diff --git a/.gitattributes b/.gitattributes index 35a60dbd53370325b1a34fec001f68f83726d06f..889bcc198743fdb4c0ba14525351540932446dc5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1782,3 +1782,12 @@ gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0 gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-10000/checkpoint-7456/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-10000/checkpoint-932/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.7-num-4920-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb6e7189f2f644eaba0f037e36f9798ed96f6276 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd96df8ee104e7b3825aa8521a0d4d68984970c2c0b69c6892e57ee4766a75d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2cbc912f02ae90cbdd3e051bdbc30e99a4664ec0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb6f14b48c1425500f4059f90fa52d5c760aaa49b2e6f0df8d47abdc03fe0392 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a4fcfc84cc73a03993f1ee6648818b541c388f7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bad32c79dc494734b80050adc382efca96e81ae40ef32fa7664ad749678fc349 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c0eabab2151ade0cdec20d69d59addc1f2ced891 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3d235f210f46bbb60f6d2548b1e40c44c9a40e073bd92a5a18cd7dd43429f9 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..10d02231837a1427a329a539ec576fe9acbfd4f5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b6337f8f742e31a4a77df511d26cd45db983f0e338ee991f6efa6198d673f80 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b7c4a41ababa96cb0bb555fd9e2fbb06861116d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/trainer_state.json @@ -0,0 +1,7397 @@ +{ + "best_metric": 1.8028968572616577, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", + "epoch": 6.99966499162479, + "eval_steps": 10, + "global_step": 10447, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.29252371191978455, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1500 + }, + { + "epoch": 1.0117252931323284, + "grad_norm": 0.31607162952423096, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 1510 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.32294467091560364, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1520 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.3868017792701721, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 1530 + }, + { + "epoch": 1.031825795644891, + "grad_norm": 0.3178282082080841, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 1540 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.3706750273704529, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 1550 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.33930912613868713, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1560 + }, + { + "epoch": 1.051926298157454, + "grad_norm": 0.33970504999160767, + "learning_rate": 0.0002, + "loss": 1.7602, + "step": 1570 + }, + { + "epoch": 1.0586264656616415, + "grad_norm": 0.42553383111953735, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1580 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.3772421181201935, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1590 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.34212902188301086, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1600 + }, + { + "epoch": 1.0787269681742044, + "grad_norm": 0.3798283338546753, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1610 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.36909598112106323, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 1620 + }, + { + "epoch": 1.0921273031825796, + "grad_norm": 0.3344230651855469, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 1630 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.3862569332122803, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1640 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.31188511848449707, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1650 + }, + { + "epoch": 1.1122278056951425, + "grad_norm": 0.3563670814037323, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 1660 + }, + { + "epoch": 1.11892797319933, + "grad_norm": 0.35052165389060974, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 1670 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.3285699188709259, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1680 + }, + { + "epoch": 1.1323283082077051, + "grad_norm": 0.3639393746852875, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1690 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.3842753767967224, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 1700 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.3624933063983917, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 1710 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.3641220033168793, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1720 + }, + { + "epoch": 1.1591289782244556, + "grad_norm": 0.32765355706214905, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1730 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.34974896907806396, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 1740 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3910926580429077, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.3564300537109375, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 1760 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.34822574257850647, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1770 + }, + { + "epoch": 1.1926298157453936, + "grad_norm": 0.36185044050216675, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1780 + }, + { + "epoch": 1.1993299832495812, + "grad_norm": 0.34866711497306824, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1790 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.4017769992351532, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 1800 + }, + { + "epoch": 1.2127303182579565, + "grad_norm": 0.32930681109428406, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1810 + }, + { + "epoch": 1.219430485762144, + "grad_norm": 0.35951921343803406, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1820 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.37366992235183716, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 1830 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.3565689027309418, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1840 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.3692343533039093, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 1850 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.38426971435546875, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 1860 + }, + { + "epoch": 1.252931323283082, + "grad_norm": 0.33559855818748474, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1870 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.34181106090545654, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1880 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3916318416595459, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1890 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3887825012207031, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 1900 + }, + { + "epoch": 1.2797319932998326, + "grad_norm": 0.33583927154541016, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1910 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.37639349699020386, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1920 + }, + { + "epoch": 1.2931323283082077, + "grad_norm": 0.38059428334236145, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1930 + }, + { + "epoch": 1.2998324958123952, + "grad_norm": 0.37253183126449585, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1940 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.37371566891670227, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 1950 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.4080910086631775, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 1960 + }, + { + "epoch": 1.3199329983249581, + "grad_norm": 0.3174354135990143, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1970 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.4518888294696808, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 1980 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3627921938896179, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 1990 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3655930161476135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 2000 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2010 + }, + { + "epoch": 1.3534338358458962, + "grad_norm": 0.4281129240989685, + "learning_rate": 0.0002, + "loss": 1.7359, + "step": 2020 + }, + { + "epoch": 1.3601340033500837, + "grad_norm": 0.3821414113044739, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 2030 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.3907586336135864, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 2040 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37792932987213135, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 2050 + }, + { + "epoch": 1.3802345058626466, + "grad_norm": 0.3693985641002655, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 2060 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.32275936007499695, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 2070 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.3789440095424652, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 2080 + }, + { + "epoch": 1.4003350083752093, + "grad_norm": 0.3638380467891693, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 2090 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 2100 + }, + { + "epoch": 1.4137353433835846, + "grad_norm": 0.37920597195625305, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 2110 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.37218064069747925, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 2120 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.38074082136154175, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 2130 + }, + { + "epoch": 1.4338358458961473, + "grad_norm": 0.3455527126789093, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 2140 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.3712003529071808, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2150 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.3786754906177521, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2160 + }, + { + "epoch": 1.4539363484087102, + "grad_norm": 0.3879223167896271, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 2170 + }, + { + "epoch": 1.4606365159128978, + "grad_norm": 0.38738805055618286, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 2180 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.39768800139427185, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2190 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.4172441065311432, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 2200 + }, + { + "epoch": 1.4807370184254607, + "grad_norm": 0.4043174982070923, + "learning_rate": 0.0002, + "loss": 1.6736, + "step": 2210 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.3750883936882019, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 2220 + }, + { + "epoch": 1.4941373534338358, + "grad_norm": 0.3552253246307373, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 2230 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.34607139229774475, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2240 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.3406706750392914, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 2250 + }, + { + "epoch": 1.5142378559463987, + "grad_norm": 0.36654895544052124, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 2260 + }, + { + "epoch": 1.5209380234505863, + "grad_norm": 0.3914054334163666, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2270 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.42012137174606323, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 2280 + }, + { + "epoch": 1.5343383584589616, + "grad_norm": 0.39563435316085815, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.3508438766002655, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 2300 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.3785218596458435, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 2310 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.39377647638320923, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 2320 + }, + { + "epoch": 1.5611390284757118, + "grad_norm": 0.3391438126564026, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2330 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.37944263219833374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 2340 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3523491322994232, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2350 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.3911575973033905, + "learning_rate": 0.0002, + "loss": 1.7583, + "step": 2360 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.33832186460494995, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 2370 + }, + { + "epoch": 1.5946398659966499, + "grad_norm": 0.3665979206562042, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2380 + }, + { + "epoch": 1.6013400335008376, + "grad_norm": 0.3871748149394989, + "learning_rate": 0.0002, + "loss": 1.779, + "step": 2390 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3586967885494232, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 2400 + }, + { + "epoch": 1.6147403685092128, + "grad_norm": 0.3563673198223114, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 2410 + }, + { + "epoch": 1.6214405360134003, + "grad_norm": 0.37588971853256226, + "learning_rate": 0.0002, + "loss": 1.745, + "step": 2420 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.352556437253952, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 2430 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.3716259300708771, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2440 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.372001975774765, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2450 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.3430042862892151, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2460 + }, + { + "epoch": 1.6549413735343383, + "grad_norm": 0.3741483688354492, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2470 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.3610571324825287, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2480 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.4204719066619873, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2490 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3938186466693878, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 2500 + }, + { + "epoch": 1.6817420435510888, + "grad_norm": 0.3421435058116913, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 2510 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.42441412806510925, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 2520 + }, + { + "epoch": 1.695142378559464, + "grad_norm": 0.38071519136428833, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 2530 + }, + { + "epoch": 1.7018425460636517, + "grad_norm": 0.34078919887542725, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2540 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.412844181060791, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 2550 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 2560 + }, + { + "epoch": 1.7219430485762144, + "grad_norm": 0.41588476300239563, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 2570 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.35504111647605896, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2580 + }, + { + "epoch": 1.7353433835845897, + "grad_norm": 0.36909720301628113, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.4149979054927826, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 2600 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.38859328627586365, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 2610 + }, + { + "epoch": 1.7554438860971524, + "grad_norm": 0.36738792061805725, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2620 + }, + { + "epoch": 1.76214405360134, + "grad_norm": 0.3968178927898407, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2630 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.3972901999950409, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 2640 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3949959874153137, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 1.7822445561139029, + "grad_norm": 0.44074657559394836, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 2660 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.39743664860725403, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 2670 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.3950406610965729, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2680 + }, + { + "epoch": 1.8023450586264658, + "grad_norm": 0.3568263649940491, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2690 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3819476366043091, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2700 + }, + { + "epoch": 1.8157453936348409, + "grad_norm": 0.3480634391307831, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 2710 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2720 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.3441337049007416, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2730 + }, + { + "epoch": 1.8358458961474038, + "grad_norm": 0.35692882537841797, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 2740 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.36959215998649597, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2750 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.3893393278121948, + "learning_rate": 0.0002, + "loss": 1.7657, + "step": 2760 + }, + { + "epoch": 1.8559463986599665, + "grad_norm": 0.37817293405532837, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2770 + }, + { + "epoch": 1.862646566164154, + "grad_norm": 0.36071285605430603, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 2780 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.3758420944213867, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 2790 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3889938294887543, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 2800 + }, + { + "epoch": 1.882747068676717, + "grad_norm": 0.34361857175827026, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2810 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.39283323287963867, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2820 + }, + { + "epoch": 1.896147403685092, + "grad_norm": 0.3919452726840973, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 2830 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.38215070962905884, + "learning_rate": 0.0002, + "loss": 1.673, + "step": 2840 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.4235064387321472, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 2850 + }, + { + "epoch": 1.916247906197655, + "grad_norm": 0.35694634914398193, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 2860 + }, + { + "epoch": 1.9229480737018425, + "grad_norm": 0.383492112159729, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 2870 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5945147275924683, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2880 + }, + { + "epoch": 1.9363484087102178, + "grad_norm": 0.3367522358894348, + "learning_rate": 0.0002, + "loss": 1.7421, + "step": 2890 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.35300394892692566, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2900 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.38084495067596436, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2910 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.37559160590171814, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 2920 + }, + { + "epoch": 1.963149078726968, + "grad_norm": 0.3661738336086273, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 2930 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.4073849320411682, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2940 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3723304271697998, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 2950 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.3991098999977112, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 2960 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.3947085440158844, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2970 + }, + { + "epoch": 1.996649916247906, + "grad_norm": 0.3786258399486542, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2980 + }, + { + "epoch": 2.0, + "eval_loss": 1.8028968572616577, + "eval_runtime": 37.8985, + "eval_samples_per_second": 13.589, + "eval_steps_per_second": 1.715, + "step": 2985 + }, + { + "epoch": 2.003350083752094, + "grad_norm": 0.34824079275131226, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2990 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.3394894003868103, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3000 + }, + { + "epoch": 2.016750418760469, + "grad_norm": 0.36910977959632874, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3010 + }, + { + "epoch": 2.023450586264657, + "grad_norm": 0.45000967383384705, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 3020 + }, + { + "epoch": 2.030150753768844, + "grad_norm": 0.3791407346725464, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3030 + }, + { + "epoch": 2.036850921273032, + "grad_norm": 0.387321799993515, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 3040 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.4185757040977478, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3050 + }, + { + "epoch": 2.050251256281407, + "grad_norm": 0.45110777020454407, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3060 + }, + { + "epoch": 2.056951423785595, + "grad_norm": 0.42663660645484924, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 3070 + }, + { + "epoch": 2.063651591289782, + "grad_norm": 0.4546292722225189, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 3080 + }, + { + "epoch": 2.07035175879397, + "grad_norm": 0.3979759216308594, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3090 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.43596673011779785, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3100 + }, + { + "epoch": 2.083752093802345, + "grad_norm": 0.40120232105255127, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 3110 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.44449281692504883, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3120 + }, + { + "epoch": 2.09715242881072, + "grad_norm": 0.42672568559646606, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 3130 + }, + { + "epoch": 2.103852596314908, + "grad_norm": 0.4232690930366516, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 3140 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.4299317002296448, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3150 + }, + { + "epoch": 2.117252931323283, + "grad_norm": 0.4067758023738861, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 3160 + }, + { + "epoch": 2.123953098827471, + "grad_norm": 0.4918815791606903, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3170 + }, + { + "epoch": 2.130653266331658, + "grad_norm": 0.4140559732913971, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3180 + }, + { + "epoch": 2.137353433835846, + "grad_norm": 0.4555995464324951, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 3190 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.42943915724754333, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 3200 + }, + { + "epoch": 2.150753768844221, + "grad_norm": 0.4730435013771057, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3210 + }, + { + "epoch": 2.157453936348409, + "grad_norm": 0.43310216069221497, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 3220 + }, + { + "epoch": 2.164154103852596, + "grad_norm": 0.42054110765457153, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3230 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.4897233247756958, + "learning_rate": 0.0002, + "loss": 1.6749, + "step": 3240 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.42194533348083496, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 3250 + }, + { + "epoch": 2.184254606365159, + "grad_norm": 0.44494450092315674, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3260 + }, + { + "epoch": 2.190954773869347, + "grad_norm": 0.43524879217147827, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 3270 + }, + { + "epoch": 2.1976549413735342, + "grad_norm": 0.4621117413043976, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 3280 + }, + { + "epoch": 2.204355108877722, + "grad_norm": 0.4073285460472107, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 3290 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.47868335247039795, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3300 + }, + { + "epoch": 2.217755443886097, + "grad_norm": 0.4264970123767853, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 3310 + }, + { + "epoch": 2.224455611390285, + "grad_norm": 0.4491245150566101, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3320 + }, + { + "epoch": 2.2311557788944723, + "grad_norm": 0.4010344445705414, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 3330 + }, + { + "epoch": 2.23785594639866, + "grad_norm": 0.4232759177684784, + "learning_rate": 0.0002, + "loss": 1.6684, + "step": 3340 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5099776983261108, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3350 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.5223407745361328, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 3360 + }, + { + "epoch": 2.257956448911223, + "grad_norm": 0.47818470001220703, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3370 + }, + { + "epoch": 2.2646566164154103, + "grad_norm": 0.4721255898475647, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3380 + }, + { + "epoch": 2.271356783919598, + "grad_norm": 0.4113229513168335, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3390 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.507080078125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3400 + }, + { + "epoch": 2.284757118927973, + "grad_norm": 0.4852292239665985, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 3410 + }, + { + "epoch": 2.291457286432161, + "grad_norm": 0.4503684341907501, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 3420 + }, + { + "epoch": 2.2981574539363483, + "grad_norm": 0.8359600305557251, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 3430 + }, + { + "epoch": 2.304857621440536, + "grad_norm": 0.44604045152664185, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3440 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.45667049288749695, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3450 + }, + { + "epoch": 2.318257956448911, + "grad_norm": 0.4879349172115326, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 3460 + }, + { + "epoch": 2.324958123953099, + "grad_norm": 0.4033963084220886, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 3470 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.44494301080703735, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3480 + }, + { + "epoch": 2.338358458961474, + "grad_norm": 0.4794621765613556, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.41404327750205994, + "learning_rate": 0.0002, + "loss": 1.6807, + "step": 3500 + }, + { + "epoch": 2.351758793969849, + "grad_norm": 0.4664851725101471, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 3510 + }, + { + "epoch": 2.358458961474037, + "grad_norm": 0.4263697564601898, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 3520 + }, + { + "epoch": 2.3651591289782243, + "grad_norm": 0.5035167336463928, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 3530 + }, + { + "epoch": 2.371859296482412, + "grad_norm": 0.4380664527416229, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 3540 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.5227681994438171, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3550 + }, + { + "epoch": 2.3852596314907872, + "grad_norm": 0.4382302761077881, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3560 + }, + { + "epoch": 2.391959798994975, + "grad_norm": 0.4392451047897339, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3570 + }, + { + "epoch": 2.3986599664991624, + "grad_norm": 0.4372786581516266, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 3580 + }, + { + "epoch": 2.40536013400335, + "grad_norm": 0.5015502572059631, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 3590 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.5653210878372192, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3600 + }, + { + "epoch": 2.4187604690117253, + "grad_norm": 0.53007972240448, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3610 + }, + { + "epoch": 2.425460636515913, + "grad_norm": 0.4659176766872406, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 3620 + }, + { + "epoch": 2.4321608040201004, + "grad_norm": 0.5637837052345276, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3630 + }, + { + "epoch": 2.438860971524288, + "grad_norm": 0.4248391389846802, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3640 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.44668248295783997, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 3650 + }, + { + "epoch": 2.4522613065326633, + "grad_norm": 0.43990179896354675, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 3660 + }, + { + "epoch": 2.458961474036851, + "grad_norm": 0.4532523453235626, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 3670 + }, + { + "epoch": 2.4656616415410384, + "grad_norm": 0.6605591773986816, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 3680 + }, + { + "epoch": 2.472361809045226, + "grad_norm": 0.4694533348083496, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3690 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.4485011100769043, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 3700 + }, + { + "epoch": 2.4857621440536013, + "grad_norm": 0.4761785864830017, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3710 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.5116432309150696, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 3720 + }, + { + "epoch": 2.4991624790619764, + "grad_norm": 0.49523618817329407, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 3730 + }, + { + "epoch": 2.505862646566164, + "grad_norm": 0.43826380372047424, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 3740 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.4916154146194458, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3750 + }, + { + "epoch": 2.5192629815745393, + "grad_norm": 0.5381299257278442, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 3760 + }, + { + "epoch": 2.525963149078727, + "grad_norm": 0.44947415590286255, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 3770 + }, + { + "epoch": 2.5326633165829144, + "grad_norm": 0.49979084730148315, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3780 + }, + { + "epoch": 2.539363484087102, + "grad_norm": 0.43046900629997253, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 3790 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.4513470530509949, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 3800 + }, + { + "epoch": 2.5527638190954773, + "grad_norm": 0.49900051951408386, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3810 + }, + { + "epoch": 2.559463986599665, + "grad_norm": 0.4348420202732086, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 3820 + }, + { + "epoch": 2.5661641541038525, + "grad_norm": 0.4684867560863495, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3830 + }, + { + "epoch": 2.5728643216080402, + "grad_norm": 0.44430989027023315, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3840 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.47375255823135376, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3850 + }, + { + "epoch": 2.5862646566164154, + "grad_norm": 0.45493075251579285, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 3860 + }, + { + "epoch": 2.592964824120603, + "grad_norm": 0.4563275873661041, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 3870 + }, + { + "epoch": 2.5996649916247905, + "grad_norm": 0.46060335636138916, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3880 + }, + { + "epoch": 2.6063651591289783, + "grad_norm": 0.4718867540359497, + "learning_rate": 0.0002, + "loss": 1.6302, + "step": 3890 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.41570305824279785, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 3900 + }, + { + "epoch": 2.6197654941373534, + "grad_norm": 0.4603121876716614, + "learning_rate": 0.0002, + "loss": 1.6401, + "step": 3910 + }, + { + "epoch": 2.626465661641541, + "grad_norm": 0.4734652638435364, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 3920 + }, + { + "epoch": 2.6331658291457285, + "grad_norm": 0.45348483324050903, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3930 + }, + { + "epoch": 2.6398659966499163, + "grad_norm": 0.46559447050094604, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3940 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.44113144278526306, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 3950 + }, + { + "epoch": 2.6532663316582914, + "grad_norm": 0.41415104269981384, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3960 + }, + { + "epoch": 2.659966499162479, + "grad_norm": 0.48868080973625183, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 3970 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.49610549211502075, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 3980 + }, + { + "epoch": 2.6733668341708543, + "grad_norm": 0.4309130907058716, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 3990 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.4489327669143677, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 4000 + }, + { + "epoch": 2.6867671691792294, + "grad_norm": 0.5380139946937561, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 4010 + }, + { + "epoch": 2.693467336683417, + "grad_norm": 0.5076672434806824, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 4020 + }, + { + "epoch": 2.7001675041876045, + "grad_norm": 0.47620031237602234, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 4030 + }, + { + "epoch": 2.7068676716917923, + "grad_norm": 0.48089155554771423, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 4040 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.5108814239501953, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 4050 + }, + { + "epoch": 2.7202680067001674, + "grad_norm": 0.4196513295173645, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 4060 + }, + { + "epoch": 2.726968174204355, + "grad_norm": 0.4574664831161499, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 4070 + }, + { + "epoch": 2.7336683417085426, + "grad_norm": 0.4671640992164612, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 4080 + }, + { + "epoch": 2.7403685092127303, + "grad_norm": 0.49355530738830566, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 4090 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.46716663241386414, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 4100 + }, + { + "epoch": 2.7537688442211055, + "grad_norm": 0.45420581102371216, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 4110 + }, + { + "epoch": 2.7604690117252932, + "grad_norm": 0.4680487811565399, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4120 + }, + { + "epoch": 2.7671691792294806, + "grad_norm": 0.5375032424926758, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4130 + }, + { + "epoch": 2.7738693467336684, + "grad_norm": 0.46026280522346497, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 4140 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.43658447265625, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 4150 + }, + { + "epoch": 2.7872696817420435, + "grad_norm": 0.4935547113418579, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 4160 + }, + { + "epoch": 2.7939698492462313, + "grad_norm": 0.8167962431907654, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4170 + }, + { + "epoch": 2.8006700167504186, + "grad_norm": 0.4289683997631073, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 4180 + }, + { + "epoch": 2.8073701842546064, + "grad_norm": 0.4569324254989624, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 4190 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.474795937538147, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 4200 + }, + { + "epoch": 2.8207705192629815, + "grad_norm": 0.44272229075431824, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 4210 + }, + { + "epoch": 2.8274706867671693, + "grad_norm": 0.525240957736969, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 4220 + }, + { + "epoch": 2.8341708542713566, + "grad_norm": 0.4802303910255432, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 4230 + }, + { + "epoch": 2.8408710217755444, + "grad_norm": 0.46400442719459534, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4240 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.49884888529777527, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4250 + }, + { + "epoch": 2.8542713567839195, + "grad_norm": 0.5015072226524353, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 4260 + }, + { + "epoch": 2.8609715242881073, + "grad_norm": 0.4335440695285797, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 4270 + }, + { + "epoch": 2.8676716917922946, + "grad_norm": 0.5131644606590271, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 4280 + }, + { + "epoch": 2.8743718592964824, + "grad_norm": 0.6977195739746094, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 4290 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5133762955665588, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 4300 + }, + { + "epoch": 2.8877721943048575, + "grad_norm": 0.4737614393234253, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 4310 + }, + { + "epoch": 2.8944723618090453, + "grad_norm": 0.4580535590648651, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 4320 + }, + { + "epoch": 2.901172529313233, + "grad_norm": 0.43863341212272644, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 4330 + }, + { + "epoch": 2.9078726968174204, + "grad_norm": 0.4103737473487854, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4340 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.438014417886734, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 4350 + }, + { + "epoch": 2.9212730318257956, + "grad_norm": 0.5068213939666748, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 4360 + }, + { + "epoch": 2.9279731993299833, + "grad_norm": 0.45305484533309937, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 4370 + }, + { + "epoch": 2.934673366834171, + "grad_norm": 0.4612090289592743, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4380 + }, + { + "epoch": 2.9413735343383585, + "grad_norm": 0.508736789226532, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 4390 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4924427270889282, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 4400 + }, + { + "epoch": 2.9547738693467336, + "grad_norm": 0.5707460641860962, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 4410 + }, + { + "epoch": 2.9614740368509214, + "grad_norm": 0.42270299792289734, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 4420 + }, + { + "epoch": 2.968174204355109, + "grad_norm": 0.4429931044578552, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 4430 + }, + { + "epoch": 2.9748743718592965, + "grad_norm": 0.49760574102401733, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 4440 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4558229148387909, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 4450 + }, + { + "epoch": 2.9882747068676716, + "grad_norm": 0.39848530292510986, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4460 + }, + { + "epoch": 2.9949748743718594, + "grad_norm": 0.5224862098693848, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 4470 + }, + { + "epoch": 2.9996649916247904, + "eval_loss": 1.8228833675384521, + "eval_runtime": 37.9049, + "eval_samples_per_second": 13.587, + "eval_steps_per_second": 1.715, + "step": 4477 + }, + { + "epoch": 3.0016750418760467, + "grad_norm": 0.41169142723083496, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 4480 + }, + { + "epoch": 3.0083752093802345, + "grad_norm": 0.4865207374095917, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 4490 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5462028384208679, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4500 + }, + { + "epoch": 3.0217755443886096, + "grad_norm": 0.6169732809066772, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 4510 + }, + { + "epoch": 3.0284757118927974, + "grad_norm": 0.5667954087257385, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 4520 + }, + { + "epoch": 3.0351758793969847, + "grad_norm": 0.5758325457572937, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 4530 + }, + { + "epoch": 3.0418760469011725, + "grad_norm": 0.5220064520835876, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4540 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.5469558835029602, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 4550 + }, + { + "epoch": 3.0552763819095476, + "grad_norm": 0.5680848956108093, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 4560 + }, + { + "epoch": 3.0619765494137354, + "grad_norm": 0.5906574726104736, + "learning_rate": 0.0002, + "loss": 1.5187, + "step": 4570 + }, + { + "epoch": 3.0686767169179228, + "grad_norm": 0.4725631773471832, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4580 + }, + { + "epoch": 3.0753768844221105, + "grad_norm": 0.5273477435112, + "learning_rate": 0.0002, + "loss": 1.5083, + "step": 4590 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.5861203074455261, + "learning_rate": 0.0002, + "loss": 1.5154, + "step": 4600 + }, + { + "epoch": 3.0887772194304857, + "grad_norm": 0.5343965291976929, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 4610 + }, + { + "epoch": 3.0954773869346734, + "grad_norm": 0.5348150730133057, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4620 + }, + { + "epoch": 3.102177554438861, + "grad_norm": 0.5971846580505371, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4630 + }, + { + "epoch": 3.1088777219430486, + "grad_norm": 0.5203177332878113, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4640 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.55289226770401, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 4650 + }, + { + "epoch": 3.1222780569514237, + "grad_norm": 0.6878530979156494, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4660 + }, + { + "epoch": 3.1289782244556115, + "grad_norm": 0.6173256635665894, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 4670 + }, + { + "epoch": 3.135678391959799, + "grad_norm": 0.536796510219574, + "learning_rate": 0.0002, + "loss": 1.51, + "step": 4680 + }, + { + "epoch": 3.1423785594639866, + "grad_norm": 0.58846116065979, + "learning_rate": 0.0002, + "loss": 1.4713, + "step": 4690 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.645889401435852, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 4700 + }, + { + "epoch": 3.1557788944723617, + "grad_norm": 0.6118691563606262, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 4710 + }, + { + "epoch": 3.1624790619765495, + "grad_norm": 0.5189669132232666, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 4720 + }, + { + "epoch": 3.169179229480737, + "grad_norm": 0.5794713497161865, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4730 + }, + { + "epoch": 3.1758793969849246, + "grad_norm": 0.6579326391220093, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 4740 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.5822742581367493, + "learning_rate": 0.0002, + "loss": 1.545, + "step": 4750 + }, + { + "epoch": 3.1892797319932997, + "grad_norm": 0.5475956201553345, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 4760 + }, + { + "epoch": 3.1959798994974875, + "grad_norm": 0.6743834018707275, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4770 + }, + { + "epoch": 3.202680067001675, + "grad_norm": 0.6110585927963257, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4780 + }, + { + "epoch": 3.2093802345058626, + "grad_norm": 0.5426181554794312, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 4790 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6077824234962463, + "learning_rate": 0.0002, + "loss": 1.5315, + "step": 4800 + }, + { + "epoch": 3.2227805695142377, + "grad_norm": 0.5785858631134033, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 4810 + }, + { + "epoch": 3.2294807370184255, + "grad_norm": 0.6425958275794983, + "learning_rate": 0.0002, + "loss": 1.4041, + "step": 4820 + }, + { + "epoch": 3.236180904522613, + "grad_norm": 0.6607080698013306, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 4830 + }, + { + "epoch": 3.2428810720268006, + "grad_norm": 0.5385788679122925, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4840 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.5630403757095337, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 4850 + }, + { + "epoch": 3.2562814070351758, + "grad_norm": 0.6340779662132263, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 4860 + }, + { + "epoch": 3.2629815745393635, + "grad_norm": 0.5305342674255371, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4870 + }, + { + "epoch": 3.2696817420435513, + "grad_norm": 0.597670316696167, + "learning_rate": 0.0002, + "loss": 1.5162, + "step": 4880 + }, + { + "epoch": 3.2763819095477387, + "grad_norm": 0.665553867816925, + "learning_rate": 0.0002, + "loss": 1.5429, + "step": 4890 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.579767644405365, + "learning_rate": 0.0002, + "loss": 1.4607, + "step": 4900 + }, + { + "epoch": 3.289782244556114, + "grad_norm": 0.5512481331825256, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 4910 + }, + { + "epoch": 3.2964824120603016, + "grad_norm": 0.5916532278060913, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 4920 + }, + { + "epoch": 3.3031825795644894, + "grad_norm": 0.7521726489067078, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 4930 + }, + { + "epoch": 3.3098827470686767, + "grad_norm": 0.5352797508239746, + "learning_rate": 0.0002, + "loss": 1.4223, + "step": 4940 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.5950371623039246, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4950 + }, + { + "epoch": 3.323283082077052, + "grad_norm": 0.8020477890968323, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 4960 + }, + { + "epoch": 3.3299832495812396, + "grad_norm": 0.6790024638175964, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4970 + }, + { + "epoch": 3.3366834170854274, + "grad_norm": 0.687627375125885, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4980 + }, + { + "epoch": 3.3433835845896147, + "grad_norm": 0.6094385385513306, + "learning_rate": 0.0002, + "loss": 1.5276, + "step": 4990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.6541242003440857, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 5000 + }, + { + "epoch": 3.35678391959799, + "grad_norm": 0.5560880303382874, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 5010 + }, + { + "epoch": 3.3634840871021776, + "grad_norm": 0.5440094470977783, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 5020 + }, + { + "epoch": 3.3701842546063654, + "grad_norm": 0.5749301314353943, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 5030 + }, + { + "epoch": 3.3768844221105527, + "grad_norm": 0.5919716954231262, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 5040 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.6331481337547302, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 5050 + }, + { + "epoch": 3.390284757118928, + "grad_norm": 0.5687161684036255, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 5060 + }, + { + "epoch": 3.3969849246231156, + "grad_norm": 0.6718577742576599, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 5070 + }, + { + "epoch": 3.4036850921273034, + "grad_norm": 0.5089324116706848, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 5080 + }, + { + "epoch": 3.4103852596314908, + "grad_norm": 0.5710174441337585, + "learning_rate": 0.0002, + "loss": 1.512, + "step": 5090 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6670721173286438, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 5100 + }, + { + "epoch": 3.423785594639866, + "grad_norm": 0.6875665187835693, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 5110 + }, + { + "epoch": 3.4304857621440537, + "grad_norm": 0.5375880599021912, + "learning_rate": 0.0002, + "loss": 1.4496, + "step": 5120 + }, + { + "epoch": 3.4371859296482414, + "grad_norm": 0.6550399661064148, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 5130 + }, + { + "epoch": 3.4438860971524288, + "grad_norm": 0.5948067903518677, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 5140 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.6134477257728577, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5150 + }, + { + "epoch": 3.457286432160804, + "grad_norm": 0.6506398320198059, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 5160 + }, + { + "epoch": 3.4639865996649917, + "grad_norm": 0.6060147881507874, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 5170 + }, + { + "epoch": 3.4706867671691795, + "grad_norm": 0.6173806190490723, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 5180 + }, + { + "epoch": 3.477386934673367, + "grad_norm": 0.6032607555389404, + "learning_rate": 0.0002, + "loss": 1.4975, + "step": 5190 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5652492046356201, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 5200 + }, + { + "epoch": 3.490787269681742, + "grad_norm": 0.6168607473373413, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 5210 + }, + { + "epoch": 3.4974874371859297, + "grad_norm": 0.6170629262924194, + "learning_rate": 0.0002, + "loss": 1.5164, + "step": 5220 + }, + { + "epoch": 3.5041876046901175, + "grad_norm": 0.6926297545433044, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 5230 + }, + { + "epoch": 3.510887772194305, + "grad_norm": 0.6702437996864319, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 5240 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.5421436429023743, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 5250 + }, + { + "epoch": 3.52428810720268, + "grad_norm": 0.5726765990257263, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 5260 + }, + { + "epoch": 3.5309882747068677, + "grad_norm": 0.5685455203056335, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 5270 + }, + { + "epoch": 3.5376884422110555, + "grad_norm": 0.6018396019935608, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 5280 + }, + { + "epoch": 3.544388609715243, + "grad_norm": 0.5731932520866394, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 5290 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.6601519584655762, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5300 + }, + { + "epoch": 3.557788944723618, + "grad_norm": 0.5545530319213867, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 5310 + }, + { + "epoch": 3.5644891122278057, + "grad_norm": 0.5998541116714478, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 5320 + }, + { + "epoch": 3.5711892797319935, + "grad_norm": 0.5651767253875732, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 5330 + }, + { + "epoch": 3.577889447236181, + "grad_norm": 0.7425084114074707, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 5340 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5770602226257324, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 5350 + }, + { + "epoch": 3.591289782244556, + "grad_norm": 0.54723060131073, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 5360 + }, + { + "epoch": 3.5979899497487438, + "grad_norm": 0.6658238172531128, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 5370 + }, + { + "epoch": 3.6046901172529315, + "grad_norm": 0.5787645578384399, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 5380 + }, + { + "epoch": 3.611390284757119, + "grad_norm": 0.594913125038147, + "learning_rate": 0.0002, + "loss": 1.5343, + "step": 5390 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.4964977502822876, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5400 + }, + { + "epoch": 3.624790619765494, + "grad_norm": 0.6087527275085449, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 5410 + }, + { + "epoch": 3.6314907872696818, + "grad_norm": 0.6315323710441589, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 5420 + }, + { + "epoch": 3.6381909547738696, + "grad_norm": 0.574799120426178, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 5430 + }, + { + "epoch": 3.644891122278057, + "grad_norm": 0.5949277877807617, + "learning_rate": 0.0002, + "loss": 1.4595, + "step": 5440 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.5640677213668823, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 5450 + }, + { + "epoch": 3.658291457286432, + "grad_norm": 0.6198237538337708, + "learning_rate": 0.0002, + "loss": 1.525, + "step": 5460 + }, + { + "epoch": 3.66499162479062, + "grad_norm": 0.6902034878730774, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 5470 + }, + { + "epoch": 3.6716917922948076, + "grad_norm": 0.5686674118041992, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5480 + }, + { + "epoch": 3.678391959798995, + "grad_norm": 0.6532107591629028, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 5490 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.5790849924087524, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 5500 + }, + { + "epoch": 3.69179229480737, + "grad_norm": 0.6055065393447876, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 5510 + }, + { + "epoch": 3.698492462311558, + "grad_norm": 0.5630605816841125, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 5520 + }, + { + "epoch": 3.7051926298157456, + "grad_norm": 0.6005825996398926, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 5530 + }, + { + "epoch": 3.711892797319933, + "grad_norm": 0.6553038954734802, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 5540 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5601094961166382, + "learning_rate": 0.0002, + "loss": 1.4943, + "step": 5550 + }, + { + "epoch": 3.725293132328308, + "grad_norm": 0.6598808169364929, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 5560 + }, + { + "epoch": 3.731993299832496, + "grad_norm": 0.5506255626678467, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 5570 + }, + { + "epoch": 3.7386934673366836, + "grad_norm": 0.6001223921775818, + "learning_rate": 0.0002, + "loss": 1.4805, + "step": 5580 + }, + { + "epoch": 3.745393634840871, + "grad_norm": 0.6287297606468201, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 5590 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.6253238916397095, + "learning_rate": 0.0002, + "loss": 1.5246, + "step": 5600 + }, + { + "epoch": 3.758793969849246, + "grad_norm": 0.5713174939155579, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 5610 + }, + { + "epoch": 3.765494137353434, + "grad_norm": 0.6198310852050781, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 5620 + }, + { + "epoch": 3.7721943048576216, + "grad_norm": 0.5941224098205566, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 5630 + }, + { + "epoch": 3.778894472361809, + "grad_norm": 0.606002151966095, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 5640 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.6540704965591431, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 5650 + }, + { + "epoch": 3.792294807370184, + "grad_norm": 0.6147415041923523, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 5660 + }, + { + "epoch": 3.798994974874372, + "grad_norm": 0.5649605393409729, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5670 + }, + { + "epoch": 3.8056951423785597, + "grad_norm": 0.6788773536682129, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 5680 + }, + { + "epoch": 3.812395309882747, + "grad_norm": 0.6581860780715942, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 5690 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.5529348850250244, + "learning_rate": 0.0002, + "loss": 1.4587, + "step": 5700 + }, + { + "epoch": 3.825795644891122, + "grad_norm": 0.6320232152938843, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 5710 + }, + { + "epoch": 3.83249581239531, + "grad_norm": 0.6529698371887207, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 5720 + }, + { + "epoch": 3.8391959798994977, + "grad_norm": 0.5983362793922424, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 5730 + }, + { + "epoch": 3.845896147403685, + "grad_norm": 0.6335684061050415, + "learning_rate": 0.0002, + "loss": 1.465, + "step": 5740 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.700446605682373, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5750 + }, + { + "epoch": 3.85929648241206, + "grad_norm": 0.6092597842216492, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 5760 + }, + { + "epoch": 3.865996649916248, + "grad_norm": 0.564146101474762, + "learning_rate": 0.0002, + "loss": 1.5729, + "step": 5770 + }, + { + "epoch": 3.8726968174204357, + "grad_norm": 0.615275502204895, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 5780 + }, + { + "epoch": 3.879396984924623, + "grad_norm": 0.6685376763343811, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 5790 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6116922497749329, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5800 + }, + { + "epoch": 3.892797319932998, + "grad_norm": 0.5486813187599182, + "learning_rate": 0.0002, + "loss": 1.5179, + "step": 5810 + }, + { + "epoch": 3.899497487437186, + "grad_norm": 0.6208204030990601, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 5820 + }, + { + "epoch": 3.9061976549413737, + "grad_norm": 0.6500625014305115, + "learning_rate": 0.0002, + "loss": 1.5334, + "step": 5830 + }, + { + "epoch": 3.912897822445561, + "grad_norm": 0.5948089361190796, + "learning_rate": 0.0002, + "loss": 1.4716, + "step": 5840 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.7210732698440552, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 5850 + }, + { + "epoch": 3.926298157453936, + "grad_norm": 0.6662322878837585, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 5860 + }, + { + "epoch": 3.932998324958124, + "grad_norm": 0.5613839626312256, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 5870 + }, + { + "epoch": 3.9396984924623117, + "grad_norm": 0.6069002151489258, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5880 + }, + { + "epoch": 3.946398659966499, + "grad_norm": 0.7075562477111816, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 5890 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.6316173076629639, + "learning_rate": 0.0002, + "loss": 1.5391, + "step": 5900 + }, + { + "epoch": 3.959798994974874, + "grad_norm": 0.5716308355331421, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 5910 + }, + { + "epoch": 3.966499162479062, + "grad_norm": 0.6800096035003662, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 5920 + }, + { + "epoch": 3.9731993299832498, + "grad_norm": 0.6057983040809631, + "learning_rate": 0.0002, + "loss": 1.5189, + "step": 5930 + }, + { + "epoch": 3.979899497487437, + "grad_norm": 0.5938987731933594, + "learning_rate": 0.0002, + "loss": 1.5431, + "step": 5940 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.6963576674461365, + "learning_rate": 0.0002, + "loss": 1.5111, + "step": 5950 + }, + { + "epoch": 3.993299832495812, + "grad_norm": 0.6279940009117126, + "learning_rate": 0.0002, + "loss": 1.5521, + "step": 5960 + }, + { + "epoch": 4.0, + "grad_norm": 0.7161159515380859, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 5970 + }, + { + "epoch": 4.0, + "eval_loss": 1.8655421733856201, + "eval_runtime": 37.9276, + "eval_samples_per_second": 13.579, + "eval_steps_per_second": 1.714, + "step": 5970 + }, + { + "epoch": 4.006700167504188, + "grad_norm": 0.7380476593971252, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 5980 + }, + { + "epoch": 4.013400335008376, + "grad_norm": 0.7148947715759277, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 5990 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.6177082657814026, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 6000 + }, + { + "epoch": 4.02680067001675, + "grad_norm": 0.8552946448326111, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 6010 + }, + { + "epoch": 4.033500837520938, + "grad_norm": 0.8033416271209717, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 6020 + }, + { + "epoch": 4.040201005025126, + "grad_norm": 0.8501318097114563, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 6030 + }, + { + "epoch": 4.046901172529314, + "grad_norm": 0.6981393098831177, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6040 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.7227180600166321, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 6050 + }, + { + "epoch": 4.060301507537688, + "grad_norm": 0.6923989653587341, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 6060 + }, + { + "epoch": 4.067001675041876, + "grad_norm": 0.879779040813446, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 6070 + }, + { + "epoch": 4.073701842546064, + "grad_norm": 0.8184754848480225, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 6080 + }, + { + "epoch": 4.080402010050252, + "grad_norm": 0.8211342692375183, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 6090 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.7542396783828735, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 6100 + }, + { + "epoch": 4.093802345058626, + "grad_norm": 0.6631066799163818, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 6110 + }, + { + "epoch": 4.100502512562814, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 6120 + }, + { + "epoch": 4.107202680067002, + "grad_norm": 0.681851863861084, + "learning_rate": 0.0002, + "loss": 1.3443, + "step": 6130 + }, + { + "epoch": 4.11390284757119, + "grad_norm": 0.8757794499397278, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 6140 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.6567301750183105, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 6150 + }, + { + "epoch": 4.127303182579564, + "grad_norm": 0.7950329184532166, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 6160 + }, + { + "epoch": 4.134003350083752, + "grad_norm": 0.7545644044876099, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6170 + }, + { + "epoch": 4.14070351758794, + "grad_norm": 0.7172710299491882, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 6180 + }, + { + "epoch": 4.147403685092128, + "grad_norm": 0.7040584087371826, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6190 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7482913732528687, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 6200 + }, + { + "epoch": 4.160804020100502, + "grad_norm": 0.8523276448249817, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 6210 + }, + { + "epoch": 4.16750418760469, + "grad_norm": 0.6672041416168213, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 6220 + }, + { + "epoch": 4.174204355108878, + "grad_norm": 0.7523500919342041, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 6230 + }, + { + "epoch": 4.180904522613066, + "grad_norm": 0.8085253834724426, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 6240 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.789450466632843, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 6250 + }, + { + "epoch": 4.19430485762144, + "grad_norm": 0.7502310872077942, + "learning_rate": 0.0002, + "loss": 1.3539, + "step": 6260 + }, + { + "epoch": 4.201005025125628, + "grad_norm": 0.7397456765174866, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 6270 + }, + { + "epoch": 4.207705192629816, + "grad_norm": 0.6921947002410889, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6280 + }, + { + "epoch": 4.214405360134004, + "grad_norm": 0.9334571957588196, + "learning_rate": 0.0002, + "loss": 1.3125, + "step": 6290 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.725799024105072, + "learning_rate": 0.0002, + "loss": 1.3612, + "step": 6300 + }, + { + "epoch": 4.227805695142378, + "grad_norm": 0.8290495872497559, + "learning_rate": 0.0002, + "loss": 1.4217, + "step": 6310 + }, + { + "epoch": 4.234505862646566, + "grad_norm": 0.688983678817749, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 6320 + }, + { + "epoch": 4.241206030150754, + "grad_norm": 0.8620913028717041, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 6330 + }, + { + "epoch": 4.247906197654942, + "grad_norm": 0.8008657693862915, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6340 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7379199266433716, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6350 + }, + { + "epoch": 4.261306532663316, + "grad_norm": 0.7842815518379211, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 6360 + }, + { + "epoch": 4.268006700167504, + "grad_norm": 0.812600314617157, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 6370 + }, + { + "epoch": 4.274706867671692, + "grad_norm": 0.7852841019630432, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 6380 + }, + { + "epoch": 4.28140703517588, + "grad_norm": 1.0377534627914429, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 6390 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 1.03935706615448, + "learning_rate": 0.0002, + "loss": 1.3755, + "step": 6400 + }, + { + "epoch": 4.294807370184254, + "grad_norm": 0.7244732975959778, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 6410 + }, + { + "epoch": 4.301507537688442, + "grad_norm": 0.7137406468391418, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 6420 + }, + { + "epoch": 4.30820770519263, + "grad_norm": 0.7492543458938599, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 6430 + }, + { + "epoch": 4.314907872696818, + "grad_norm": 0.7065439224243164, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 6440 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.7786989808082581, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 6450 + }, + { + "epoch": 4.328308207705192, + "grad_norm": 0.7369208335876465, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 6460 + }, + { + "epoch": 4.33500837520938, + "grad_norm": 0.7412346005439758, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 6470 + }, + { + "epoch": 4.341708542713568, + "grad_norm": 0.780927300453186, + "learning_rate": 0.0002, + "loss": 1.4087, + "step": 6480 + }, + { + "epoch": 4.348408710217756, + "grad_norm": 0.8320930600166321, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 6490 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.6871094703674316, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6500 + }, + { + "epoch": 4.36180904522613, + "grad_norm": 0.6751559972763062, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 6510 + }, + { + "epoch": 4.368509212730318, + "grad_norm": 0.7723976969718933, + "learning_rate": 0.0002, + "loss": 1.4311, + "step": 6520 + }, + { + "epoch": 4.375209380234506, + "grad_norm": 0.7915401458740234, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 6530 + }, + { + "epoch": 4.381909547738694, + "grad_norm": 0.7329102754592896, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 6540 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.7388760447502136, + "learning_rate": 0.0002, + "loss": 1.447, + "step": 6550 + }, + { + "epoch": 4.3953098827470685, + "grad_norm": 0.8282579183578491, + "learning_rate": 0.0002, + "loss": 1.4378, + "step": 6560 + }, + { + "epoch": 4.402010050251256, + "grad_norm": 0.7192724347114563, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6570 + }, + { + "epoch": 4.408710217755444, + "grad_norm": 0.746526837348938, + "learning_rate": 0.0002, + "loss": 1.4141, + "step": 6580 + }, + { + "epoch": 4.415410385259632, + "grad_norm": 0.8738046288490295, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 6590 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.8408458828926086, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 6600 + }, + { + "epoch": 4.4288107202680065, + "grad_norm": 0.8110666275024414, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 6610 + }, + { + "epoch": 4.435510887772194, + "grad_norm": 0.8602406978607178, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 6620 + }, + { + "epoch": 4.442211055276382, + "grad_norm": 0.7549102902412415, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6630 + }, + { + "epoch": 4.44891122278057, + "grad_norm": 0.7831804156303406, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 6640 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.7269673943519592, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6650 + }, + { + "epoch": 4.4623115577889445, + "grad_norm": 0.7397838830947876, + "learning_rate": 0.0002, + "loss": 1.4132, + "step": 6660 + }, + { + "epoch": 4.469011725293132, + "grad_norm": 0.713707447052002, + "learning_rate": 0.0002, + "loss": 1.3174, + "step": 6670 + }, + { + "epoch": 4.47571189279732, + "grad_norm": 0.7525581121444702, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 6680 + }, + { + "epoch": 4.482412060301508, + "grad_norm": 0.8030191659927368, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6690 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.7469439506530762, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 6700 + }, + { + "epoch": 4.4958123953098825, + "grad_norm": 0.7743868231773376, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 6710 + }, + { + "epoch": 4.50251256281407, + "grad_norm": 0.6539737582206726, + "learning_rate": 0.0002, + "loss": 1.3439, + "step": 6720 + }, + { + "epoch": 4.509212730318258, + "grad_norm": 0.825818657875061, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 6730 + }, + { + "epoch": 4.515912897822446, + "grad_norm": 0.8048575520515442, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 6740 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.7828766107559204, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6750 + }, + { + "epoch": 4.5293132328308205, + "grad_norm": 0.7406010031700134, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 6760 + }, + { + "epoch": 4.536013400335008, + "grad_norm": 0.840345561504364, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 6770 + }, + { + "epoch": 4.542713567839196, + "grad_norm": 0.8492622971534729, + "learning_rate": 0.0002, + "loss": 1.4808, + "step": 6780 + }, + { + "epoch": 4.549413735343384, + "grad_norm": 0.7130163908004761, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 6790 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 0.8454728126525879, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 6800 + }, + { + "epoch": 4.562814070351759, + "grad_norm": 0.7847645282745361, + "learning_rate": 0.0002, + "loss": 1.3239, + "step": 6810 + }, + { + "epoch": 4.569514237855946, + "grad_norm": 0.7245864272117615, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 6820 + }, + { + "epoch": 4.576214405360134, + "grad_norm": 0.768893301486969, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 6830 + }, + { + "epoch": 4.582914572864322, + "grad_norm": 0.8028400540351868, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6840 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.763945460319519, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 6850 + }, + { + "epoch": 4.596314907872697, + "grad_norm": 0.7417685389518738, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 6860 + }, + { + "epoch": 4.603015075376884, + "grad_norm": 0.7603038549423218, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 6870 + }, + { + "epoch": 4.609715242881072, + "grad_norm": 0.7981528043746948, + "learning_rate": 0.0002, + "loss": 1.4095, + "step": 6880 + }, + { + "epoch": 4.61641541038526, + "grad_norm": 0.8077111840248108, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6890 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.8778454065322876, + "learning_rate": 0.0002, + "loss": 1.4721, + "step": 6900 + }, + { + "epoch": 4.629815745393635, + "grad_norm": 0.8620710372924805, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 6910 + }, + { + "epoch": 4.636515912897822, + "grad_norm": 0.7486072778701782, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 6920 + }, + { + "epoch": 4.64321608040201, + "grad_norm": 0.7493042945861816, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 6930 + }, + { + "epoch": 4.649916247906198, + "grad_norm": 0.7388978600502014, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 6940 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.798530638217926, + "learning_rate": 0.0002, + "loss": 1.3593, + "step": 6950 + }, + { + "epoch": 4.663316582914573, + "grad_norm": 0.7929500937461853, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 6960 + }, + { + "epoch": 4.67001675041876, + "grad_norm": 0.9186785221099854, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 6970 + }, + { + "epoch": 4.676716917922948, + "grad_norm": 1.1103485822677612, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 6980 + }, + { + "epoch": 4.683417085427136, + "grad_norm": 0.8000466823577881, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 6990 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7520599961280823, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 7000 + }, + { + "epoch": 4.696817420435511, + "grad_norm": 0.7971973419189453, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 7010 + }, + { + "epoch": 4.703517587939698, + "grad_norm": 0.7363343834877014, + "learning_rate": 0.0002, + "loss": 1.3682, + "step": 7020 + }, + { + "epoch": 4.710217755443886, + "grad_norm": 0.8268865942955017, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 7030 + }, + { + "epoch": 4.716917922948074, + "grad_norm": 0.7054963111877441, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 7040 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.8196262121200562, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7050 + }, + { + "epoch": 4.730318257956449, + "grad_norm": 0.8276031017303467, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 7060 + }, + { + "epoch": 4.7370184254606365, + "grad_norm": 0.8248157501220703, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 7070 + }, + { + "epoch": 4.743718592964824, + "grad_norm": 0.8937979936599731, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 7080 + }, + { + "epoch": 4.750418760469012, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 7090 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.9495313763618469, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 7100 + }, + { + "epoch": 4.763819095477387, + "grad_norm": 0.8598204255104065, + "learning_rate": 0.0002, + "loss": 1.4504, + "step": 7110 + }, + { + "epoch": 4.7705192629815745, + "grad_norm": 0.8951472640037537, + "learning_rate": 0.0002, + "loss": 1.3969, + "step": 7120 + }, + { + "epoch": 4.777219430485762, + "grad_norm": 0.9110309481620789, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 7130 + }, + { + "epoch": 4.78391959798995, + "grad_norm": 0.7929584980010986, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 7140 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7415322661399841, + "learning_rate": 0.0002, + "loss": 1.467, + "step": 7150 + }, + { + "epoch": 4.797319932998325, + "grad_norm": 0.7504757046699524, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 7160 + }, + { + "epoch": 4.8040201005025125, + "grad_norm": 0.7166924476623535, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 7170 + }, + { + "epoch": 4.8107202680067, + "grad_norm": 0.7728400826454163, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 7180 + }, + { + "epoch": 4.817420435510888, + "grad_norm": 0.7992154955863953, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 7190 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.8655321002006531, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 7200 + }, + { + "epoch": 4.830820770519263, + "grad_norm": 0.7672632336616516, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 7210 + }, + { + "epoch": 4.8375209380234505, + "grad_norm": 0.708416223526001, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7220 + }, + { + "epoch": 4.844221105527638, + "grad_norm": 0.8914081454277039, + "learning_rate": 0.0002, + "loss": 1.5413, + "step": 7230 + }, + { + "epoch": 4.850921273031826, + "grad_norm": 0.7141931653022766, + "learning_rate": 0.0002, + "loss": 1.3569, + "step": 7240 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.6913040280342102, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 7250 + }, + { + "epoch": 4.864321608040201, + "grad_norm": 0.7871233820915222, + "learning_rate": 0.0002, + "loss": 1.3912, + "step": 7260 + }, + { + "epoch": 4.8710217755443885, + "grad_norm": 0.8466277122497559, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 7270 + }, + { + "epoch": 4.877721943048576, + "grad_norm": 0.8492183685302734, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 7280 + }, + { + "epoch": 4.884422110552764, + "grad_norm": 0.8339574933052063, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 7290 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.787022590637207, + "learning_rate": 0.0002, + "loss": 1.4157, + "step": 7300 + }, + { + "epoch": 4.897822445561139, + "grad_norm": 0.8877332806587219, + "learning_rate": 0.0002, + "loss": 1.3725, + "step": 7310 + }, + { + "epoch": 4.9045226130653266, + "grad_norm": 0.744989812374115, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 7320 + }, + { + "epoch": 4.911222780569514, + "grad_norm": 0.8027268648147583, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 7330 + }, + { + "epoch": 4.917922948073702, + "grad_norm": 0.6437455415725708, + "learning_rate": 0.0002, + "loss": 1.425, + "step": 7340 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.685999870300293, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 7350 + }, + { + "epoch": 4.931323283082077, + "grad_norm": 0.9086187481880188, + "learning_rate": 0.0002, + "loss": 1.4352, + "step": 7360 + }, + { + "epoch": 4.938023450586265, + "grad_norm": 0.8272411227226257, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 7370 + }, + { + "epoch": 4.944723618090452, + "grad_norm": 0.9227852821350098, + "learning_rate": 0.0002, + "loss": 1.4226, + "step": 7380 + }, + { + "epoch": 4.95142378559464, + "grad_norm": 0.7688441276550293, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 7390 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8662643432617188, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 7400 + }, + { + "epoch": 4.964824120603015, + "grad_norm": 0.9234127998352051, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 7410 + }, + { + "epoch": 4.971524288107203, + "grad_norm": 0.9131470918655396, + "learning_rate": 0.0002, + "loss": 1.4009, + "step": 7420 + }, + { + "epoch": 4.97822445561139, + "grad_norm": 0.7377504110336304, + "learning_rate": 0.0002, + "loss": 1.4544, + "step": 7430 + }, + { + "epoch": 4.984924623115578, + "grad_norm": 0.8762801289558411, + "learning_rate": 0.0002, + "loss": 1.4008, + "step": 7440 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7919872999191284, + "learning_rate": 0.0002, + "loss": 1.4304, + "step": 7450 + }, + { + "epoch": 4.998324958123953, + "grad_norm": 0.7144299149513245, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 7460 + }, + { + "epoch": 4.99966499162479, + "eval_loss": 1.9291157722473145, + "eval_runtime": 37.9831, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 7462 + }, + { + "epoch": 5.005025125628141, + "grad_norm": 0.7860151529312134, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 7470 + }, + { + "epoch": 5.011725293132328, + "grad_norm": 0.9418314695358276, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 7480 + }, + { + "epoch": 5.018425460636516, + "grad_norm": 0.8474572896957397, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 7490 + }, + { + "epoch": 5.025125628140704, + "grad_norm": 1.0724040269851685, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 7500 + }, + { + "epoch": 5.031825795644891, + "grad_norm": 0.9109148979187012, + "learning_rate": 0.0002, + "loss": 1.2228, + "step": 7510 + }, + { + "epoch": 5.038525963149079, + "grad_norm": 1.0088659524917603, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 7520 + }, + { + "epoch": 5.045226130653266, + "grad_norm": 1.1421623229980469, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 7530 + }, + { + "epoch": 5.051926298157454, + "grad_norm": 0.9219902157783508, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 7540 + }, + { + "epoch": 5.058626465661642, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 7550 + }, + { + "epoch": 5.065326633165829, + "grad_norm": 0.8889328241348267, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 7560 + }, + { + "epoch": 5.072026800670017, + "grad_norm": 0.9751363396644592, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 7570 + }, + { + "epoch": 5.078726968174204, + "grad_norm": 0.8603123426437378, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 7580 + }, + { + "epoch": 5.085427135678392, + "grad_norm": 0.8910616636276245, + "learning_rate": 0.0002, + "loss": 1.2175, + "step": 7590 + }, + { + "epoch": 5.09212730318258, + "grad_norm": 1.1128392219543457, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 7600 + }, + { + "epoch": 5.098827470686767, + "grad_norm": 0.9480258822441101, + "learning_rate": 0.0002, + "loss": 1.3065, + "step": 7610 + }, + { + "epoch": 5.105527638190955, + "grad_norm": 0.906958818435669, + "learning_rate": 0.0002, + "loss": 1.193, + "step": 7620 + }, + { + "epoch": 5.1122278056951425, + "grad_norm": 0.8741167187690735, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 7630 + }, + { + "epoch": 5.11892797319933, + "grad_norm": 0.966268002986908, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 7640 + }, + { + "epoch": 5.125628140703517, + "grad_norm": 0.9124358892440796, + "learning_rate": 0.0002, + "loss": 1.2782, + "step": 7650 + }, + { + "epoch": 5.132328308207705, + "grad_norm": 1.0436606407165527, + "learning_rate": 0.0002, + "loss": 1.3004, + "step": 7660 + }, + { + "epoch": 5.139028475711893, + "grad_norm": 0.9217309355735779, + "learning_rate": 0.0002, + "loss": 1.2675, + "step": 7670 + }, + { + "epoch": 5.1457286432160805, + "grad_norm": 1.344765543937683, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 7680 + }, + { + "epoch": 5.152428810720268, + "grad_norm": 1.0730723142623901, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 7690 + }, + { + "epoch": 5.159128978224456, + "grad_norm": 0.9321247339248657, + "learning_rate": 0.0002, + "loss": 1.1888, + "step": 7700 + }, + { + "epoch": 5.165829145728643, + "grad_norm": 0.8482614755630493, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 7710 + }, + { + "epoch": 5.172529313232831, + "grad_norm": 0.8274452686309814, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 7720 + }, + { + "epoch": 5.1792294807370185, + "grad_norm": 0.9120376706123352, + "learning_rate": 0.0002, + "loss": 1.1972, + "step": 7730 + }, + { + "epoch": 5.185929648241206, + "grad_norm": 1.0062892436981201, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 7740 + }, + { + "epoch": 5.192629815745394, + "grad_norm": 0.9521504640579224, + "learning_rate": 0.0002, + "loss": 1.2199, + "step": 7750 + }, + { + "epoch": 5.199329983249581, + "grad_norm": 0.8800198435783386, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 7760 + }, + { + "epoch": 5.206030150753769, + "grad_norm": 0.9749179482460022, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 7770 + }, + { + "epoch": 5.2127303182579565, + "grad_norm": 0.9441686868667603, + "learning_rate": 0.0002, + "loss": 1.2975, + "step": 7780 + }, + { + "epoch": 5.219430485762144, + "grad_norm": 0.9114066362380981, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 7790 + }, + { + "epoch": 5.226130653266332, + "grad_norm": 0.9851446151733398, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 7800 + }, + { + "epoch": 5.232830820770519, + "grad_norm": 0.9526297450065613, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 7810 + }, + { + "epoch": 5.239530988274707, + "grad_norm": 1.05986487865448, + "learning_rate": 0.0002, + "loss": 1.1502, + "step": 7820 + }, + { + "epoch": 5.2462311557788945, + "grad_norm": 0.8956538438796997, + "learning_rate": 0.0002, + "loss": 1.2517, + "step": 7830 + }, + { + "epoch": 5.252931323283082, + "grad_norm": 0.9568153619766235, + "learning_rate": 0.0002, + "loss": 1.2556, + "step": 7840 + }, + { + "epoch": 5.259631490787269, + "grad_norm": 1.0035018920898438, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 7850 + }, + { + "epoch": 5.266331658291457, + "grad_norm": 0.8554368615150452, + "learning_rate": 0.0002, + "loss": 1.2605, + "step": 7860 + }, + { + "epoch": 5.273031825795645, + "grad_norm": 0.9677708148956299, + "learning_rate": 0.0002, + "loss": 1.2799, + "step": 7870 + }, + { + "epoch": 5.279731993299833, + "grad_norm": 0.943606436252594, + "learning_rate": 0.0002, + "loss": 1.275, + "step": 7880 + }, + { + "epoch": 5.28643216080402, + "grad_norm": 1.0029335021972656, + "learning_rate": 0.0002, + "loss": 1.2335, + "step": 7890 + }, + { + "epoch": 5.293132328308207, + "grad_norm": 1.0164015293121338, + "learning_rate": 0.0002, + "loss": 1.2494, + "step": 7900 + }, + { + "epoch": 5.299832495812395, + "grad_norm": 0.8908365368843079, + "learning_rate": 0.0002, + "loss": 1.3117, + "step": 7910 + }, + { + "epoch": 5.306532663316583, + "grad_norm": 0.9307826161384583, + "learning_rate": 0.0002, + "loss": 1.2832, + "step": 7920 + }, + { + "epoch": 5.313232830820771, + "grad_norm": 1.0730371475219727, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 7930 + }, + { + "epoch": 5.319932998324958, + "grad_norm": 0.844739556312561, + "learning_rate": 0.0002, + "loss": 1.2003, + "step": 7940 + }, + { + "epoch": 5.326633165829146, + "grad_norm": 1.275833010673523, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 7950 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9042661190032959, + "learning_rate": 0.0002, + "loss": 1.2957, + "step": 7960 + }, + { + "epoch": 5.340033500837521, + "grad_norm": 0.9374269247055054, + "learning_rate": 0.0002, + "loss": 1.2912, + "step": 7970 + }, + { + "epoch": 5.346733668341709, + "grad_norm": 1.033098578453064, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 7980 + }, + { + "epoch": 5.353433835845896, + "grad_norm": 1.062775731086731, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 7990 + }, + { + "epoch": 5.360134003350084, + "grad_norm": 1.1064317226409912, + "learning_rate": 0.0002, + "loss": 1.3065, + "step": 8000 + }, + { + "epoch": 5.366834170854271, + "grad_norm": 1.1114039421081543, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 8010 + }, + { + "epoch": 5.373534338358459, + "grad_norm": 1.0198014974594116, + "learning_rate": 0.0002, + "loss": 1.2255, + "step": 8020 + }, + { + "epoch": 5.380234505862647, + "grad_norm": 0.8443173170089722, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 8030 + }, + { + "epoch": 5.386934673366834, + "grad_norm": 1.000881314277649, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 8040 + }, + { + "epoch": 5.393634840871022, + "grad_norm": 0.9874443411827087, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 8050 + }, + { + "epoch": 5.400335008375209, + "grad_norm": 0.9895344972610474, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 8060 + }, + { + "epoch": 5.407035175879397, + "grad_norm": 0.8595236539840698, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 8070 + }, + { + "epoch": 5.413735343383585, + "grad_norm": 0.9523849487304688, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 8080 + }, + { + "epoch": 5.420435510887772, + "grad_norm": 1.0560476779937744, + "learning_rate": 0.0002, + "loss": 1.2343, + "step": 8090 + }, + { + "epoch": 5.42713567839196, + "grad_norm": 1.0893689393997192, + "learning_rate": 0.0002, + "loss": 1.2956, + "step": 8100 + }, + { + "epoch": 5.433835845896147, + "grad_norm": 0.9395513534545898, + "learning_rate": 0.0002, + "loss": 1.2846, + "step": 8110 + }, + { + "epoch": 5.440536013400335, + "grad_norm": 0.9364215135574341, + "learning_rate": 0.0002, + "loss": 1.3444, + "step": 8120 + }, + { + "epoch": 5.447236180904523, + "grad_norm": 0.9502208232879639, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 8130 + }, + { + "epoch": 5.45393634840871, + "grad_norm": 0.9559208154678345, + "learning_rate": 0.0002, + "loss": 1.2971, + "step": 8140 + }, + { + "epoch": 5.460636515912898, + "grad_norm": 0.9261730313301086, + "learning_rate": 0.0002, + "loss": 1.2495, + "step": 8150 + }, + { + "epoch": 5.467336683417085, + "grad_norm": 0.9832326173782349, + "learning_rate": 0.0002, + "loss": 1.2599, + "step": 8160 + }, + { + "epoch": 5.474036850921273, + "grad_norm": 1.065953016281128, + "learning_rate": 0.0002, + "loss": 1.2771, + "step": 8170 + }, + { + "epoch": 5.480737018425461, + "grad_norm": 0.9139469861984253, + "learning_rate": 0.0002, + "loss": 1.3617, + "step": 8180 + }, + { + "epoch": 5.4874371859296485, + "grad_norm": 1.2322484254837036, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 8190 + }, + { + "epoch": 5.494137353433836, + "grad_norm": 0.9722974896430969, + "learning_rate": 0.0002, + "loss": 1.2879, + "step": 8200 + }, + { + "epoch": 5.500837520938023, + "grad_norm": 0.9338926076889038, + "learning_rate": 0.0002, + "loss": 1.2664, + "step": 8210 + }, + { + "epoch": 5.507537688442211, + "grad_norm": 0.9283728003501892, + "learning_rate": 0.0002, + "loss": 1.2128, + "step": 8220 + }, + { + "epoch": 5.514237855946399, + "grad_norm": 1.0489585399627686, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 8230 + }, + { + "epoch": 5.5209380234505865, + "grad_norm": 0.9881814122200012, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 8240 + }, + { + "epoch": 5.527638190954773, + "grad_norm": 0.9274460077285767, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 8250 + }, + { + "epoch": 5.534338358458961, + "grad_norm": 0.8650718331336975, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 8260 + }, + { + "epoch": 5.541038525963149, + "grad_norm": 1.014069676399231, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 8270 + }, + { + "epoch": 5.547738693467337, + "grad_norm": 0.9212974905967712, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 8280 + }, + { + "epoch": 5.5544388609715245, + "grad_norm": 1.1235398054122925, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 8290 + }, + { + "epoch": 5.561139028475711, + "grad_norm": 0.961954653263092, + "learning_rate": 0.0002, + "loss": 1.306, + "step": 8300 + }, + { + "epoch": 5.567839195979899, + "grad_norm": 0.9386700391769409, + "learning_rate": 0.0002, + "loss": 1.2946, + "step": 8310 + }, + { + "epoch": 5.574539363484087, + "grad_norm": 1.01912522315979, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 8320 + }, + { + "epoch": 5.581239530988275, + "grad_norm": 0.9851216077804565, + "learning_rate": 0.0002, + "loss": 1.3121, + "step": 8330 + }, + { + "epoch": 5.5879396984924625, + "grad_norm": 1.0138001441955566, + "learning_rate": 0.0002, + "loss": 1.3071, + "step": 8340 + }, + { + "epoch": 5.594639865996649, + "grad_norm": 0.9262447357177734, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 8350 + }, + { + "epoch": 5.601340033500837, + "grad_norm": 1.1322970390319824, + "learning_rate": 0.0002, + "loss": 1.2473, + "step": 8360 + }, + { + "epoch": 5.608040201005025, + "grad_norm": 1.1429349184036255, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 8370 + }, + { + "epoch": 5.614740368509213, + "grad_norm": 0.9130118489265442, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 8380 + }, + { + "epoch": 5.6214405360134005, + "grad_norm": 0.9651545882225037, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 8390 + }, + { + "epoch": 5.628140703517588, + "grad_norm": 0.9595398306846619, + "learning_rate": 0.0002, + "loss": 1.2799, + "step": 8400 + }, + { + "epoch": 5.634840871021775, + "grad_norm": 1.0049372911453247, + "learning_rate": 0.0002, + "loss": 1.3429, + "step": 8410 + }, + { + "epoch": 5.641541038525963, + "grad_norm": 1.082804560661316, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 8420 + }, + { + "epoch": 5.648241206030151, + "grad_norm": 0.9489204287528992, + "learning_rate": 0.0002, + "loss": 1.297, + "step": 8430 + }, + { + "epoch": 5.654941373534339, + "grad_norm": 0.9470235109329224, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 8440 + }, + { + "epoch": 5.661641541038526, + "grad_norm": 1.0662927627563477, + "learning_rate": 0.0002, + "loss": 1.3358, + "step": 8450 + }, + { + "epoch": 5.668341708542713, + "grad_norm": 0.9097877740859985, + "learning_rate": 0.0002, + "loss": 1.2973, + "step": 8460 + }, + { + "epoch": 5.675041876046901, + "grad_norm": 0.9740368127822876, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 8470 + }, + { + "epoch": 5.681742043551089, + "grad_norm": 0.9878810048103333, + "learning_rate": 0.0002, + "loss": 1.286, + "step": 8480 + }, + { + "epoch": 5.688442211055277, + "grad_norm": 1.148260474205017, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 8490 + }, + { + "epoch": 5.695142378559464, + "grad_norm": 0.9632558822631836, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 8500 + }, + { + "epoch": 5.701842546063651, + "grad_norm": 0.876812756061554, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 8510 + }, + { + "epoch": 5.708542713567839, + "grad_norm": 1.0730829238891602, + "learning_rate": 0.0002, + "loss": 1.3186, + "step": 8520 + }, + { + "epoch": 5.715242881072027, + "grad_norm": 1.2239218950271606, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 8530 + }, + { + "epoch": 5.721943048576215, + "grad_norm": 0.9460835456848145, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 8540 + }, + { + "epoch": 5.728643216080402, + "grad_norm": 0.9086270928382874, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 8550 + }, + { + "epoch": 5.735343383584589, + "grad_norm": 1.0258867740631104, + "learning_rate": 0.0002, + "loss": 1.2971, + "step": 8560 + }, + { + "epoch": 5.742043551088777, + "grad_norm": 1.0543923377990723, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 8570 + }, + { + "epoch": 5.748743718592965, + "grad_norm": 0.9063900113105774, + "learning_rate": 0.0002, + "loss": 1.2988, + "step": 8580 + }, + { + "epoch": 5.755443886097153, + "grad_norm": 1.1838830709457397, + "learning_rate": 0.0002, + "loss": 1.3535, + "step": 8590 + }, + { + "epoch": 5.76214405360134, + "grad_norm": 0.9631859064102173, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 8600 + }, + { + "epoch": 5.768844221105527, + "grad_norm": 0.9702655673027039, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 8610 + }, + { + "epoch": 5.775544388609715, + "grad_norm": 1.0591435432434082, + "learning_rate": 0.0002, + "loss": 1.3196, + "step": 8620 + }, + { + "epoch": 5.782244556113903, + "grad_norm": 0.9989570379257202, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 8630 + }, + { + "epoch": 5.788944723618091, + "grad_norm": 1.0836435556411743, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 8640 + }, + { + "epoch": 5.795644891122278, + "grad_norm": 0.8832896947860718, + "learning_rate": 0.0002, + "loss": 1.3334, + "step": 8650 + }, + { + "epoch": 5.802345058626465, + "grad_norm": 1.0104607343673706, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 8660 + }, + { + "epoch": 5.809045226130653, + "grad_norm": 0.8375084400177002, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 8670 + }, + { + "epoch": 5.815745393634841, + "grad_norm": 1.1300716400146484, + "learning_rate": 0.0002, + "loss": 1.3554, + "step": 8680 + }, + { + "epoch": 5.822445561139029, + "grad_norm": 0.9311910271644592, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 8690 + }, + { + "epoch": 5.8291457286432165, + "grad_norm": 0.9488391876220703, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 8700 + }, + { + "epoch": 5.835845896147403, + "grad_norm": 0.9747629761695862, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 8710 + }, + { + "epoch": 5.842546063651591, + "grad_norm": 1.1029598712921143, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 8720 + }, + { + "epoch": 5.849246231155779, + "grad_norm": 1.0396875143051147, + "learning_rate": 0.0002, + "loss": 1.3613, + "step": 8730 + }, + { + "epoch": 5.855946398659967, + "grad_norm": 0.9259780645370483, + "learning_rate": 0.0002, + "loss": 1.3272, + "step": 8740 + }, + { + "epoch": 5.8626465661641545, + "grad_norm": 1.020033597946167, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 8750 + }, + { + "epoch": 5.869346733668341, + "grad_norm": 0.9191218614578247, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 8760 + }, + { + "epoch": 5.876046901172529, + "grad_norm": 1.1093107461929321, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 8770 + }, + { + "epoch": 5.882747068676717, + "grad_norm": 1.1626793146133423, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 8780 + }, + { + "epoch": 5.889447236180905, + "grad_norm": 0.9542945027351379, + "learning_rate": 0.0002, + "loss": 1.2969, + "step": 8790 + }, + { + "epoch": 5.8961474036850925, + "grad_norm": 0.9086058139801025, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 8800 + }, + { + "epoch": 5.902847571189279, + "grad_norm": 0.9249639511108398, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 8810 + }, + { + "epoch": 5.909547738693467, + "grad_norm": 0.9414396286010742, + "learning_rate": 0.0002, + "loss": 1.337, + "step": 8820 + }, + { + "epoch": 5.916247906197655, + "grad_norm": 0.9086037874221802, + "learning_rate": 0.0002, + "loss": 1.2865, + "step": 8830 + }, + { + "epoch": 5.922948073701843, + "grad_norm": 0.8685907125473022, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 8840 + }, + { + "epoch": 5.9296482412060305, + "grad_norm": 1.036419153213501, + "learning_rate": 0.0002, + "loss": 1.297, + "step": 8850 + }, + { + "epoch": 5.936348408710217, + "grad_norm": 1.0183674097061157, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 8860 + }, + { + "epoch": 5.943048576214405, + "grad_norm": 0.966444194316864, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 8870 + }, + { + "epoch": 5.949748743718593, + "grad_norm": 1.125693917274475, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 8880 + }, + { + "epoch": 5.956448911222781, + "grad_norm": 0.9857436418533325, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 8890 + }, + { + "epoch": 5.9631490787269685, + "grad_norm": 0.9377069473266602, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 8900 + }, + { + "epoch": 5.969849246231155, + "grad_norm": 0.9493814706802368, + "learning_rate": 0.0002, + "loss": 1.3221, + "step": 8910 + }, + { + "epoch": 5.976549413735343, + "grad_norm": 0.8806208372116089, + "learning_rate": 0.0002, + "loss": 1.2516, + "step": 8920 + }, + { + "epoch": 5.983249581239531, + "grad_norm": 0.8727600574493408, + "learning_rate": 0.0002, + "loss": 1.2558, + "step": 8930 + }, + { + "epoch": 5.989949748743719, + "grad_norm": 0.9799810647964478, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 8940 + }, + { + "epoch": 5.9966499162479066, + "grad_norm": 0.9866513609886169, + "learning_rate": 0.0002, + "loss": 1.3323, + "step": 8950 + }, + { + "epoch": 6.0, + "eval_loss": 2.0282373428344727, + "eval_runtime": 38.0375, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 8955 + }, + { + "epoch": 6.0033500837520934, + "grad_norm": 0.8747885227203369, + "learning_rate": 0.0002, + "loss": 1.1768, + "step": 8960 + }, + { + "epoch": 6.010050251256281, + "grad_norm": 1.2512741088867188, + "learning_rate": 0.0002, + "loss": 1.0677, + "step": 8970 + }, + { + "epoch": 6.016750418760469, + "grad_norm": 1.06855309009552, + "learning_rate": 0.0002, + "loss": 1.1128, + "step": 8980 + }, + { + "epoch": 6.023450586264657, + "grad_norm": 1.1868711709976196, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 8990 + }, + { + "epoch": 6.030150753768845, + "grad_norm": 1.2984495162963867, + "learning_rate": 0.0002, + "loss": 1.1377, + "step": 9000 + }, + { + "epoch": 6.0368509212730315, + "grad_norm": 1.1147589683532715, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 9010 + }, + { + "epoch": 6.043551088777219, + "grad_norm": 1.3128414154052734, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 9020 + }, + { + "epoch": 6.050251256281407, + "grad_norm": 1.068290114402771, + "learning_rate": 0.0002, + "loss": 1.097, + "step": 9030 + }, + { + "epoch": 6.056951423785595, + "grad_norm": 1.1890562772750854, + "learning_rate": 0.0002, + "loss": 1.1764, + "step": 9040 + }, + { + "epoch": 6.063651591289783, + "grad_norm": 1.2121573686599731, + "learning_rate": 0.0002, + "loss": 1.1239, + "step": 9050 + }, + { + "epoch": 6.0703517587939695, + "grad_norm": 1.0860483646392822, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 9060 + }, + { + "epoch": 6.077051926298157, + "grad_norm": 1.1214599609375, + "learning_rate": 0.0002, + "loss": 1.1613, + "step": 9070 + }, + { + "epoch": 6.083752093802345, + "grad_norm": 1.147580862045288, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 9080 + }, + { + "epoch": 6.090452261306533, + "grad_norm": 1.3233155012130737, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 9090 + }, + { + "epoch": 6.097152428810721, + "grad_norm": 1.1869080066680908, + "learning_rate": 0.0002, + "loss": 1.1017, + "step": 9100 + }, + { + "epoch": 6.1038525963149075, + "grad_norm": 1.1695014238357544, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 9110 + }, + { + "epoch": 6.110552763819095, + "grad_norm": 1.1982251405715942, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 9120 + }, + { + "epoch": 6.117252931323283, + "grad_norm": 1.1426950693130493, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 9130 + }, + { + "epoch": 6.123953098827471, + "grad_norm": 1.2257394790649414, + "learning_rate": 0.0002, + "loss": 1.0801, + "step": 9140 + }, + { + "epoch": 6.130653266331659, + "grad_norm": 1.2932263612747192, + "learning_rate": 0.0002, + "loss": 1.1209, + "step": 9150 + }, + { + "epoch": 6.1373534338358455, + "grad_norm": 1.2617030143737793, + "learning_rate": 0.0002, + "loss": 1.0934, + "step": 9160 + }, + { + "epoch": 6.144053601340033, + "grad_norm": 1.1201422214508057, + "learning_rate": 0.0002, + "loss": 1.0551, + "step": 9170 + }, + { + "epoch": 6.150753768844221, + "grad_norm": 0.9625319838523865, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 9180 + }, + { + "epoch": 6.157453936348409, + "grad_norm": 1.0290048122406006, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 9190 + }, + { + "epoch": 6.164154103852597, + "grad_norm": 1.1137803792953491, + "learning_rate": 0.0002, + "loss": 1.1257, + "step": 9200 + }, + { + "epoch": 6.1708542713567835, + "grad_norm": 1.3674522638320923, + "learning_rate": 0.0002, + "loss": 1.1211, + "step": 9210 + }, + { + "epoch": 6.177554438860971, + "grad_norm": 1.182207703590393, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 9220 + }, + { + "epoch": 6.184254606365159, + "grad_norm": 1.0496711730957031, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 9230 + }, + { + "epoch": 6.190954773869347, + "grad_norm": 1.1899489164352417, + "learning_rate": 0.0002, + "loss": 1.0666, + "step": 9240 + }, + { + "epoch": 6.197654941373535, + "grad_norm": 1.2666147947311401, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 9250 + }, + { + "epoch": 6.204355108877722, + "grad_norm": 1.2013030052185059, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 9260 + }, + { + "epoch": 6.211055276381909, + "grad_norm": 1.3049768209457397, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 9270 + }, + { + "epoch": 6.217755443886097, + "grad_norm": 1.1733006238937378, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 9280 + }, + { + "epoch": 6.224455611390285, + "grad_norm": 1.2742516994476318, + "learning_rate": 0.0002, + "loss": 1.0933, + "step": 9290 + }, + { + "epoch": 6.231155778894473, + "grad_norm": 1.110198974609375, + "learning_rate": 0.0002, + "loss": 1.1028, + "step": 9300 + }, + { + "epoch": 6.23785594639866, + "grad_norm": 1.159963607788086, + "learning_rate": 0.0002, + "loss": 1.1619, + "step": 9310 + }, + { + "epoch": 6.244556113902847, + "grad_norm": 1.302216649055481, + "learning_rate": 0.0002, + "loss": 1.0716, + "step": 9320 + }, + { + "epoch": 6.251256281407035, + "grad_norm": 1.2134063243865967, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 9330 + }, + { + "epoch": 6.257956448911223, + "grad_norm": 1.062682867050171, + "learning_rate": 0.0002, + "loss": 1.2151, + "step": 9340 + }, + { + "epoch": 6.264656616415411, + "grad_norm": 1.1568971872329712, + "learning_rate": 0.0002, + "loss": 1.148, + "step": 9350 + }, + { + "epoch": 6.271356783919598, + "grad_norm": 0.9914957880973816, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 9360 + }, + { + "epoch": 6.278056951423785, + "grad_norm": 1.017250895500183, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 9370 + }, + { + "epoch": 6.284757118927973, + "grad_norm": 1.1862834692001343, + "learning_rate": 0.0002, + "loss": 1.2177, + "step": 9380 + }, + { + "epoch": 6.291457286432161, + "grad_norm": 1.2834911346435547, + "learning_rate": 0.0002, + "loss": 0.9994, + "step": 9390 + }, + { + "epoch": 6.298157453936349, + "grad_norm": 1.3306856155395508, + "learning_rate": 0.0002, + "loss": 1.0922, + "step": 9400 + }, + { + "epoch": 6.304857621440536, + "grad_norm": 1.12908136844635, + "learning_rate": 0.0002, + "loss": 1.1136, + "step": 9410 + }, + { + "epoch": 6.311557788944723, + "grad_norm": 1.2157351970672607, + "learning_rate": 0.0002, + "loss": 1.1406, + "step": 9420 + }, + { + "epoch": 6.318257956448911, + "grad_norm": 1.121882677078247, + "learning_rate": 0.0002, + "loss": 1.1388, + "step": 9430 + }, + { + "epoch": 6.324958123953099, + "grad_norm": 1.3144481182098389, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 9440 + }, + { + "epoch": 6.331658291457287, + "grad_norm": 1.1946896314620972, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 9450 + }, + { + "epoch": 6.338358458961474, + "grad_norm": 1.1289668083190918, + "learning_rate": 0.0002, + "loss": 1.1613, + "step": 9460 + }, + { + "epoch": 6.345058626465661, + "grad_norm": 1.1065658330917358, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 9470 + }, + { + "epoch": 6.351758793969849, + "grad_norm": 1.0881422758102417, + "learning_rate": 0.0002, + "loss": 1.1431, + "step": 9480 + }, + { + "epoch": 6.358458961474037, + "grad_norm": 1.242676854133606, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 9490 + }, + { + "epoch": 6.365159128978225, + "grad_norm": 0.9650855660438538, + "learning_rate": 0.0002, + "loss": 1.1379, + "step": 9500 + }, + { + "epoch": 6.371859296482412, + "grad_norm": 1.2845722436904907, + "learning_rate": 0.0002, + "loss": 1.0763, + "step": 9510 + }, + { + "epoch": 6.3785594639865995, + "grad_norm": 1.0327043533325195, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 9520 + }, + { + "epoch": 6.385259631490787, + "grad_norm": 1.0780898332595825, + "learning_rate": 0.0002, + "loss": 1.114, + "step": 9530 + }, + { + "epoch": 6.391959798994975, + "grad_norm": 1.4934027194976807, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 9540 + }, + { + "epoch": 6.398659966499163, + "grad_norm": 0.9882908463478088, + "learning_rate": 0.0002, + "loss": 1.1546, + "step": 9550 + }, + { + "epoch": 6.40536013400335, + "grad_norm": 1.3250664472579956, + "learning_rate": 0.0002, + "loss": 1.1145, + "step": 9560 + }, + { + "epoch": 6.4120603015075375, + "grad_norm": 1.1888482570648193, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 9570 + }, + { + "epoch": 6.418760469011725, + "grad_norm": 1.136496901512146, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 9580 + }, + { + "epoch": 6.425460636515913, + "grad_norm": 1.161360502243042, + "learning_rate": 0.0002, + "loss": 1.1674, + "step": 9590 + }, + { + "epoch": 6.432160804020101, + "grad_norm": 1.2034236192703247, + "learning_rate": 0.0002, + "loss": 1.1293, + "step": 9600 + }, + { + "epoch": 6.438860971524288, + "grad_norm": 1.0268361568450928, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 9610 + }, + { + "epoch": 6.4455611390284755, + "grad_norm": 1.2132930755615234, + "learning_rate": 0.0002, + "loss": 1.1732, + "step": 9620 + }, + { + "epoch": 6.452261306532663, + "grad_norm": 1.0773013830184937, + "learning_rate": 0.0002, + "loss": 1.1329, + "step": 9630 + }, + { + "epoch": 6.458961474036851, + "grad_norm": 1.3848375082015991, + "learning_rate": 0.0002, + "loss": 1.0822, + "step": 9640 + }, + { + "epoch": 6.465661641541039, + "grad_norm": 1.110495924949646, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 9650 + }, + { + "epoch": 6.472361809045226, + "grad_norm": 1.118093729019165, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 9660 + }, + { + "epoch": 6.4790619765494135, + "grad_norm": 1.2611900568008423, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 9670 + }, + { + "epoch": 6.485762144053601, + "grad_norm": 0.971754252910614, + "learning_rate": 0.0002, + "loss": 1.2138, + "step": 9680 + }, + { + "epoch": 6.492462311557789, + "grad_norm": 1.2615419626235962, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 9690 + }, + { + "epoch": 6.499162479061977, + "grad_norm": 1.1370900869369507, + "learning_rate": 0.0002, + "loss": 1.1412, + "step": 9700 + }, + { + "epoch": 6.505862646566165, + "grad_norm": 1.1815906763076782, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 9710 + }, + { + "epoch": 6.5125628140703515, + "grad_norm": 1.3424339294433594, + "learning_rate": 0.0002, + "loss": 1.167, + "step": 9720 + }, + { + "epoch": 6.519262981574539, + "grad_norm": 1.2858397960662842, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 9730 + }, + { + "epoch": 6.525963149078727, + "grad_norm": 0.9578179121017456, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 9740 + }, + { + "epoch": 6.532663316582915, + "grad_norm": 1.3105167150497437, + "learning_rate": 0.0002, + "loss": 1.1805, + "step": 9750 + }, + { + "epoch": 6.539363484087103, + "grad_norm": 1.0586575269699097, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 9760 + }, + { + "epoch": 6.54606365159129, + "grad_norm": 1.2122068405151367, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 9770 + }, + { + "epoch": 6.552763819095477, + "grad_norm": 1.3088626861572266, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 9780 + }, + { + "epoch": 6.559463986599665, + "grad_norm": 1.194122076034546, + "learning_rate": 0.0002, + "loss": 1.1067, + "step": 9790 + }, + { + "epoch": 6.566164154103853, + "grad_norm": 1.1508387327194214, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 9800 + }, + { + "epoch": 6.572864321608041, + "grad_norm": 1.109228253364563, + "learning_rate": 0.0002, + "loss": 1.1694, + "step": 9810 + }, + { + "epoch": 6.579564489112228, + "grad_norm": 1.1607427597045898, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 9820 + }, + { + "epoch": 6.586264656616415, + "grad_norm": 1.174089789390564, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 9830 + }, + { + "epoch": 6.592964824120603, + "grad_norm": 1.1739521026611328, + "learning_rate": 0.0002, + "loss": 1.1385, + "step": 9840 + }, + { + "epoch": 6.599664991624791, + "grad_norm": 1.098528504371643, + "learning_rate": 0.0002, + "loss": 1.155, + "step": 9850 + }, + { + "epoch": 6.606365159128979, + "grad_norm": 1.0397740602493286, + "learning_rate": 0.0002, + "loss": 1.1359, + "step": 9860 + }, + { + "epoch": 6.613065326633166, + "grad_norm": 1.1087969541549683, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 9870 + }, + { + "epoch": 6.619765494137353, + "grad_norm": 1.2070481777191162, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 9880 + }, + { + "epoch": 6.626465661641541, + "grad_norm": 1.1115655899047852, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 9890 + }, + { + "epoch": 6.633165829145729, + "grad_norm": 1.2486097812652588, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 9900 + }, + { + "epoch": 6.639865996649917, + "grad_norm": 1.230380654335022, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 9910 + }, + { + "epoch": 6.646566164154104, + "grad_norm": 1.1479365825653076, + "learning_rate": 0.0002, + "loss": 1.1862, + "step": 9920 + }, + { + "epoch": 6.653266331658291, + "grad_norm": 1.0790960788726807, + "learning_rate": 0.0002, + "loss": 1.1139, + "step": 9930 + }, + { + "epoch": 6.659966499162479, + "grad_norm": 1.1157397031784058, + "learning_rate": 0.0002, + "loss": 1.2001, + "step": 9940 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.3104028701782227, + "learning_rate": 0.0002, + "loss": 1.1085, + "step": 9950 + }, + { + "epoch": 6.673366834170855, + "grad_norm": 1.1727646589279175, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 9960 + }, + { + "epoch": 6.680067001675042, + "grad_norm": 1.2104284763336182, + "learning_rate": 0.0002, + "loss": 1.1671, + "step": 9970 + }, + { + "epoch": 6.686767169179229, + "grad_norm": 1.2023727893829346, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 9980 + }, + { + "epoch": 6.693467336683417, + "grad_norm": 1.0088225603103638, + "learning_rate": 0.0002, + "loss": 1.1385, + "step": 9990 + }, + { + "epoch": 6.700167504187605, + "grad_norm": 1.298015832901001, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 10000 + }, + { + "epoch": 6.706867671691793, + "grad_norm": 1.1315910816192627, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 10010 + }, + { + "epoch": 6.71356783919598, + "grad_norm": 1.1283273696899414, + "learning_rate": 0.0002, + "loss": 1.1679, + "step": 10020 + }, + { + "epoch": 6.720268006700167, + "grad_norm": 1.2564418315887451, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 10030 + }, + { + "epoch": 6.726968174204355, + "grad_norm": 1.0451353788375854, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 10040 + }, + { + "epoch": 6.733668341708543, + "grad_norm": 1.054793357849121, + "learning_rate": 0.0002, + "loss": 1.1905, + "step": 10050 + }, + { + "epoch": 6.740368509212731, + "grad_norm": 1.2741243839263916, + "learning_rate": 0.0002, + "loss": 1.1814, + "step": 10060 + }, + { + "epoch": 6.747068676716918, + "grad_norm": 1.1342514753341675, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 10070 + }, + { + "epoch": 6.7537688442211055, + "grad_norm": 1.0081498622894287, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 10080 + }, + { + "epoch": 6.760469011725293, + "grad_norm": 1.2164603471755981, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 10090 + }, + { + "epoch": 6.767169179229481, + "grad_norm": 1.2062463760375977, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 10100 + }, + { + "epoch": 6.773869346733669, + "grad_norm": 1.2255526781082153, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 10110 + }, + { + "epoch": 6.780569514237856, + "grad_norm": 1.08175790309906, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 10120 + }, + { + "epoch": 6.7872696817420435, + "grad_norm": 1.5781128406524658, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 10130 + }, + { + "epoch": 6.793969849246231, + "grad_norm": 1.0622451305389404, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 10140 + }, + { + "epoch": 6.800670016750419, + "grad_norm": 1.1591497659683228, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 10150 + }, + { + "epoch": 6.807370184254607, + "grad_norm": 1.0398483276367188, + "learning_rate": 0.0002, + "loss": 1.2203, + "step": 10160 + }, + { + "epoch": 6.814070351758794, + "grad_norm": 1.229132056236267, + "learning_rate": 0.0002, + "loss": 1.2249, + "step": 10170 + }, + { + "epoch": 6.8207705192629815, + "grad_norm": 1.0918090343475342, + "learning_rate": 0.0002, + "loss": 1.1789, + "step": 10180 + }, + { + "epoch": 6.827470686767169, + "grad_norm": 1.1543749570846558, + "learning_rate": 0.0002, + "loss": 1.1639, + "step": 10190 + }, + { + "epoch": 6.834170854271357, + "grad_norm": 1.1831817626953125, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 10200 + }, + { + "epoch": 6.840871021775545, + "grad_norm": 1.305327296257019, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 10210 + }, + { + "epoch": 6.847571189279732, + "grad_norm": 1.136720061302185, + "learning_rate": 0.0002, + "loss": 1.2037, + "step": 10220 + }, + { + "epoch": 6.8542713567839195, + "grad_norm": 1.2282346487045288, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 10230 + }, + { + "epoch": 6.860971524288107, + "grad_norm": 1.2457010746002197, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 10240 + }, + { + "epoch": 6.867671691792295, + "grad_norm": 1.2808631658554077, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 10250 + }, + { + "epoch": 6.874371859296483, + "grad_norm": 1.089066743850708, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 10260 + }, + { + "epoch": 6.88107202680067, + "grad_norm": 0.9543178081512451, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 10270 + }, + { + "epoch": 6.8877721943048575, + "grad_norm": 1.1149744987487793, + "learning_rate": 0.0002, + "loss": 1.1617, + "step": 10280 + }, + { + "epoch": 6.894472361809045, + "grad_norm": 1.0185538530349731, + "learning_rate": 0.0002, + "loss": 1.1134, + "step": 10290 + }, + { + "epoch": 6.901172529313233, + "grad_norm": 0.9954617619514465, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 10300 + }, + { + "epoch": 6.907872696817421, + "grad_norm": 1.2581418752670288, + "learning_rate": 0.0002, + "loss": 1.1524, + "step": 10310 + }, + { + "epoch": 6.914572864321608, + "grad_norm": 1.2430983781814575, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 10320 + }, + { + "epoch": 6.921273031825796, + "grad_norm": 1.4937270879745483, + "learning_rate": 0.0002, + "loss": 1.1254, + "step": 10330 + }, + { + "epoch": 6.927973199329983, + "grad_norm": 1.1257144212722778, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 10340 + }, + { + "epoch": 6.934673366834171, + "grad_norm": 1.2068904638290405, + "learning_rate": 0.0002, + "loss": 1.2622, + "step": 10350 + }, + { + "epoch": 6.941373534338359, + "grad_norm": 1.0290757417678833, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 10360 + }, + { + "epoch": 6.948073701842546, + "grad_norm": 1.0070724487304688, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 10370 + }, + { + "epoch": 6.954773869346734, + "grad_norm": 0.9936357140541077, + "learning_rate": 0.0002, + "loss": 1.1838, + "step": 10380 + }, + { + "epoch": 6.961474036850921, + "grad_norm": 1.1063416004180908, + "learning_rate": 0.0002, + "loss": 1.2305, + "step": 10390 + }, + { + "epoch": 6.968174204355109, + "grad_norm": 1.5199986696243286, + "learning_rate": 0.0002, + "loss": 1.154, + "step": 10400 + }, + { + "epoch": 6.974874371859297, + "grad_norm": 1.160731554031372, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 10410 + }, + { + "epoch": 6.981574539363484, + "grad_norm": 1.084697961807251, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 10420 + }, + { + "epoch": 6.988274706867672, + "grad_norm": 1.1257576942443848, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 10430 + }, + { + "epoch": 6.994974874371859, + "grad_norm": 1.310616135597229, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 10440 + }, + { + "epoch": 6.99966499162479, + "eval_loss": 2.1203012466430664, + "eval_runtime": 37.936, + "eval_samples_per_second": 13.576, + "eval_steps_per_second": 1.713, + "step": 10447 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.834869071000371e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-10447/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ab67859fd10c7865ae28b3816b5b9066ddb6cba0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eef98d6245d2695d63ded5e3f460682daac0c7695520431d04d3d01b5fc9f15 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..245cffb7b85a0008e22b6b24b80a62813244d76e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:849cf840d239789f3f568974fb379b6ddc17b287e99687426e6c464e93ef2bfc +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e92612298d3997192dc6db60274a0a76167c2922 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b830f407e0023c6a767e982fcc0b9c16621f5f1933411e9c416dd8b1e8cd59 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dec1ae9003899f214c493782eb2f9090beb6b746 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5845c9f2080be96d77ec57c3dc3a5e4a524b0af908d8d720010419b1753faf3 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5f39e7e95996f652f99a41c8de3df54d67a93298 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/trainer_state.json @@ -0,0 +1,8448 @@ +{ + "best_metric": 1.8028968572616577, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", + "epoch": 7.997319932998325, + "eval_steps": 10, + "global_step": 11936, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.29252371191978455, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1500 + }, + { + "epoch": 1.0117252931323284, + "grad_norm": 0.31607162952423096, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 1510 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.32294467091560364, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1520 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.3868017792701721, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 1530 + }, + { + "epoch": 1.031825795644891, + "grad_norm": 0.3178282082080841, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 1540 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.3706750273704529, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 1550 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.33930912613868713, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1560 + }, + { + "epoch": 1.051926298157454, + "grad_norm": 0.33970504999160767, + "learning_rate": 0.0002, + "loss": 1.7602, + "step": 1570 + }, + { + "epoch": 1.0586264656616415, + "grad_norm": 0.42553383111953735, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1580 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.3772421181201935, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1590 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.34212902188301086, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1600 + }, + { + "epoch": 1.0787269681742044, + "grad_norm": 0.3798283338546753, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1610 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.36909598112106323, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 1620 + }, + { + "epoch": 1.0921273031825796, + "grad_norm": 0.3344230651855469, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 1630 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.3862569332122803, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1640 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.31188511848449707, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1650 + }, + { + "epoch": 1.1122278056951425, + "grad_norm": 0.3563670814037323, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 1660 + }, + { + "epoch": 1.11892797319933, + "grad_norm": 0.35052165389060974, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 1670 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.3285699188709259, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1680 + }, + { + "epoch": 1.1323283082077051, + "grad_norm": 0.3639393746852875, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1690 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.3842753767967224, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 1700 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.3624933063983917, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 1710 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.3641220033168793, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1720 + }, + { + "epoch": 1.1591289782244556, + "grad_norm": 0.32765355706214905, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1730 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.34974896907806396, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 1740 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3910926580429077, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.3564300537109375, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 1760 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.34822574257850647, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1770 + }, + { + "epoch": 1.1926298157453936, + "grad_norm": 0.36185044050216675, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1780 + }, + { + "epoch": 1.1993299832495812, + "grad_norm": 0.34866711497306824, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1790 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.4017769992351532, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 1800 + }, + { + "epoch": 1.2127303182579565, + "grad_norm": 0.32930681109428406, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1810 + }, + { + "epoch": 1.219430485762144, + "grad_norm": 0.35951921343803406, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1820 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.37366992235183716, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 1830 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.3565689027309418, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1840 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.3692343533039093, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 1850 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.38426971435546875, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 1860 + }, + { + "epoch": 1.252931323283082, + "grad_norm": 0.33559855818748474, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1870 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.34181106090545654, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1880 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3916318416595459, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1890 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3887825012207031, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 1900 + }, + { + "epoch": 1.2797319932998326, + "grad_norm": 0.33583927154541016, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1910 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.37639349699020386, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1920 + }, + { + "epoch": 1.2931323283082077, + "grad_norm": 0.38059428334236145, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1930 + }, + { + "epoch": 1.2998324958123952, + "grad_norm": 0.37253183126449585, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1940 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.37371566891670227, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 1950 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.4080910086631775, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 1960 + }, + { + "epoch": 1.3199329983249581, + "grad_norm": 0.3174354135990143, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1970 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.4518888294696808, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 1980 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3627921938896179, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 1990 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3655930161476135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 2000 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2010 + }, + { + "epoch": 1.3534338358458962, + "grad_norm": 0.4281129240989685, + "learning_rate": 0.0002, + "loss": 1.7359, + "step": 2020 + }, + { + "epoch": 1.3601340033500837, + "grad_norm": 0.3821414113044739, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 2030 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.3907586336135864, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 2040 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37792932987213135, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 2050 + }, + { + "epoch": 1.3802345058626466, + "grad_norm": 0.3693985641002655, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 2060 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.32275936007499695, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 2070 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.3789440095424652, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 2080 + }, + { + "epoch": 1.4003350083752093, + "grad_norm": 0.3638380467891693, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 2090 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 2100 + }, + { + "epoch": 1.4137353433835846, + "grad_norm": 0.37920597195625305, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 2110 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.37218064069747925, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 2120 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.38074082136154175, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 2130 + }, + { + "epoch": 1.4338358458961473, + "grad_norm": 0.3455527126789093, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 2140 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.3712003529071808, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2150 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.3786754906177521, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2160 + }, + { + "epoch": 1.4539363484087102, + "grad_norm": 0.3879223167896271, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 2170 + }, + { + "epoch": 1.4606365159128978, + "grad_norm": 0.38738805055618286, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 2180 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.39768800139427185, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2190 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.4172441065311432, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 2200 + }, + { + "epoch": 1.4807370184254607, + "grad_norm": 0.4043174982070923, + "learning_rate": 0.0002, + "loss": 1.6736, + "step": 2210 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.3750883936882019, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 2220 + }, + { + "epoch": 1.4941373534338358, + "grad_norm": 0.3552253246307373, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 2230 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.34607139229774475, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2240 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.3406706750392914, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 2250 + }, + { + "epoch": 1.5142378559463987, + "grad_norm": 0.36654895544052124, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 2260 + }, + { + "epoch": 1.5209380234505863, + "grad_norm": 0.3914054334163666, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2270 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.42012137174606323, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 2280 + }, + { + "epoch": 1.5343383584589616, + "grad_norm": 0.39563435316085815, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.3508438766002655, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 2300 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.3785218596458435, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 2310 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.39377647638320923, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 2320 + }, + { + "epoch": 1.5611390284757118, + "grad_norm": 0.3391438126564026, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2330 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.37944263219833374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 2340 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3523491322994232, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2350 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.3911575973033905, + "learning_rate": 0.0002, + "loss": 1.7583, + "step": 2360 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.33832186460494995, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 2370 + }, + { + "epoch": 1.5946398659966499, + "grad_norm": 0.3665979206562042, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2380 + }, + { + "epoch": 1.6013400335008376, + "grad_norm": 0.3871748149394989, + "learning_rate": 0.0002, + "loss": 1.779, + "step": 2390 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3586967885494232, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 2400 + }, + { + "epoch": 1.6147403685092128, + "grad_norm": 0.3563673198223114, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 2410 + }, + { + "epoch": 1.6214405360134003, + "grad_norm": 0.37588971853256226, + "learning_rate": 0.0002, + "loss": 1.745, + "step": 2420 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.352556437253952, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 2430 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.3716259300708771, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2440 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.372001975774765, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2450 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.3430042862892151, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2460 + }, + { + "epoch": 1.6549413735343383, + "grad_norm": 0.3741483688354492, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2470 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.3610571324825287, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2480 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.4204719066619873, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2490 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3938186466693878, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 2500 + }, + { + "epoch": 1.6817420435510888, + "grad_norm": 0.3421435058116913, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 2510 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.42441412806510925, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 2520 + }, + { + "epoch": 1.695142378559464, + "grad_norm": 0.38071519136428833, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 2530 + }, + { + "epoch": 1.7018425460636517, + "grad_norm": 0.34078919887542725, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2540 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.412844181060791, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 2550 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 2560 + }, + { + "epoch": 1.7219430485762144, + "grad_norm": 0.41588476300239563, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 2570 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.35504111647605896, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2580 + }, + { + "epoch": 1.7353433835845897, + "grad_norm": 0.36909720301628113, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.4149979054927826, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 2600 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.38859328627586365, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 2610 + }, + { + "epoch": 1.7554438860971524, + "grad_norm": 0.36738792061805725, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2620 + }, + { + "epoch": 1.76214405360134, + "grad_norm": 0.3968178927898407, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2630 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.3972901999950409, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 2640 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3949959874153137, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 1.7822445561139029, + "grad_norm": 0.44074657559394836, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 2660 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.39743664860725403, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 2670 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.3950406610965729, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2680 + }, + { + "epoch": 1.8023450586264658, + "grad_norm": 0.3568263649940491, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2690 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3819476366043091, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2700 + }, + { + "epoch": 1.8157453936348409, + "grad_norm": 0.3480634391307831, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 2710 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2720 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.3441337049007416, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2730 + }, + { + "epoch": 1.8358458961474038, + "grad_norm": 0.35692882537841797, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 2740 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.36959215998649597, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2750 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.3893393278121948, + "learning_rate": 0.0002, + "loss": 1.7657, + "step": 2760 + }, + { + "epoch": 1.8559463986599665, + "grad_norm": 0.37817293405532837, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2770 + }, + { + "epoch": 1.862646566164154, + "grad_norm": 0.36071285605430603, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 2780 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.3758420944213867, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 2790 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3889938294887543, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 2800 + }, + { + "epoch": 1.882747068676717, + "grad_norm": 0.34361857175827026, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2810 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.39283323287963867, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2820 + }, + { + "epoch": 1.896147403685092, + "grad_norm": 0.3919452726840973, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 2830 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.38215070962905884, + "learning_rate": 0.0002, + "loss": 1.673, + "step": 2840 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.4235064387321472, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 2850 + }, + { + "epoch": 1.916247906197655, + "grad_norm": 0.35694634914398193, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 2860 + }, + { + "epoch": 1.9229480737018425, + "grad_norm": 0.383492112159729, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 2870 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5945147275924683, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2880 + }, + { + "epoch": 1.9363484087102178, + "grad_norm": 0.3367522358894348, + "learning_rate": 0.0002, + "loss": 1.7421, + "step": 2890 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.35300394892692566, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2900 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.38084495067596436, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2910 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.37559160590171814, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 2920 + }, + { + "epoch": 1.963149078726968, + "grad_norm": 0.3661738336086273, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 2930 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.4073849320411682, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2940 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3723304271697998, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 2950 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.3991098999977112, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 2960 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.3947085440158844, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2970 + }, + { + "epoch": 1.996649916247906, + "grad_norm": 0.3786258399486542, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2980 + }, + { + "epoch": 2.0, + "eval_loss": 1.8028968572616577, + "eval_runtime": 37.8985, + "eval_samples_per_second": 13.589, + "eval_steps_per_second": 1.715, + "step": 2985 + }, + { + "epoch": 2.003350083752094, + "grad_norm": 0.34824079275131226, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2990 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.3394894003868103, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3000 + }, + { + "epoch": 2.016750418760469, + "grad_norm": 0.36910977959632874, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3010 + }, + { + "epoch": 2.023450586264657, + "grad_norm": 0.45000967383384705, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 3020 + }, + { + "epoch": 2.030150753768844, + "grad_norm": 0.3791407346725464, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3030 + }, + { + "epoch": 2.036850921273032, + "grad_norm": 0.387321799993515, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 3040 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.4185757040977478, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3050 + }, + { + "epoch": 2.050251256281407, + "grad_norm": 0.45110777020454407, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3060 + }, + { + "epoch": 2.056951423785595, + "grad_norm": 0.42663660645484924, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 3070 + }, + { + "epoch": 2.063651591289782, + "grad_norm": 0.4546292722225189, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 3080 + }, + { + "epoch": 2.07035175879397, + "grad_norm": 0.3979759216308594, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3090 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.43596673011779785, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3100 + }, + { + "epoch": 2.083752093802345, + "grad_norm": 0.40120232105255127, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 3110 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.44449281692504883, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3120 + }, + { + "epoch": 2.09715242881072, + "grad_norm": 0.42672568559646606, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 3130 + }, + { + "epoch": 2.103852596314908, + "grad_norm": 0.4232690930366516, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 3140 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.4299317002296448, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3150 + }, + { + "epoch": 2.117252931323283, + "grad_norm": 0.4067758023738861, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 3160 + }, + { + "epoch": 2.123953098827471, + "grad_norm": 0.4918815791606903, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3170 + }, + { + "epoch": 2.130653266331658, + "grad_norm": 0.4140559732913971, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3180 + }, + { + "epoch": 2.137353433835846, + "grad_norm": 0.4555995464324951, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 3190 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.42943915724754333, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 3200 + }, + { + "epoch": 2.150753768844221, + "grad_norm": 0.4730435013771057, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3210 + }, + { + "epoch": 2.157453936348409, + "grad_norm": 0.43310216069221497, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 3220 + }, + { + "epoch": 2.164154103852596, + "grad_norm": 0.42054110765457153, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3230 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.4897233247756958, + "learning_rate": 0.0002, + "loss": 1.6749, + "step": 3240 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.42194533348083496, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 3250 + }, + { + "epoch": 2.184254606365159, + "grad_norm": 0.44494450092315674, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3260 + }, + { + "epoch": 2.190954773869347, + "grad_norm": 0.43524879217147827, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 3270 + }, + { + "epoch": 2.1976549413735342, + "grad_norm": 0.4621117413043976, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 3280 + }, + { + "epoch": 2.204355108877722, + "grad_norm": 0.4073285460472107, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 3290 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.47868335247039795, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3300 + }, + { + "epoch": 2.217755443886097, + "grad_norm": 0.4264970123767853, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 3310 + }, + { + "epoch": 2.224455611390285, + "grad_norm": 0.4491245150566101, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3320 + }, + { + "epoch": 2.2311557788944723, + "grad_norm": 0.4010344445705414, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 3330 + }, + { + "epoch": 2.23785594639866, + "grad_norm": 0.4232759177684784, + "learning_rate": 0.0002, + "loss": 1.6684, + "step": 3340 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5099776983261108, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3350 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.5223407745361328, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 3360 + }, + { + "epoch": 2.257956448911223, + "grad_norm": 0.47818470001220703, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3370 + }, + { + "epoch": 2.2646566164154103, + "grad_norm": 0.4721255898475647, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3380 + }, + { + "epoch": 2.271356783919598, + "grad_norm": 0.4113229513168335, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3390 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.507080078125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3400 + }, + { + "epoch": 2.284757118927973, + "grad_norm": 0.4852292239665985, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 3410 + }, + { + "epoch": 2.291457286432161, + "grad_norm": 0.4503684341907501, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 3420 + }, + { + "epoch": 2.2981574539363483, + "grad_norm": 0.8359600305557251, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 3430 + }, + { + "epoch": 2.304857621440536, + "grad_norm": 0.44604045152664185, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3440 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.45667049288749695, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3450 + }, + { + "epoch": 2.318257956448911, + "grad_norm": 0.4879349172115326, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 3460 + }, + { + "epoch": 2.324958123953099, + "grad_norm": 0.4033963084220886, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 3470 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.44494301080703735, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3480 + }, + { + "epoch": 2.338358458961474, + "grad_norm": 0.4794621765613556, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.41404327750205994, + "learning_rate": 0.0002, + "loss": 1.6807, + "step": 3500 + }, + { + "epoch": 2.351758793969849, + "grad_norm": 0.4664851725101471, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 3510 + }, + { + "epoch": 2.358458961474037, + "grad_norm": 0.4263697564601898, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 3520 + }, + { + "epoch": 2.3651591289782243, + "grad_norm": 0.5035167336463928, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 3530 + }, + { + "epoch": 2.371859296482412, + "grad_norm": 0.4380664527416229, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 3540 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.5227681994438171, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3550 + }, + { + "epoch": 2.3852596314907872, + "grad_norm": 0.4382302761077881, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3560 + }, + { + "epoch": 2.391959798994975, + "grad_norm": 0.4392451047897339, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3570 + }, + { + "epoch": 2.3986599664991624, + "grad_norm": 0.4372786581516266, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 3580 + }, + { + "epoch": 2.40536013400335, + "grad_norm": 0.5015502572059631, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 3590 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.5653210878372192, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3600 + }, + { + "epoch": 2.4187604690117253, + "grad_norm": 0.53007972240448, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3610 + }, + { + "epoch": 2.425460636515913, + "grad_norm": 0.4659176766872406, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 3620 + }, + { + "epoch": 2.4321608040201004, + "grad_norm": 0.5637837052345276, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3630 + }, + { + "epoch": 2.438860971524288, + "grad_norm": 0.4248391389846802, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3640 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.44668248295783997, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 3650 + }, + { + "epoch": 2.4522613065326633, + "grad_norm": 0.43990179896354675, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 3660 + }, + { + "epoch": 2.458961474036851, + "grad_norm": 0.4532523453235626, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 3670 + }, + { + "epoch": 2.4656616415410384, + "grad_norm": 0.6605591773986816, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 3680 + }, + { + "epoch": 2.472361809045226, + "grad_norm": 0.4694533348083496, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3690 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.4485011100769043, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 3700 + }, + { + "epoch": 2.4857621440536013, + "grad_norm": 0.4761785864830017, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3710 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.5116432309150696, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 3720 + }, + { + "epoch": 2.4991624790619764, + "grad_norm": 0.49523618817329407, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 3730 + }, + { + "epoch": 2.505862646566164, + "grad_norm": 0.43826380372047424, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 3740 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.4916154146194458, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3750 + }, + { + "epoch": 2.5192629815745393, + "grad_norm": 0.5381299257278442, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 3760 + }, + { + "epoch": 2.525963149078727, + "grad_norm": 0.44947415590286255, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 3770 + }, + { + "epoch": 2.5326633165829144, + "grad_norm": 0.49979084730148315, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3780 + }, + { + "epoch": 2.539363484087102, + "grad_norm": 0.43046900629997253, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 3790 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.4513470530509949, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 3800 + }, + { + "epoch": 2.5527638190954773, + "grad_norm": 0.49900051951408386, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3810 + }, + { + "epoch": 2.559463986599665, + "grad_norm": 0.4348420202732086, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 3820 + }, + { + "epoch": 2.5661641541038525, + "grad_norm": 0.4684867560863495, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3830 + }, + { + "epoch": 2.5728643216080402, + "grad_norm": 0.44430989027023315, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3840 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.47375255823135376, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3850 + }, + { + "epoch": 2.5862646566164154, + "grad_norm": 0.45493075251579285, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 3860 + }, + { + "epoch": 2.592964824120603, + "grad_norm": 0.4563275873661041, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 3870 + }, + { + "epoch": 2.5996649916247905, + "grad_norm": 0.46060335636138916, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3880 + }, + { + "epoch": 2.6063651591289783, + "grad_norm": 0.4718867540359497, + "learning_rate": 0.0002, + "loss": 1.6302, + "step": 3890 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.41570305824279785, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 3900 + }, + { + "epoch": 2.6197654941373534, + "grad_norm": 0.4603121876716614, + "learning_rate": 0.0002, + "loss": 1.6401, + "step": 3910 + }, + { + "epoch": 2.626465661641541, + "grad_norm": 0.4734652638435364, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 3920 + }, + { + "epoch": 2.6331658291457285, + "grad_norm": 0.45348483324050903, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3930 + }, + { + "epoch": 2.6398659966499163, + "grad_norm": 0.46559447050094604, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3940 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.44113144278526306, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 3950 + }, + { + "epoch": 2.6532663316582914, + "grad_norm": 0.41415104269981384, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3960 + }, + { + "epoch": 2.659966499162479, + "grad_norm": 0.48868080973625183, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 3970 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.49610549211502075, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 3980 + }, + { + "epoch": 2.6733668341708543, + "grad_norm": 0.4309130907058716, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 3990 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.4489327669143677, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 4000 + }, + { + "epoch": 2.6867671691792294, + "grad_norm": 0.5380139946937561, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 4010 + }, + { + "epoch": 2.693467336683417, + "grad_norm": 0.5076672434806824, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 4020 + }, + { + "epoch": 2.7001675041876045, + "grad_norm": 0.47620031237602234, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 4030 + }, + { + "epoch": 2.7068676716917923, + "grad_norm": 0.48089155554771423, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 4040 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.5108814239501953, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 4050 + }, + { + "epoch": 2.7202680067001674, + "grad_norm": 0.4196513295173645, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 4060 + }, + { + "epoch": 2.726968174204355, + "grad_norm": 0.4574664831161499, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 4070 + }, + { + "epoch": 2.7336683417085426, + "grad_norm": 0.4671640992164612, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 4080 + }, + { + "epoch": 2.7403685092127303, + "grad_norm": 0.49355530738830566, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 4090 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.46716663241386414, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 4100 + }, + { + "epoch": 2.7537688442211055, + "grad_norm": 0.45420581102371216, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 4110 + }, + { + "epoch": 2.7604690117252932, + "grad_norm": 0.4680487811565399, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4120 + }, + { + "epoch": 2.7671691792294806, + "grad_norm": 0.5375032424926758, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4130 + }, + { + "epoch": 2.7738693467336684, + "grad_norm": 0.46026280522346497, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 4140 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.43658447265625, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 4150 + }, + { + "epoch": 2.7872696817420435, + "grad_norm": 0.4935547113418579, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 4160 + }, + { + "epoch": 2.7939698492462313, + "grad_norm": 0.8167962431907654, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4170 + }, + { + "epoch": 2.8006700167504186, + "grad_norm": 0.4289683997631073, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 4180 + }, + { + "epoch": 2.8073701842546064, + "grad_norm": 0.4569324254989624, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 4190 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.474795937538147, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 4200 + }, + { + "epoch": 2.8207705192629815, + "grad_norm": 0.44272229075431824, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 4210 + }, + { + "epoch": 2.8274706867671693, + "grad_norm": 0.525240957736969, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 4220 + }, + { + "epoch": 2.8341708542713566, + "grad_norm": 0.4802303910255432, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 4230 + }, + { + "epoch": 2.8408710217755444, + "grad_norm": 0.46400442719459534, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4240 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.49884888529777527, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4250 + }, + { + "epoch": 2.8542713567839195, + "grad_norm": 0.5015072226524353, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 4260 + }, + { + "epoch": 2.8609715242881073, + "grad_norm": 0.4335440695285797, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 4270 + }, + { + "epoch": 2.8676716917922946, + "grad_norm": 0.5131644606590271, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 4280 + }, + { + "epoch": 2.8743718592964824, + "grad_norm": 0.6977195739746094, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 4290 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5133762955665588, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 4300 + }, + { + "epoch": 2.8877721943048575, + "grad_norm": 0.4737614393234253, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 4310 + }, + { + "epoch": 2.8944723618090453, + "grad_norm": 0.4580535590648651, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 4320 + }, + { + "epoch": 2.901172529313233, + "grad_norm": 0.43863341212272644, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 4330 + }, + { + "epoch": 2.9078726968174204, + "grad_norm": 0.4103737473487854, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4340 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.438014417886734, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 4350 + }, + { + "epoch": 2.9212730318257956, + "grad_norm": 0.5068213939666748, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 4360 + }, + { + "epoch": 2.9279731993299833, + "grad_norm": 0.45305484533309937, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 4370 + }, + { + "epoch": 2.934673366834171, + "grad_norm": 0.4612090289592743, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4380 + }, + { + "epoch": 2.9413735343383585, + "grad_norm": 0.508736789226532, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 4390 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4924427270889282, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 4400 + }, + { + "epoch": 2.9547738693467336, + "grad_norm": 0.5707460641860962, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 4410 + }, + { + "epoch": 2.9614740368509214, + "grad_norm": 0.42270299792289734, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 4420 + }, + { + "epoch": 2.968174204355109, + "grad_norm": 0.4429931044578552, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 4430 + }, + { + "epoch": 2.9748743718592965, + "grad_norm": 0.49760574102401733, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 4440 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4558229148387909, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 4450 + }, + { + "epoch": 2.9882747068676716, + "grad_norm": 0.39848530292510986, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4460 + }, + { + "epoch": 2.9949748743718594, + "grad_norm": 0.5224862098693848, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 4470 + }, + { + "epoch": 2.9996649916247904, + "eval_loss": 1.8228833675384521, + "eval_runtime": 37.9049, + "eval_samples_per_second": 13.587, + "eval_steps_per_second": 1.715, + "step": 4477 + }, + { + "epoch": 3.0016750418760467, + "grad_norm": 0.41169142723083496, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 4480 + }, + { + "epoch": 3.0083752093802345, + "grad_norm": 0.4865207374095917, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 4490 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5462028384208679, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4500 + }, + { + "epoch": 3.0217755443886096, + "grad_norm": 0.6169732809066772, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 4510 + }, + { + "epoch": 3.0284757118927974, + "grad_norm": 0.5667954087257385, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 4520 + }, + { + "epoch": 3.0351758793969847, + "grad_norm": 0.5758325457572937, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 4530 + }, + { + "epoch": 3.0418760469011725, + "grad_norm": 0.5220064520835876, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4540 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.5469558835029602, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 4550 + }, + { + "epoch": 3.0552763819095476, + "grad_norm": 0.5680848956108093, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 4560 + }, + { + "epoch": 3.0619765494137354, + "grad_norm": 0.5906574726104736, + "learning_rate": 0.0002, + "loss": 1.5187, + "step": 4570 + }, + { + "epoch": 3.0686767169179228, + "grad_norm": 0.4725631773471832, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4580 + }, + { + "epoch": 3.0753768844221105, + "grad_norm": 0.5273477435112, + "learning_rate": 0.0002, + "loss": 1.5083, + "step": 4590 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.5861203074455261, + "learning_rate": 0.0002, + "loss": 1.5154, + "step": 4600 + }, + { + "epoch": 3.0887772194304857, + "grad_norm": 0.5343965291976929, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 4610 + }, + { + "epoch": 3.0954773869346734, + "grad_norm": 0.5348150730133057, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4620 + }, + { + "epoch": 3.102177554438861, + "grad_norm": 0.5971846580505371, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4630 + }, + { + "epoch": 3.1088777219430486, + "grad_norm": 0.5203177332878113, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4640 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.55289226770401, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 4650 + }, + { + "epoch": 3.1222780569514237, + "grad_norm": 0.6878530979156494, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4660 + }, + { + "epoch": 3.1289782244556115, + "grad_norm": 0.6173256635665894, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 4670 + }, + { + "epoch": 3.135678391959799, + "grad_norm": 0.536796510219574, + "learning_rate": 0.0002, + "loss": 1.51, + "step": 4680 + }, + { + "epoch": 3.1423785594639866, + "grad_norm": 0.58846116065979, + "learning_rate": 0.0002, + "loss": 1.4713, + "step": 4690 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.645889401435852, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 4700 + }, + { + "epoch": 3.1557788944723617, + "grad_norm": 0.6118691563606262, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 4710 + }, + { + "epoch": 3.1624790619765495, + "grad_norm": 0.5189669132232666, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 4720 + }, + { + "epoch": 3.169179229480737, + "grad_norm": 0.5794713497161865, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4730 + }, + { + "epoch": 3.1758793969849246, + "grad_norm": 0.6579326391220093, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 4740 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.5822742581367493, + "learning_rate": 0.0002, + "loss": 1.545, + "step": 4750 + }, + { + "epoch": 3.1892797319932997, + "grad_norm": 0.5475956201553345, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 4760 + }, + { + "epoch": 3.1959798994974875, + "grad_norm": 0.6743834018707275, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4770 + }, + { + "epoch": 3.202680067001675, + "grad_norm": 0.6110585927963257, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4780 + }, + { + "epoch": 3.2093802345058626, + "grad_norm": 0.5426181554794312, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 4790 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6077824234962463, + "learning_rate": 0.0002, + "loss": 1.5315, + "step": 4800 + }, + { + "epoch": 3.2227805695142377, + "grad_norm": 0.5785858631134033, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 4810 + }, + { + "epoch": 3.2294807370184255, + "grad_norm": 0.6425958275794983, + "learning_rate": 0.0002, + "loss": 1.4041, + "step": 4820 + }, + { + "epoch": 3.236180904522613, + "grad_norm": 0.6607080698013306, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 4830 + }, + { + "epoch": 3.2428810720268006, + "grad_norm": 0.5385788679122925, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4840 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.5630403757095337, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 4850 + }, + { + "epoch": 3.2562814070351758, + "grad_norm": 0.6340779662132263, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 4860 + }, + { + "epoch": 3.2629815745393635, + "grad_norm": 0.5305342674255371, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4870 + }, + { + "epoch": 3.2696817420435513, + "grad_norm": 0.597670316696167, + "learning_rate": 0.0002, + "loss": 1.5162, + "step": 4880 + }, + { + "epoch": 3.2763819095477387, + "grad_norm": 0.665553867816925, + "learning_rate": 0.0002, + "loss": 1.5429, + "step": 4890 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.579767644405365, + "learning_rate": 0.0002, + "loss": 1.4607, + "step": 4900 + }, + { + "epoch": 3.289782244556114, + "grad_norm": 0.5512481331825256, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 4910 + }, + { + "epoch": 3.2964824120603016, + "grad_norm": 0.5916532278060913, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 4920 + }, + { + "epoch": 3.3031825795644894, + "grad_norm": 0.7521726489067078, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 4930 + }, + { + "epoch": 3.3098827470686767, + "grad_norm": 0.5352797508239746, + "learning_rate": 0.0002, + "loss": 1.4223, + "step": 4940 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.5950371623039246, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4950 + }, + { + "epoch": 3.323283082077052, + "grad_norm": 0.8020477890968323, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 4960 + }, + { + "epoch": 3.3299832495812396, + "grad_norm": 0.6790024638175964, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4970 + }, + { + "epoch": 3.3366834170854274, + "grad_norm": 0.687627375125885, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4980 + }, + { + "epoch": 3.3433835845896147, + "grad_norm": 0.6094385385513306, + "learning_rate": 0.0002, + "loss": 1.5276, + "step": 4990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.6541242003440857, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 5000 + }, + { + "epoch": 3.35678391959799, + "grad_norm": 0.5560880303382874, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 5010 + }, + { + "epoch": 3.3634840871021776, + "grad_norm": 0.5440094470977783, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 5020 + }, + { + "epoch": 3.3701842546063654, + "grad_norm": 0.5749301314353943, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 5030 + }, + { + "epoch": 3.3768844221105527, + "grad_norm": 0.5919716954231262, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 5040 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.6331481337547302, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 5050 + }, + { + "epoch": 3.390284757118928, + "grad_norm": 0.5687161684036255, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 5060 + }, + { + "epoch": 3.3969849246231156, + "grad_norm": 0.6718577742576599, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 5070 + }, + { + "epoch": 3.4036850921273034, + "grad_norm": 0.5089324116706848, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 5080 + }, + { + "epoch": 3.4103852596314908, + "grad_norm": 0.5710174441337585, + "learning_rate": 0.0002, + "loss": 1.512, + "step": 5090 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6670721173286438, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 5100 + }, + { + "epoch": 3.423785594639866, + "grad_norm": 0.6875665187835693, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 5110 + }, + { + "epoch": 3.4304857621440537, + "grad_norm": 0.5375880599021912, + "learning_rate": 0.0002, + "loss": 1.4496, + "step": 5120 + }, + { + "epoch": 3.4371859296482414, + "grad_norm": 0.6550399661064148, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 5130 + }, + { + "epoch": 3.4438860971524288, + "grad_norm": 0.5948067903518677, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 5140 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.6134477257728577, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5150 + }, + { + "epoch": 3.457286432160804, + "grad_norm": 0.6506398320198059, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 5160 + }, + { + "epoch": 3.4639865996649917, + "grad_norm": 0.6060147881507874, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 5170 + }, + { + "epoch": 3.4706867671691795, + "grad_norm": 0.6173806190490723, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 5180 + }, + { + "epoch": 3.477386934673367, + "grad_norm": 0.6032607555389404, + "learning_rate": 0.0002, + "loss": 1.4975, + "step": 5190 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5652492046356201, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 5200 + }, + { + "epoch": 3.490787269681742, + "grad_norm": 0.6168607473373413, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 5210 + }, + { + "epoch": 3.4974874371859297, + "grad_norm": 0.6170629262924194, + "learning_rate": 0.0002, + "loss": 1.5164, + "step": 5220 + }, + { + "epoch": 3.5041876046901175, + "grad_norm": 0.6926297545433044, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 5230 + }, + { + "epoch": 3.510887772194305, + "grad_norm": 0.6702437996864319, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 5240 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.5421436429023743, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 5250 + }, + { + "epoch": 3.52428810720268, + "grad_norm": 0.5726765990257263, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 5260 + }, + { + "epoch": 3.5309882747068677, + "grad_norm": 0.5685455203056335, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 5270 + }, + { + "epoch": 3.5376884422110555, + "grad_norm": 0.6018396019935608, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 5280 + }, + { + "epoch": 3.544388609715243, + "grad_norm": 0.5731932520866394, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 5290 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.6601519584655762, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5300 + }, + { + "epoch": 3.557788944723618, + "grad_norm": 0.5545530319213867, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 5310 + }, + { + "epoch": 3.5644891122278057, + "grad_norm": 0.5998541116714478, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 5320 + }, + { + "epoch": 3.5711892797319935, + "grad_norm": 0.5651767253875732, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 5330 + }, + { + "epoch": 3.577889447236181, + "grad_norm": 0.7425084114074707, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 5340 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5770602226257324, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 5350 + }, + { + "epoch": 3.591289782244556, + "grad_norm": 0.54723060131073, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 5360 + }, + { + "epoch": 3.5979899497487438, + "grad_norm": 0.6658238172531128, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 5370 + }, + { + "epoch": 3.6046901172529315, + "grad_norm": 0.5787645578384399, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 5380 + }, + { + "epoch": 3.611390284757119, + "grad_norm": 0.594913125038147, + "learning_rate": 0.0002, + "loss": 1.5343, + "step": 5390 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.4964977502822876, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5400 + }, + { + "epoch": 3.624790619765494, + "grad_norm": 0.6087527275085449, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 5410 + }, + { + "epoch": 3.6314907872696818, + "grad_norm": 0.6315323710441589, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 5420 + }, + { + "epoch": 3.6381909547738696, + "grad_norm": 0.574799120426178, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 5430 + }, + { + "epoch": 3.644891122278057, + "grad_norm": 0.5949277877807617, + "learning_rate": 0.0002, + "loss": 1.4595, + "step": 5440 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.5640677213668823, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 5450 + }, + { + "epoch": 3.658291457286432, + "grad_norm": 0.6198237538337708, + "learning_rate": 0.0002, + "loss": 1.525, + "step": 5460 + }, + { + "epoch": 3.66499162479062, + "grad_norm": 0.6902034878730774, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 5470 + }, + { + "epoch": 3.6716917922948076, + "grad_norm": 0.5686674118041992, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5480 + }, + { + "epoch": 3.678391959798995, + "grad_norm": 0.6532107591629028, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 5490 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.5790849924087524, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 5500 + }, + { + "epoch": 3.69179229480737, + "grad_norm": 0.6055065393447876, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 5510 + }, + { + "epoch": 3.698492462311558, + "grad_norm": 0.5630605816841125, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 5520 + }, + { + "epoch": 3.7051926298157456, + "grad_norm": 0.6005825996398926, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 5530 + }, + { + "epoch": 3.711892797319933, + "grad_norm": 0.6553038954734802, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 5540 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5601094961166382, + "learning_rate": 0.0002, + "loss": 1.4943, + "step": 5550 + }, + { + "epoch": 3.725293132328308, + "grad_norm": 0.6598808169364929, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 5560 + }, + { + "epoch": 3.731993299832496, + "grad_norm": 0.5506255626678467, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 5570 + }, + { + "epoch": 3.7386934673366836, + "grad_norm": 0.6001223921775818, + "learning_rate": 0.0002, + "loss": 1.4805, + "step": 5580 + }, + { + "epoch": 3.745393634840871, + "grad_norm": 0.6287297606468201, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 5590 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.6253238916397095, + "learning_rate": 0.0002, + "loss": 1.5246, + "step": 5600 + }, + { + "epoch": 3.758793969849246, + "grad_norm": 0.5713174939155579, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 5610 + }, + { + "epoch": 3.765494137353434, + "grad_norm": 0.6198310852050781, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 5620 + }, + { + "epoch": 3.7721943048576216, + "grad_norm": 0.5941224098205566, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 5630 + }, + { + "epoch": 3.778894472361809, + "grad_norm": 0.606002151966095, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 5640 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.6540704965591431, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 5650 + }, + { + "epoch": 3.792294807370184, + "grad_norm": 0.6147415041923523, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 5660 + }, + { + "epoch": 3.798994974874372, + "grad_norm": 0.5649605393409729, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5670 + }, + { + "epoch": 3.8056951423785597, + "grad_norm": 0.6788773536682129, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 5680 + }, + { + "epoch": 3.812395309882747, + "grad_norm": 0.6581860780715942, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 5690 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.5529348850250244, + "learning_rate": 0.0002, + "loss": 1.4587, + "step": 5700 + }, + { + "epoch": 3.825795644891122, + "grad_norm": 0.6320232152938843, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 5710 + }, + { + "epoch": 3.83249581239531, + "grad_norm": 0.6529698371887207, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 5720 + }, + { + "epoch": 3.8391959798994977, + "grad_norm": 0.5983362793922424, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 5730 + }, + { + "epoch": 3.845896147403685, + "grad_norm": 0.6335684061050415, + "learning_rate": 0.0002, + "loss": 1.465, + "step": 5740 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.700446605682373, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5750 + }, + { + "epoch": 3.85929648241206, + "grad_norm": 0.6092597842216492, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 5760 + }, + { + "epoch": 3.865996649916248, + "grad_norm": 0.564146101474762, + "learning_rate": 0.0002, + "loss": 1.5729, + "step": 5770 + }, + { + "epoch": 3.8726968174204357, + "grad_norm": 0.615275502204895, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 5780 + }, + { + "epoch": 3.879396984924623, + "grad_norm": 0.6685376763343811, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 5790 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6116922497749329, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5800 + }, + { + "epoch": 3.892797319932998, + "grad_norm": 0.5486813187599182, + "learning_rate": 0.0002, + "loss": 1.5179, + "step": 5810 + }, + { + "epoch": 3.899497487437186, + "grad_norm": 0.6208204030990601, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 5820 + }, + { + "epoch": 3.9061976549413737, + "grad_norm": 0.6500625014305115, + "learning_rate": 0.0002, + "loss": 1.5334, + "step": 5830 + }, + { + "epoch": 3.912897822445561, + "grad_norm": 0.5948089361190796, + "learning_rate": 0.0002, + "loss": 1.4716, + "step": 5840 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.7210732698440552, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 5850 + }, + { + "epoch": 3.926298157453936, + "grad_norm": 0.6662322878837585, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 5860 + }, + { + "epoch": 3.932998324958124, + "grad_norm": 0.5613839626312256, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 5870 + }, + { + "epoch": 3.9396984924623117, + "grad_norm": 0.6069002151489258, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5880 + }, + { + "epoch": 3.946398659966499, + "grad_norm": 0.7075562477111816, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 5890 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.6316173076629639, + "learning_rate": 0.0002, + "loss": 1.5391, + "step": 5900 + }, + { + "epoch": 3.959798994974874, + "grad_norm": 0.5716308355331421, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 5910 + }, + { + "epoch": 3.966499162479062, + "grad_norm": 0.6800096035003662, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 5920 + }, + { + "epoch": 3.9731993299832498, + "grad_norm": 0.6057983040809631, + "learning_rate": 0.0002, + "loss": 1.5189, + "step": 5930 + }, + { + "epoch": 3.979899497487437, + "grad_norm": 0.5938987731933594, + "learning_rate": 0.0002, + "loss": 1.5431, + "step": 5940 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.6963576674461365, + "learning_rate": 0.0002, + "loss": 1.5111, + "step": 5950 + }, + { + "epoch": 3.993299832495812, + "grad_norm": 0.6279940009117126, + "learning_rate": 0.0002, + "loss": 1.5521, + "step": 5960 + }, + { + "epoch": 4.0, + "grad_norm": 0.7161159515380859, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 5970 + }, + { + "epoch": 4.0, + "eval_loss": 1.8655421733856201, + "eval_runtime": 37.9276, + "eval_samples_per_second": 13.579, + "eval_steps_per_second": 1.714, + "step": 5970 + }, + { + "epoch": 4.006700167504188, + "grad_norm": 0.7380476593971252, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 5980 + }, + { + "epoch": 4.013400335008376, + "grad_norm": 0.7148947715759277, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 5990 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.6177082657814026, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 6000 + }, + { + "epoch": 4.02680067001675, + "grad_norm": 0.8552946448326111, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 6010 + }, + { + "epoch": 4.033500837520938, + "grad_norm": 0.8033416271209717, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 6020 + }, + { + "epoch": 4.040201005025126, + "grad_norm": 0.8501318097114563, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 6030 + }, + { + "epoch": 4.046901172529314, + "grad_norm": 0.6981393098831177, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6040 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.7227180600166321, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 6050 + }, + { + "epoch": 4.060301507537688, + "grad_norm": 0.6923989653587341, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 6060 + }, + { + "epoch": 4.067001675041876, + "grad_norm": 0.879779040813446, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 6070 + }, + { + "epoch": 4.073701842546064, + "grad_norm": 0.8184754848480225, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 6080 + }, + { + "epoch": 4.080402010050252, + "grad_norm": 0.8211342692375183, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 6090 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.7542396783828735, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 6100 + }, + { + "epoch": 4.093802345058626, + "grad_norm": 0.6631066799163818, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 6110 + }, + { + "epoch": 4.100502512562814, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 6120 + }, + { + "epoch": 4.107202680067002, + "grad_norm": 0.681851863861084, + "learning_rate": 0.0002, + "loss": 1.3443, + "step": 6130 + }, + { + "epoch": 4.11390284757119, + "grad_norm": 0.8757794499397278, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 6140 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.6567301750183105, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 6150 + }, + { + "epoch": 4.127303182579564, + "grad_norm": 0.7950329184532166, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 6160 + }, + { + "epoch": 4.134003350083752, + "grad_norm": 0.7545644044876099, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6170 + }, + { + "epoch": 4.14070351758794, + "grad_norm": 0.7172710299491882, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 6180 + }, + { + "epoch": 4.147403685092128, + "grad_norm": 0.7040584087371826, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6190 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7482913732528687, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 6200 + }, + { + "epoch": 4.160804020100502, + "grad_norm": 0.8523276448249817, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 6210 + }, + { + "epoch": 4.16750418760469, + "grad_norm": 0.6672041416168213, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 6220 + }, + { + "epoch": 4.174204355108878, + "grad_norm": 0.7523500919342041, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 6230 + }, + { + "epoch": 4.180904522613066, + "grad_norm": 0.8085253834724426, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 6240 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.789450466632843, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 6250 + }, + { + "epoch": 4.19430485762144, + "grad_norm": 0.7502310872077942, + "learning_rate": 0.0002, + "loss": 1.3539, + "step": 6260 + }, + { + "epoch": 4.201005025125628, + "grad_norm": 0.7397456765174866, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 6270 + }, + { + "epoch": 4.207705192629816, + "grad_norm": 0.6921947002410889, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6280 + }, + { + "epoch": 4.214405360134004, + "grad_norm": 0.9334571957588196, + "learning_rate": 0.0002, + "loss": 1.3125, + "step": 6290 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.725799024105072, + "learning_rate": 0.0002, + "loss": 1.3612, + "step": 6300 + }, + { + "epoch": 4.227805695142378, + "grad_norm": 0.8290495872497559, + "learning_rate": 0.0002, + "loss": 1.4217, + "step": 6310 + }, + { + "epoch": 4.234505862646566, + "grad_norm": 0.688983678817749, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 6320 + }, + { + "epoch": 4.241206030150754, + "grad_norm": 0.8620913028717041, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 6330 + }, + { + "epoch": 4.247906197654942, + "grad_norm": 0.8008657693862915, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6340 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7379199266433716, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6350 + }, + { + "epoch": 4.261306532663316, + "grad_norm": 0.7842815518379211, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 6360 + }, + { + "epoch": 4.268006700167504, + "grad_norm": 0.812600314617157, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 6370 + }, + { + "epoch": 4.274706867671692, + "grad_norm": 0.7852841019630432, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 6380 + }, + { + "epoch": 4.28140703517588, + "grad_norm": 1.0377534627914429, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 6390 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 1.03935706615448, + "learning_rate": 0.0002, + "loss": 1.3755, + "step": 6400 + }, + { + "epoch": 4.294807370184254, + "grad_norm": 0.7244732975959778, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 6410 + }, + { + "epoch": 4.301507537688442, + "grad_norm": 0.7137406468391418, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 6420 + }, + { + "epoch": 4.30820770519263, + "grad_norm": 0.7492543458938599, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 6430 + }, + { + "epoch": 4.314907872696818, + "grad_norm": 0.7065439224243164, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 6440 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.7786989808082581, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 6450 + }, + { + "epoch": 4.328308207705192, + "grad_norm": 0.7369208335876465, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 6460 + }, + { + "epoch": 4.33500837520938, + "grad_norm": 0.7412346005439758, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 6470 + }, + { + "epoch": 4.341708542713568, + "grad_norm": 0.780927300453186, + "learning_rate": 0.0002, + "loss": 1.4087, + "step": 6480 + }, + { + "epoch": 4.348408710217756, + "grad_norm": 0.8320930600166321, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 6490 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.6871094703674316, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6500 + }, + { + "epoch": 4.36180904522613, + "grad_norm": 0.6751559972763062, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 6510 + }, + { + "epoch": 4.368509212730318, + "grad_norm": 0.7723976969718933, + "learning_rate": 0.0002, + "loss": 1.4311, + "step": 6520 + }, + { + "epoch": 4.375209380234506, + "grad_norm": 0.7915401458740234, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 6530 + }, + { + "epoch": 4.381909547738694, + "grad_norm": 0.7329102754592896, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 6540 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.7388760447502136, + "learning_rate": 0.0002, + "loss": 1.447, + "step": 6550 + }, + { + "epoch": 4.3953098827470685, + "grad_norm": 0.8282579183578491, + "learning_rate": 0.0002, + "loss": 1.4378, + "step": 6560 + }, + { + "epoch": 4.402010050251256, + "grad_norm": 0.7192724347114563, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6570 + }, + { + "epoch": 4.408710217755444, + "grad_norm": 0.746526837348938, + "learning_rate": 0.0002, + "loss": 1.4141, + "step": 6580 + }, + { + "epoch": 4.415410385259632, + "grad_norm": 0.8738046288490295, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 6590 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.8408458828926086, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 6600 + }, + { + "epoch": 4.4288107202680065, + "grad_norm": 0.8110666275024414, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 6610 + }, + { + "epoch": 4.435510887772194, + "grad_norm": 0.8602406978607178, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 6620 + }, + { + "epoch": 4.442211055276382, + "grad_norm": 0.7549102902412415, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6630 + }, + { + "epoch": 4.44891122278057, + "grad_norm": 0.7831804156303406, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 6640 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.7269673943519592, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6650 + }, + { + "epoch": 4.4623115577889445, + "grad_norm": 0.7397838830947876, + "learning_rate": 0.0002, + "loss": 1.4132, + "step": 6660 + }, + { + "epoch": 4.469011725293132, + "grad_norm": 0.713707447052002, + "learning_rate": 0.0002, + "loss": 1.3174, + "step": 6670 + }, + { + "epoch": 4.47571189279732, + "grad_norm": 0.7525581121444702, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 6680 + }, + { + "epoch": 4.482412060301508, + "grad_norm": 0.8030191659927368, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6690 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.7469439506530762, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 6700 + }, + { + "epoch": 4.4958123953098825, + "grad_norm": 0.7743868231773376, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 6710 + }, + { + "epoch": 4.50251256281407, + "grad_norm": 0.6539737582206726, + "learning_rate": 0.0002, + "loss": 1.3439, + "step": 6720 + }, + { + "epoch": 4.509212730318258, + "grad_norm": 0.825818657875061, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 6730 + }, + { + "epoch": 4.515912897822446, + "grad_norm": 0.8048575520515442, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 6740 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.7828766107559204, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6750 + }, + { + "epoch": 4.5293132328308205, + "grad_norm": 0.7406010031700134, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 6760 + }, + { + "epoch": 4.536013400335008, + "grad_norm": 0.840345561504364, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 6770 + }, + { + "epoch": 4.542713567839196, + "grad_norm": 0.8492622971534729, + "learning_rate": 0.0002, + "loss": 1.4808, + "step": 6780 + }, + { + "epoch": 4.549413735343384, + "grad_norm": 0.7130163908004761, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 6790 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 0.8454728126525879, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 6800 + }, + { + "epoch": 4.562814070351759, + "grad_norm": 0.7847645282745361, + "learning_rate": 0.0002, + "loss": 1.3239, + "step": 6810 + }, + { + "epoch": 4.569514237855946, + "grad_norm": 0.7245864272117615, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 6820 + }, + { + "epoch": 4.576214405360134, + "grad_norm": 0.768893301486969, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 6830 + }, + { + "epoch": 4.582914572864322, + "grad_norm": 0.8028400540351868, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6840 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.763945460319519, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 6850 + }, + { + "epoch": 4.596314907872697, + "grad_norm": 0.7417685389518738, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 6860 + }, + { + "epoch": 4.603015075376884, + "grad_norm": 0.7603038549423218, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 6870 + }, + { + "epoch": 4.609715242881072, + "grad_norm": 0.7981528043746948, + "learning_rate": 0.0002, + "loss": 1.4095, + "step": 6880 + }, + { + "epoch": 4.61641541038526, + "grad_norm": 0.8077111840248108, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6890 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.8778454065322876, + "learning_rate": 0.0002, + "loss": 1.4721, + "step": 6900 + }, + { + "epoch": 4.629815745393635, + "grad_norm": 0.8620710372924805, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 6910 + }, + { + "epoch": 4.636515912897822, + "grad_norm": 0.7486072778701782, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 6920 + }, + { + "epoch": 4.64321608040201, + "grad_norm": 0.7493042945861816, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 6930 + }, + { + "epoch": 4.649916247906198, + "grad_norm": 0.7388978600502014, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 6940 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.798530638217926, + "learning_rate": 0.0002, + "loss": 1.3593, + "step": 6950 + }, + { + "epoch": 4.663316582914573, + "grad_norm": 0.7929500937461853, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 6960 + }, + { + "epoch": 4.67001675041876, + "grad_norm": 0.9186785221099854, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 6970 + }, + { + "epoch": 4.676716917922948, + "grad_norm": 1.1103485822677612, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 6980 + }, + { + "epoch": 4.683417085427136, + "grad_norm": 0.8000466823577881, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 6990 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7520599961280823, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 7000 + }, + { + "epoch": 4.696817420435511, + "grad_norm": 0.7971973419189453, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 7010 + }, + { + "epoch": 4.703517587939698, + "grad_norm": 0.7363343834877014, + "learning_rate": 0.0002, + "loss": 1.3682, + "step": 7020 + }, + { + "epoch": 4.710217755443886, + "grad_norm": 0.8268865942955017, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 7030 + }, + { + "epoch": 4.716917922948074, + "grad_norm": 0.7054963111877441, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 7040 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.8196262121200562, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7050 + }, + { + "epoch": 4.730318257956449, + "grad_norm": 0.8276031017303467, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 7060 + }, + { + "epoch": 4.7370184254606365, + "grad_norm": 0.8248157501220703, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 7070 + }, + { + "epoch": 4.743718592964824, + "grad_norm": 0.8937979936599731, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 7080 + }, + { + "epoch": 4.750418760469012, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 7090 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.9495313763618469, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 7100 + }, + { + "epoch": 4.763819095477387, + "grad_norm": 0.8598204255104065, + "learning_rate": 0.0002, + "loss": 1.4504, + "step": 7110 + }, + { + "epoch": 4.7705192629815745, + "grad_norm": 0.8951472640037537, + "learning_rate": 0.0002, + "loss": 1.3969, + "step": 7120 + }, + { + "epoch": 4.777219430485762, + "grad_norm": 0.9110309481620789, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 7130 + }, + { + "epoch": 4.78391959798995, + "grad_norm": 0.7929584980010986, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 7140 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7415322661399841, + "learning_rate": 0.0002, + "loss": 1.467, + "step": 7150 + }, + { + "epoch": 4.797319932998325, + "grad_norm": 0.7504757046699524, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 7160 + }, + { + "epoch": 4.8040201005025125, + "grad_norm": 0.7166924476623535, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 7170 + }, + { + "epoch": 4.8107202680067, + "grad_norm": 0.7728400826454163, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 7180 + }, + { + "epoch": 4.817420435510888, + "grad_norm": 0.7992154955863953, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 7190 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.8655321002006531, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 7200 + }, + { + "epoch": 4.830820770519263, + "grad_norm": 0.7672632336616516, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 7210 + }, + { + "epoch": 4.8375209380234505, + "grad_norm": 0.708416223526001, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7220 + }, + { + "epoch": 4.844221105527638, + "grad_norm": 0.8914081454277039, + "learning_rate": 0.0002, + "loss": 1.5413, + "step": 7230 + }, + { + "epoch": 4.850921273031826, + "grad_norm": 0.7141931653022766, + "learning_rate": 0.0002, + "loss": 1.3569, + "step": 7240 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.6913040280342102, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 7250 + }, + { + "epoch": 4.864321608040201, + "grad_norm": 0.7871233820915222, + "learning_rate": 0.0002, + "loss": 1.3912, + "step": 7260 + }, + { + "epoch": 4.8710217755443885, + "grad_norm": 0.8466277122497559, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 7270 + }, + { + "epoch": 4.877721943048576, + "grad_norm": 0.8492183685302734, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 7280 + }, + { + "epoch": 4.884422110552764, + "grad_norm": 0.8339574933052063, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 7290 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.787022590637207, + "learning_rate": 0.0002, + "loss": 1.4157, + "step": 7300 + }, + { + "epoch": 4.897822445561139, + "grad_norm": 0.8877332806587219, + "learning_rate": 0.0002, + "loss": 1.3725, + "step": 7310 + }, + { + "epoch": 4.9045226130653266, + "grad_norm": 0.744989812374115, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 7320 + }, + { + "epoch": 4.911222780569514, + "grad_norm": 0.8027268648147583, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 7330 + }, + { + "epoch": 4.917922948073702, + "grad_norm": 0.6437455415725708, + "learning_rate": 0.0002, + "loss": 1.425, + "step": 7340 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.685999870300293, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 7350 + }, + { + "epoch": 4.931323283082077, + "grad_norm": 0.9086187481880188, + "learning_rate": 0.0002, + "loss": 1.4352, + "step": 7360 + }, + { + "epoch": 4.938023450586265, + "grad_norm": 0.8272411227226257, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 7370 + }, + { + "epoch": 4.944723618090452, + "grad_norm": 0.9227852821350098, + "learning_rate": 0.0002, + "loss": 1.4226, + "step": 7380 + }, + { + "epoch": 4.95142378559464, + "grad_norm": 0.7688441276550293, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 7390 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8662643432617188, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 7400 + }, + { + "epoch": 4.964824120603015, + "grad_norm": 0.9234127998352051, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 7410 + }, + { + "epoch": 4.971524288107203, + "grad_norm": 0.9131470918655396, + "learning_rate": 0.0002, + "loss": 1.4009, + "step": 7420 + }, + { + "epoch": 4.97822445561139, + "grad_norm": 0.7377504110336304, + "learning_rate": 0.0002, + "loss": 1.4544, + "step": 7430 + }, + { + "epoch": 4.984924623115578, + "grad_norm": 0.8762801289558411, + "learning_rate": 0.0002, + "loss": 1.4008, + "step": 7440 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7919872999191284, + "learning_rate": 0.0002, + "loss": 1.4304, + "step": 7450 + }, + { + "epoch": 4.998324958123953, + "grad_norm": 0.7144299149513245, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 7460 + }, + { + "epoch": 4.99966499162479, + "eval_loss": 1.9291157722473145, + "eval_runtime": 37.9831, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 7462 + }, + { + "epoch": 5.005025125628141, + "grad_norm": 0.7860151529312134, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 7470 + }, + { + "epoch": 5.011725293132328, + "grad_norm": 0.9418314695358276, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 7480 + }, + { + "epoch": 5.018425460636516, + "grad_norm": 0.8474572896957397, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 7490 + }, + { + "epoch": 5.025125628140704, + "grad_norm": 1.0724040269851685, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 7500 + }, + { + "epoch": 5.031825795644891, + "grad_norm": 0.9109148979187012, + "learning_rate": 0.0002, + "loss": 1.2228, + "step": 7510 + }, + { + "epoch": 5.038525963149079, + "grad_norm": 1.0088659524917603, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 7520 + }, + { + "epoch": 5.045226130653266, + "grad_norm": 1.1421623229980469, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 7530 + }, + { + "epoch": 5.051926298157454, + "grad_norm": 0.9219902157783508, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 7540 + }, + { + "epoch": 5.058626465661642, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 7550 + }, + { + "epoch": 5.065326633165829, + "grad_norm": 0.8889328241348267, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 7560 + }, + { + "epoch": 5.072026800670017, + "grad_norm": 0.9751363396644592, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 7570 + }, + { + "epoch": 5.078726968174204, + "grad_norm": 0.8603123426437378, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 7580 + }, + { + "epoch": 5.085427135678392, + "grad_norm": 0.8910616636276245, + "learning_rate": 0.0002, + "loss": 1.2175, + "step": 7590 + }, + { + "epoch": 5.09212730318258, + "grad_norm": 1.1128392219543457, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 7600 + }, + { + "epoch": 5.098827470686767, + "grad_norm": 0.9480258822441101, + "learning_rate": 0.0002, + "loss": 1.3065, + "step": 7610 + }, + { + "epoch": 5.105527638190955, + "grad_norm": 0.906958818435669, + "learning_rate": 0.0002, + "loss": 1.193, + "step": 7620 + }, + { + "epoch": 5.1122278056951425, + "grad_norm": 0.8741167187690735, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 7630 + }, + { + "epoch": 5.11892797319933, + "grad_norm": 0.966268002986908, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 7640 + }, + { + "epoch": 5.125628140703517, + "grad_norm": 0.9124358892440796, + "learning_rate": 0.0002, + "loss": 1.2782, + "step": 7650 + }, + { + "epoch": 5.132328308207705, + "grad_norm": 1.0436606407165527, + "learning_rate": 0.0002, + "loss": 1.3004, + "step": 7660 + }, + { + "epoch": 5.139028475711893, + "grad_norm": 0.9217309355735779, + "learning_rate": 0.0002, + "loss": 1.2675, + "step": 7670 + }, + { + "epoch": 5.1457286432160805, + "grad_norm": 1.344765543937683, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 7680 + }, + { + "epoch": 5.152428810720268, + "grad_norm": 1.0730723142623901, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 7690 + }, + { + "epoch": 5.159128978224456, + "grad_norm": 0.9321247339248657, + "learning_rate": 0.0002, + "loss": 1.1888, + "step": 7700 + }, + { + "epoch": 5.165829145728643, + "grad_norm": 0.8482614755630493, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 7710 + }, + { + "epoch": 5.172529313232831, + "grad_norm": 0.8274452686309814, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 7720 + }, + { + "epoch": 5.1792294807370185, + "grad_norm": 0.9120376706123352, + "learning_rate": 0.0002, + "loss": 1.1972, + "step": 7730 + }, + { + "epoch": 5.185929648241206, + "grad_norm": 1.0062892436981201, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 7740 + }, + { + "epoch": 5.192629815745394, + "grad_norm": 0.9521504640579224, + "learning_rate": 0.0002, + "loss": 1.2199, + "step": 7750 + }, + { + "epoch": 5.199329983249581, + "grad_norm": 0.8800198435783386, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 7760 + }, + { + "epoch": 5.206030150753769, + "grad_norm": 0.9749179482460022, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 7770 + }, + { + "epoch": 5.2127303182579565, + "grad_norm": 0.9441686868667603, + "learning_rate": 0.0002, + "loss": 1.2975, + "step": 7780 + }, + { + "epoch": 5.219430485762144, + "grad_norm": 0.9114066362380981, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 7790 + }, + { + "epoch": 5.226130653266332, + "grad_norm": 0.9851446151733398, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 7800 + }, + { + "epoch": 5.232830820770519, + "grad_norm": 0.9526297450065613, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 7810 + }, + { + "epoch": 5.239530988274707, + "grad_norm": 1.05986487865448, + "learning_rate": 0.0002, + "loss": 1.1502, + "step": 7820 + }, + { + "epoch": 5.2462311557788945, + "grad_norm": 0.8956538438796997, + "learning_rate": 0.0002, + "loss": 1.2517, + "step": 7830 + }, + { + "epoch": 5.252931323283082, + "grad_norm": 0.9568153619766235, + "learning_rate": 0.0002, + "loss": 1.2556, + "step": 7840 + }, + { + "epoch": 5.259631490787269, + "grad_norm": 1.0035018920898438, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 7850 + }, + { + "epoch": 5.266331658291457, + "grad_norm": 0.8554368615150452, + "learning_rate": 0.0002, + "loss": 1.2605, + "step": 7860 + }, + { + "epoch": 5.273031825795645, + "grad_norm": 0.9677708148956299, + "learning_rate": 0.0002, + "loss": 1.2799, + "step": 7870 + }, + { + "epoch": 5.279731993299833, + "grad_norm": 0.943606436252594, + "learning_rate": 0.0002, + "loss": 1.275, + "step": 7880 + }, + { + "epoch": 5.28643216080402, + "grad_norm": 1.0029335021972656, + "learning_rate": 0.0002, + "loss": 1.2335, + "step": 7890 + }, + { + "epoch": 5.293132328308207, + "grad_norm": 1.0164015293121338, + "learning_rate": 0.0002, + "loss": 1.2494, + "step": 7900 + }, + { + "epoch": 5.299832495812395, + "grad_norm": 0.8908365368843079, + "learning_rate": 0.0002, + "loss": 1.3117, + "step": 7910 + }, + { + "epoch": 5.306532663316583, + "grad_norm": 0.9307826161384583, + "learning_rate": 0.0002, + "loss": 1.2832, + "step": 7920 + }, + { + "epoch": 5.313232830820771, + "grad_norm": 1.0730371475219727, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 7930 + }, + { + "epoch": 5.319932998324958, + "grad_norm": 0.844739556312561, + "learning_rate": 0.0002, + "loss": 1.2003, + "step": 7940 + }, + { + "epoch": 5.326633165829146, + "grad_norm": 1.275833010673523, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 7950 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9042661190032959, + "learning_rate": 0.0002, + "loss": 1.2957, + "step": 7960 + }, + { + "epoch": 5.340033500837521, + "grad_norm": 0.9374269247055054, + "learning_rate": 0.0002, + "loss": 1.2912, + "step": 7970 + }, + { + "epoch": 5.346733668341709, + "grad_norm": 1.033098578453064, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 7980 + }, + { + "epoch": 5.353433835845896, + "grad_norm": 1.062775731086731, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 7990 + }, + { + "epoch": 5.360134003350084, + "grad_norm": 1.1064317226409912, + "learning_rate": 0.0002, + "loss": 1.3065, + "step": 8000 + }, + { + "epoch": 5.366834170854271, + "grad_norm": 1.1114039421081543, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 8010 + }, + { + "epoch": 5.373534338358459, + "grad_norm": 1.0198014974594116, + "learning_rate": 0.0002, + "loss": 1.2255, + "step": 8020 + }, + { + "epoch": 5.380234505862647, + "grad_norm": 0.8443173170089722, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 8030 + }, + { + "epoch": 5.386934673366834, + "grad_norm": 1.000881314277649, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 8040 + }, + { + "epoch": 5.393634840871022, + "grad_norm": 0.9874443411827087, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 8050 + }, + { + "epoch": 5.400335008375209, + "grad_norm": 0.9895344972610474, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 8060 + }, + { + "epoch": 5.407035175879397, + "grad_norm": 0.8595236539840698, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 8070 + }, + { + "epoch": 5.413735343383585, + "grad_norm": 0.9523849487304688, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 8080 + }, + { + "epoch": 5.420435510887772, + "grad_norm": 1.0560476779937744, + "learning_rate": 0.0002, + "loss": 1.2343, + "step": 8090 + }, + { + "epoch": 5.42713567839196, + "grad_norm": 1.0893689393997192, + "learning_rate": 0.0002, + "loss": 1.2956, + "step": 8100 + }, + { + "epoch": 5.433835845896147, + "grad_norm": 0.9395513534545898, + "learning_rate": 0.0002, + "loss": 1.2846, + "step": 8110 + }, + { + "epoch": 5.440536013400335, + "grad_norm": 0.9364215135574341, + "learning_rate": 0.0002, + "loss": 1.3444, + "step": 8120 + }, + { + "epoch": 5.447236180904523, + "grad_norm": 0.9502208232879639, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 8130 + }, + { + "epoch": 5.45393634840871, + "grad_norm": 0.9559208154678345, + "learning_rate": 0.0002, + "loss": 1.2971, + "step": 8140 + }, + { + "epoch": 5.460636515912898, + "grad_norm": 0.9261730313301086, + "learning_rate": 0.0002, + "loss": 1.2495, + "step": 8150 + }, + { + "epoch": 5.467336683417085, + "grad_norm": 0.9832326173782349, + "learning_rate": 0.0002, + "loss": 1.2599, + "step": 8160 + }, + { + "epoch": 5.474036850921273, + "grad_norm": 1.065953016281128, + "learning_rate": 0.0002, + "loss": 1.2771, + "step": 8170 + }, + { + "epoch": 5.480737018425461, + "grad_norm": 0.9139469861984253, + "learning_rate": 0.0002, + "loss": 1.3617, + "step": 8180 + }, + { + "epoch": 5.4874371859296485, + "grad_norm": 1.2322484254837036, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 8190 + }, + { + "epoch": 5.494137353433836, + "grad_norm": 0.9722974896430969, + "learning_rate": 0.0002, + "loss": 1.2879, + "step": 8200 + }, + { + "epoch": 5.500837520938023, + "grad_norm": 0.9338926076889038, + "learning_rate": 0.0002, + "loss": 1.2664, + "step": 8210 + }, + { + "epoch": 5.507537688442211, + "grad_norm": 0.9283728003501892, + "learning_rate": 0.0002, + "loss": 1.2128, + "step": 8220 + }, + { + "epoch": 5.514237855946399, + "grad_norm": 1.0489585399627686, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 8230 + }, + { + "epoch": 5.5209380234505865, + "grad_norm": 0.9881814122200012, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 8240 + }, + { + "epoch": 5.527638190954773, + "grad_norm": 0.9274460077285767, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 8250 + }, + { + "epoch": 5.534338358458961, + "grad_norm": 0.8650718331336975, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 8260 + }, + { + "epoch": 5.541038525963149, + "grad_norm": 1.014069676399231, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 8270 + }, + { + "epoch": 5.547738693467337, + "grad_norm": 0.9212974905967712, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 8280 + }, + { + "epoch": 5.5544388609715245, + "grad_norm": 1.1235398054122925, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 8290 + }, + { + "epoch": 5.561139028475711, + "grad_norm": 0.961954653263092, + "learning_rate": 0.0002, + "loss": 1.306, + "step": 8300 + }, + { + "epoch": 5.567839195979899, + "grad_norm": 0.9386700391769409, + "learning_rate": 0.0002, + "loss": 1.2946, + "step": 8310 + }, + { + "epoch": 5.574539363484087, + "grad_norm": 1.01912522315979, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 8320 + }, + { + "epoch": 5.581239530988275, + "grad_norm": 0.9851216077804565, + "learning_rate": 0.0002, + "loss": 1.3121, + "step": 8330 + }, + { + "epoch": 5.5879396984924625, + "grad_norm": 1.0138001441955566, + "learning_rate": 0.0002, + "loss": 1.3071, + "step": 8340 + }, + { + "epoch": 5.594639865996649, + "grad_norm": 0.9262447357177734, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 8350 + }, + { + "epoch": 5.601340033500837, + "grad_norm": 1.1322970390319824, + "learning_rate": 0.0002, + "loss": 1.2473, + "step": 8360 + }, + { + "epoch": 5.608040201005025, + "grad_norm": 1.1429349184036255, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 8370 + }, + { + "epoch": 5.614740368509213, + "grad_norm": 0.9130118489265442, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 8380 + }, + { + "epoch": 5.6214405360134005, + "grad_norm": 0.9651545882225037, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 8390 + }, + { + "epoch": 5.628140703517588, + "grad_norm": 0.9595398306846619, + "learning_rate": 0.0002, + "loss": 1.2799, + "step": 8400 + }, + { + "epoch": 5.634840871021775, + "grad_norm": 1.0049372911453247, + "learning_rate": 0.0002, + "loss": 1.3429, + "step": 8410 + }, + { + "epoch": 5.641541038525963, + "grad_norm": 1.082804560661316, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 8420 + }, + { + "epoch": 5.648241206030151, + "grad_norm": 0.9489204287528992, + "learning_rate": 0.0002, + "loss": 1.297, + "step": 8430 + }, + { + "epoch": 5.654941373534339, + "grad_norm": 0.9470235109329224, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 8440 + }, + { + "epoch": 5.661641541038526, + "grad_norm": 1.0662927627563477, + "learning_rate": 0.0002, + "loss": 1.3358, + "step": 8450 + }, + { + "epoch": 5.668341708542713, + "grad_norm": 0.9097877740859985, + "learning_rate": 0.0002, + "loss": 1.2973, + "step": 8460 + }, + { + "epoch": 5.675041876046901, + "grad_norm": 0.9740368127822876, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 8470 + }, + { + "epoch": 5.681742043551089, + "grad_norm": 0.9878810048103333, + "learning_rate": 0.0002, + "loss": 1.286, + "step": 8480 + }, + { + "epoch": 5.688442211055277, + "grad_norm": 1.148260474205017, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 8490 + }, + { + "epoch": 5.695142378559464, + "grad_norm": 0.9632558822631836, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 8500 + }, + { + "epoch": 5.701842546063651, + "grad_norm": 0.876812756061554, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 8510 + }, + { + "epoch": 5.708542713567839, + "grad_norm": 1.0730829238891602, + "learning_rate": 0.0002, + "loss": 1.3186, + "step": 8520 + }, + { + "epoch": 5.715242881072027, + "grad_norm": 1.2239218950271606, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 8530 + }, + { + "epoch": 5.721943048576215, + "grad_norm": 0.9460835456848145, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 8540 + }, + { + "epoch": 5.728643216080402, + "grad_norm": 0.9086270928382874, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 8550 + }, + { + "epoch": 5.735343383584589, + "grad_norm": 1.0258867740631104, + "learning_rate": 0.0002, + "loss": 1.2971, + "step": 8560 + }, + { + "epoch": 5.742043551088777, + "grad_norm": 1.0543923377990723, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 8570 + }, + { + "epoch": 5.748743718592965, + "grad_norm": 0.9063900113105774, + "learning_rate": 0.0002, + "loss": 1.2988, + "step": 8580 + }, + { + "epoch": 5.755443886097153, + "grad_norm": 1.1838830709457397, + "learning_rate": 0.0002, + "loss": 1.3535, + "step": 8590 + }, + { + "epoch": 5.76214405360134, + "grad_norm": 0.9631859064102173, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 8600 + }, + { + "epoch": 5.768844221105527, + "grad_norm": 0.9702655673027039, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 8610 + }, + { + "epoch": 5.775544388609715, + "grad_norm": 1.0591435432434082, + "learning_rate": 0.0002, + "loss": 1.3196, + "step": 8620 + }, + { + "epoch": 5.782244556113903, + "grad_norm": 0.9989570379257202, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 8630 + }, + { + "epoch": 5.788944723618091, + "grad_norm": 1.0836435556411743, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 8640 + }, + { + "epoch": 5.795644891122278, + "grad_norm": 0.8832896947860718, + "learning_rate": 0.0002, + "loss": 1.3334, + "step": 8650 + }, + { + "epoch": 5.802345058626465, + "grad_norm": 1.0104607343673706, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 8660 + }, + { + "epoch": 5.809045226130653, + "grad_norm": 0.8375084400177002, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 8670 + }, + { + "epoch": 5.815745393634841, + "grad_norm": 1.1300716400146484, + "learning_rate": 0.0002, + "loss": 1.3554, + "step": 8680 + }, + { + "epoch": 5.822445561139029, + "grad_norm": 0.9311910271644592, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 8690 + }, + { + "epoch": 5.8291457286432165, + "grad_norm": 0.9488391876220703, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 8700 + }, + { + "epoch": 5.835845896147403, + "grad_norm": 0.9747629761695862, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 8710 + }, + { + "epoch": 5.842546063651591, + "grad_norm": 1.1029598712921143, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 8720 + }, + { + "epoch": 5.849246231155779, + "grad_norm": 1.0396875143051147, + "learning_rate": 0.0002, + "loss": 1.3613, + "step": 8730 + }, + { + "epoch": 5.855946398659967, + "grad_norm": 0.9259780645370483, + "learning_rate": 0.0002, + "loss": 1.3272, + "step": 8740 + }, + { + "epoch": 5.8626465661641545, + "grad_norm": 1.020033597946167, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 8750 + }, + { + "epoch": 5.869346733668341, + "grad_norm": 0.9191218614578247, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 8760 + }, + { + "epoch": 5.876046901172529, + "grad_norm": 1.1093107461929321, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 8770 + }, + { + "epoch": 5.882747068676717, + "grad_norm": 1.1626793146133423, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 8780 + }, + { + "epoch": 5.889447236180905, + "grad_norm": 0.9542945027351379, + "learning_rate": 0.0002, + "loss": 1.2969, + "step": 8790 + }, + { + "epoch": 5.8961474036850925, + "grad_norm": 0.9086058139801025, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 8800 + }, + { + "epoch": 5.902847571189279, + "grad_norm": 0.9249639511108398, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 8810 + }, + { + "epoch": 5.909547738693467, + "grad_norm": 0.9414396286010742, + "learning_rate": 0.0002, + "loss": 1.337, + "step": 8820 + }, + { + "epoch": 5.916247906197655, + "grad_norm": 0.9086037874221802, + "learning_rate": 0.0002, + "loss": 1.2865, + "step": 8830 + }, + { + "epoch": 5.922948073701843, + "grad_norm": 0.8685907125473022, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 8840 + }, + { + "epoch": 5.9296482412060305, + "grad_norm": 1.036419153213501, + "learning_rate": 0.0002, + "loss": 1.297, + "step": 8850 + }, + { + "epoch": 5.936348408710217, + "grad_norm": 1.0183674097061157, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 8860 + }, + { + "epoch": 5.943048576214405, + "grad_norm": 0.966444194316864, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 8870 + }, + { + "epoch": 5.949748743718593, + "grad_norm": 1.125693917274475, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 8880 + }, + { + "epoch": 5.956448911222781, + "grad_norm": 0.9857436418533325, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 8890 + }, + { + "epoch": 5.9631490787269685, + "grad_norm": 0.9377069473266602, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 8900 + }, + { + "epoch": 5.969849246231155, + "grad_norm": 0.9493814706802368, + "learning_rate": 0.0002, + "loss": 1.3221, + "step": 8910 + }, + { + "epoch": 5.976549413735343, + "grad_norm": 0.8806208372116089, + "learning_rate": 0.0002, + "loss": 1.2516, + "step": 8920 + }, + { + "epoch": 5.983249581239531, + "grad_norm": 0.8727600574493408, + "learning_rate": 0.0002, + "loss": 1.2558, + "step": 8930 + }, + { + "epoch": 5.989949748743719, + "grad_norm": 0.9799810647964478, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 8940 + }, + { + "epoch": 5.9966499162479066, + "grad_norm": 0.9866513609886169, + "learning_rate": 0.0002, + "loss": 1.3323, + "step": 8950 + }, + { + "epoch": 6.0, + "eval_loss": 2.0282373428344727, + "eval_runtime": 38.0375, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 8955 + }, + { + "epoch": 6.0033500837520934, + "grad_norm": 0.8747885227203369, + "learning_rate": 0.0002, + "loss": 1.1768, + "step": 8960 + }, + { + "epoch": 6.010050251256281, + "grad_norm": 1.2512741088867188, + "learning_rate": 0.0002, + "loss": 1.0677, + "step": 8970 + }, + { + "epoch": 6.016750418760469, + "grad_norm": 1.06855309009552, + "learning_rate": 0.0002, + "loss": 1.1128, + "step": 8980 + }, + { + "epoch": 6.023450586264657, + "grad_norm": 1.1868711709976196, + "learning_rate": 0.0002, + "loss": 1.1382, + "step": 8990 + }, + { + "epoch": 6.030150753768845, + "grad_norm": 1.2984495162963867, + "learning_rate": 0.0002, + "loss": 1.1377, + "step": 9000 + }, + { + "epoch": 6.0368509212730315, + "grad_norm": 1.1147589683532715, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 9010 + }, + { + "epoch": 6.043551088777219, + "grad_norm": 1.3128414154052734, + "learning_rate": 0.0002, + "loss": 1.1244, + "step": 9020 + }, + { + "epoch": 6.050251256281407, + "grad_norm": 1.068290114402771, + "learning_rate": 0.0002, + "loss": 1.097, + "step": 9030 + }, + { + "epoch": 6.056951423785595, + "grad_norm": 1.1890562772750854, + "learning_rate": 0.0002, + "loss": 1.1764, + "step": 9040 + }, + { + "epoch": 6.063651591289783, + "grad_norm": 1.2121573686599731, + "learning_rate": 0.0002, + "loss": 1.1239, + "step": 9050 + }, + { + "epoch": 6.0703517587939695, + "grad_norm": 1.0860483646392822, + "learning_rate": 0.0002, + "loss": 1.0683, + "step": 9060 + }, + { + "epoch": 6.077051926298157, + "grad_norm": 1.1214599609375, + "learning_rate": 0.0002, + "loss": 1.1613, + "step": 9070 + }, + { + "epoch": 6.083752093802345, + "grad_norm": 1.147580862045288, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 9080 + }, + { + "epoch": 6.090452261306533, + "grad_norm": 1.3233155012130737, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 9090 + }, + { + "epoch": 6.097152428810721, + "grad_norm": 1.1869080066680908, + "learning_rate": 0.0002, + "loss": 1.1017, + "step": 9100 + }, + { + "epoch": 6.1038525963149075, + "grad_norm": 1.1695014238357544, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 9110 + }, + { + "epoch": 6.110552763819095, + "grad_norm": 1.1982251405715942, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 9120 + }, + { + "epoch": 6.117252931323283, + "grad_norm": 1.1426950693130493, + "learning_rate": 0.0002, + "loss": 1.0805, + "step": 9130 + }, + { + "epoch": 6.123953098827471, + "grad_norm": 1.2257394790649414, + "learning_rate": 0.0002, + "loss": 1.0801, + "step": 9140 + }, + { + "epoch": 6.130653266331659, + "grad_norm": 1.2932263612747192, + "learning_rate": 0.0002, + "loss": 1.1209, + "step": 9150 + }, + { + "epoch": 6.1373534338358455, + "grad_norm": 1.2617030143737793, + "learning_rate": 0.0002, + "loss": 1.0934, + "step": 9160 + }, + { + "epoch": 6.144053601340033, + "grad_norm": 1.1201422214508057, + "learning_rate": 0.0002, + "loss": 1.0551, + "step": 9170 + }, + { + "epoch": 6.150753768844221, + "grad_norm": 0.9625319838523865, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 9180 + }, + { + "epoch": 6.157453936348409, + "grad_norm": 1.0290048122406006, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 9190 + }, + { + "epoch": 6.164154103852597, + "grad_norm": 1.1137803792953491, + "learning_rate": 0.0002, + "loss": 1.1257, + "step": 9200 + }, + { + "epoch": 6.1708542713567835, + "grad_norm": 1.3674522638320923, + "learning_rate": 0.0002, + "loss": 1.1211, + "step": 9210 + }, + { + "epoch": 6.177554438860971, + "grad_norm": 1.182207703590393, + "learning_rate": 0.0002, + "loss": 1.0947, + "step": 9220 + }, + { + "epoch": 6.184254606365159, + "grad_norm": 1.0496711730957031, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 9230 + }, + { + "epoch": 6.190954773869347, + "grad_norm": 1.1899489164352417, + "learning_rate": 0.0002, + "loss": 1.0666, + "step": 9240 + }, + { + "epoch": 6.197654941373535, + "grad_norm": 1.2666147947311401, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 9250 + }, + { + "epoch": 6.204355108877722, + "grad_norm": 1.2013030052185059, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 9260 + }, + { + "epoch": 6.211055276381909, + "grad_norm": 1.3049768209457397, + "learning_rate": 0.0002, + "loss": 1.151, + "step": 9270 + }, + { + "epoch": 6.217755443886097, + "grad_norm": 1.1733006238937378, + "learning_rate": 0.0002, + "loss": 1.0741, + "step": 9280 + }, + { + "epoch": 6.224455611390285, + "grad_norm": 1.2742516994476318, + "learning_rate": 0.0002, + "loss": 1.0933, + "step": 9290 + }, + { + "epoch": 6.231155778894473, + "grad_norm": 1.110198974609375, + "learning_rate": 0.0002, + "loss": 1.1028, + "step": 9300 + }, + { + "epoch": 6.23785594639866, + "grad_norm": 1.159963607788086, + "learning_rate": 0.0002, + "loss": 1.1619, + "step": 9310 + }, + { + "epoch": 6.244556113902847, + "grad_norm": 1.302216649055481, + "learning_rate": 0.0002, + "loss": 1.0716, + "step": 9320 + }, + { + "epoch": 6.251256281407035, + "grad_norm": 1.2134063243865967, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 9330 + }, + { + "epoch": 6.257956448911223, + "grad_norm": 1.062682867050171, + "learning_rate": 0.0002, + "loss": 1.2151, + "step": 9340 + }, + { + "epoch": 6.264656616415411, + "grad_norm": 1.1568971872329712, + "learning_rate": 0.0002, + "loss": 1.148, + "step": 9350 + }, + { + "epoch": 6.271356783919598, + "grad_norm": 0.9914957880973816, + "learning_rate": 0.0002, + "loss": 1.094, + "step": 9360 + }, + { + "epoch": 6.278056951423785, + "grad_norm": 1.017250895500183, + "learning_rate": 0.0002, + "loss": 1.125, + "step": 9370 + }, + { + "epoch": 6.284757118927973, + "grad_norm": 1.1862834692001343, + "learning_rate": 0.0002, + "loss": 1.2177, + "step": 9380 + }, + { + "epoch": 6.291457286432161, + "grad_norm": 1.2834911346435547, + "learning_rate": 0.0002, + "loss": 0.9994, + "step": 9390 + }, + { + "epoch": 6.298157453936349, + "grad_norm": 1.3306856155395508, + "learning_rate": 0.0002, + "loss": 1.0922, + "step": 9400 + }, + { + "epoch": 6.304857621440536, + "grad_norm": 1.12908136844635, + "learning_rate": 0.0002, + "loss": 1.1136, + "step": 9410 + }, + { + "epoch": 6.311557788944723, + "grad_norm": 1.2157351970672607, + "learning_rate": 0.0002, + "loss": 1.1406, + "step": 9420 + }, + { + "epoch": 6.318257956448911, + "grad_norm": 1.121882677078247, + "learning_rate": 0.0002, + "loss": 1.1388, + "step": 9430 + }, + { + "epoch": 6.324958123953099, + "grad_norm": 1.3144481182098389, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 9440 + }, + { + "epoch": 6.331658291457287, + "grad_norm": 1.1946896314620972, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 9450 + }, + { + "epoch": 6.338358458961474, + "grad_norm": 1.1289668083190918, + "learning_rate": 0.0002, + "loss": 1.1613, + "step": 9460 + }, + { + "epoch": 6.345058626465661, + "grad_norm": 1.1065658330917358, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 9470 + }, + { + "epoch": 6.351758793969849, + "grad_norm": 1.0881422758102417, + "learning_rate": 0.0002, + "loss": 1.1431, + "step": 9480 + }, + { + "epoch": 6.358458961474037, + "grad_norm": 1.242676854133606, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 9490 + }, + { + "epoch": 6.365159128978225, + "grad_norm": 0.9650855660438538, + "learning_rate": 0.0002, + "loss": 1.1379, + "step": 9500 + }, + { + "epoch": 6.371859296482412, + "grad_norm": 1.2845722436904907, + "learning_rate": 0.0002, + "loss": 1.0763, + "step": 9510 + }, + { + "epoch": 6.3785594639865995, + "grad_norm": 1.0327043533325195, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 9520 + }, + { + "epoch": 6.385259631490787, + "grad_norm": 1.0780898332595825, + "learning_rate": 0.0002, + "loss": 1.114, + "step": 9530 + }, + { + "epoch": 6.391959798994975, + "grad_norm": 1.4934027194976807, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 9540 + }, + { + "epoch": 6.398659966499163, + "grad_norm": 0.9882908463478088, + "learning_rate": 0.0002, + "loss": 1.1546, + "step": 9550 + }, + { + "epoch": 6.40536013400335, + "grad_norm": 1.3250664472579956, + "learning_rate": 0.0002, + "loss": 1.1145, + "step": 9560 + }, + { + "epoch": 6.4120603015075375, + "grad_norm": 1.1888482570648193, + "learning_rate": 0.0002, + "loss": 1.2333, + "step": 9570 + }, + { + "epoch": 6.418760469011725, + "grad_norm": 1.136496901512146, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 9580 + }, + { + "epoch": 6.425460636515913, + "grad_norm": 1.161360502243042, + "learning_rate": 0.0002, + "loss": 1.1674, + "step": 9590 + }, + { + "epoch": 6.432160804020101, + "grad_norm": 1.2034236192703247, + "learning_rate": 0.0002, + "loss": 1.1293, + "step": 9600 + }, + { + "epoch": 6.438860971524288, + "grad_norm": 1.0268361568450928, + "learning_rate": 0.0002, + "loss": 1.1059, + "step": 9610 + }, + { + "epoch": 6.4455611390284755, + "grad_norm": 1.2132930755615234, + "learning_rate": 0.0002, + "loss": 1.1732, + "step": 9620 + }, + { + "epoch": 6.452261306532663, + "grad_norm": 1.0773013830184937, + "learning_rate": 0.0002, + "loss": 1.1329, + "step": 9630 + }, + { + "epoch": 6.458961474036851, + "grad_norm": 1.3848375082015991, + "learning_rate": 0.0002, + "loss": 1.0822, + "step": 9640 + }, + { + "epoch": 6.465661641541039, + "grad_norm": 1.110495924949646, + "learning_rate": 0.0002, + "loss": 1.1778, + "step": 9650 + }, + { + "epoch": 6.472361809045226, + "grad_norm": 1.118093729019165, + "learning_rate": 0.0002, + "loss": 1.2022, + "step": 9660 + }, + { + "epoch": 6.4790619765494135, + "grad_norm": 1.2611900568008423, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 9670 + }, + { + "epoch": 6.485762144053601, + "grad_norm": 0.971754252910614, + "learning_rate": 0.0002, + "loss": 1.2138, + "step": 9680 + }, + { + "epoch": 6.492462311557789, + "grad_norm": 1.2615419626235962, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 9690 + }, + { + "epoch": 6.499162479061977, + "grad_norm": 1.1370900869369507, + "learning_rate": 0.0002, + "loss": 1.1412, + "step": 9700 + }, + { + "epoch": 6.505862646566165, + "grad_norm": 1.1815906763076782, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 9710 + }, + { + "epoch": 6.5125628140703515, + "grad_norm": 1.3424339294433594, + "learning_rate": 0.0002, + "loss": 1.167, + "step": 9720 + }, + { + "epoch": 6.519262981574539, + "grad_norm": 1.2858397960662842, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 9730 + }, + { + "epoch": 6.525963149078727, + "grad_norm": 0.9578179121017456, + "learning_rate": 0.0002, + "loss": 1.178, + "step": 9740 + }, + { + "epoch": 6.532663316582915, + "grad_norm": 1.3105167150497437, + "learning_rate": 0.0002, + "loss": 1.1805, + "step": 9750 + }, + { + "epoch": 6.539363484087103, + "grad_norm": 1.0586575269699097, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 9760 + }, + { + "epoch": 6.54606365159129, + "grad_norm": 1.2122068405151367, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 9770 + }, + { + "epoch": 6.552763819095477, + "grad_norm": 1.3088626861572266, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 9780 + }, + { + "epoch": 6.559463986599665, + "grad_norm": 1.194122076034546, + "learning_rate": 0.0002, + "loss": 1.1067, + "step": 9790 + }, + { + "epoch": 6.566164154103853, + "grad_norm": 1.1508387327194214, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 9800 + }, + { + "epoch": 6.572864321608041, + "grad_norm": 1.109228253364563, + "learning_rate": 0.0002, + "loss": 1.1694, + "step": 9810 + }, + { + "epoch": 6.579564489112228, + "grad_norm": 1.1607427597045898, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 9820 + }, + { + "epoch": 6.586264656616415, + "grad_norm": 1.174089789390564, + "learning_rate": 0.0002, + "loss": 1.1585, + "step": 9830 + }, + { + "epoch": 6.592964824120603, + "grad_norm": 1.1739521026611328, + "learning_rate": 0.0002, + "loss": 1.1385, + "step": 9840 + }, + { + "epoch": 6.599664991624791, + "grad_norm": 1.098528504371643, + "learning_rate": 0.0002, + "loss": 1.155, + "step": 9850 + }, + { + "epoch": 6.606365159128979, + "grad_norm": 1.0397740602493286, + "learning_rate": 0.0002, + "loss": 1.1359, + "step": 9860 + }, + { + "epoch": 6.613065326633166, + "grad_norm": 1.1087969541549683, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 9870 + }, + { + "epoch": 6.619765494137353, + "grad_norm": 1.2070481777191162, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 9880 + }, + { + "epoch": 6.626465661641541, + "grad_norm": 1.1115655899047852, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 9890 + }, + { + "epoch": 6.633165829145729, + "grad_norm": 1.2486097812652588, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 9900 + }, + { + "epoch": 6.639865996649917, + "grad_norm": 1.230380654335022, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 9910 + }, + { + "epoch": 6.646566164154104, + "grad_norm": 1.1479365825653076, + "learning_rate": 0.0002, + "loss": 1.1862, + "step": 9920 + }, + { + "epoch": 6.653266331658291, + "grad_norm": 1.0790960788726807, + "learning_rate": 0.0002, + "loss": 1.1139, + "step": 9930 + }, + { + "epoch": 6.659966499162479, + "grad_norm": 1.1157397031784058, + "learning_rate": 0.0002, + "loss": 1.2001, + "step": 9940 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.3104028701782227, + "learning_rate": 0.0002, + "loss": 1.1085, + "step": 9950 + }, + { + "epoch": 6.673366834170855, + "grad_norm": 1.1727646589279175, + "learning_rate": 0.0002, + "loss": 1.2209, + "step": 9960 + }, + { + "epoch": 6.680067001675042, + "grad_norm": 1.2104284763336182, + "learning_rate": 0.0002, + "loss": 1.1671, + "step": 9970 + }, + { + "epoch": 6.686767169179229, + "grad_norm": 1.2023727893829346, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 9980 + }, + { + "epoch": 6.693467336683417, + "grad_norm": 1.0088225603103638, + "learning_rate": 0.0002, + "loss": 1.1385, + "step": 9990 + }, + { + "epoch": 6.700167504187605, + "grad_norm": 1.298015832901001, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 10000 + }, + { + "epoch": 6.706867671691793, + "grad_norm": 1.1315910816192627, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 10010 + }, + { + "epoch": 6.71356783919598, + "grad_norm": 1.1283273696899414, + "learning_rate": 0.0002, + "loss": 1.1679, + "step": 10020 + }, + { + "epoch": 6.720268006700167, + "grad_norm": 1.2564418315887451, + "learning_rate": 0.0002, + "loss": 1.1917, + "step": 10030 + }, + { + "epoch": 6.726968174204355, + "grad_norm": 1.0451353788375854, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 10040 + }, + { + "epoch": 6.733668341708543, + "grad_norm": 1.054793357849121, + "learning_rate": 0.0002, + "loss": 1.1905, + "step": 10050 + }, + { + "epoch": 6.740368509212731, + "grad_norm": 1.2741243839263916, + "learning_rate": 0.0002, + "loss": 1.1814, + "step": 10060 + }, + { + "epoch": 6.747068676716918, + "grad_norm": 1.1342514753341675, + "learning_rate": 0.0002, + "loss": 1.2015, + "step": 10070 + }, + { + "epoch": 6.7537688442211055, + "grad_norm": 1.0081498622894287, + "learning_rate": 0.0002, + "loss": 1.2587, + "step": 10080 + }, + { + "epoch": 6.760469011725293, + "grad_norm": 1.2164603471755981, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 10090 + }, + { + "epoch": 6.767169179229481, + "grad_norm": 1.2062463760375977, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 10100 + }, + { + "epoch": 6.773869346733669, + "grad_norm": 1.2255526781082153, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 10110 + }, + { + "epoch": 6.780569514237856, + "grad_norm": 1.08175790309906, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 10120 + }, + { + "epoch": 6.7872696817420435, + "grad_norm": 1.5781128406524658, + "learning_rate": 0.0002, + "loss": 1.1983, + "step": 10130 + }, + { + "epoch": 6.793969849246231, + "grad_norm": 1.0622451305389404, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 10140 + }, + { + "epoch": 6.800670016750419, + "grad_norm": 1.1591497659683228, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 10150 + }, + { + "epoch": 6.807370184254607, + "grad_norm": 1.0398483276367188, + "learning_rate": 0.0002, + "loss": 1.2203, + "step": 10160 + }, + { + "epoch": 6.814070351758794, + "grad_norm": 1.229132056236267, + "learning_rate": 0.0002, + "loss": 1.2249, + "step": 10170 + }, + { + "epoch": 6.8207705192629815, + "grad_norm": 1.0918090343475342, + "learning_rate": 0.0002, + "loss": 1.1789, + "step": 10180 + }, + { + "epoch": 6.827470686767169, + "grad_norm": 1.1543749570846558, + "learning_rate": 0.0002, + "loss": 1.1639, + "step": 10190 + }, + { + "epoch": 6.834170854271357, + "grad_norm": 1.1831817626953125, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 10200 + }, + { + "epoch": 6.840871021775545, + "grad_norm": 1.305327296257019, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 10210 + }, + { + "epoch": 6.847571189279732, + "grad_norm": 1.136720061302185, + "learning_rate": 0.0002, + "loss": 1.2037, + "step": 10220 + }, + { + "epoch": 6.8542713567839195, + "grad_norm": 1.2282346487045288, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 10230 + }, + { + "epoch": 6.860971524288107, + "grad_norm": 1.2457010746002197, + "learning_rate": 0.0002, + "loss": 1.1281, + "step": 10240 + }, + { + "epoch": 6.867671691792295, + "grad_norm": 1.2808631658554077, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 10250 + }, + { + "epoch": 6.874371859296483, + "grad_norm": 1.089066743850708, + "learning_rate": 0.0002, + "loss": 1.2213, + "step": 10260 + }, + { + "epoch": 6.88107202680067, + "grad_norm": 0.9543178081512451, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 10270 + }, + { + "epoch": 6.8877721943048575, + "grad_norm": 1.1149744987487793, + "learning_rate": 0.0002, + "loss": 1.1617, + "step": 10280 + }, + { + "epoch": 6.894472361809045, + "grad_norm": 1.0185538530349731, + "learning_rate": 0.0002, + "loss": 1.1134, + "step": 10290 + }, + { + "epoch": 6.901172529313233, + "grad_norm": 0.9954617619514465, + "learning_rate": 0.0002, + "loss": 1.217, + "step": 10300 + }, + { + "epoch": 6.907872696817421, + "grad_norm": 1.2581418752670288, + "learning_rate": 0.0002, + "loss": 1.1524, + "step": 10310 + }, + { + "epoch": 6.914572864321608, + "grad_norm": 1.2430983781814575, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 10320 + }, + { + "epoch": 6.921273031825796, + "grad_norm": 1.4937270879745483, + "learning_rate": 0.0002, + "loss": 1.1254, + "step": 10330 + }, + { + "epoch": 6.927973199329983, + "grad_norm": 1.1257144212722778, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 10340 + }, + { + "epoch": 6.934673366834171, + "grad_norm": 1.2068904638290405, + "learning_rate": 0.0002, + "loss": 1.2622, + "step": 10350 + }, + { + "epoch": 6.941373534338359, + "grad_norm": 1.0290757417678833, + "learning_rate": 0.0002, + "loss": 1.1349, + "step": 10360 + }, + { + "epoch": 6.948073701842546, + "grad_norm": 1.0070724487304688, + "learning_rate": 0.0002, + "loss": 1.1752, + "step": 10370 + }, + { + "epoch": 6.954773869346734, + "grad_norm": 0.9936357140541077, + "learning_rate": 0.0002, + "loss": 1.1838, + "step": 10380 + }, + { + "epoch": 6.961474036850921, + "grad_norm": 1.1063416004180908, + "learning_rate": 0.0002, + "loss": 1.2305, + "step": 10390 + }, + { + "epoch": 6.968174204355109, + "grad_norm": 1.5199986696243286, + "learning_rate": 0.0002, + "loss": 1.154, + "step": 10400 + }, + { + "epoch": 6.974874371859297, + "grad_norm": 1.160731554031372, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 10410 + }, + { + "epoch": 6.981574539363484, + "grad_norm": 1.084697961807251, + "learning_rate": 0.0002, + "loss": 1.2132, + "step": 10420 + }, + { + "epoch": 6.988274706867672, + "grad_norm": 1.1257576942443848, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 10430 + }, + { + "epoch": 6.994974874371859, + "grad_norm": 1.310616135597229, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 10440 + }, + { + "epoch": 6.99966499162479, + "eval_loss": 2.1203012466430664, + "eval_runtime": 37.936, + "eval_samples_per_second": 13.576, + "eval_steps_per_second": 1.713, + "step": 10447 + }, + { + "epoch": 7.001675041876047, + "grad_norm": 1.0176491737365723, + "learning_rate": 0.0002, + "loss": 1.1118, + "step": 10450 + }, + { + "epoch": 7.008375209380235, + "grad_norm": 1.602665662765503, + "learning_rate": 0.0002, + "loss": 0.9932, + "step": 10460 + }, + { + "epoch": 7.015075376884422, + "grad_norm": 1.2909572124481201, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 10470 + }, + { + "epoch": 7.02177554438861, + "grad_norm": 1.2601855993270874, + "learning_rate": 0.0002, + "loss": 0.9398, + "step": 10480 + }, + { + "epoch": 7.028475711892797, + "grad_norm": 1.3437587022781372, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 10490 + }, + { + "epoch": 7.035175879396985, + "grad_norm": 1.2220089435577393, + "learning_rate": 0.0002, + "loss": 1.0031, + "step": 10500 + }, + { + "epoch": 7.041876046901173, + "grad_norm": 1.3392685651779175, + "learning_rate": 0.0002, + "loss": 0.9428, + "step": 10510 + }, + { + "epoch": 7.04857621440536, + "grad_norm": 1.3902767896652222, + "learning_rate": 0.0002, + "loss": 0.9566, + "step": 10520 + }, + { + "epoch": 7.055276381909548, + "grad_norm": 1.4098035097122192, + "learning_rate": 0.0002, + "loss": 0.9993, + "step": 10530 + }, + { + "epoch": 7.061976549413735, + "grad_norm": 1.38866126537323, + "learning_rate": 0.0002, + "loss": 0.9683, + "step": 10540 + }, + { + "epoch": 7.068676716917923, + "grad_norm": 1.3638999462127686, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 10550 + }, + { + "epoch": 7.075376884422111, + "grad_norm": 1.3181349039077759, + "learning_rate": 0.0002, + "loss": 0.9698, + "step": 10560 + }, + { + "epoch": 7.082077051926298, + "grad_norm": 1.2528657913208008, + "learning_rate": 0.0002, + "loss": 0.9963, + "step": 10570 + }, + { + "epoch": 7.088777219430486, + "grad_norm": 1.4326589107513428, + "learning_rate": 0.0002, + "loss": 0.9624, + "step": 10580 + }, + { + "epoch": 7.0954773869346734, + "grad_norm": 1.4312337636947632, + "learning_rate": 0.0002, + "loss": 1.0432, + "step": 10590 + }, + { + "epoch": 7.102177554438861, + "grad_norm": 1.7286990880966187, + "learning_rate": 0.0002, + "loss": 0.9704, + "step": 10600 + }, + { + "epoch": 7.108877721943049, + "grad_norm": 1.1248762607574463, + "learning_rate": 0.0002, + "loss": 0.9954, + "step": 10610 + }, + { + "epoch": 7.115577889447236, + "grad_norm": 1.278731346130371, + "learning_rate": 0.0002, + "loss": 0.9736, + "step": 10620 + }, + { + "epoch": 7.122278056951424, + "grad_norm": 1.53670072555542, + "learning_rate": 0.0002, + "loss": 0.9885, + "step": 10630 + }, + { + "epoch": 7.1289782244556115, + "grad_norm": 1.268069863319397, + "learning_rate": 0.0002, + "loss": 0.9573, + "step": 10640 + }, + { + "epoch": 7.135678391959799, + "grad_norm": 1.5072290897369385, + "learning_rate": 0.0002, + "loss": 0.9716, + "step": 10650 + }, + { + "epoch": 7.142378559463987, + "grad_norm": 1.5552845001220703, + "learning_rate": 0.0002, + "loss": 0.9871, + "step": 10660 + }, + { + "epoch": 7.149078726968174, + "grad_norm": 1.2643769979476929, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 10670 + }, + { + "epoch": 7.155778894472362, + "grad_norm": 1.347589373588562, + "learning_rate": 0.0002, + "loss": 0.9848, + "step": 10680 + }, + { + "epoch": 7.1624790619765495, + "grad_norm": 1.364700436592102, + "learning_rate": 0.0002, + "loss": 1.0193, + "step": 10690 + }, + { + "epoch": 7.169179229480737, + "grad_norm": 1.4375768899917603, + "learning_rate": 0.0002, + "loss": 1.0057, + "step": 10700 + }, + { + "epoch": 7.175879396984925, + "grad_norm": 1.426047444343567, + "learning_rate": 0.0002, + "loss": 1.0108, + "step": 10710 + }, + { + "epoch": 7.182579564489112, + "grad_norm": 1.5208740234375, + "learning_rate": 0.0002, + "loss": 0.9985, + "step": 10720 + }, + { + "epoch": 7.1892797319933, + "grad_norm": 1.4713337421417236, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 10730 + }, + { + "epoch": 7.1959798994974875, + "grad_norm": 1.3042446374893188, + "learning_rate": 0.0002, + "loss": 0.9481, + "step": 10740 + }, + { + "epoch": 7.202680067001675, + "grad_norm": 1.2290682792663574, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 10750 + }, + { + "epoch": 7.209380234505863, + "grad_norm": 1.1152390241622925, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 10760 + }, + { + "epoch": 7.21608040201005, + "grad_norm": 1.3632242679595947, + "learning_rate": 0.0002, + "loss": 0.9557, + "step": 10770 + }, + { + "epoch": 7.222780569514238, + "grad_norm": 1.2406541109085083, + "learning_rate": 0.0002, + "loss": 0.9915, + "step": 10780 + }, + { + "epoch": 7.2294807370184255, + "grad_norm": 1.1965205669403076, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 10790 + }, + { + "epoch": 7.236180904522613, + "grad_norm": 1.2895352840423584, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 10800 + }, + { + "epoch": 7.242881072026801, + "grad_norm": 1.388366937637329, + "learning_rate": 0.0002, + "loss": 0.9616, + "step": 10810 + }, + { + "epoch": 7.249581239530988, + "grad_norm": 1.1411796808242798, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 10820 + }, + { + "epoch": 7.256281407035176, + "grad_norm": 1.2220646142959595, + "learning_rate": 0.0002, + "loss": 0.942, + "step": 10830 + }, + { + "epoch": 7.2629815745393635, + "grad_norm": 1.495492696762085, + "learning_rate": 0.0002, + "loss": 1.0136, + "step": 10840 + }, + { + "epoch": 7.269681742043551, + "grad_norm": 1.1395213603973389, + "learning_rate": 0.0002, + "loss": 1.0475, + "step": 10850 + }, + { + "epoch": 7.276381909547739, + "grad_norm": 1.3826487064361572, + "learning_rate": 0.0002, + "loss": 1.0134, + "step": 10860 + }, + { + "epoch": 7.283082077051926, + "grad_norm": 1.4356474876403809, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 10870 + }, + { + "epoch": 7.289782244556114, + "grad_norm": 1.3617557287216187, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 10880 + }, + { + "epoch": 7.296482412060302, + "grad_norm": 1.585394024848938, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 10890 + }, + { + "epoch": 7.303182579564489, + "grad_norm": 1.1442821025848389, + "learning_rate": 0.0002, + "loss": 1.0204, + "step": 10900 + }, + { + "epoch": 7.309882747068677, + "grad_norm": 1.4712985754013062, + "learning_rate": 0.0002, + "loss": 0.9608, + "step": 10910 + }, + { + "epoch": 7.316582914572864, + "grad_norm": 1.2864325046539307, + "learning_rate": 0.0002, + "loss": 1.0027, + "step": 10920 + }, + { + "epoch": 7.323283082077052, + "grad_norm": 1.308010458946228, + "learning_rate": 0.0002, + "loss": 1.0341, + "step": 10930 + }, + { + "epoch": 7.32998324958124, + "grad_norm": 1.4371414184570312, + "learning_rate": 0.0002, + "loss": 1.0096, + "step": 10940 + }, + { + "epoch": 7.336683417085427, + "grad_norm": 1.5968806743621826, + "learning_rate": 0.0002, + "loss": 0.9999, + "step": 10950 + }, + { + "epoch": 7.343383584589615, + "grad_norm": 1.3592339754104614, + "learning_rate": 0.0002, + "loss": 0.9611, + "step": 10960 + }, + { + "epoch": 7.350083752093802, + "grad_norm": 1.225520133972168, + "learning_rate": 0.0002, + "loss": 1.0505, + "step": 10970 + }, + { + "epoch": 7.35678391959799, + "grad_norm": 1.3138031959533691, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 10980 + }, + { + "epoch": 7.363484087102178, + "grad_norm": 1.2601540088653564, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 10990 + }, + { + "epoch": 7.370184254606365, + "grad_norm": 1.3437113761901855, + "learning_rate": 0.0002, + "loss": 1.0508, + "step": 11000 + }, + { + "epoch": 7.376884422110553, + "grad_norm": 1.0681836605072021, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 11010 + }, + { + "epoch": 7.38358458961474, + "grad_norm": 1.415852665901184, + "learning_rate": 0.0002, + "loss": 1.0025, + "step": 11020 + }, + { + "epoch": 7.390284757118928, + "grad_norm": 1.3147039413452148, + "learning_rate": 0.0002, + "loss": 1.0532, + "step": 11030 + }, + { + "epoch": 7.396984924623116, + "grad_norm": 1.4778614044189453, + "learning_rate": 0.0002, + "loss": 1.1188, + "step": 11040 + }, + { + "epoch": 7.403685092127303, + "grad_norm": 1.3137797117233276, + "learning_rate": 0.0002, + "loss": 0.9917, + "step": 11050 + }, + { + "epoch": 7.410385259631491, + "grad_norm": 1.1917701959609985, + "learning_rate": 0.0002, + "loss": 1.0115, + "step": 11060 + }, + { + "epoch": 7.417085427135678, + "grad_norm": 1.3999699354171753, + "learning_rate": 0.0002, + "loss": 1.0668, + "step": 11070 + }, + { + "epoch": 7.423785594639866, + "grad_norm": 1.3817322254180908, + "learning_rate": 0.0002, + "loss": 1.0311, + "step": 11080 + }, + { + "epoch": 7.430485762144054, + "grad_norm": 1.2086812257766724, + "learning_rate": 0.0002, + "loss": 1.0086, + "step": 11090 + }, + { + "epoch": 7.437185929648241, + "grad_norm": 1.1938024759292603, + "learning_rate": 0.0002, + "loss": 1.0002, + "step": 11100 + }, + { + "epoch": 7.443886097152429, + "grad_norm": 1.1543669700622559, + "learning_rate": 0.0002, + "loss": 1.0463, + "step": 11110 + }, + { + "epoch": 7.450586264656616, + "grad_norm": 1.158841848373413, + "learning_rate": 0.0002, + "loss": 1.0814, + "step": 11120 + }, + { + "epoch": 7.457286432160804, + "grad_norm": 1.2777763605117798, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 11130 + }, + { + "epoch": 7.463986599664992, + "grad_norm": 1.3375903367996216, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 11140 + }, + { + "epoch": 7.4706867671691795, + "grad_norm": 1.5573794841766357, + "learning_rate": 0.0002, + "loss": 0.9995, + "step": 11150 + }, + { + "epoch": 7.477386934673367, + "grad_norm": 1.3869640827178955, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 11160 + }, + { + "epoch": 7.484087102177554, + "grad_norm": 1.310341238975525, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 11170 + }, + { + "epoch": 7.490787269681742, + "grad_norm": 1.4249892234802246, + "learning_rate": 0.0002, + "loss": 1.0365, + "step": 11180 + }, + { + "epoch": 7.49748743718593, + "grad_norm": 1.4025905132293701, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 11190 + }, + { + "epoch": 7.5041876046901175, + "grad_norm": 1.4361881017684937, + "learning_rate": 0.0002, + "loss": 1.0328, + "step": 11200 + }, + { + "epoch": 7.510887772194305, + "grad_norm": 1.1791380643844604, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 11210 + }, + { + "epoch": 7.517587939698492, + "grad_norm": 1.415075421333313, + "learning_rate": 0.0002, + "loss": 1.0621, + "step": 11220 + }, + { + "epoch": 7.52428810720268, + "grad_norm": 1.3633697032928467, + "learning_rate": 0.0002, + "loss": 1.0265, + "step": 11230 + }, + { + "epoch": 7.530988274706868, + "grad_norm": 1.2803648710250854, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 11240 + }, + { + "epoch": 7.5376884422110555, + "grad_norm": 1.4032878875732422, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 11250 + }, + { + "epoch": 7.544388609715243, + "grad_norm": 1.4507378339767456, + "learning_rate": 0.0002, + "loss": 1.032, + "step": 11260 + }, + { + "epoch": 7.55108877721943, + "grad_norm": 1.227613925933838, + "learning_rate": 0.0002, + "loss": 1.0221, + "step": 11270 + }, + { + "epoch": 7.557788944723618, + "grad_norm": 1.2620965242385864, + "learning_rate": 0.0002, + "loss": 1.0288, + "step": 11280 + }, + { + "epoch": 7.564489112227806, + "grad_norm": 1.3917821645736694, + "learning_rate": 0.0002, + "loss": 1.1263, + "step": 11290 + }, + { + "epoch": 7.5711892797319935, + "grad_norm": 1.5720019340515137, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 11300 + }, + { + "epoch": 7.577889447236181, + "grad_norm": 1.376694917678833, + "learning_rate": 0.0002, + "loss": 1.0537, + "step": 11310 + }, + { + "epoch": 7.584589614740368, + "grad_norm": 1.4403680562973022, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 11320 + }, + { + "epoch": 7.591289782244556, + "grad_norm": 1.6306934356689453, + "learning_rate": 0.0002, + "loss": 1.0318, + "step": 11330 + }, + { + "epoch": 7.597989949748744, + "grad_norm": 1.2361583709716797, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 11340 + }, + { + "epoch": 7.6046901172529315, + "grad_norm": 1.2658058404922485, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 11350 + }, + { + "epoch": 7.611390284757119, + "grad_norm": 1.5335173606872559, + "learning_rate": 0.0002, + "loss": 1.0357, + "step": 11360 + }, + { + "epoch": 7.618090452261306, + "grad_norm": 1.3432948589324951, + "learning_rate": 0.0002, + "loss": 1.0689, + "step": 11370 + }, + { + "epoch": 7.624790619765494, + "grad_norm": 1.374617338180542, + "learning_rate": 0.0002, + "loss": 1.0613, + "step": 11380 + }, + { + "epoch": 7.631490787269682, + "grad_norm": 1.3790302276611328, + "learning_rate": 0.0002, + "loss": 1.0722, + "step": 11390 + }, + { + "epoch": 7.63819095477387, + "grad_norm": 1.4256713390350342, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 11400 + }, + { + "epoch": 7.644891122278057, + "grad_norm": 1.271228313446045, + "learning_rate": 0.0002, + "loss": 1.0314, + "step": 11410 + }, + { + "epoch": 7.651591289782244, + "grad_norm": 1.432146668434143, + "learning_rate": 0.0002, + "loss": 1.0678, + "step": 11420 + }, + { + "epoch": 7.658291457286432, + "grad_norm": 1.2698006629943848, + "learning_rate": 0.0002, + "loss": 1.0496, + "step": 11430 + }, + { + "epoch": 7.66499162479062, + "grad_norm": 1.439496636390686, + "learning_rate": 0.0002, + "loss": 1.0678, + "step": 11440 + }, + { + "epoch": 7.671691792294808, + "grad_norm": 1.2079370021820068, + "learning_rate": 0.0002, + "loss": 1.0344, + "step": 11450 + }, + { + "epoch": 7.678391959798995, + "grad_norm": 1.310412049293518, + "learning_rate": 0.0002, + "loss": 1.047, + "step": 11460 + }, + { + "epoch": 7.685092127303182, + "grad_norm": 1.413438320159912, + "learning_rate": 0.0002, + "loss": 1.0524, + "step": 11470 + }, + { + "epoch": 7.69179229480737, + "grad_norm": 1.2390344142913818, + "learning_rate": 0.0002, + "loss": 0.9965, + "step": 11480 + }, + { + "epoch": 7.698492462311558, + "grad_norm": 1.3902971744537354, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 11490 + }, + { + "epoch": 7.705192629815746, + "grad_norm": 1.1194908618927002, + "learning_rate": 0.0002, + "loss": 0.9811, + "step": 11500 + }, + { + "epoch": 7.711892797319933, + "grad_norm": 1.625697374343872, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 11510 + }, + { + "epoch": 7.71859296482412, + "grad_norm": 1.339687466621399, + "learning_rate": 0.0002, + "loss": 1.0285, + "step": 11520 + }, + { + "epoch": 7.725293132328308, + "grad_norm": 1.336680293083191, + "learning_rate": 0.0002, + "loss": 1.032, + "step": 11530 + }, + { + "epoch": 7.731993299832496, + "grad_norm": 1.3316529989242554, + "learning_rate": 0.0002, + "loss": 1.0915, + "step": 11540 + }, + { + "epoch": 7.738693467336684, + "grad_norm": 1.2593837976455688, + "learning_rate": 0.0002, + "loss": 1.0031, + "step": 11550 + }, + { + "epoch": 7.745393634840871, + "grad_norm": 1.2159652709960938, + "learning_rate": 0.0002, + "loss": 1.019, + "step": 11560 + }, + { + "epoch": 7.752093802345058, + "grad_norm": 1.2919223308563232, + "learning_rate": 0.0002, + "loss": 1.0554, + "step": 11570 + }, + { + "epoch": 7.758793969849246, + "grad_norm": 1.2574092149734497, + "learning_rate": 0.0002, + "loss": 1.0072, + "step": 11580 + }, + { + "epoch": 7.765494137353434, + "grad_norm": 1.228236436843872, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 11590 + }, + { + "epoch": 7.772194304857622, + "grad_norm": 1.3790936470031738, + "learning_rate": 0.0002, + "loss": 1.0494, + "step": 11600 + }, + { + "epoch": 7.778894472361809, + "grad_norm": 1.419376015663147, + "learning_rate": 0.0002, + "loss": 1.0256, + "step": 11610 + }, + { + "epoch": 7.785594639865996, + "grad_norm": 1.3336344957351685, + "learning_rate": 0.0002, + "loss": 1.0985, + "step": 11620 + }, + { + "epoch": 7.792294807370184, + "grad_norm": 1.5279520750045776, + "learning_rate": 0.0002, + "loss": 1.107, + "step": 11630 + }, + { + "epoch": 7.798994974874372, + "grad_norm": 1.4296824932098389, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 11640 + }, + { + "epoch": 7.80569514237856, + "grad_norm": 1.279316782951355, + "learning_rate": 0.0002, + "loss": 1.128, + "step": 11650 + }, + { + "epoch": 7.812395309882747, + "grad_norm": 1.2764557600021362, + "learning_rate": 0.0002, + "loss": 1.0738, + "step": 11660 + }, + { + "epoch": 7.819095477386934, + "grad_norm": 1.1745330095291138, + "learning_rate": 0.0002, + "loss": 1.1157, + "step": 11670 + }, + { + "epoch": 7.825795644891122, + "grad_norm": 1.3440991640090942, + "learning_rate": 0.0002, + "loss": 1.1071, + "step": 11680 + }, + { + "epoch": 7.83249581239531, + "grad_norm": 1.1923167705535889, + "learning_rate": 0.0002, + "loss": 1.0751, + "step": 11690 + }, + { + "epoch": 7.839195979899498, + "grad_norm": 1.2679530382156372, + "learning_rate": 0.0002, + "loss": 1.0656, + "step": 11700 + }, + { + "epoch": 7.8458961474036855, + "grad_norm": 1.3027020692825317, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 11710 + }, + { + "epoch": 7.852596314907872, + "grad_norm": 1.4565616846084595, + "learning_rate": 0.0002, + "loss": 1.0496, + "step": 11720 + }, + { + "epoch": 7.85929648241206, + "grad_norm": 1.3157920837402344, + "learning_rate": 0.0002, + "loss": 1.0487, + "step": 11730 + }, + { + "epoch": 7.865996649916248, + "grad_norm": 1.3120285272598267, + "learning_rate": 0.0002, + "loss": 1.0876, + "step": 11740 + }, + { + "epoch": 7.872696817420436, + "grad_norm": 1.2625858783721924, + "learning_rate": 0.0002, + "loss": 1.0248, + "step": 11750 + }, + { + "epoch": 7.8793969849246235, + "grad_norm": 1.3911317586898804, + "learning_rate": 0.0002, + "loss": 1.0047, + "step": 11760 + }, + { + "epoch": 7.88609715242881, + "grad_norm": 1.2151232957839966, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 11770 + }, + { + "epoch": 7.892797319932998, + "grad_norm": 1.5171650648117065, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 11780 + }, + { + "epoch": 7.899497487437186, + "grad_norm": 1.4308419227600098, + "learning_rate": 0.0002, + "loss": 1.0957, + "step": 11790 + }, + { + "epoch": 7.906197654941374, + "grad_norm": 1.2683558464050293, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 11800 + }, + { + "epoch": 7.9128978224455615, + "grad_norm": 1.6872674226760864, + "learning_rate": 0.0002, + "loss": 1.1134, + "step": 11810 + }, + { + "epoch": 7.919597989949748, + "grad_norm": 1.3805692195892334, + "learning_rate": 0.0002, + "loss": 1.0077, + "step": 11820 + }, + { + "epoch": 7.926298157453936, + "grad_norm": 1.3342738151550293, + "learning_rate": 0.0002, + "loss": 1.1098, + "step": 11830 + }, + { + "epoch": 7.932998324958124, + "grad_norm": 1.308137059211731, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 11840 + }, + { + "epoch": 7.939698492462312, + "grad_norm": 1.3725523948669434, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 11850 + }, + { + "epoch": 7.9463986599664995, + "grad_norm": 1.1458995342254639, + "learning_rate": 0.0002, + "loss": 1.0492, + "step": 11860 + }, + { + "epoch": 7.953098827470686, + "grad_norm": 1.4715759754180908, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 11870 + }, + { + "epoch": 7.959798994974874, + "grad_norm": 1.1897743940353394, + "learning_rate": 0.0002, + "loss": 1.1408, + "step": 11880 + }, + { + "epoch": 7.966499162479062, + "grad_norm": 1.3386842012405396, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 11890 + }, + { + "epoch": 7.97319932998325, + "grad_norm": 1.3611114025115967, + "learning_rate": 0.0002, + "loss": 1.0463, + "step": 11900 + }, + { + "epoch": 7.9798994974874375, + "grad_norm": 1.1429232358932495, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 11910 + }, + { + "epoch": 7.986599664991624, + "grad_norm": 1.4848406314849854, + "learning_rate": 0.0002, + "loss": 1.1153, + "step": 11920 + }, + { + "epoch": 7.993299832495812, + "grad_norm": 1.3205432891845703, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 11930 + }, + { + "epoch": 7.997319932998325, + "eval_loss": 2.273319959640503, + "eval_runtime": 37.9333, + "eval_samples_per_second": 13.576, + "eval_steps_per_second": 1.714, + "step": 11936 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.52371354213548e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-11936/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dd239e42afc491dece8497032500d4eb9b33a024 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e5e994f25eeb647e9eaf23119c1032c2dfc7e32d980c8862eff32d0a60cd25 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..620fa8c94b3f0e09c590632f8933d9f7426387b1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6239078bcb5ed53dc80356c24ddbd2bac0fcb98c4f6db364b6bf14f0e94f37 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ff627ce7a7574c57726bcd7f8dd33ef923e735b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d416b20e85177f56c04997a90b643a2900ee505cb41f99c2d2b2e23d88b8dd8f +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..574851d73f1f423bc7fd3b74ef9bc88c905f3754 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c77dbd02d51b31cf00f6638f73d175ce25655ac869b78c1454b4a1c11a6fdae +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b48ac6f9b0866be7a6a8882bab01a12c6bafe113 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/trainer_state.json @@ -0,0 +1,1084 @@ +{ + "best_metric": 1.8036354780197144, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492", + "epoch": 0.9996649916247906, + "eval_steps": 10, + "global_step": 1492, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.906955815714816e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb6e7189f2f644eaba0f037e36f9798ed96f6276 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd96df8ee104e7b3825aa8521a0d4d68984970c2c0b69c6892e57ee4766a75d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ed0048a717946e1b10e202a18778ce3546f396f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:636991cdadb24d5433244c04a05ff58a08f7790337aee34bee6465e0857fcf64 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..feb1aca0d4082b105b3b7b388da02b234494d28a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:334295595baafc3a59c6c9ae498247797c3e2460976c08b7917b610520d97d56 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..79e491e499bd9b2ad182b3c4b2638bd93a77c563 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6e6b7e1db1e8263475596c6777128aabdf2ae74656407a474b8e1ecceb9c9d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dbb477671bbe8bee88068baadcbb083f6c7bc743 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/trainer_state.json @@ -0,0 +1,2135 @@ +{ + "best_metric": 1.8028968572616577, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2985, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.29252371191978455, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1500 + }, + { + "epoch": 1.0117252931323284, + "grad_norm": 0.31607162952423096, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 1510 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.32294467091560364, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1520 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.3868017792701721, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 1530 + }, + { + "epoch": 1.031825795644891, + "grad_norm": 0.3178282082080841, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 1540 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.3706750273704529, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 1550 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.33930912613868713, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1560 + }, + { + "epoch": 1.051926298157454, + "grad_norm": 0.33970504999160767, + "learning_rate": 0.0002, + "loss": 1.7602, + "step": 1570 + }, + { + "epoch": 1.0586264656616415, + "grad_norm": 0.42553383111953735, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1580 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.3772421181201935, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1590 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.34212902188301086, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1600 + }, + { + "epoch": 1.0787269681742044, + "grad_norm": 0.3798283338546753, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1610 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.36909598112106323, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 1620 + }, + { + "epoch": 1.0921273031825796, + "grad_norm": 0.3344230651855469, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 1630 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.3862569332122803, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1640 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.31188511848449707, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1650 + }, + { + "epoch": 1.1122278056951425, + "grad_norm": 0.3563670814037323, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 1660 + }, + { + "epoch": 1.11892797319933, + "grad_norm": 0.35052165389060974, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 1670 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.3285699188709259, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1680 + }, + { + "epoch": 1.1323283082077051, + "grad_norm": 0.3639393746852875, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1690 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.3842753767967224, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 1700 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.3624933063983917, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 1710 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.3641220033168793, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1720 + }, + { + "epoch": 1.1591289782244556, + "grad_norm": 0.32765355706214905, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1730 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.34974896907806396, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 1740 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3910926580429077, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.3564300537109375, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 1760 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.34822574257850647, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1770 + }, + { + "epoch": 1.1926298157453936, + "grad_norm": 0.36185044050216675, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1780 + }, + { + "epoch": 1.1993299832495812, + "grad_norm": 0.34866711497306824, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1790 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.4017769992351532, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 1800 + }, + { + "epoch": 1.2127303182579565, + "grad_norm": 0.32930681109428406, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1810 + }, + { + "epoch": 1.219430485762144, + "grad_norm": 0.35951921343803406, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1820 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.37366992235183716, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 1830 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.3565689027309418, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1840 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.3692343533039093, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 1850 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.38426971435546875, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 1860 + }, + { + "epoch": 1.252931323283082, + "grad_norm": 0.33559855818748474, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1870 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.34181106090545654, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1880 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3916318416595459, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1890 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3887825012207031, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 1900 + }, + { + "epoch": 1.2797319932998326, + "grad_norm": 0.33583927154541016, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1910 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.37639349699020386, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1920 + }, + { + "epoch": 1.2931323283082077, + "grad_norm": 0.38059428334236145, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1930 + }, + { + "epoch": 1.2998324958123952, + "grad_norm": 0.37253183126449585, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1940 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.37371566891670227, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 1950 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.4080910086631775, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 1960 + }, + { + "epoch": 1.3199329983249581, + "grad_norm": 0.3174354135990143, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1970 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.4518888294696808, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 1980 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3627921938896179, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 1990 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3655930161476135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 2000 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2010 + }, + { + "epoch": 1.3534338358458962, + "grad_norm": 0.4281129240989685, + "learning_rate": 0.0002, + "loss": 1.7359, + "step": 2020 + }, + { + "epoch": 1.3601340033500837, + "grad_norm": 0.3821414113044739, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 2030 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.3907586336135864, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 2040 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37792932987213135, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 2050 + }, + { + "epoch": 1.3802345058626466, + "grad_norm": 0.3693985641002655, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 2060 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.32275936007499695, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 2070 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.3789440095424652, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 2080 + }, + { + "epoch": 1.4003350083752093, + "grad_norm": 0.3638380467891693, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 2090 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 2100 + }, + { + "epoch": 1.4137353433835846, + "grad_norm": 0.37920597195625305, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 2110 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.37218064069747925, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 2120 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.38074082136154175, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 2130 + }, + { + "epoch": 1.4338358458961473, + "grad_norm": 0.3455527126789093, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 2140 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.3712003529071808, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2150 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.3786754906177521, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2160 + }, + { + "epoch": 1.4539363484087102, + "grad_norm": 0.3879223167896271, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 2170 + }, + { + "epoch": 1.4606365159128978, + "grad_norm": 0.38738805055618286, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 2180 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.39768800139427185, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2190 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.4172441065311432, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 2200 + }, + { + "epoch": 1.4807370184254607, + "grad_norm": 0.4043174982070923, + "learning_rate": 0.0002, + "loss": 1.6736, + "step": 2210 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.3750883936882019, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 2220 + }, + { + "epoch": 1.4941373534338358, + "grad_norm": 0.3552253246307373, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 2230 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.34607139229774475, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2240 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.3406706750392914, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 2250 + }, + { + "epoch": 1.5142378559463987, + "grad_norm": 0.36654895544052124, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 2260 + }, + { + "epoch": 1.5209380234505863, + "grad_norm": 0.3914054334163666, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2270 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.42012137174606323, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 2280 + }, + { + "epoch": 1.5343383584589616, + "grad_norm": 0.39563435316085815, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.3508438766002655, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 2300 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.3785218596458435, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 2310 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.39377647638320923, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 2320 + }, + { + "epoch": 1.5611390284757118, + "grad_norm": 0.3391438126564026, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2330 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.37944263219833374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 2340 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3523491322994232, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2350 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.3911575973033905, + "learning_rate": 0.0002, + "loss": 1.7583, + "step": 2360 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.33832186460494995, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 2370 + }, + { + "epoch": 1.5946398659966499, + "grad_norm": 0.3665979206562042, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2380 + }, + { + "epoch": 1.6013400335008376, + "grad_norm": 0.3871748149394989, + "learning_rate": 0.0002, + "loss": 1.779, + "step": 2390 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3586967885494232, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 2400 + }, + { + "epoch": 1.6147403685092128, + "grad_norm": 0.3563673198223114, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 2410 + }, + { + "epoch": 1.6214405360134003, + "grad_norm": 0.37588971853256226, + "learning_rate": 0.0002, + "loss": 1.745, + "step": 2420 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.352556437253952, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 2430 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.3716259300708771, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2440 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.372001975774765, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2450 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.3430042862892151, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2460 + }, + { + "epoch": 1.6549413735343383, + "grad_norm": 0.3741483688354492, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2470 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.3610571324825287, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2480 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.4204719066619873, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2490 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3938186466693878, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 2500 + }, + { + "epoch": 1.6817420435510888, + "grad_norm": 0.3421435058116913, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 2510 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.42441412806510925, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 2520 + }, + { + "epoch": 1.695142378559464, + "grad_norm": 0.38071519136428833, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 2530 + }, + { + "epoch": 1.7018425460636517, + "grad_norm": 0.34078919887542725, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2540 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.412844181060791, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 2550 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 2560 + }, + { + "epoch": 1.7219430485762144, + "grad_norm": 0.41588476300239563, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 2570 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.35504111647605896, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2580 + }, + { + "epoch": 1.7353433835845897, + "grad_norm": 0.36909720301628113, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.4149979054927826, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 2600 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.38859328627586365, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 2610 + }, + { + "epoch": 1.7554438860971524, + "grad_norm": 0.36738792061805725, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2620 + }, + { + "epoch": 1.76214405360134, + "grad_norm": 0.3968178927898407, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2630 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.3972901999950409, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 2640 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3949959874153137, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 1.7822445561139029, + "grad_norm": 0.44074657559394836, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 2660 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.39743664860725403, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 2670 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.3950406610965729, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2680 + }, + { + "epoch": 1.8023450586264658, + "grad_norm": 0.3568263649940491, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2690 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3819476366043091, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2700 + }, + { + "epoch": 1.8157453936348409, + "grad_norm": 0.3480634391307831, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 2710 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2720 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.3441337049007416, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2730 + }, + { + "epoch": 1.8358458961474038, + "grad_norm": 0.35692882537841797, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 2740 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.36959215998649597, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2750 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.3893393278121948, + "learning_rate": 0.0002, + "loss": 1.7657, + "step": 2760 + }, + { + "epoch": 1.8559463986599665, + "grad_norm": 0.37817293405532837, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2770 + }, + { + "epoch": 1.862646566164154, + "grad_norm": 0.36071285605430603, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 2780 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.3758420944213867, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 2790 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3889938294887543, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 2800 + }, + { + "epoch": 1.882747068676717, + "grad_norm": 0.34361857175827026, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2810 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.39283323287963867, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2820 + }, + { + "epoch": 1.896147403685092, + "grad_norm": 0.3919452726840973, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 2830 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.38215070962905884, + "learning_rate": 0.0002, + "loss": 1.673, + "step": 2840 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.4235064387321472, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 2850 + }, + { + "epoch": 1.916247906197655, + "grad_norm": 0.35694634914398193, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 2860 + }, + { + "epoch": 1.9229480737018425, + "grad_norm": 0.383492112159729, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 2870 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5945147275924683, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2880 + }, + { + "epoch": 1.9363484087102178, + "grad_norm": 0.3367522358894348, + "learning_rate": 0.0002, + "loss": 1.7421, + "step": 2890 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.35300394892692566, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2900 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.38084495067596436, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2910 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.37559160590171814, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 2920 + }, + { + "epoch": 1.963149078726968, + "grad_norm": 0.3661738336086273, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 2930 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.4073849320411682, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2940 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3723304271697998, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 2950 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.3991098999977112, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 2960 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.3947085440158844, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2970 + }, + { + "epoch": 1.996649916247906, + "grad_norm": 0.3786258399486542, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2980 + }, + { + "epoch": 2.0, + "eval_loss": 1.8028968572616577, + "eval_runtime": 37.8985, + "eval_samples_per_second": 13.589, + "eval_steps_per_second": 1.715, + "step": 2985 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3813911631429632e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9db6a6e39dad5a1aeda5f62bb72072dfee810f36 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a6250b488c4c4dbef4033e080f9cdecb7ac533c6ad1de44a10fbfdcb20da6bf +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e45c82df3bf4ccf8bcd0253221f3b6ecc030d86f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c993acdc25f216c826a96c6519846f644d89b0060cca1151dbb3e0f3226eaea5 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0fefd235b9fd5d25f75ddf110535ddec5c539fe4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fbc45d51dbc1471231cadd0032584621fb86eecfcde0d03ab47e4dcd3c68af5 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..feb46514db6c2bb40a9629c6d795c96be175d35b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2c1d004afcbbbebcfb68a9727530fff405d175a6ca8b3f7efcd34b9885fe831 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..591961ec116b7fa7cf876189377ccdbd3b9f8172 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/trainer_state.json @@ -0,0 +1,3186 @@ +{ + "best_metric": 1.8028968572616577, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", + "epoch": 2.9996649916247904, + "eval_steps": 10, + "global_step": 4477, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.29252371191978455, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1500 + }, + { + "epoch": 1.0117252931323284, + "grad_norm": 0.31607162952423096, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 1510 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.32294467091560364, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1520 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.3868017792701721, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 1530 + }, + { + "epoch": 1.031825795644891, + "grad_norm": 0.3178282082080841, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 1540 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.3706750273704529, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 1550 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.33930912613868713, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1560 + }, + { + "epoch": 1.051926298157454, + "grad_norm": 0.33970504999160767, + "learning_rate": 0.0002, + "loss": 1.7602, + "step": 1570 + }, + { + "epoch": 1.0586264656616415, + "grad_norm": 0.42553383111953735, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1580 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.3772421181201935, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1590 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.34212902188301086, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1600 + }, + { + "epoch": 1.0787269681742044, + "grad_norm": 0.3798283338546753, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1610 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.36909598112106323, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 1620 + }, + { + "epoch": 1.0921273031825796, + "grad_norm": 0.3344230651855469, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 1630 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.3862569332122803, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1640 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.31188511848449707, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1650 + }, + { + "epoch": 1.1122278056951425, + "grad_norm": 0.3563670814037323, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 1660 + }, + { + "epoch": 1.11892797319933, + "grad_norm": 0.35052165389060974, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 1670 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.3285699188709259, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1680 + }, + { + "epoch": 1.1323283082077051, + "grad_norm": 0.3639393746852875, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1690 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.3842753767967224, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 1700 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.3624933063983917, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 1710 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.3641220033168793, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1720 + }, + { + "epoch": 1.1591289782244556, + "grad_norm": 0.32765355706214905, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1730 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.34974896907806396, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 1740 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3910926580429077, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.3564300537109375, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 1760 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.34822574257850647, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1770 + }, + { + "epoch": 1.1926298157453936, + "grad_norm": 0.36185044050216675, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1780 + }, + { + "epoch": 1.1993299832495812, + "grad_norm": 0.34866711497306824, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1790 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.4017769992351532, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 1800 + }, + { + "epoch": 1.2127303182579565, + "grad_norm": 0.32930681109428406, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1810 + }, + { + "epoch": 1.219430485762144, + "grad_norm": 0.35951921343803406, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1820 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.37366992235183716, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 1830 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.3565689027309418, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1840 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.3692343533039093, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 1850 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.38426971435546875, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 1860 + }, + { + "epoch": 1.252931323283082, + "grad_norm": 0.33559855818748474, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1870 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.34181106090545654, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1880 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3916318416595459, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1890 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3887825012207031, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 1900 + }, + { + "epoch": 1.2797319932998326, + "grad_norm": 0.33583927154541016, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1910 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.37639349699020386, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1920 + }, + { + "epoch": 1.2931323283082077, + "grad_norm": 0.38059428334236145, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1930 + }, + { + "epoch": 1.2998324958123952, + "grad_norm": 0.37253183126449585, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1940 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.37371566891670227, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 1950 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.4080910086631775, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 1960 + }, + { + "epoch": 1.3199329983249581, + "grad_norm": 0.3174354135990143, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1970 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.4518888294696808, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 1980 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3627921938896179, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 1990 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3655930161476135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 2000 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2010 + }, + { + "epoch": 1.3534338358458962, + "grad_norm": 0.4281129240989685, + "learning_rate": 0.0002, + "loss": 1.7359, + "step": 2020 + }, + { + "epoch": 1.3601340033500837, + "grad_norm": 0.3821414113044739, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 2030 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.3907586336135864, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 2040 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37792932987213135, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 2050 + }, + { + "epoch": 1.3802345058626466, + "grad_norm": 0.3693985641002655, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 2060 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.32275936007499695, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 2070 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.3789440095424652, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 2080 + }, + { + "epoch": 1.4003350083752093, + "grad_norm": 0.3638380467891693, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 2090 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 2100 + }, + { + "epoch": 1.4137353433835846, + "grad_norm": 0.37920597195625305, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 2110 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.37218064069747925, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 2120 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.38074082136154175, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 2130 + }, + { + "epoch": 1.4338358458961473, + "grad_norm": 0.3455527126789093, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 2140 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.3712003529071808, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2150 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.3786754906177521, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2160 + }, + { + "epoch": 1.4539363484087102, + "grad_norm": 0.3879223167896271, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 2170 + }, + { + "epoch": 1.4606365159128978, + "grad_norm": 0.38738805055618286, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 2180 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.39768800139427185, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2190 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.4172441065311432, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 2200 + }, + { + "epoch": 1.4807370184254607, + "grad_norm": 0.4043174982070923, + "learning_rate": 0.0002, + "loss": 1.6736, + "step": 2210 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.3750883936882019, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 2220 + }, + { + "epoch": 1.4941373534338358, + "grad_norm": 0.3552253246307373, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 2230 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.34607139229774475, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2240 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.3406706750392914, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 2250 + }, + { + "epoch": 1.5142378559463987, + "grad_norm": 0.36654895544052124, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 2260 + }, + { + "epoch": 1.5209380234505863, + "grad_norm": 0.3914054334163666, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2270 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.42012137174606323, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 2280 + }, + { + "epoch": 1.5343383584589616, + "grad_norm": 0.39563435316085815, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.3508438766002655, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 2300 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.3785218596458435, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 2310 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.39377647638320923, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 2320 + }, + { + "epoch": 1.5611390284757118, + "grad_norm": 0.3391438126564026, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2330 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.37944263219833374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 2340 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3523491322994232, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2350 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.3911575973033905, + "learning_rate": 0.0002, + "loss": 1.7583, + "step": 2360 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.33832186460494995, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 2370 + }, + { + "epoch": 1.5946398659966499, + "grad_norm": 0.3665979206562042, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2380 + }, + { + "epoch": 1.6013400335008376, + "grad_norm": 0.3871748149394989, + "learning_rate": 0.0002, + "loss": 1.779, + "step": 2390 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3586967885494232, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 2400 + }, + { + "epoch": 1.6147403685092128, + "grad_norm": 0.3563673198223114, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 2410 + }, + { + "epoch": 1.6214405360134003, + "grad_norm": 0.37588971853256226, + "learning_rate": 0.0002, + "loss": 1.745, + "step": 2420 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.352556437253952, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 2430 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.3716259300708771, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2440 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.372001975774765, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2450 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.3430042862892151, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2460 + }, + { + "epoch": 1.6549413735343383, + "grad_norm": 0.3741483688354492, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2470 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.3610571324825287, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2480 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.4204719066619873, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2490 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3938186466693878, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 2500 + }, + { + "epoch": 1.6817420435510888, + "grad_norm": 0.3421435058116913, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 2510 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.42441412806510925, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 2520 + }, + { + "epoch": 1.695142378559464, + "grad_norm": 0.38071519136428833, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 2530 + }, + { + "epoch": 1.7018425460636517, + "grad_norm": 0.34078919887542725, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2540 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.412844181060791, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 2550 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 2560 + }, + { + "epoch": 1.7219430485762144, + "grad_norm": 0.41588476300239563, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 2570 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.35504111647605896, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2580 + }, + { + "epoch": 1.7353433835845897, + "grad_norm": 0.36909720301628113, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.4149979054927826, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 2600 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.38859328627586365, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 2610 + }, + { + "epoch": 1.7554438860971524, + "grad_norm": 0.36738792061805725, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2620 + }, + { + "epoch": 1.76214405360134, + "grad_norm": 0.3968178927898407, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2630 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.3972901999950409, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 2640 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3949959874153137, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 1.7822445561139029, + "grad_norm": 0.44074657559394836, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 2660 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.39743664860725403, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 2670 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.3950406610965729, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2680 + }, + { + "epoch": 1.8023450586264658, + "grad_norm": 0.3568263649940491, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2690 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3819476366043091, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2700 + }, + { + "epoch": 1.8157453936348409, + "grad_norm": 0.3480634391307831, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 2710 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2720 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.3441337049007416, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2730 + }, + { + "epoch": 1.8358458961474038, + "grad_norm": 0.35692882537841797, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 2740 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.36959215998649597, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2750 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.3893393278121948, + "learning_rate": 0.0002, + "loss": 1.7657, + "step": 2760 + }, + { + "epoch": 1.8559463986599665, + "grad_norm": 0.37817293405532837, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2770 + }, + { + "epoch": 1.862646566164154, + "grad_norm": 0.36071285605430603, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 2780 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.3758420944213867, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 2790 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3889938294887543, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 2800 + }, + { + "epoch": 1.882747068676717, + "grad_norm": 0.34361857175827026, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2810 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.39283323287963867, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2820 + }, + { + "epoch": 1.896147403685092, + "grad_norm": 0.3919452726840973, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 2830 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.38215070962905884, + "learning_rate": 0.0002, + "loss": 1.673, + "step": 2840 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.4235064387321472, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 2850 + }, + { + "epoch": 1.916247906197655, + "grad_norm": 0.35694634914398193, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 2860 + }, + { + "epoch": 1.9229480737018425, + "grad_norm": 0.383492112159729, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 2870 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5945147275924683, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2880 + }, + { + "epoch": 1.9363484087102178, + "grad_norm": 0.3367522358894348, + "learning_rate": 0.0002, + "loss": 1.7421, + "step": 2890 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.35300394892692566, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2900 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.38084495067596436, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2910 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.37559160590171814, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 2920 + }, + { + "epoch": 1.963149078726968, + "grad_norm": 0.3661738336086273, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 2930 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.4073849320411682, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2940 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3723304271697998, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 2950 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.3991098999977112, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 2960 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.3947085440158844, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2970 + }, + { + "epoch": 1.996649916247906, + "grad_norm": 0.3786258399486542, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2980 + }, + { + "epoch": 2.0, + "eval_loss": 1.8028968572616577, + "eval_runtime": 37.8985, + "eval_samples_per_second": 13.589, + "eval_steps_per_second": 1.715, + "step": 2985 + }, + { + "epoch": 2.003350083752094, + "grad_norm": 0.34824079275131226, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2990 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.3394894003868103, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3000 + }, + { + "epoch": 2.016750418760469, + "grad_norm": 0.36910977959632874, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3010 + }, + { + "epoch": 2.023450586264657, + "grad_norm": 0.45000967383384705, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 3020 + }, + { + "epoch": 2.030150753768844, + "grad_norm": 0.3791407346725464, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3030 + }, + { + "epoch": 2.036850921273032, + "grad_norm": 0.387321799993515, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 3040 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.4185757040977478, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3050 + }, + { + "epoch": 2.050251256281407, + "grad_norm": 0.45110777020454407, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3060 + }, + { + "epoch": 2.056951423785595, + "grad_norm": 0.42663660645484924, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 3070 + }, + { + "epoch": 2.063651591289782, + "grad_norm": 0.4546292722225189, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 3080 + }, + { + "epoch": 2.07035175879397, + "grad_norm": 0.3979759216308594, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3090 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.43596673011779785, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3100 + }, + { + "epoch": 2.083752093802345, + "grad_norm": 0.40120232105255127, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 3110 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.44449281692504883, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3120 + }, + { + "epoch": 2.09715242881072, + "grad_norm": 0.42672568559646606, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 3130 + }, + { + "epoch": 2.103852596314908, + "grad_norm": 0.4232690930366516, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 3140 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.4299317002296448, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3150 + }, + { + "epoch": 2.117252931323283, + "grad_norm": 0.4067758023738861, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 3160 + }, + { + "epoch": 2.123953098827471, + "grad_norm": 0.4918815791606903, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3170 + }, + { + "epoch": 2.130653266331658, + "grad_norm": 0.4140559732913971, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3180 + }, + { + "epoch": 2.137353433835846, + "grad_norm": 0.4555995464324951, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 3190 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.42943915724754333, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 3200 + }, + { + "epoch": 2.150753768844221, + "grad_norm": 0.4730435013771057, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3210 + }, + { + "epoch": 2.157453936348409, + "grad_norm": 0.43310216069221497, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 3220 + }, + { + "epoch": 2.164154103852596, + "grad_norm": 0.42054110765457153, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3230 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.4897233247756958, + "learning_rate": 0.0002, + "loss": 1.6749, + "step": 3240 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.42194533348083496, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 3250 + }, + { + "epoch": 2.184254606365159, + "grad_norm": 0.44494450092315674, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3260 + }, + { + "epoch": 2.190954773869347, + "grad_norm": 0.43524879217147827, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 3270 + }, + { + "epoch": 2.1976549413735342, + "grad_norm": 0.4621117413043976, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 3280 + }, + { + "epoch": 2.204355108877722, + "grad_norm": 0.4073285460472107, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 3290 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.47868335247039795, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3300 + }, + { + "epoch": 2.217755443886097, + "grad_norm": 0.4264970123767853, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 3310 + }, + { + "epoch": 2.224455611390285, + "grad_norm": 0.4491245150566101, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3320 + }, + { + "epoch": 2.2311557788944723, + "grad_norm": 0.4010344445705414, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 3330 + }, + { + "epoch": 2.23785594639866, + "grad_norm": 0.4232759177684784, + "learning_rate": 0.0002, + "loss": 1.6684, + "step": 3340 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5099776983261108, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3350 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.5223407745361328, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 3360 + }, + { + "epoch": 2.257956448911223, + "grad_norm": 0.47818470001220703, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3370 + }, + { + "epoch": 2.2646566164154103, + "grad_norm": 0.4721255898475647, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3380 + }, + { + "epoch": 2.271356783919598, + "grad_norm": 0.4113229513168335, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3390 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.507080078125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3400 + }, + { + "epoch": 2.284757118927973, + "grad_norm": 0.4852292239665985, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 3410 + }, + { + "epoch": 2.291457286432161, + "grad_norm": 0.4503684341907501, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 3420 + }, + { + "epoch": 2.2981574539363483, + "grad_norm": 0.8359600305557251, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 3430 + }, + { + "epoch": 2.304857621440536, + "grad_norm": 0.44604045152664185, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3440 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.45667049288749695, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3450 + }, + { + "epoch": 2.318257956448911, + "grad_norm": 0.4879349172115326, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 3460 + }, + { + "epoch": 2.324958123953099, + "grad_norm": 0.4033963084220886, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 3470 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.44494301080703735, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3480 + }, + { + "epoch": 2.338358458961474, + "grad_norm": 0.4794621765613556, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.41404327750205994, + "learning_rate": 0.0002, + "loss": 1.6807, + "step": 3500 + }, + { + "epoch": 2.351758793969849, + "grad_norm": 0.4664851725101471, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 3510 + }, + { + "epoch": 2.358458961474037, + "grad_norm": 0.4263697564601898, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 3520 + }, + { + "epoch": 2.3651591289782243, + "grad_norm": 0.5035167336463928, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 3530 + }, + { + "epoch": 2.371859296482412, + "grad_norm": 0.4380664527416229, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 3540 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.5227681994438171, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3550 + }, + { + "epoch": 2.3852596314907872, + "grad_norm": 0.4382302761077881, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3560 + }, + { + "epoch": 2.391959798994975, + "grad_norm": 0.4392451047897339, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3570 + }, + { + "epoch": 2.3986599664991624, + "grad_norm": 0.4372786581516266, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 3580 + }, + { + "epoch": 2.40536013400335, + "grad_norm": 0.5015502572059631, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 3590 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.5653210878372192, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3600 + }, + { + "epoch": 2.4187604690117253, + "grad_norm": 0.53007972240448, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3610 + }, + { + "epoch": 2.425460636515913, + "grad_norm": 0.4659176766872406, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 3620 + }, + { + "epoch": 2.4321608040201004, + "grad_norm": 0.5637837052345276, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3630 + }, + { + "epoch": 2.438860971524288, + "grad_norm": 0.4248391389846802, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3640 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.44668248295783997, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 3650 + }, + { + "epoch": 2.4522613065326633, + "grad_norm": 0.43990179896354675, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 3660 + }, + { + "epoch": 2.458961474036851, + "grad_norm": 0.4532523453235626, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 3670 + }, + { + "epoch": 2.4656616415410384, + "grad_norm": 0.6605591773986816, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 3680 + }, + { + "epoch": 2.472361809045226, + "grad_norm": 0.4694533348083496, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3690 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.4485011100769043, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 3700 + }, + { + "epoch": 2.4857621440536013, + "grad_norm": 0.4761785864830017, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3710 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.5116432309150696, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 3720 + }, + { + "epoch": 2.4991624790619764, + "grad_norm": 0.49523618817329407, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 3730 + }, + { + "epoch": 2.505862646566164, + "grad_norm": 0.43826380372047424, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 3740 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.4916154146194458, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3750 + }, + { + "epoch": 2.5192629815745393, + "grad_norm": 0.5381299257278442, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 3760 + }, + { + "epoch": 2.525963149078727, + "grad_norm": 0.44947415590286255, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 3770 + }, + { + "epoch": 2.5326633165829144, + "grad_norm": 0.49979084730148315, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3780 + }, + { + "epoch": 2.539363484087102, + "grad_norm": 0.43046900629997253, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 3790 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.4513470530509949, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 3800 + }, + { + "epoch": 2.5527638190954773, + "grad_norm": 0.49900051951408386, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3810 + }, + { + "epoch": 2.559463986599665, + "grad_norm": 0.4348420202732086, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 3820 + }, + { + "epoch": 2.5661641541038525, + "grad_norm": 0.4684867560863495, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3830 + }, + { + "epoch": 2.5728643216080402, + "grad_norm": 0.44430989027023315, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3840 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.47375255823135376, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3850 + }, + { + "epoch": 2.5862646566164154, + "grad_norm": 0.45493075251579285, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 3860 + }, + { + "epoch": 2.592964824120603, + "grad_norm": 0.4563275873661041, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 3870 + }, + { + "epoch": 2.5996649916247905, + "grad_norm": 0.46060335636138916, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3880 + }, + { + "epoch": 2.6063651591289783, + "grad_norm": 0.4718867540359497, + "learning_rate": 0.0002, + "loss": 1.6302, + "step": 3890 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.41570305824279785, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 3900 + }, + { + "epoch": 2.6197654941373534, + "grad_norm": 0.4603121876716614, + "learning_rate": 0.0002, + "loss": 1.6401, + "step": 3910 + }, + { + "epoch": 2.626465661641541, + "grad_norm": 0.4734652638435364, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 3920 + }, + { + "epoch": 2.6331658291457285, + "grad_norm": 0.45348483324050903, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3930 + }, + { + "epoch": 2.6398659966499163, + "grad_norm": 0.46559447050094604, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3940 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.44113144278526306, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 3950 + }, + { + "epoch": 2.6532663316582914, + "grad_norm": 0.41415104269981384, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3960 + }, + { + "epoch": 2.659966499162479, + "grad_norm": 0.48868080973625183, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 3970 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.49610549211502075, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 3980 + }, + { + "epoch": 2.6733668341708543, + "grad_norm": 0.4309130907058716, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 3990 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.4489327669143677, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 4000 + }, + { + "epoch": 2.6867671691792294, + "grad_norm": 0.5380139946937561, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 4010 + }, + { + "epoch": 2.693467336683417, + "grad_norm": 0.5076672434806824, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 4020 + }, + { + "epoch": 2.7001675041876045, + "grad_norm": 0.47620031237602234, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 4030 + }, + { + "epoch": 2.7068676716917923, + "grad_norm": 0.48089155554771423, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 4040 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.5108814239501953, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 4050 + }, + { + "epoch": 2.7202680067001674, + "grad_norm": 0.4196513295173645, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 4060 + }, + { + "epoch": 2.726968174204355, + "grad_norm": 0.4574664831161499, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 4070 + }, + { + "epoch": 2.7336683417085426, + "grad_norm": 0.4671640992164612, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 4080 + }, + { + "epoch": 2.7403685092127303, + "grad_norm": 0.49355530738830566, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 4090 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.46716663241386414, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 4100 + }, + { + "epoch": 2.7537688442211055, + "grad_norm": 0.45420581102371216, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 4110 + }, + { + "epoch": 2.7604690117252932, + "grad_norm": 0.4680487811565399, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4120 + }, + { + "epoch": 2.7671691792294806, + "grad_norm": 0.5375032424926758, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4130 + }, + { + "epoch": 2.7738693467336684, + "grad_norm": 0.46026280522346497, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 4140 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.43658447265625, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 4150 + }, + { + "epoch": 2.7872696817420435, + "grad_norm": 0.4935547113418579, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 4160 + }, + { + "epoch": 2.7939698492462313, + "grad_norm": 0.8167962431907654, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4170 + }, + { + "epoch": 2.8006700167504186, + "grad_norm": 0.4289683997631073, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 4180 + }, + { + "epoch": 2.8073701842546064, + "grad_norm": 0.4569324254989624, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 4190 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.474795937538147, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 4200 + }, + { + "epoch": 2.8207705192629815, + "grad_norm": 0.44272229075431824, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 4210 + }, + { + "epoch": 2.8274706867671693, + "grad_norm": 0.525240957736969, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 4220 + }, + { + "epoch": 2.8341708542713566, + "grad_norm": 0.4802303910255432, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 4230 + }, + { + "epoch": 2.8408710217755444, + "grad_norm": 0.46400442719459534, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4240 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.49884888529777527, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4250 + }, + { + "epoch": 2.8542713567839195, + "grad_norm": 0.5015072226524353, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 4260 + }, + { + "epoch": 2.8609715242881073, + "grad_norm": 0.4335440695285797, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 4270 + }, + { + "epoch": 2.8676716917922946, + "grad_norm": 0.5131644606590271, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 4280 + }, + { + "epoch": 2.8743718592964824, + "grad_norm": 0.6977195739746094, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 4290 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5133762955665588, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 4300 + }, + { + "epoch": 2.8877721943048575, + "grad_norm": 0.4737614393234253, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 4310 + }, + { + "epoch": 2.8944723618090453, + "grad_norm": 0.4580535590648651, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 4320 + }, + { + "epoch": 2.901172529313233, + "grad_norm": 0.43863341212272644, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 4330 + }, + { + "epoch": 2.9078726968174204, + "grad_norm": 0.4103737473487854, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4340 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.438014417886734, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 4350 + }, + { + "epoch": 2.9212730318257956, + "grad_norm": 0.5068213939666748, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 4360 + }, + { + "epoch": 2.9279731993299833, + "grad_norm": 0.45305484533309937, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 4370 + }, + { + "epoch": 2.934673366834171, + "grad_norm": 0.4612090289592743, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4380 + }, + { + "epoch": 2.9413735343383585, + "grad_norm": 0.508736789226532, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 4390 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4924427270889282, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 4400 + }, + { + "epoch": 2.9547738693467336, + "grad_norm": 0.5707460641860962, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 4410 + }, + { + "epoch": 2.9614740368509214, + "grad_norm": 0.42270299792289734, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 4420 + }, + { + "epoch": 2.968174204355109, + "grad_norm": 0.4429931044578552, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 4430 + }, + { + "epoch": 2.9748743718592965, + "grad_norm": 0.49760574102401733, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 4440 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4558229148387909, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 4450 + }, + { + "epoch": 2.9882747068676716, + "grad_norm": 0.39848530292510986, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4460 + }, + { + "epoch": 2.9949748743718594, + "grad_norm": 0.5224862098693848, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 4470 + }, + { + "epoch": 2.9996649916247904, + "eval_loss": 1.8228833675384521, + "eval_runtime": 37.9049, + "eval_samples_per_second": 13.587, + "eval_steps_per_second": 1.715, + "step": 4477 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0720867447144448e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-4477/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..187483575890af464d9fc34a5c6041a1d913c030 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09843f37ac51f23607d4fc013c537ae32d4a4610a3e68536ef0c6fd0de56067f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bae881a0c8a101634bb2e8482068e7815fd9da88 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce883ace8bdd886f8b3a5dcf93aa4a8675f7bd47a446c777aeecb5881fad069 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..943a6bd2ee8cbed6782eaa5a484edf6695650762 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a4e09a09c5f85de9778f65259b862561985aa5a2edaf77c99c39901e891ff3 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dfc75f10e9f51117e5f561a5d0b15bda67213a3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001d71f239fa6dc6a8238c31d4375718af287b278554a1c1582559d2cad2bb41 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9f9251c68727836b1d9a7287d6fd442eeaf6d65f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/trainer_state.json @@ -0,0 +1,4244 @@ +{ + "best_metric": 1.8028968572616577, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 5970, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.29252371191978455, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1500 + }, + { + "epoch": 1.0117252931323284, + "grad_norm": 0.31607162952423096, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 1510 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.32294467091560364, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1520 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.3868017792701721, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 1530 + }, + { + "epoch": 1.031825795644891, + "grad_norm": 0.3178282082080841, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 1540 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.3706750273704529, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 1550 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.33930912613868713, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1560 + }, + { + "epoch": 1.051926298157454, + "grad_norm": 0.33970504999160767, + "learning_rate": 0.0002, + "loss": 1.7602, + "step": 1570 + }, + { + "epoch": 1.0586264656616415, + "grad_norm": 0.42553383111953735, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1580 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.3772421181201935, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1590 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.34212902188301086, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1600 + }, + { + "epoch": 1.0787269681742044, + "grad_norm": 0.3798283338546753, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1610 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.36909598112106323, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 1620 + }, + { + "epoch": 1.0921273031825796, + "grad_norm": 0.3344230651855469, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 1630 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.3862569332122803, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1640 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.31188511848449707, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1650 + }, + { + "epoch": 1.1122278056951425, + "grad_norm": 0.3563670814037323, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 1660 + }, + { + "epoch": 1.11892797319933, + "grad_norm": 0.35052165389060974, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 1670 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.3285699188709259, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1680 + }, + { + "epoch": 1.1323283082077051, + "grad_norm": 0.3639393746852875, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1690 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.3842753767967224, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 1700 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.3624933063983917, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 1710 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.3641220033168793, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1720 + }, + { + "epoch": 1.1591289782244556, + "grad_norm": 0.32765355706214905, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1730 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.34974896907806396, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 1740 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3910926580429077, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.3564300537109375, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 1760 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.34822574257850647, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1770 + }, + { + "epoch": 1.1926298157453936, + "grad_norm": 0.36185044050216675, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1780 + }, + { + "epoch": 1.1993299832495812, + "grad_norm": 0.34866711497306824, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1790 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.4017769992351532, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 1800 + }, + { + "epoch": 1.2127303182579565, + "grad_norm": 0.32930681109428406, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1810 + }, + { + "epoch": 1.219430485762144, + "grad_norm": 0.35951921343803406, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1820 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.37366992235183716, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 1830 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.3565689027309418, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1840 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.3692343533039093, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 1850 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.38426971435546875, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 1860 + }, + { + "epoch": 1.252931323283082, + "grad_norm": 0.33559855818748474, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1870 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.34181106090545654, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1880 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3916318416595459, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1890 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3887825012207031, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 1900 + }, + { + "epoch": 1.2797319932998326, + "grad_norm": 0.33583927154541016, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1910 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.37639349699020386, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1920 + }, + { + "epoch": 1.2931323283082077, + "grad_norm": 0.38059428334236145, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1930 + }, + { + "epoch": 1.2998324958123952, + "grad_norm": 0.37253183126449585, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1940 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.37371566891670227, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 1950 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.4080910086631775, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 1960 + }, + { + "epoch": 1.3199329983249581, + "grad_norm": 0.3174354135990143, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1970 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.4518888294696808, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 1980 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3627921938896179, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 1990 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3655930161476135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 2000 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2010 + }, + { + "epoch": 1.3534338358458962, + "grad_norm": 0.4281129240989685, + "learning_rate": 0.0002, + "loss": 1.7359, + "step": 2020 + }, + { + "epoch": 1.3601340033500837, + "grad_norm": 0.3821414113044739, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 2030 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.3907586336135864, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 2040 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37792932987213135, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 2050 + }, + { + "epoch": 1.3802345058626466, + "grad_norm": 0.3693985641002655, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 2060 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.32275936007499695, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 2070 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.3789440095424652, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 2080 + }, + { + "epoch": 1.4003350083752093, + "grad_norm": 0.3638380467891693, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 2090 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 2100 + }, + { + "epoch": 1.4137353433835846, + "grad_norm": 0.37920597195625305, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 2110 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.37218064069747925, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 2120 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.38074082136154175, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 2130 + }, + { + "epoch": 1.4338358458961473, + "grad_norm": 0.3455527126789093, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 2140 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.3712003529071808, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2150 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.3786754906177521, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2160 + }, + { + "epoch": 1.4539363484087102, + "grad_norm": 0.3879223167896271, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 2170 + }, + { + "epoch": 1.4606365159128978, + "grad_norm": 0.38738805055618286, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 2180 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.39768800139427185, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2190 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.4172441065311432, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 2200 + }, + { + "epoch": 1.4807370184254607, + "grad_norm": 0.4043174982070923, + "learning_rate": 0.0002, + "loss": 1.6736, + "step": 2210 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.3750883936882019, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 2220 + }, + { + "epoch": 1.4941373534338358, + "grad_norm": 0.3552253246307373, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 2230 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.34607139229774475, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2240 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.3406706750392914, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 2250 + }, + { + "epoch": 1.5142378559463987, + "grad_norm": 0.36654895544052124, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 2260 + }, + { + "epoch": 1.5209380234505863, + "grad_norm": 0.3914054334163666, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2270 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.42012137174606323, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 2280 + }, + { + "epoch": 1.5343383584589616, + "grad_norm": 0.39563435316085815, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.3508438766002655, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 2300 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.3785218596458435, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 2310 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.39377647638320923, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 2320 + }, + { + "epoch": 1.5611390284757118, + "grad_norm": 0.3391438126564026, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2330 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.37944263219833374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 2340 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3523491322994232, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2350 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.3911575973033905, + "learning_rate": 0.0002, + "loss": 1.7583, + "step": 2360 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.33832186460494995, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 2370 + }, + { + "epoch": 1.5946398659966499, + "grad_norm": 0.3665979206562042, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2380 + }, + { + "epoch": 1.6013400335008376, + "grad_norm": 0.3871748149394989, + "learning_rate": 0.0002, + "loss": 1.779, + "step": 2390 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3586967885494232, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 2400 + }, + { + "epoch": 1.6147403685092128, + "grad_norm": 0.3563673198223114, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 2410 + }, + { + "epoch": 1.6214405360134003, + "grad_norm": 0.37588971853256226, + "learning_rate": 0.0002, + "loss": 1.745, + "step": 2420 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.352556437253952, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 2430 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.3716259300708771, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2440 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.372001975774765, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2450 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.3430042862892151, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2460 + }, + { + "epoch": 1.6549413735343383, + "grad_norm": 0.3741483688354492, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2470 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.3610571324825287, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2480 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.4204719066619873, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2490 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3938186466693878, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 2500 + }, + { + "epoch": 1.6817420435510888, + "grad_norm": 0.3421435058116913, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 2510 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.42441412806510925, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 2520 + }, + { + "epoch": 1.695142378559464, + "grad_norm": 0.38071519136428833, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 2530 + }, + { + "epoch": 1.7018425460636517, + "grad_norm": 0.34078919887542725, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2540 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.412844181060791, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 2550 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 2560 + }, + { + "epoch": 1.7219430485762144, + "grad_norm": 0.41588476300239563, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 2570 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.35504111647605896, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2580 + }, + { + "epoch": 1.7353433835845897, + "grad_norm": 0.36909720301628113, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.4149979054927826, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 2600 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.38859328627586365, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 2610 + }, + { + "epoch": 1.7554438860971524, + "grad_norm": 0.36738792061805725, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2620 + }, + { + "epoch": 1.76214405360134, + "grad_norm": 0.3968178927898407, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2630 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.3972901999950409, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 2640 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3949959874153137, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 1.7822445561139029, + "grad_norm": 0.44074657559394836, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 2660 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.39743664860725403, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 2670 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.3950406610965729, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2680 + }, + { + "epoch": 1.8023450586264658, + "grad_norm": 0.3568263649940491, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2690 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3819476366043091, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2700 + }, + { + "epoch": 1.8157453936348409, + "grad_norm": 0.3480634391307831, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 2710 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2720 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.3441337049007416, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2730 + }, + { + "epoch": 1.8358458961474038, + "grad_norm": 0.35692882537841797, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 2740 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.36959215998649597, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2750 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.3893393278121948, + "learning_rate": 0.0002, + "loss": 1.7657, + "step": 2760 + }, + { + "epoch": 1.8559463986599665, + "grad_norm": 0.37817293405532837, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2770 + }, + { + "epoch": 1.862646566164154, + "grad_norm": 0.36071285605430603, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 2780 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.3758420944213867, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 2790 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3889938294887543, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 2800 + }, + { + "epoch": 1.882747068676717, + "grad_norm": 0.34361857175827026, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2810 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.39283323287963867, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2820 + }, + { + "epoch": 1.896147403685092, + "grad_norm": 0.3919452726840973, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 2830 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.38215070962905884, + "learning_rate": 0.0002, + "loss": 1.673, + "step": 2840 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.4235064387321472, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 2850 + }, + { + "epoch": 1.916247906197655, + "grad_norm": 0.35694634914398193, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 2860 + }, + { + "epoch": 1.9229480737018425, + "grad_norm": 0.383492112159729, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 2870 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5945147275924683, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2880 + }, + { + "epoch": 1.9363484087102178, + "grad_norm": 0.3367522358894348, + "learning_rate": 0.0002, + "loss": 1.7421, + "step": 2890 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.35300394892692566, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2900 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.38084495067596436, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2910 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.37559160590171814, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 2920 + }, + { + "epoch": 1.963149078726968, + "grad_norm": 0.3661738336086273, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 2930 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.4073849320411682, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2940 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3723304271697998, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 2950 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.3991098999977112, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 2960 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.3947085440158844, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2970 + }, + { + "epoch": 1.996649916247906, + "grad_norm": 0.3786258399486542, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2980 + }, + { + "epoch": 2.0, + "eval_loss": 1.8028968572616577, + "eval_runtime": 37.8985, + "eval_samples_per_second": 13.589, + "eval_steps_per_second": 1.715, + "step": 2985 + }, + { + "epoch": 2.003350083752094, + "grad_norm": 0.34824079275131226, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2990 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.3394894003868103, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3000 + }, + { + "epoch": 2.016750418760469, + "grad_norm": 0.36910977959632874, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3010 + }, + { + "epoch": 2.023450586264657, + "grad_norm": 0.45000967383384705, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 3020 + }, + { + "epoch": 2.030150753768844, + "grad_norm": 0.3791407346725464, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3030 + }, + { + "epoch": 2.036850921273032, + "grad_norm": 0.387321799993515, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 3040 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.4185757040977478, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3050 + }, + { + "epoch": 2.050251256281407, + "grad_norm": 0.45110777020454407, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3060 + }, + { + "epoch": 2.056951423785595, + "grad_norm": 0.42663660645484924, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 3070 + }, + { + "epoch": 2.063651591289782, + "grad_norm": 0.4546292722225189, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 3080 + }, + { + "epoch": 2.07035175879397, + "grad_norm": 0.3979759216308594, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3090 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.43596673011779785, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3100 + }, + { + "epoch": 2.083752093802345, + "grad_norm": 0.40120232105255127, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 3110 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.44449281692504883, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3120 + }, + { + "epoch": 2.09715242881072, + "grad_norm": 0.42672568559646606, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 3130 + }, + { + "epoch": 2.103852596314908, + "grad_norm": 0.4232690930366516, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 3140 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.4299317002296448, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3150 + }, + { + "epoch": 2.117252931323283, + "grad_norm": 0.4067758023738861, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 3160 + }, + { + "epoch": 2.123953098827471, + "grad_norm": 0.4918815791606903, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3170 + }, + { + "epoch": 2.130653266331658, + "grad_norm": 0.4140559732913971, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3180 + }, + { + "epoch": 2.137353433835846, + "grad_norm": 0.4555995464324951, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 3190 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.42943915724754333, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 3200 + }, + { + "epoch": 2.150753768844221, + "grad_norm": 0.4730435013771057, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3210 + }, + { + "epoch": 2.157453936348409, + "grad_norm": 0.43310216069221497, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 3220 + }, + { + "epoch": 2.164154103852596, + "grad_norm": 0.42054110765457153, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3230 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.4897233247756958, + "learning_rate": 0.0002, + "loss": 1.6749, + "step": 3240 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.42194533348083496, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 3250 + }, + { + "epoch": 2.184254606365159, + "grad_norm": 0.44494450092315674, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3260 + }, + { + "epoch": 2.190954773869347, + "grad_norm": 0.43524879217147827, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 3270 + }, + { + "epoch": 2.1976549413735342, + "grad_norm": 0.4621117413043976, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 3280 + }, + { + "epoch": 2.204355108877722, + "grad_norm": 0.4073285460472107, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 3290 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.47868335247039795, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3300 + }, + { + "epoch": 2.217755443886097, + "grad_norm": 0.4264970123767853, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 3310 + }, + { + "epoch": 2.224455611390285, + "grad_norm": 0.4491245150566101, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3320 + }, + { + "epoch": 2.2311557788944723, + "grad_norm": 0.4010344445705414, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 3330 + }, + { + "epoch": 2.23785594639866, + "grad_norm": 0.4232759177684784, + "learning_rate": 0.0002, + "loss": 1.6684, + "step": 3340 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5099776983261108, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3350 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.5223407745361328, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 3360 + }, + { + "epoch": 2.257956448911223, + "grad_norm": 0.47818470001220703, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3370 + }, + { + "epoch": 2.2646566164154103, + "grad_norm": 0.4721255898475647, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3380 + }, + { + "epoch": 2.271356783919598, + "grad_norm": 0.4113229513168335, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3390 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.507080078125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3400 + }, + { + "epoch": 2.284757118927973, + "grad_norm": 0.4852292239665985, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 3410 + }, + { + "epoch": 2.291457286432161, + "grad_norm": 0.4503684341907501, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 3420 + }, + { + "epoch": 2.2981574539363483, + "grad_norm": 0.8359600305557251, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 3430 + }, + { + "epoch": 2.304857621440536, + "grad_norm": 0.44604045152664185, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3440 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.45667049288749695, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3450 + }, + { + "epoch": 2.318257956448911, + "grad_norm": 0.4879349172115326, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 3460 + }, + { + "epoch": 2.324958123953099, + "grad_norm": 0.4033963084220886, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 3470 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.44494301080703735, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3480 + }, + { + "epoch": 2.338358458961474, + "grad_norm": 0.4794621765613556, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.41404327750205994, + "learning_rate": 0.0002, + "loss": 1.6807, + "step": 3500 + }, + { + "epoch": 2.351758793969849, + "grad_norm": 0.4664851725101471, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 3510 + }, + { + "epoch": 2.358458961474037, + "grad_norm": 0.4263697564601898, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 3520 + }, + { + "epoch": 2.3651591289782243, + "grad_norm": 0.5035167336463928, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 3530 + }, + { + "epoch": 2.371859296482412, + "grad_norm": 0.4380664527416229, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 3540 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.5227681994438171, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3550 + }, + { + "epoch": 2.3852596314907872, + "grad_norm": 0.4382302761077881, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3560 + }, + { + "epoch": 2.391959798994975, + "grad_norm": 0.4392451047897339, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3570 + }, + { + "epoch": 2.3986599664991624, + "grad_norm": 0.4372786581516266, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 3580 + }, + { + "epoch": 2.40536013400335, + "grad_norm": 0.5015502572059631, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 3590 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.5653210878372192, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3600 + }, + { + "epoch": 2.4187604690117253, + "grad_norm": 0.53007972240448, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3610 + }, + { + "epoch": 2.425460636515913, + "grad_norm": 0.4659176766872406, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 3620 + }, + { + "epoch": 2.4321608040201004, + "grad_norm": 0.5637837052345276, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3630 + }, + { + "epoch": 2.438860971524288, + "grad_norm": 0.4248391389846802, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3640 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.44668248295783997, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 3650 + }, + { + "epoch": 2.4522613065326633, + "grad_norm": 0.43990179896354675, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 3660 + }, + { + "epoch": 2.458961474036851, + "grad_norm": 0.4532523453235626, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 3670 + }, + { + "epoch": 2.4656616415410384, + "grad_norm": 0.6605591773986816, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 3680 + }, + { + "epoch": 2.472361809045226, + "grad_norm": 0.4694533348083496, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3690 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.4485011100769043, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 3700 + }, + { + "epoch": 2.4857621440536013, + "grad_norm": 0.4761785864830017, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3710 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.5116432309150696, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 3720 + }, + { + "epoch": 2.4991624790619764, + "grad_norm": 0.49523618817329407, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 3730 + }, + { + "epoch": 2.505862646566164, + "grad_norm": 0.43826380372047424, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 3740 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.4916154146194458, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3750 + }, + { + "epoch": 2.5192629815745393, + "grad_norm": 0.5381299257278442, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 3760 + }, + { + "epoch": 2.525963149078727, + "grad_norm": 0.44947415590286255, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 3770 + }, + { + "epoch": 2.5326633165829144, + "grad_norm": 0.49979084730148315, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3780 + }, + { + "epoch": 2.539363484087102, + "grad_norm": 0.43046900629997253, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 3790 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.4513470530509949, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 3800 + }, + { + "epoch": 2.5527638190954773, + "grad_norm": 0.49900051951408386, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3810 + }, + { + "epoch": 2.559463986599665, + "grad_norm": 0.4348420202732086, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 3820 + }, + { + "epoch": 2.5661641541038525, + "grad_norm": 0.4684867560863495, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3830 + }, + { + "epoch": 2.5728643216080402, + "grad_norm": 0.44430989027023315, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3840 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.47375255823135376, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3850 + }, + { + "epoch": 2.5862646566164154, + "grad_norm": 0.45493075251579285, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 3860 + }, + { + "epoch": 2.592964824120603, + "grad_norm": 0.4563275873661041, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 3870 + }, + { + "epoch": 2.5996649916247905, + "grad_norm": 0.46060335636138916, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3880 + }, + { + "epoch": 2.6063651591289783, + "grad_norm": 0.4718867540359497, + "learning_rate": 0.0002, + "loss": 1.6302, + "step": 3890 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.41570305824279785, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 3900 + }, + { + "epoch": 2.6197654941373534, + "grad_norm": 0.4603121876716614, + "learning_rate": 0.0002, + "loss": 1.6401, + "step": 3910 + }, + { + "epoch": 2.626465661641541, + "grad_norm": 0.4734652638435364, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 3920 + }, + { + "epoch": 2.6331658291457285, + "grad_norm": 0.45348483324050903, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3930 + }, + { + "epoch": 2.6398659966499163, + "grad_norm": 0.46559447050094604, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3940 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.44113144278526306, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 3950 + }, + { + "epoch": 2.6532663316582914, + "grad_norm": 0.41415104269981384, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3960 + }, + { + "epoch": 2.659966499162479, + "grad_norm": 0.48868080973625183, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 3970 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.49610549211502075, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 3980 + }, + { + "epoch": 2.6733668341708543, + "grad_norm": 0.4309130907058716, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 3990 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.4489327669143677, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 4000 + }, + { + "epoch": 2.6867671691792294, + "grad_norm": 0.5380139946937561, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 4010 + }, + { + "epoch": 2.693467336683417, + "grad_norm": 0.5076672434806824, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 4020 + }, + { + "epoch": 2.7001675041876045, + "grad_norm": 0.47620031237602234, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 4030 + }, + { + "epoch": 2.7068676716917923, + "grad_norm": 0.48089155554771423, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 4040 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.5108814239501953, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 4050 + }, + { + "epoch": 2.7202680067001674, + "grad_norm": 0.4196513295173645, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 4060 + }, + { + "epoch": 2.726968174204355, + "grad_norm": 0.4574664831161499, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 4070 + }, + { + "epoch": 2.7336683417085426, + "grad_norm": 0.4671640992164612, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 4080 + }, + { + "epoch": 2.7403685092127303, + "grad_norm": 0.49355530738830566, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 4090 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.46716663241386414, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 4100 + }, + { + "epoch": 2.7537688442211055, + "grad_norm": 0.45420581102371216, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 4110 + }, + { + "epoch": 2.7604690117252932, + "grad_norm": 0.4680487811565399, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4120 + }, + { + "epoch": 2.7671691792294806, + "grad_norm": 0.5375032424926758, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4130 + }, + { + "epoch": 2.7738693467336684, + "grad_norm": 0.46026280522346497, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 4140 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.43658447265625, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 4150 + }, + { + "epoch": 2.7872696817420435, + "grad_norm": 0.4935547113418579, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 4160 + }, + { + "epoch": 2.7939698492462313, + "grad_norm": 0.8167962431907654, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4170 + }, + { + "epoch": 2.8006700167504186, + "grad_norm": 0.4289683997631073, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 4180 + }, + { + "epoch": 2.8073701842546064, + "grad_norm": 0.4569324254989624, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 4190 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.474795937538147, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 4200 + }, + { + "epoch": 2.8207705192629815, + "grad_norm": 0.44272229075431824, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 4210 + }, + { + "epoch": 2.8274706867671693, + "grad_norm": 0.525240957736969, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 4220 + }, + { + "epoch": 2.8341708542713566, + "grad_norm": 0.4802303910255432, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 4230 + }, + { + "epoch": 2.8408710217755444, + "grad_norm": 0.46400442719459534, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4240 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.49884888529777527, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4250 + }, + { + "epoch": 2.8542713567839195, + "grad_norm": 0.5015072226524353, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 4260 + }, + { + "epoch": 2.8609715242881073, + "grad_norm": 0.4335440695285797, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 4270 + }, + { + "epoch": 2.8676716917922946, + "grad_norm": 0.5131644606590271, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 4280 + }, + { + "epoch": 2.8743718592964824, + "grad_norm": 0.6977195739746094, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 4290 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5133762955665588, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 4300 + }, + { + "epoch": 2.8877721943048575, + "grad_norm": 0.4737614393234253, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 4310 + }, + { + "epoch": 2.8944723618090453, + "grad_norm": 0.4580535590648651, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 4320 + }, + { + "epoch": 2.901172529313233, + "grad_norm": 0.43863341212272644, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 4330 + }, + { + "epoch": 2.9078726968174204, + "grad_norm": 0.4103737473487854, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4340 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.438014417886734, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 4350 + }, + { + "epoch": 2.9212730318257956, + "grad_norm": 0.5068213939666748, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 4360 + }, + { + "epoch": 2.9279731993299833, + "grad_norm": 0.45305484533309937, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 4370 + }, + { + "epoch": 2.934673366834171, + "grad_norm": 0.4612090289592743, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4380 + }, + { + "epoch": 2.9413735343383585, + "grad_norm": 0.508736789226532, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 4390 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4924427270889282, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 4400 + }, + { + "epoch": 2.9547738693467336, + "grad_norm": 0.5707460641860962, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 4410 + }, + { + "epoch": 2.9614740368509214, + "grad_norm": 0.42270299792289734, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 4420 + }, + { + "epoch": 2.968174204355109, + "grad_norm": 0.4429931044578552, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 4430 + }, + { + "epoch": 2.9748743718592965, + "grad_norm": 0.49760574102401733, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 4440 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4558229148387909, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 4450 + }, + { + "epoch": 2.9882747068676716, + "grad_norm": 0.39848530292510986, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4460 + }, + { + "epoch": 2.9949748743718594, + "grad_norm": 0.5224862098693848, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 4470 + }, + { + "epoch": 2.9996649916247904, + "eval_loss": 1.8228833675384521, + "eval_runtime": 37.9049, + "eval_samples_per_second": 13.587, + "eval_steps_per_second": 1.715, + "step": 4477 + }, + { + "epoch": 3.0016750418760467, + "grad_norm": 0.41169142723083496, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 4480 + }, + { + "epoch": 3.0083752093802345, + "grad_norm": 0.4865207374095917, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 4490 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5462028384208679, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4500 + }, + { + "epoch": 3.0217755443886096, + "grad_norm": 0.6169732809066772, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 4510 + }, + { + "epoch": 3.0284757118927974, + "grad_norm": 0.5667954087257385, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 4520 + }, + { + "epoch": 3.0351758793969847, + "grad_norm": 0.5758325457572937, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 4530 + }, + { + "epoch": 3.0418760469011725, + "grad_norm": 0.5220064520835876, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4540 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.5469558835029602, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 4550 + }, + { + "epoch": 3.0552763819095476, + "grad_norm": 0.5680848956108093, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 4560 + }, + { + "epoch": 3.0619765494137354, + "grad_norm": 0.5906574726104736, + "learning_rate": 0.0002, + "loss": 1.5187, + "step": 4570 + }, + { + "epoch": 3.0686767169179228, + "grad_norm": 0.4725631773471832, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4580 + }, + { + "epoch": 3.0753768844221105, + "grad_norm": 0.5273477435112, + "learning_rate": 0.0002, + "loss": 1.5083, + "step": 4590 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.5861203074455261, + "learning_rate": 0.0002, + "loss": 1.5154, + "step": 4600 + }, + { + "epoch": 3.0887772194304857, + "grad_norm": 0.5343965291976929, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 4610 + }, + { + "epoch": 3.0954773869346734, + "grad_norm": 0.5348150730133057, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4620 + }, + { + "epoch": 3.102177554438861, + "grad_norm": 0.5971846580505371, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4630 + }, + { + "epoch": 3.1088777219430486, + "grad_norm": 0.5203177332878113, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4640 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.55289226770401, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 4650 + }, + { + "epoch": 3.1222780569514237, + "grad_norm": 0.6878530979156494, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4660 + }, + { + "epoch": 3.1289782244556115, + "grad_norm": 0.6173256635665894, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 4670 + }, + { + "epoch": 3.135678391959799, + "grad_norm": 0.536796510219574, + "learning_rate": 0.0002, + "loss": 1.51, + "step": 4680 + }, + { + "epoch": 3.1423785594639866, + "grad_norm": 0.58846116065979, + "learning_rate": 0.0002, + "loss": 1.4713, + "step": 4690 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.645889401435852, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 4700 + }, + { + "epoch": 3.1557788944723617, + "grad_norm": 0.6118691563606262, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 4710 + }, + { + "epoch": 3.1624790619765495, + "grad_norm": 0.5189669132232666, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 4720 + }, + { + "epoch": 3.169179229480737, + "grad_norm": 0.5794713497161865, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4730 + }, + { + "epoch": 3.1758793969849246, + "grad_norm": 0.6579326391220093, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 4740 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.5822742581367493, + "learning_rate": 0.0002, + "loss": 1.545, + "step": 4750 + }, + { + "epoch": 3.1892797319932997, + "grad_norm": 0.5475956201553345, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 4760 + }, + { + "epoch": 3.1959798994974875, + "grad_norm": 0.6743834018707275, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4770 + }, + { + "epoch": 3.202680067001675, + "grad_norm": 0.6110585927963257, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4780 + }, + { + "epoch": 3.2093802345058626, + "grad_norm": 0.5426181554794312, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 4790 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6077824234962463, + "learning_rate": 0.0002, + "loss": 1.5315, + "step": 4800 + }, + { + "epoch": 3.2227805695142377, + "grad_norm": 0.5785858631134033, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 4810 + }, + { + "epoch": 3.2294807370184255, + "grad_norm": 0.6425958275794983, + "learning_rate": 0.0002, + "loss": 1.4041, + "step": 4820 + }, + { + "epoch": 3.236180904522613, + "grad_norm": 0.6607080698013306, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 4830 + }, + { + "epoch": 3.2428810720268006, + "grad_norm": 0.5385788679122925, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4840 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.5630403757095337, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 4850 + }, + { + "epoch": 3.2562814070351758, + "grad_norm": 0.6340779662132263, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 4860 + }, + { + "epoch": 3.2629815745393635, + "grad_norm": 0.5305342674255371, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4870 + }, + { + "epoch": 3.2696817420435513, + "grad_norm": 0.597670316696167, + "learning_rate": 0.0002, + "loss": 1.5162, + "step": 4880 + }, + { + "epoch": 3.2763819095477387, + "grad_norm": 0.665553867816925, + "learning_rate": 0.0002, + "loss": 1.5429, + "step": 4890 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.579767644405365, + "learning_rate": 0.0002, + "loss": 1.4607, + "step": 4900 + }, + { + "epoch": 3.289782244556114, + "grad_norm": 0.5512481331825256, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 4910 + }, + { + "epoch": 3.2964824120603016, + "grad_norm": 0.5916532278060913, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 4920 + }, + { + "epoch": 3.3031825795644894, + "grad_norm": 0.7521726489067078, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 4930 + }, + { + "epoch": 3.3098827470686767, + "grad_norm": 0.5352797508239746, + "learning_rate": 0.0002, + "loss": 1.4223, + "step": 4940 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.5950371623039246, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4950 + }, + { + "epoch": 3.323283082077052, + "grad_norm": 0.8020477890968323, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 4960 + }, + { + "epoch": 3.3299832495812396, + "grad_norm": 0.6790024638175964, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4970 + }, + { + "epoch": 3.3366834170854274, + "grad_norm": 0.687627375125885, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4980 + }, + { + "epoch": 3.3433835845896147, + "grad_norm": 0.6094385385513306, + "learning_rate": 0.0002, + "loss": 1.5276, + "step": 4990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.6541242003440857, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 5000 + }, + { + "epoch": 3.35678391959799, + "grad_norm": 0.5560880303382874, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 5010 + }, + { + "epoch": 3.3634840871021776, + "grad_norm": 0.5440094470977783, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 5020 + }, + { + "epoch": 3.3701842546063654, + "grad_norm": 0.5749301314353943, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 5030 + }, + { + "epoch": 3.3768844221105527, + "grad_norm": 0.5919716954231262, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 5040 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.6331481337547302, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 5050 + }, + { + "epoch": 3.390284757118928, + "grad_norm": 0.5687161684036255, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 5060 + }, + { + "epoch": 3.3969849246231156, + "grad_norm": 0.6718577742576599, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 5070 + }, + { + "epoch": 3.4036850921273034, + "grad_norm": 0.5089324116706848, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 5080 + }, + { + "epoch": 3.4103852596314908, + "grad_norm": 0.5710174441337585, + "learning_rate": 0.0002, + "loss": 1.512, + "step": 5090 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6670721173286438, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 5100 + }, + { + "epoch": 3.423785594639866, + "grad_norm": 0.6875665187835693, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 5110 + }, + { + "epoch": 3.4304857621440537, + "grad_norm": 0.5375880599021912, + "learning_rate": 0.0002, + "loss": 1.4496, + "step": 5120 + }, + { + "epoch": 3.4371859296482414, + "grad_norm": 0.6550399661064148, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 5130 + }, + { + "epoch": 3.4438860971524288, + "grad_norm": 0.5948067903518677, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 5140 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.6134477257728577, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5150 + }, + { + "epoch": 3.457286432160804, + "grad_norm": 0.6506398320198059, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 5160 + }, + { + "epoch": 3.4639865996649917, + "grad_norm": 0.6060147881507874, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 5170 + }, + { + "epoch": 3.4706867671691795, + "grad_norm": 0.6173806190490723, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 5180 + }, + { + "epoch": 3.477386934673367, + "grad_norm": 0.6032607555389404, + "learning_rate": 0.0002, + "loss": 1.4975, + "step": 5190 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5652492046356201, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 5200 + }, + { + "epoch": 3.490787269681742, + "grad_norm": 0.6168607473373413, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 5210 + }, + { + "epoch": 3.4974874371859297, + "grad_norm": 0.6170629262924194, + "learning_rate": 0.0002, + "loss": 1.5164, + "step": 5220 + }, + { + "epoch": 3.5041876046901175, + "grad_norm": 0.6926297545433044, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 5230 + }, + { + "epoch": 3.510887772194305, + "grad_norm": 0.6702437996864319, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 5240 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.5421436429023743, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 5250 + }, + { + "epoch": 3.52428810720268, + "grad_norm": 0.5726765990257263, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 5260 + }, + { + "epoch": 3.5309882747068677, + "grad_norm": 0.5685455203056335, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 5270 + }, + { + "epoch": 3.5376884422110555, + "grad_norm": 0.6018396019935608, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 5280 + }, + { + "epoch": 3.544388609715243, + "grad_norm": 0.5731932520866394, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 5290 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.6601519584655762, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5300 + }, + { + "epoch": 3.557788944723618, + "grad_norm": 0.5545530319213867, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 5310 + }, + { + "epoch": 3.5644891122278057, + "grad_norm": 0.5998541116714478, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 5320 + }, + { + "epoch": 3.5711892797319935, + "grad_norm": 0.5651767253875732, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 5330 + }, + { + "epoch": 3.577889447236181, + "grad_norm": 0.7425084114074707, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 5340 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5770602226257324, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 5350 + }, + { + "epoch": 3.591289782244556, + "grad_norm": 0.54723060131073, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 5360 + }, + { + "epoch": 3.5979899497487438, + "grad_norm": 0.6658238172531128, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 5370 + }, + { + "epoch": 3.6046901172529315, + "grad_norm": 0.5787645578384399, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 5380 + }, + { + "epoch": 3.611390284757119, + "grad_norm": 0.594913125038147, + "learning_rate": 0.0002, + "loss": 1.5343, + "step": 5390 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.4964977502822876, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5400 + }, + { + "epoch": 3.624790619765494, + "grad_norm": 0.6087527275085449, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 5410 + }, + { + "epoch": 3.6314907872696818, + "grad_norm": 0.6315323710441589, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 5420 + }, + { + "epoch": 3.6381909547738696, + "grad_norm": 0.574799120426178, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 5430 + }, + { + "epoch": 3.644891122278057, + "grad_norm": 0.5949277877807617, + "learning_rate": 0.0002, + "loss": 1.4595, + "step": 5440 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.5640677213668823, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 5450 + }, + { + "epoch": 3.658291457286432, + "grad_norm": 0.6198237538337708, + "learning_rate": 0.0002, + "loss": 1.525, + "step": 5460 + }, + { + "epoch": 3.66499162479062, + "grad_norm": 0.6902034878730774, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 5470 + }, + { + "epoch": 3.6716917922948076, + "grad_norm": 0.5686674118041992, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5480 + }, + { + "epoch": 3.678391959798995, + "grad_norm": 0.6532107591629028, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 5490 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.5790849924087524, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 5500 + }, + { + "epoch": 3.69179229480737, + "grad_norm": 0.6055065393447876, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 5510 + }, + { + "epoch": 3.698492462311558, + "grad_norm": 0.5630605816841125, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 5520 + }, + { + "epoch": 3.7051926298157456, + "grad_norm": 0.6005825996398926, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 5530 + }, + { + "epoch": 3.711892797319933, + "grad_norm": 0.6553038954734802, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 5540 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5601094961166382, + "learning_rate": 0.0002, + "loss": 1.4943, + "step": 5550 + }, + { + "epoch": 3.725293132328308, + "grad_norm": 0.6598808169364929, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 5560 + }, + { + "epoch": 3.731993299832496, + "grad_norm": 0.5506255626678467, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 5570 + }, + { + "epoch": 3.7386934673366836, + "grad_norm": 0.6001223921775818, + "learning_rate": 0.0002, + "loss": 1.4805, + "step": 5580 + }, + { + "epoch": 3.745393634840871, + "grad_norm": 0.6287297606468201, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 5590 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.6253238916397095, + "learning_rate": 0.0002, + "loss": 1.5246, + "step": 5600 + }, + { + "epoch": 3.758793969849246, + "grad_norm": 0.5713174939155579, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 5610 + }, + { + "epoch": 3.765494137353434, + "grad_norm": 0.6198310852050781, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 5620 + }, + { + "epoch": 3.7721943048576216, + "grad_norm": 0.5941224098205566, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 5630 + }, + { + "epoch": 3.778894472361809, + "grad_norm": 0.606002151966095, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 5640 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.6540704965591431, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 5650 + }, + { + "epoch": 3.792294807370184, + "grad_norm": 0.6147415041923523, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 5660 + }, + { + "epoch": 3.798994974874372, + "grad_norm": 0.5649605393409729, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5670 + }, + { + "epoch": 3.8056951423785597, + "grad_norm": 0.6788773536682129, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 5680 + }, + { + "epoch": 3.812395309882747, + "grad_norm": 0.6581860780715942, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 5690 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.5529348850250244, + "learning_rate": 0.0002, + "loss": 1.4587, + "step": 5700 + }, + { + "epoch": 3.825795644891122, + "grad_norm": 0.6320232152938843, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 5710 + }, + { + "epoch": 3.83249581239531, + "grad_norm": 0.6529698371887207, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 5720 + }, + { + "epoch": 3.8391959798994977, + "grad_norm": 0.5983362793922424, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 5730 + }, + { + "epoch": 3.845896147403685, + "grad_norm": 0.6335684061050415, + "learning_rate": 0.0002, + "loss": 1.465, + "step": 5740 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.700446605682373, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5750 + }, + { + "epoch": 3.85929648241206, + "grad_norm": 0.6092597842216492, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 5760 + }, + { + "epoch": 3.865996649916248, + "grad_norm": 0.564146101474762, + "learning_rate": 0.0002, + "loss": 1.5729, + "step": 5770 + }, + { + "epoch": 3.8726968174204357, + "grad_norm": 0.615275502204895, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 5780 + }, + { + "epoch": 3.879396984924623, + "grad_norm": 0.6685376763343811, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 5790 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6116922497749329, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5800 + }, + { + "epoch": 3.892797319932998, + "grad_norm": 0.5486813187599182, + "learning_rate": 0.0002, + "loss": 1.5179, + "step": 5810 + }, + { + "epoch": 3.899497487437186, + "grad_norm": 0.6208204030990601, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 5820 + }, + { + "epoch": 3.9061976549413737, + "grad_norm": 0.6500625014305115, + "learning_rate": 0.0002, + "loss": 1.5334, + "step": 5830 + }, + { + "epoch": 3.912897822445561, + "grad_norm": 0.5948089361190796, + "learning_rate": 0.0002, + "loss": 1.4716, + "step": 5840 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.7210732698440552, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 5850 + }, + { + "epoch": 3.926298157453936, + "grad_norm": 0.6662322878837585, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 5860 + }, + { + "epoch": 3.932998324958124, + "grad_norm": 0.5613839626312256, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 5870 + }, + { + "epoch": 3.9396984924623117, + "grad_norm": 0.6069002151489258, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5880 + }, + { + "epoch": 3.946398659966499, + "grad_norm": 0.7075562477111816, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 5890 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.6316173076629639, + "learning_rate": 0.0002, + "loss": 1.5391, + "step": 5900 + }, + { + "epoch": 3.959798994974874, + "grad_norm": 0.5716308355331421, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 5910 + }, + { + "epoch": 3.966499162479062, + "grad_norm": 0.6800096035003662, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 5920 + }, + { + "epoch": 3.9731993299832498, + "grad_norm": 0.6057983040809631, + "learning_rate": 0.0002, + "loss": 1.5189, + "step": 5930 + }, + { + "epoch": 3.979899497487437, + "grad_norm": 0.5938987731933594, + "learning_rate": 0.0002, + "loss": 1.5431, + "step": 5940 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.6963576674461365, + "learning_rate": 0.0002, + "loss": 1.5111, + "step": 5950 + }, + { + "epoch": 3.993299832495812, + "grad_norm": 0.6279940009117126, + "learning_rate": 0.0002, + "loss": 1.5521, + "step": 5960 + }, + { + "epoch": 4.0, + "grad_norm": 0.7161159515380859, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 5970 + }, + { + "epoch": 4.0, + "eval_loss": 1.8655421733856201, + "eval_runtime": 37.9276, + "eval_samples_per_second": 13.579, + "eval_steps_per_second": 1.714, + "step": 5970 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.7627823262859264e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-5970/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2091ccdd675cfe239b2d7d6ac2a9c115915cf019 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a6dc8f1eb63b254d61cc48121629221832e83035d7cbbea32f7a1c7e5613918 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7970f235a5307df1b83c9903f9352cdde671a838 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6af6af9b3e1d5fc139cd7f77d43795b275d0cffe9c1e94b132559552babf620b +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a68ba21ac26150c6f30a31fadacd57f2eb69cb77 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:111d23dcfd260fbf001922a777fa691a7c513e3e31685b60c2713a289382a171 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..17f10eb44ce7821cb7f52d06750abb17355b54c3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27652f182b842c7ed5e0b2abe3cf6cd30fac1baf0f5622b65fbc7d55d4a9a6c6 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..27ab2ae4a415f5a320f4d7807d8619740f871418 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/trainer_state.json @@ -0,0 +1,5295 @@ +{ + "best_metric": 1.8028968572616577, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", + "epoch": 4.99966499162479, + "eval_steps": 10, + "global_step": 7462, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.29252371191978455, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1500 + }, + { + "epoch": 1.0117252931323284, + "grad_norm": 0.31607162952423096, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 1510 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.32294467091560364, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1520 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.3868017792701721, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 1530 + }, + { + "epoch": 1.031825795644891, + "grad_norm": 0.3178282082080841, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 1540 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.3706750273704529, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 1550 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.33930912613868713, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1560 + }, + { + "epoch": 1.051926298157454, + "grad_norm": 0.33970504999160767, + "learning_rate": 0.0002, + "loss": 1.7602, + "step": 1570 + }, + { + "epoch": 1.0586264656616415, + "grad_norm": 0.42553383111953735, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1580 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.3772421181201935, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1590 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.34212902188301086, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1600 + }, + { + "epoch": 1.0787269681742044, + "grad_norm": 0.3798283338546753, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1610 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.36909598112106323, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 1620 + }, + { + "epoch": 1.0921273031825796, + "grad_norm": 0.3344230651855469, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 1630 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.3862569332122803, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1640 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.31188511848449707, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1650 + }, + { + "epoch": 1.1122278056951425, + "grad_norm": 0.3563670814037323, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 1660 + }, + { + "epoch": 1.11892797319933, + "grad_norm": 0.35052165389060974, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 1670 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.3285699188709259, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1680 + }, + { + "epoch": 1.1323283082077051, + "grad_norm": 0.3639393746852875, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1690 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.3842753767967224, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 1700 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.3624933063983917, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 1710 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.3641220033168793, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1720 + }, + { + "epoch": 1.1591289782244556, + "grad_norm": 0.32765355706214905, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1730 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.34974896907806396, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 1740 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3910926580429077, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.3564300537109375, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 1760 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.34822574257850647, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1770 + }, + { + "epoch": 1.1926298157453936, + "grad_norm": 0.36185044050216675, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1780 + }, + { + "epoch": 1.1993299832495812, + "grad_norm": 0.34866711497306824, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1790 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.4017769992351532, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 1800 + }, + { + "epoch": 1.2127303182579565, + "grad_norm": 0.32930681109428406, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1810 + }, + { + "epoch": 1.219430485762144, + "grad_norm": 0.35951921343803406, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1820 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.37366992235183716, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 1830 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.3565689027309418, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1840 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.3692343533039093, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 1850 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.38426971435546875, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 1860 + }, + { + "epoch": 1.252931323283082, + "grad_norm": 0.33559855818748474, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1870 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.34181106090545654, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1880 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3916318416595459, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1890 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3887825012207031, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 1900 + }, + { + "epoch": 1.2797319932998326, + "grad_norm": 0.33583927154541016, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1910 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.37639349699020386, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1920 + }, + { + "epoch": 1.2931323283082077, + "grad_norm": 0.38059428334236145, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1930 + }, + { + "epoch": 1.2998324958123952, + "grad_norm": 0.37253183126449585, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1940 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.37371566891670227, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 1950 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.4080910086631775, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 1960 + }, + { + "epoch": 1.3199329983249581, + "grad_norm": 0.3174354135990143, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1970 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.4518888294696808, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 1980 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3627921938896179, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 1990 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3655930161476135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 2000 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2010 + }, + { + "epoch": 1.3534338358458962, + "grad_norm": 0.4281129240989685, + "learning_rate": 0.0002, + "loss": 1.7359, + "step": 2020 + }, + { + "epoch": 1.3601340033500837, + "grad_norm": 0.3821414113044739, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 2030 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.3907586336135864, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 2040 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37792932987213135, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 2050 + }, + { + "epoch": 1.3802345058626466, + "grad_norm": 0.3693985641002655, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 2060 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.32275936007499695, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 2070 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.3789440095424652, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 2080 + }, + { + "epoch": 1.4003350083752093, + "grad_norm": 0.3638380467891693, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 2090 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 2100 + }, + { + "epoch": 1.4137353433835846, + "grad_norm": 0.37920597195625305, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 2110 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.37218064069747925, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 2120 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.38074082136154175, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 2130 + }, + { + "epoch": 1.4338358458961473, + "grad_norm": 0.3455527126789093, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 2140 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.3712003529071808, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2150 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.3786754906177521, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2160 + }, + { + "epoch": 1.4539363484087102, + "grad_norm": 0.3879223167896271, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 2170 + }, + { + "epoch": 1.4606365159128978, + "grad_norm": 0.38738805055618286, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 2180 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.39768800139427185, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2190 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.4172441065311432, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 2200 + }, + { + "epoch": 1.4807370184254607, + "grad_norm": 0.4043174982070923, + "learning_rate": 0.0002, + "loss": 1.6736, + "step": 2210 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.3750883936882019, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 2220 + }, + { + "epoch": 1.4941373534338358, + "grad_norm": 0.3552253246307373, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 2230 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.34607139229774475, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2240 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.3406706750392914, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 2250 + }, + { + "epoch": 1.5142378559463987, + "grad_norm": 0.36654895544052124, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 2260 + }, + { + "epoch": 1.5209380234505863, + "grad_norm": 0.3914054334163666, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2270 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.42012137174606323, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 2280 + }, + { + "epoch": 1.5343383584589616, + "grad_norm": 0.39563435316085815, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.3508438766002655, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 2300 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.3785218596458435, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 2310 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.39377647638320923, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 2320 + }, + { + "epoch": 1.5611390284757118, + "grad_norm": 0.3391438126564026, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2330 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.37944263219833374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 2340 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3523491322994232, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2350 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.3911575973033905, + "learning_rate": 0.0002, + "loss": 1.7583, + "step": 2360 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.33832186460494995, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 2370 + }, + { + "epoch": 1.5946398659966499, + "grad_norm": 0.3665979206562042, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2380 + }, + { + "epoch": 1.6013400335008376, + "grad_norm": 0.3871748149394989, + "learning_rate": 0.0002, + "loss": 1.779, + "step": 2390 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3586967885494232, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 2400 + }, + { + "epoch": 1.6147403685092128, + "grad_norm": 0.3563673198223114, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 2410 + }, + { + "epoch": 1.6214405360134003, + "grad_norm": 0.37588971853256226, + "learning_rate": 0.0002, + "loss": 1.745, + "step": 2420 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.352556437253952, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 2430 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.3716259300708771, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2440 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.372001975774765, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2450 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.3430042862892151, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2460 + }, + { + "epoch": 1.6549413735343383, + "grad_norm": 0.3741483688354492, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2470 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.3610571324825287, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2480 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.4204719066619873, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2490 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3938186466693878, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 2500 + }, + { + "epoch": 1.6817420435510888, + "grad_norm": 0.3421435058116913, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 2510 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.42441412806510925, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 2520 + }, + { + "epoch": 1.695142378559464, + "grad_norm": 0.38071519136428833, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 2530 + }, + { + "epoch": 1.7018425460636517, + "grad_norm": 0.34078919887542725, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2540 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.412844181060791, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 2550 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 2560 + }, + { + "epoch": 1.7219430485762144, + "grad_norm": 0.41588476300239563, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 2570 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.35504111647605896, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2580 + }, + { + "epoch": 1.7353433835845897, + "grad_norm": 0.36909720301628113, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.4149979054927826, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 2600 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.38859328627586365, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 2610 + }, + { + "epoch": 1.7554438860971524, + "grad_norm": 0.36738792061805725, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2620 + }, + { + "epoch": 1.76214405360134, + "grad_norm": 0.3968178927898407, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2630 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.3972901999950409, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 2640 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3949959874153137, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 1.7822445561139029, + "grad_norm": 0.44074657559394836, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 2660 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.39743664860725403, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 2670 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.3950406610965729, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2680 + }, + { + "epoch": 1.8023450586264658, + "grad_norm": 0.3568263649940491, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2690 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3819476366043091, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2700 + }, + { + "epoch": 1.8157453936348409, + "grad_norm": 0.3480634391307831, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 2710 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2720 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.3441337049007416, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2730 + }, + { + "epoch": 1.8358458961474038, + "grad_norm": 0.35692882537841797, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 2740 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.36959215998649597, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2750 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.3893393278121948, + "learning_rate": 0.0002, + "loss": 1.7657, + "step": 2760 + }, + { + "epoch": 1.8559463986599665, + "grad_norm": 0.37817293405532837, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2770 + }, + { + "epoch": 1.862646566164154, + "grad_norm": 0.36071285605430603, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 2780 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.3758420944213867, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 2790 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3889938294887543, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 2800 + }, + { + "epoch": 1.882747068676717, + "grad_norm": 0.34361857175827026, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2810 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.39283323287963867, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2820 + }, + { + "epoch": 1.896147403685092, + "grad_norm": 0.3919452726840973, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 2830 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.38215070962905884, + "learning_rate": 0.0002, + "loss": 1.673, + "step": 2840 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.4235064387321472, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 2850 + }, + { + "epoch": 1.916247906197655, + "grad_norm": 0.35694634914398193, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 2860 + }, + { + "epoch": 1.9229480737018425, + "grad_norm": 0.383492112159729, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 2870 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5945147275924683, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2880 + }, + { + "epoch": 1.9363484087102178, + "grad_norm": 0.3367522358894348, + "learning_rate": 0.0002, + "loss": 1.7421, + "step": 2890 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.35300394892692566, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2900 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.38084495067596436, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2910 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.37559160590171814, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 2920 + }, + { + "epoch": 1.963149078726968, + "grad_norm": 0.3661738336086273, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 2930 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.4073849320411682, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2940 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3723304271697998, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 2950 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.3991098999977112, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 2960 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.3947085440158844, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2970 + }, + { + "epoch": 1.996649916247906, + "grad_norm": 0.3786258399486542, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2980 + }, + { + "epoch": 2.0, + "eval_loss": 1.8028968572616577, + "eval_runtime": 37.8985, + "eval_samples_per_second": 13.589, + "eval_steps_per_second": 1.715, + "step": 2985 + }, + { + "epoch": 2.003350083752094, + "grad_norm": 0.34824079275131226, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2990 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.3394894003868103, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3000 + }, + { + "epoch": 2.016750418760469, + "grad_norm": 0.36910977959632874, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3010 + }, + { + "epoch": 2.023450586264657, + "grad_norm": 0.45000967383384705, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 3020 + }, + { + "epoch": 2.030150753768844, + "grad_norm": 0.3791407346725464, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3030 + }, + { + "epoch": 2.036850921273032, + "grad_norm": 0.387321799993515, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 3040 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.4185757040977478, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3050 + }, + { + "epoch": 2.050251256281407, + "grad_norm": 0.45110777020454407, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3060 + }, + { + "epoch": 2.056951423785595, + "grad_norm": 0.42663660645484924, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 3070 + }, + { + "epoch": 2.063651591289782, + "grad_norm": 0.4546292722225189, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 3080 + }, + { + "epoch": 2.07035175879397, + "grad_norm": 0.3979759216308594, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3090 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.43596673011779785, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3100 + }, + { + "epoch": 2.083752093802345, + "grad_norm": 0.40120232105255127, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 3110 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.44449281692504883, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3120 + }, + { + "epoch": 2.09715242881072, + "grad_norm": 0.42672568559646606, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 3130 + }, + { + "epoch": 2.103852596314908, + "grad_norm": 0.4232690930366516, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 3140 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.4299317002296448, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3150 + }, + { + "epoch": 2.117252931323283, + "grad_norm": 0.4067758023738861, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 3160 + }, + { + "epoch": 2.123953098827471, + "grad_norm": 0.4918815791606903, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3170 + }, + { + "epoch": 2.130653266331658, + "grad_norm": 0.4140559732913971, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3180 + }, + { + "epoch": 2.137353433835846, + "grad_norm": 0.4555995464324951, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 3190 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.42943915724754333, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 3200 + }, + { + "epoch": 2.150753768844221, + "grad_norm": 0.4730435013771057, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3210 + }, + { + "epoch": 2.157453936348409, + "grad_norm": 0.43310216069221497, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 3220 + }, + { + "epoch": 2.164154103852596, + "grad_norm": 0.42054110765457153, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3230 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.4897233247756958, + "learning_rate": 0.0002, + "loss": 1.6749, + "step": 3240 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.42194533348083496, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 3250 + }, + { + "epoch": 2.184254606365159, + "grad_norm": 0.44494450092315674, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3260 + }, + { + "epoch": 2.190954773869347, + "grad_norm": 0.43524879217147827, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 3270 + }, + { + "epoch": 2.1976549413735342, + "grad_norm": 0.4621117413043976, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 3280 + }, + { + "epoch": 2.204355108877722, + "grad_norm": 0.4073285460472107, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 3290 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.47868335247039795, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3300 + }, + { + "epoch": 2.217755443886097, + "grad_norm": 0.4264970123767853, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 3310 + }, + { + "epoch": 2.224455611390285, + "grad_norm": 0.4491245150566101, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3320 + }, + { + "epoch": 2.2311557788944723, + "grad_norm": 0.4010344445705414, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 3330 + }, + { + "epoch": 2.23785594639866, + "grad_norm": 0.4232759177684784, + "learning_rate": 0.0002, + "loss": 1.6684, + "step": 3340 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5099776983261108, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3350 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.5223407745361328, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 3360 + }, + { + "epoch": 2.257956448911223, + "grad_norm": 0.47818470001220703, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3370 + }, + { + "epoch": 2.2646566164154103, + "grad_norm": 0.4721255898475647, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3380 + }, + { + "epoch": 2.271356783919598, + "grad_norm": 0.4113229513168335, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3390 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.507080078125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3400 + }, + { + "epoch": 2.284757118927973, + "grad_norm": 0.4852292239665985, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 3410 + }, + { + "epoch": 2.291457286432161, + "grad_norm": 0.4503684341907501, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 3420 + }, + { + "epoch": 2.2981574539363483, + "grad_norm": 0.8359600305557251, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 3430 + }, + { + "epoch": 2.304857621440536, + "grad_norm": 0.44604045152664185, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3440 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.45667049288749695, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3450 + }, + { + "epoch": 2.318257956448911, + "grad_norm": 0.4879349172115326, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 3460 + }, + { + "epoch": 2.324958123953099, + "grad_norm": 0.4033963084220886, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 3470 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.44494301080703735, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3480 + }, + { + "epoch": 2.338358458961474, + "grad_norm": 0.4794621765613556, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.41404327750205994, + "learning_rate": 0.0002, + "loss": 1.6807, + "step": 3500 + }, + { + "epoch": 2.351758793969849, + "grad_norm": 0.4664851725101471, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 3510 + }, + { + "epoch": 2.358458961474037, + "grad_norm": 0.4263697564601898, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 3520 + }, + { + "epoch": 2.3651591289782243, + "grad_norm": 0.5035167336463928, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 3530 + }, + { + "epoch": 2.371859296482412, + "grad_norm": 0.4380664527416229, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 3540 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.5227681994438171, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3550 + }, + { + "epoch": 2.3852596314907872, + "grad_norm": 0.4382302761077881, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3560 + }, + { + "epoch": 2.391959798994975, + "grad_norm": 0.4392451047897339, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3570 + }, + { + "epoch": 2.3986599664991624, + "grad_norm": 0.4372786581516266, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 3580 + }, + { + "epoch": 2.40536013400335, + "grad_norm": 0.5015502572059631, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 3590 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.5653210878372192, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3600 + }, + { + "epoch": 2.4187604690117253, + "grad_norm": 0.53007972240448, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3610 + }, + { + "epoch": 2.425460636515913, + "grad_norm": 0.4659176766872406, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 3620 + }, + { + "epoch": 2.4321608040201004, + "grad_norm": 0.5637837052345276, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3630 + }, + { + "epoch": 2.438860971524288, + "grad_norm": 0.4248391389846802, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3640 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.44668248295783997, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 3650 + }, + { + "epoch": 2.4522613065326633, + "grad_norm": 0.43990179896354675, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 3660 + }, + { + "epoch": 2.458961474036851, + "grad_norm": 0.4532523453235626, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 3670 + }, + { + "epoch": 2.4656616415410384, + "grad_norm": 0.6605591773986816, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 3680 + }, + { + "epoch": 2.472361809045226, + "grad_norm": 0.4694533348083496, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3690 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.4485011100769043, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 3700 + }, + { + "epoch": 2.4857621440536013, + "grad_norm": 0.4761785864830017, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3710 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.5116432309150696, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 3720 + }, + { + "epoch": 2.4991624790619764, + "grad_norm": 0.49523618817329407, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 3730 + }, + { + "epoch": 2.505862646566164, + "grad_norm": 0.43826380372047424, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 3740 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.4916154146194458, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3750 + }, + { + "epoch": 2.5192629815745393, + "grad_norm": 0.5381299257278442, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 3760 + }, + { + "epoch": 2.525963149078727, + "grad_norm": 0.44947415590286255, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 3770 + }, + { + "epoch": 2.5326633165829144, + "grad_norm": 0.49979084730148315, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3780 + }, + { + "epoch": 2.539363484087102, + "grad_norm": 0.43046900629997253, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 3790 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.4513470530509949, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 3800 + }, + { + "epoch": 2.5527638190954773, + "grad_norm": 0.49900051951408386, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3810 + }, + { + "epoch": 2.559463986599665, + "grad_norm": 0.4348420202732086, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 3820 + }, + { + "epoch": 2.5661641541038525, + "grad_norm": 0.4684867560863495, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3830 + }, + { + "epoch": 2.5728643216080402, + "grad_norm": 0.44430989027023315, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3840 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.47375255823135376, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3850 + }, + { + "epoch": 2.5862646566164154, + "grad_norm": 0.45493075251579285, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 3860 + }, + { + "epoch": 2.592964824120603, + "grad_norm": 0.4563275873661041, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 3870 + }, + { + "epoch": 2.5996649916247905, + "grad_norm": 0.46060335636138916, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3880 + }, + { + "epoch": 2.6063651591289783, + "grad_norm": 0.4718867540359497, + "learning_rate": 0.0002, + "loss": 1.6302, + "step": 3890 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.41570305824279785, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 3900 + }, + { + "epoch": 2.6197654941373534, + "grad_norm": 0.4603121876716614, + "learning_rate": 0.0002, + "loss": 1.6401, + "step": 3910 + }, + { + "epoch": 2.626465661641541, + "grad_norm": 0.4734652638435364, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 3920 + }, + { + "epoch": 2.6331658291457285, + "grad_norm": 0.45348483324050903, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3930 + }, + { + "epoch": 2.6398659966499163, + "grad_norm": 0.46559447050094604, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3940 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.44113144278526306, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 3950 + }, + { + "epoch": 2.6532663316582914, + "grad_norm": 0.41415104269981384, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3960 + }, + { + "epoch": 2.659966499162479, + "grad_norm": 0.48868080973625183, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 3970 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.49610549211502075, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 3980 + }, + { + "epoch": 2.6733668341708543, + "grad_norm": 0.4309130907058716, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 3990 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.4489327669143677, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 4000 + }, + { + "epoch": 2.6867671691792294, + "grad_norm": 0.5380139946937561, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 4010 + }, + { + "epoch": 2.693467336683417, + "grad_norm": 0.5076672434806824, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 4020 + }, + { + "epoch": 2.7001675041876045, + "grad_norm": 0.47620031237602234, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 4030 + }, + { + "epoch": 2.7068676716917923, + "grad_norm": 0.48089155554771423, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 4040 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.5108814239501953, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 4050 + }, + { + "epoch": 2.7202680067001674, + "grad_norm": 0.4196513295173645, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 4060 + }, + { + "epoch": 2.726968174204355, + "grad_norm": 0.4574664831161499, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 4070 + }, + { + "epoch": 2.7336683417085426, + "grad_norm": 0.4671640992164612, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 4080 + }, + { + "epoch": 2.7403685092127303, + "grad_norm": 0.49355530738830566, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 4090 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.46716663241386414, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 4100 + }, + { + "epoch": 2.7537688442211055, + "grad_norm": 0.45420581102371216, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 4110 + }, + { + "epoch": 2.7604690117252932, + "grad_norm": 0.4680487811565399, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4120 + }, + { + "epoch": 2.7671691792294806, + "grad_norm": 0.5375032424926758, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4130 + }, + { + "epoch": 2.7738693467336684, + "grad_norm": 0.46026280522346497, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 4140 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.43658447265625, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 4150 + }, + { + "epoch": 2.7872696817420435, + "grad_norm": 0.4935547113418579, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 4160 + }, + { + "epoch": 2.7939698492462313, + "grad_norm": 0.8167962431907654, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4170 + }, + { + "epoch": 2.8006700167504186, + "grad_norm": 0.4289683997631073, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 4180 + }, + { + "epoch": 2.8073701842546064, + "grad_norm": 0.4569324254989624, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 4190 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.474795937538147, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 4200 + }, + { + "epoch": 2.8207705192629815, + "grad_norm": 0.44272229075431824, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 4210 + }, + { + "epoch": 2.8274706867671693, + "grad_norm": 0.525240957736969, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 4220 + }, + { + "epoch": 2.8341708542713566, + "grad_norm": 0.4802303910255432, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 4230 + }, + { + "epoch": 2.8408710217755444, + "grad_norm": 0.46400442719459534, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4240 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.49884888529777527, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4250 + }, + { + "epoch": 2.8542713567839195, + "grad_norm": 0.5015072226524353, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 4260 + }, + { + "epoch": 2.8609715242881073, + "grad_norm": 0.4335440695285797, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 4270 + }, + { + "epoch": 2.8676716917922946, + "grad_norm": 0.5131644606590271, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 4280 + }, + { + "epoch": 2.8743718592964824, + "grad_norm": 0.6977195739746094, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 4290 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5133762955665588, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 4300 + }, + { + "epoch": 2.8877721943048575, + "grad_norm": 0.4737614393234253, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 4310 + }, + { + "epoch": 2.8944723618090453, + "grad_norm": 0.4580535590648651, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 4320 + }, + { + "epoch": 2.901172529313233, + "grad_norm": 0.43863341212272644, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 4330 + }, + { + "epoch": 2.9078726968174204, + "grad_norm": 0.4103737473487854, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4340 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.438014417886734, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 4350 + }, + { + "epoch": 2.9212730318257956, + "grad_norm": 0.5068213939666748, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 4360 + }, + { + "epoch": 2.9279731993299833, + "grad_norm": 0.45305484533309937, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 4370 + }, + { + "epoch": 2.934673366834171, + "grad_norm": 0.4612090289592743, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4380 + }, + { + "epoch": 2.9413735343383585, + "grad_norm": 0.508736789226532, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 4390 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4924427270889282, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 4400 + }, + { + "epoch": 2.9547738693467336, + "grad_norm": 0.5707460641860962, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 4410 + }, + { + "epoch": 2.9614740368509214, + "grad_norm": 0.42270299792289734, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 4420 + }, + { + "epoch": 2.968174204355109, + "grad_norm": 0.4429931044578552, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 4430 + }, + { + "epoch": 2.9748743718592965, + "grad_norm": 0.49760574102401733, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 4440 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4558229148387909, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 4450 + }, + { + "epoch": 2.9882747068676716, + "grad_norm": 0.39848530292510986, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4460 + }, + { + "epoch": 2.9949748743718594, + "grad_norm": 0.5224862098693848, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 4470 + }, + { + "epoch": 2.9996649916247904, + "eval_loss": 1.8228833675384521, + "eval_runtime": 37.9049, + "eval_samples_per_second": 13.587, + "eval_steps_per_second": 1.715, + "step": 4477 + }, + { + "epoch": 3.0016750418760467, + "grad_norm": 0.41169142723083496, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 4480 + }, + { + "epoch": 3.0083752093802345, + "grad_norm": 0.4865207374095917, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 4490 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5462028384208679, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4500 + }, + { + "epoch": 3.0217755443886096, + "grad_norm": 0.6169732809066772, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 4510 + }, + { + "epoch": 3.0284757118927974, + "grad_norm": 0.5667954087257385, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 4520 + }, + { + "epoch": 3.0351758793969847, + "grad_norm": 0.5758325457572937, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 4530 + }, + { + "epoch": 3.0418760469011725, + "grad_norm": 0.5220064520835876, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4540 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.5469558835029602, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 4550 + }, + { + "epoch": 3.0552763819095476, + "grad_norm": 0.5680848956108093, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 4560 + }, + { + "epoch": 3.0619765494137354, + "grad_norm": 0.5906574726104736, + "learning_rate": 0.0002, + "loss": 1.5187, + "step": 4570 + }, + { + "epoch": 3.0686767169179228, + "grad_norm": 0.4725631773471832, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4580 + }, + { + "epoch": 3.0753768844221105, + "grad_norm": 0.5273477435112, + "learning_rate": 0.0002, + "loss": 1.5083, + "step": 4590 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.5861203074455261, + "learning_rate": 0.0002, + "loss": 1.5154, + "step": 4600 + }, + { + "epoch": 3.0887772194304857, + "grad_norm": 0.5343965291976929, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 4610 + }, + { + "epoch": 3.0954773869346734, + "grad_norm": 0.5348150730133057, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4620 + }, + { + "epoch": 3.102177554438861, + "grad_norm": 0.5971846580505371, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4630 + }, + { + "epoch": 3.1088777219430486, + "grad_norm": 0.5203177332878113, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4640 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.55289226770401, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 4650 + }, + { + "epoch": 3.1222780569514237, + "grad_norm": 0.6878530979156494, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4660 + }, + { + "epoch": 3.1289782244556115, + "grad_norm": 0.6173256635665894, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 4670 + }, + { + "epoch": 3.135678391959799, + "grad_norm": 0.536796510219574, + "learning_rate": 0.0002, + "loss": 1.51, + "step": 4680 + }, + { + "epoch": 3.1423785594639866, + "grad_norm": 0.58846116065979, + "learning_rate": 0.0002, + "loss": 1.4713, + "step": 4690 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.645889401435852, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 4700 + }, + { + "epoch": 3.1557788944723617, + "grad_norm": 0.6118691563606262, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 4710 + }, + { + "epoch": 3.1624790619765495, + "grad_norm": 0.5189669132232666, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 4720 + }, + { + "epoch": 3.169179229480737, + "grad_norm": 0.5794713497161865, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4730 + }, + { + "epoch": 3.1758793969849246, + "grad_norm": 0.6579326391220093, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 4740 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.5822742581367493, + "learning_rate": 0.0002, + "loss": 1.545, + "step": 4750 + }, + { + "epoch": 3.1892797319932997, + "grad_norm": 0.5475956201553345, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 4760 + }, + { + "epoch": 3.1959798994974875, + "grad_norm": 0.6743834018707275, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4770 + }, + { + "epoch": 3.202680067001675, + "grad_norm": 0.6110585927963257, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4780 + }, + { + "epoch": 3.2093802345058626, + "grad_norm": 0.5426181554794312, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 4790 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6077824234962463, + "learning_rate": 0.0002, + "loss": 1.5315, + "step": 4800 + }, + { + "epoch": 3.2227805695142377, + "grad_norm": 0.5785858631134033, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 4810 + }, + { + "epoch": 3.2294807370184255, + "grad_norm": 0.6425958275794983, + "learning_rate": 0.0002, + "loss": 1.4041, + "step": 4820 + }, + { + "epoch": 3.236180904522613, + "grad_norm": 0.6607080698013306, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 4830 + }, + { + "epoch": 3.2428810720268006, + "grad_norm": 0.5385788679122925, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4840 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.5630403757095337, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 4850 + }, + { + "epoch": 3.2562814070351758, + "grad_norm": 0.6340779662132263, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 4860 + }, + { + "epoch": 3.2629815745393635, + "grad_norm": 0.5305342674255371, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4870 + }, + { + "epoch": 3.2696817420435513, + "grad_norm": 0.597670316696167, + "learning_rate": 0.0002, + "loss": 1.5162, + "step": 4880 + }, + { + "epoch": 3.2763819095477387, + "grad_norm": 0.665553867816925, + "learning_rate": 0.0002, + "loss": 1.5429, + "step": 4890 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.579767644405365, + "learning_rate": 0.0002, + "loss": 1.4607, + "step": 4900 + }, + { + "epoch": 3.289782244556114, + "grad_norm": 0.5512481331825256, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 4910 + }, + { + "epoch": 3.2964824120603016, + "grad_norm": 0.5916532278060913, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 4920 + }, + { + "epoch": 3.3031825795644894, + "grad_norm": 0.7521726489067078, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 4930 + }, + { + "epoch": 3.3098827470686767, + "grad_norm": 0.5352797508239746, + "learning_rate": 0.0002, + "loss": 1.4223, + "step": 4940 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.5950371623039246, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4950 + }, + { + "epoch": 3.323283082077052, + "grad_norm": 0.8020477890968323, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 4960 + }, + { + "epoch": 3.3299832495812396, + "grad_norm": 0.6790024638175964, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4970 + }, + { + "epoch": 3.3366834170854274, + "grad_norm": 0.687627375125885, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4980 + }, + { + "epoch": 3.3433835845896147, + "grad_norm": 0.6094385385513306, + "learning_rate": 0.0002, + "loss": 1.5276, + "step": 4990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.6541242003440857, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 5000 + }, + { + "epoch": 3.35678391959799, + "grad_norm": 0.5560880303382874, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 5010 + }, + { + "epoch": 3.3634840871021776, + "grad_norm": 0.5440094470977783, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 5020 + }, + { + "epoch": 3.3701842546063654, + "grad_norm": 0.5749301314353943, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 5030 + }, + { + "epoch": 3.3768844221105527, + "grad_norm": 0.5919716954231262, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 5040 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.6331481337547302, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 5050 + }, + { + "epoch": 3.390284757118928, + "grad_norm": 0.5687161684036255, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 5060 + }, + { + "epoch": 3.3969849246231156, + "grad_norm": 0.6718577742576599, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 5070 + }, + { + "epoch": 3.4036850921273034, + "grad_norm": 0.5089324116706848, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 5080 + }, + { + "epoch": 3.4103852596314908, + "grad_norm": 0.5710174441337585, + "learning_rate": 0.0002, + "loss": 1.512, + "step": 5090 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6670721173286438, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 5100 + }, + { + "epoch": 3.423785594639866, + "grad_norm": 0.6875665187835693, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 5110 + }, + { + "epoch": 3.4304857621440537, + "grad_norm": 0.5375880599021912, + "learning_rate": 0.0002, + "loss": 1.4496, + "step": 5120 + }, + { + "epoch": 3.4371859296482414, + "grad_norm": 0.6550399661064148, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 5130 + }, + { + "epoch": 3.4438860971524288, + "grad_norm": 0.5948067903518677, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 5140 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.6134477257728577, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5150 + }, + { + "epoch": 3.457286432160804, + "grad_norm": 0.6506398320198059, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 5160 + }, + { + "epoch": 3.4639865996649917, + "grad_norm": 0.6060147881507874, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 5170 + }, + { + "epoch": 3.4706867671691795, + "grad_norm": 0.6173806190490723, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 5180 + }, + { + "epoch": 3.477386934673367, + "grad_norm": 0.6032607555389404, + "learning_rate": 0.0002, + "loss": 1.4975, + "step": 5190 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5652492046356201, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 5200 + }, + { + "epoch": 3.490787269681742, + "grad_norm": 0.6168607473373413, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 5210 + }, + { + "epoch": 3.4974874371859297, + "grad_norm": 0.6170629262924194, + "learning_rate": 0.0002, + "loss": 1.5164, + "step": 5220 + }, + { + "epoch": 3.5041876046901175, + "grad_norm": 0.6926297545433044, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 5230 + }, + { + "epoch": 3.510887772194305, + "grad_norm": 0.6702437996864319, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 5240 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.5421436429023743, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 5250 + }, + { + "epoch": 3.52428810720268, + "grad_norm": 0.5726765990257263, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 5260 + }, + { + "epoch": 3.5309882747068677, + "grad_norm": 0.5685455203056335, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 5270 + }, + { + "epoch": 3.5376884422110555, + "grad_norm": 0.6018396019935608, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 5280 + }, + { + "epoch": 3.544388609715243, + "grad_norm": 0.5731932520866394, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 5290 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.6601519584655762, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5300 + }, + { + "epoch": 3.557788944723618, + "grad_norm": 0.5545530319213867, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 5310 + }, + { + "epoch": 3.5644891122278057, + "grad_norm": 0.5998541116714478, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 5320 + }, + { + "epoch": 3.5711892797319935, + "grad_norm": 0.5651767253875732, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 5330 + }, + { + "epoch": 3.577889447236181, + "grad_norm": 0.7425084114074707, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 5340 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5770602226257324, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 5350 + }, + { + "epoch": 3.591289782244556, + "grad_norm": 0.54723060131073, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 5360 + }, + { + "epoch": 3.5979899497487438, + "grad_norm": 0.6658238172531128, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 5370 + }, + { + "epoch": 3.6046901172529315, + "grad_norm": 0.5787645578384399, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 5380 + }, + { + "epoch": 3.611390284757119, + "grad_norm": 0.594913125038147, + "learning_rate": 0.0002, + "loss": 1.5343, + "step": 5390 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.4964977502822876, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5400 + }, + { + "epoch": 3.624790619765494, + "grad_norm": 0.6087527275085449, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 5410 + }, + { + "epoch": 3.6314907872696818, + "grad_norm": 0.6315323710441589, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 5420 + }, + { + "epoch": 3.6381909547738696, + "grad_norm": 0.574799120426178, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 5430 + }, + { + "epoch": 3.644891122278057, + "grad_norm": 0.5949277877807617, + "learning_rate": 0.0002, + "loss": 1.4595, + "step": 5440 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.5640677213668823, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 5450 + }, + { + "epoch": 3.658291457286432, + "grad_norm": 0.6198237538337708, + "learning_rate": 0.0002, + "loss": 1.525, + "step": 5460 + }, + { + "epoch": 3.66499162479062, + "grad_norm": 0.6902034878730774, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 5470 + }, + { + "epoch": 3.6716917922948076, + "grad_norm": 0.5686674118041992, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5480 + }, + { + "epoch": 3.678391959798995, + "grad_norm": 0.6532107591629028, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 5490 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.5790849924087524, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 5500 + }, + { + "epoch": 3.69179229480737, + "grad_norm": 0.6055065393447876, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 5510 + }, + { + "epoch": 3.698492462311558, + "grad_norm": 0.5630605816841125, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 5520 + }, + { + "epoch": 3.7051926298157456, + "grad_norm": 0.6005825996398926, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 5530 + }, + { + "epoch": 3.711892797319933, + "grad_norm": 0.6553038954734802, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 5540 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5601094961166382, + "learning_rate": 0.0002, + "loss": 1.4943, + "step": 5550 + }, + { + "epoch": 3.725293132328308, + "grad_norm": 0.6598808169364929, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 5560 + }, + { + "epoch": 3.731993299832496, + "grad_norm": 0.5506255626678467, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 5570 + }, + { + "epoch": 3.7386934673366836, + "grad_norm": 0.6001223921775818, + "learning_rate": 0.0002, + "loss": 1.4805, + "step": 5580 + }, + { + "epoch": 3.745393634840871, + "grad_norm": 0.6287297606468201, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 5590 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.6253238916397095, + "learning_rate": 0.0002, + "loss": 1.5246, + "step": 5600 + }, + { + "epoch": 3.758793969849246, + "grad_norm": 0.5713174939155579, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 5610 + }, + { + "epoch": 3.765494137353434, + "grad_norm": 0.6198310852050781, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 5620 + }, + { + "epoch": 3.7721943048576216, + "grad_norm": 0.5941224098205566, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 5630 + }, + { + "epoch": 3.778894472361809, + "grad_norm": 0.606002151966095, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 5640 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.6540704965591431, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 5650 + }, + { + "epoch": 3.792294807370184, + "grad_norm": 0.6147415041923523, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 5660 + }, + { + "epoch": 3.798994974874372, + "grad_norm": 0.5649605393409729, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5670 + }, + { + "epoch": 3.8056951423785597, + "grad_norm": 0.6788773536682129, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 5680 + }, + { + "epoch": 3.812395309882747, + "grad_norm": 0.6581860780715942, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 5690 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.5529348850250244, + "learning_rate": 0.0002, + "loss": 1.4587, + "step": 5700 + }, + { + "epoch": 3.825795644891122, + "grad_norm": 0.6320232152938843, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 5710 + }, + { + "epoch": 3.83249581239531, + "grad_norm": 0.6529698371887207, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 5720 + }, + { + "epoch": 3.8391959798994977, + "grad_norm": 0.5983362793922424, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 5730 + }, + { + "epoch": 3.845896147403685, + "grad_norm": 0.6335684061050415, + "learning_rate": 0.0002, + "loss": 1.465, + "step": 5740 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.700446605682373, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5750 + }, + { + "epoch": 3.85929648241206, + "grad_norm": 0.6092597842216492, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 5760 + }, + { + "epoch": 3.865996649916248, + "grad_norm": 0.564146101474762, + "learning_rate": 0.0002, + "loss": 1.5729, + "step": 5770 + }, + { + "epoch": 3.8726968174204357, + "grad_norm": 0.615275502204895, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 5780 + }, + { + "epoch": 3.879396984924623, + "grad_norm": 0.6685376763343811, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 5790 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6116922497749329, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5800 + }, + { + "epoch": 3.892797319932998, + "grad_norm": 0.5486813187599182, + "learning_rate": 0.0002, + "loss": 1.5179, + "step": 5810 + }, + { + "epoch": 3.899497487437186, + "grad_norm": 0.6208204030990601, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 5820 + }, + { + "epoch": 3.9061976549413737, + "grad_norm": 0.6500625014305115, + "learning_rate": 0.0002, + "loss": 1.5334, + "step": 5830 + }, + { + "epoch": 3.912897822445561, + "grad_norm": 0.5948089361190796, + "learning_rate": 0.0002, + "loss": 1.4716, + "step": 5840 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.7210732698440552, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 5850 + }, + { + "epoch": 3.926298157453936, + "grad_norm": 0.6662322878837585, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 5860 + }, + { + "epoch": 3.932998324958124, + "grad_norm": 0.5613839626312256, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 5870 + }, + { + "epoch": 3.9396984924623117, + "grad_norm": 0.6069002151489258, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5880 + }, + { + "epoch": 3.946398659966499, + "grad_norm": 0.7075562477111816, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 5890 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.6316173076629639, + "learning_rate": 0.0002, + "loss": 1.5391, + "step": 5900 + }, + { + "epoch": 3.959798994974874, + "grad_norm": 0.5716308355331421, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 5910 + }, + { + "epoch": 3.966499162479062, + "grad_norm": 0.6800096035003662, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 5920 + }, + { + "epoch": 3.9731993299832498, + "grad_norm": 0.6057983040809631, + "learning_rate": 0.0002, + "loss": 1.5189, + "step": 5930 + }, + { + "epoch": 3.979899497487437, + "grad_norm": 0.5938987731933594, + "learning_rate": 0.0002, + "loss": 1.5431, + "step": 5940 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.6963576674461365, + "learning_rate": 0.0002, + "loss": 1.5111, + "step": 5950 + }, + { + "epoch": 3.993299832495812, + "grad_norm": 0.6279940009117126, + "learning_rate": 0.0002, + "loss": 1.5521, + "step": 5960 + }, + { + "epoch": 4.0, + "grad_norm": 0.7161159515380859, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 5970 + }, + { + "epoch": 4.0, + "eval_loss": 1.8655421733856201, + "eval_runtime": 37.9276, + "eval_samples_per_second": 13.579, + "eval_steps_per_second": 1.714, + "step": 5970 + }, + { + "epoch": 4.006700167504188, + "grad_norm": 0.7380476593971252, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 5980 + }, + { + "epoch": 4.013400335008376, + "grad_norm": 0.7148947715759277, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 5990 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.6177082657814026, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 6000 + }, + { + "epoch": 4.02680067001675, + "grad_norm": 0.8552946448326111, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 6010 + }, + { + "epoch": 4.033500837520938, + "grad_norm": 0.8033416271209717, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 6020 + }, + { + "epoch": 4.040201005025126, + "grad_norm": 0.8501318097114563, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 6030 + }, + { + "epoch": 4.046901172529314, + "grad_norm": 0.6981393098831177, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6040 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.7227180600166321, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 6050 + }, + { + "epoch": 4.060301507537688, + "grad_norm": 0.6923989653587341, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 6060 + }, + { + "epoch": 4.067001675041876, + "grad_norm": 0.879779040813446, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 6070 + }, + { + "epoch": 4.073701842546064, + "grad_norm": 0.8184754848480225, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 6080 + }, + { + "epoch": 4.080402010050252, + "grad_norm": 0.8211342692375183, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 6090 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.7542396783828735, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 6100 + }, + { + "epoch": 4.093802345058626, + "grad_norm": 0.6631066799163818, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 6110 + }, + { + "epoch": 4.100502512562814, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 6120 + }, + { + "epoch": 4.107202680067002, + "grad_norm": 0.681851863861084, + "learning_rate": 0.0002, + "loss": 1.3443, + "step": 6130 + }, + { + "epoch": 4.11390284757119, + "grad_norm": 0.8757794499397278, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 6140 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.6567301750183105, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 6150 + }, + { + "epoch": 4.127303182579564, + "grad_norm": 0.7950329184532166, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 6160 + }, + { + "epoch": 4.134003350083752, + "grad_norm": 0.7545644044876099, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6170 + }, + { + "epoch": 4.14070351758794, + "grad_norm": 0.7172710299491882, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 6180 + }, + { + "epoch": 4.147403685092128, + "grad_norm": 0.7040584087371826, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6190 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7482913732528687, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 6200 + }, + { + "epoch": 4.160804020100502, + "grad_norm": 0.8523276448249817, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 6210 + }, + { + "epoch": 4.16750418760469, + "grad_norm": 0.6672041416168213, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 6220 + }, + { + "epoch": 4.174204355108878, + "grad_norm": 0.7523500919342041, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 6230 + }, + { + "epoch": 4.180904522613066, + "grad_norm": 0.8085253834724426, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 6240 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.789450466632843, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 6250 + }, + { + "epoch": 4.19430485762144, + "grad_norm": 0.7502310872077942, + "learning_rate": 0.0002, + "loss": 1.3539, + "step": 6260 + }, + { + "epoch": 4.201005025125628, + "grad_norm": 0.7397456765174866, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 6270 + }, + { + "epoch": 4.207705192629816, + "grad_norm": 0.6921947002410889, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6280 + }, + { + "epoch": 4.214405360134004, + "grad_norm": 0.9334571957588196, + "learning_rate": 0.0002, + "loss": 1.3125, + "step": 6290 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.725799024105072, + "learning_rate": 0.0002, + "loss": 1.3612, + "step": 6300 + }, + { + "epoch": 4.227805695142378, + "grad_norm": 0.8290495872497559, + "learning_rate": 0.0002, + "loss": 1.4217, + "step": 6310 + }, + { + "epoch": 4.234505862646566, + "grad_norm": 0.688983678817749, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 6320 + }, + { + "epoch": 4.241206030150754, + "grad_norm": 0.8620913028717041, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 6330 + }, + { + "epoch": 4.247906197654942, + "grad_norm": 0.8008657693862915, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6340 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7379199266433716, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6350 + }, + { + "epoch": 4.261306532663316, + "grad_norm": 0.7842815518379211, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 6360 + }, + { + "epoch": 4.268006700167504, + "grad_norm": 0.812600314617157, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 6370 + }, + { + "epoch": 4.274706867671692, + "grad_norm": 0.7852841019630432, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 6380 + }, + { + "epoch": 4.28140703517588, + "grad_norm": 1.0377534627914429, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 6390 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 1.03935706615448, + "learning_rate": 0.0002, + "loss": 1.3755, + "step": 6400 + }, + { + "epoch": 4.294807370184254, + "grad_norm": 0.7244732975959778, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 6410 + }, + { + "epoch": 4.301507537688442, + "grad_norm": 0.7137406468391418, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 6420 + }, + { + "epoch": 4.30820770519263, + "grad_norm": 0.7492543458938599, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 6430 + }, + { + "epoch": 4.314907872696818, + "grad_norm": 0.7065439224243164, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 6440 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.7786989808082581, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 6450 + }, + { + "epoch": 4.328308207705192, + "grad_norm": 0.7369208335876465, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 6460 + }, + { + "epoch": 4.33500837520938, + "grad_norm": 0.7412346005439758, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 6470 + }, + { + "epoch": 4.341708542713568, + "grad_norm": 0.780927300453186, + "learning_rate": 0.0002, + "loss": 1.4087, + "step": 6480 + }, + { + "epoch": 4.348408710217756, + "grad_norm": 0.8320930600166321, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 6490 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.6871094703674316, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6500 + }, + { + "epoch": 4.36180904522613, + "grad_norm": 0.6751559972763062, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 6510 + }, + { + "epoch": 4.368509212730318, + "grad_norm": 0.7723976969718933, + "learning_rate": 0.0002, + "loss": 1.4311, + "step": 6520 + }, + { + "epoch": 4.375209380234506, + "grad_norm": 0.7915401458740234, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 6530 + }, + { + "epoch": 4.381909547738694, + "grad_norm": 0.7329102754592896, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 6540 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.7388760447502136, + "learning_rate": 0.0002, + "loss": 1.447, + "step": 6550 + }, + { + "epoch": 4.3953098827470685, + "grad_norm": 0.8282579183578491, + "learning_rate": 0.0002, + "loss": 1.4378, + "step": 6560 + }, + { + "epoch": 4.402010050251256, + "grad_norm": 0.7192724347114563, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6570 + }, + { + "epoch": 4.408710217755444, + "grad_norm": 0.746526837348938, + "learning_rate": 0.0002, + "loss": 1.4141, + "step": 6580 + }, + { + "epoch": 4.415410385259632, + "grad_norm": 0.8738046288490295, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 6590 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.8408458828926086, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 6600 + }, + { + "epoch": 4.4288107202680065, + "grad_norm": 0.8110666275024414, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 6610 + }, + { + "epoch": 4.435510887772194, + "grad_norm": 0.8602406978607178, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 6620 + }, + { + "epoch": 4.442211055276382, + "grad_norm": 0.7549102902412415, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6630 + }, + { + "epoch": 4.44891122278057, + "grad_norm": 0.7831804156303406, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 6640 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.7269673943519592, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6650 + }, + { + "epoch": 4.4623115577889445, + "grad_norm": 0.7397838830947876, + "learning_rate": 0.0002, + "loss": 1.4132, + "step": 6660 + }, + { + "epoch": 4.469011725293132, + "grad_norm": 0.713707447052002, + "learning_rate": 0.0002, + "loss": 1.3174, + "step": 6670 + }, + { + "epoch": 4.47571189279732, + "grad_norm": 0.7525581121444702, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 6680 + }, + { + "epoch": 4.482412060301508, + "grad_norm": 0.8030191659927368, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6690 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.7469439506530762, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 6700 + }, + { + "epoch": 4.4958123953098825, + "grad_norm": 0.7743868231773376, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 6710 + }, + { + "epoch": 4.50251256281407, + "grad_norm": 0.6539737582206726, + "learning_rate": 0.0002, + "loss": 1.3439, + "step": 6720 + }, + { + "epoch": 4.509212730318258, + "grad_norm": 0.825818657875061, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 6730 + }, + { + "epoch": 4.515912897822446, + "grad_norm": 0.8048575520515442, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 6740 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.7828766107559204, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6750 + }, + { + "epoch": 4.5293132328308205, + "grad_norm": 0.7406010031700134, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 6760 + }, + { + "epoch": 4.536013400335008, + "grad_norm": 0.840345561504364, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 6770 + }, + { + "epoch": 4.542713567839196, + "grad_norm": 0.8492622971534729, + "learning_rate": 0.0002, + "loss": 1.4808, + "step": 6780 + }, + { + "epoch": 4.549413735343384, + "grad_norm": 0.7130163908004761, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 6790 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 0.8454728126525879, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 6800 + }, + { + "epoch": 4.562814070351759, + "grad_norm": 0.7847645282745361, + "learning_rate": 0.0002, + "loss": 1.3239, + "step": 6810 + }, + { + "epoch": 4.569514237855946, + "grad_norm": 0.7245864272117615, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 6820 + }, + { + "epoch": 4.576214405360134, + "grad_norm": 0.768893301486969, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 6830 + }, + { + "epoch": 4.582914572864322, + "grad_norm": 0.8028400540351868, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6840 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.763945460319519, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 6850 + }, + { + "epoch": 4.596314907872697, + "grad_norm": 0.7417685389518738, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 6860 + }, + { + "epoch": 4.603015075376884, + "grad_norm": 0.7603038549423218, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 6870 + }, + { + "epoch": 4.609715242881072, + "grad_norm": 0.7981528043746948, + "learning_rate": 0.0002, + "loss": 1.4095, + "step": 6880 + }, + { + "epoch": 4.61641541038526, + "grad_norm": 0.8077111840248108, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6890 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.8778454065322876, + "learning_rate": 0.0002, + "loss": 1.4721, + "step": 6900 + }, + { + "epoch": 4.629815745393635, + "grad_norm": 0.8620710372924805, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 6910 + }, + { + "epoch": 4.636515912897822, + "grad_norm": 0.7486072778701782, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 6920 + }, + { + "epoch": 4.64321608040201, + "grad_norm": 0.7493042945861816, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 6930 + }, + { + "epoch": 4.649916247906198, + "grad_norm": 0.7388978600502014, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 6940 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.798530638217926, + "learning_rate": 0.0002, + "loss": 1.3593, + "step": 6950 + }, + { + "epoch": 4.663316582914573, + "grad_norm": 0.7929500937461853, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 6960 + }, + { + "epoch": 4.67001675041876, + "grad_norm": 0.9186785221099854, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 6970 + }, + { + "epoch": 4.676716917922948, + "grad_norm": 1.1103485822677612, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 6980 + }, + { + "epoch": 4.683417085427136, + "grad_norm": 0.8000466823577881, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 6990 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7520599961280823, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 7000 + }, + { + "epoch": 4.696817420435511, + "grad_norm": 0.7971973419189453, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 7010 + }, + { + "epoch": 4.703517587939698, + "grad_norm": 0.7363343834877014, + "learning_rate": 0.0002, + "loss": 1.3682, + "step": 7020 + }, + { + "epoch": 4.710217755443886, + "grad_norm": 0.8268865942955017, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 7030 + }, + { + "epoch": 4.716917922948074, + "grad_norm": 0.7054963111877441, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 7040 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.8196262121200562, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7050 + }, + { + "epoch": 4.730318257956449, + "grad_norm": 0.8276031017303467, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 7060 + }, + { + "epoch": 4.7370184254606365, + "grad_norm": 0.8248157501220703, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 7070 + }, + { + "epoch": 4.743718592964824, + "grad_norm": 0.8937979936599731, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 7080 + }, + { + "epoch": 4.750418760469012, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 7090 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.9495313763618469, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 7100 + }, + { + "epoch": 4.763819095477387, + "grad_norm": 0.8598204255104065, + "learning_rate": 0.0002, + "loss": 1.4504, + "step": 7110 + }, + { + "epoch": 4.7705192629815745, + "grad_norm": 0.8951472640037537, + "learning_rate": 0.0002, + "loss": 1.3969, + "step": 7120 + }, + { + "epoch": 4.777219430485762, + "grad_norm": 0.9110309481620789, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 7130 + }, + { + "epoch": 4.78391959798995, + "grad_norm": 0.7929584980010986, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 7140 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7415322661399841, + "learning_rate": 0.0002, + "loss": 1.467, + "step": 7150 + }, + { + "epoch": 4.797319932998325, + "grad_norm": 0.7504757046699524, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 7160 + }, + { + "epoch": 4.8040201005025125, + "grad_norm": 0.7166924476623535, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 7170 + }, + { + "epoch": 4.8107202680067, + "grad_norm": 0.7728400826454163, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 7180 + }, + { + "epoch": 4.817420435510888, + "grad_norm": 0.7992154955863953, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 7190 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.8655321002006531, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 7200 + }, + { + "epoch": 4.830820770519263, + "grad_norm": 0.7672632336616516, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 7210 + }, + { + "epoch": 4.8375209380234505, + "grad_norm": 0.708416223526001, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7220 + }, + { + "epoch": 4.844221105527638, + "grad_norm": 0.8914081454277039, + "learning_rate": 0.0002, + "loss": 1.5413, + "step": 7230 + }, + { + "epoch": 4.850921273031826, + "grad_norm": 0.7141931653022766, + "learning_rate": 0.0002, + "loss": 1.3569, + "step": 7240 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.6913040280342102, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 7250 + }, + { + "epoch": 4.864321608040201, + "grad_norm": 0.7871233820915222, + "learning_rate": 0.0002, + "loss": 1.3912, + "step": 7260 + }, + { + "epoch": 4.8710217755443885, + "grad_norm": 0.8466277122497559, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 7270 + }, + { + "epoch": 4.877721943048576, + "grad_norm": 0.8492183685302734, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 7280 + }, + { + "epoch": 4.884422110552764, + "grad_norm": 0.8339574933052063, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 7290 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.787022590637207, + "learning_rate": 0.0002, + "loss": 1.4157, + "step": 7300 + }, + { + "epoch": 4.897822445561139, + "grad_norm": 0.8877332806587219, + "learning_rate": 0.0002, + "loss": 1.3725, + "step": 7310 + }, + { + "epoch": 4.9045226130653266, + "grad_norm": 0.744989812374115, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 7320 + }, + { + "epoch": 4.911222780569514, + "grad_norm": 0.8027268648147583, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 7330 + }, + { + "epoch": 4.917922948073702, + "grad_norm": 0.6437455415725708, + "learning_rate": 0.0002, + "loss": 1.425, + "step": 7340 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.685999870300293, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 7350 + }, + { + "epoch": 4.931323283082077, + "grad_norm": 0.9086187481880188, + "learning_rate": 0.0002, + "loss": 1.4352, + "step": 7360 + }, + { + "epoch": 4.938023450586265, + "grad_norm": 0.8272411227226257, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 7370 + }, + { + "epoch": 4.944723618090452, + "grad_norm": 0.9227852821350098, + "learning_rate": 0.0002, + "loss": 1.4226, + "step": 7380 + }, + { + "epoch": 4.95142378559464, + "grad_norm": 0.7688441276550293, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 7390 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8662643432617188, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 7400 + }, + { + "epoch": 4.964824120603015, + "grad_norm": 0.9234127998352051, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 7410 + }, + { + "epoch": 4.971524288107203, + "grad_norm": 0.9131470918655396, + "learning_rate": 0.0002, + "loss": 1.4009, + "step": 7420 + }, + { + "epoch": 4.97822445561139, + "grad_norm": 0.7377504110336304, + "learning_rate": 0.0002, + "loss": 1.4544, + "step": 7430 + }, + { + "epoch": 4.984924623115578, + "grad_norm": 0.8762801289558411, + "learning_rate": 0.0002, + "loss": 1.4008, + "step": 7440 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7919872999191284, + "learning_rate": 0.0002, + "loss": 1.4304, + "step": 7450 + }, + { + "epoch": 4.998324958123953, + "grad_norm": 0.7144299149513245, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 7460 + }, + { + "epoch": 4.99966499162479, + "eval_loss": 1.9291157722473145, + "eval_runtime": 37.9831, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 7462 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.453477907857408e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-7462/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5b40d70adec9032ad731a772b6872ee7ddd87f63 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4c763944653cdd95b76a02dcbf86e28f4756a6d4cc892f064a27d4ddc8f9133 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9ef194f0a312c74510403471b8a43ca27a9ac3e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d79583f48db1e65beafca5194fdbbcf716952c7ca105d9d491cbfac04d6e093 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6263b5766bc70d4dcd4ab4a3290016f260d99227 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb6a4d75a6ec8bc5a512db762e8603fc7707c39c1e931dd5c514a1b914b1314 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..145d0f09e381e5a002d1f525781535e205bde3a8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea89e9fb267ad0bf2939fed06eb40079c33ce20fc2f9224d95bbb89c5510bab1 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..16edd060a824a69118f566a12abeebbab3325edf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/trainer_state.json @@ -0,0 +1,6346 @@ +{ + "best_metric": 1.8028968572616577, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 8955, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006700167504187605, + "grad_norm": 0.565915048122406, + "learning_rate": 0.0002, + "loss": 2.6189, + "step": 10 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5004463791847229, + "learning_rate": 0.0002, + "loss": 2.3162, + "step": 20 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.511043906211853, + "learning_rate": 0.0002, + "loss": 2.0576, + "step": 30 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.47327178716659546, + "learning_rate": 0.0002, + "loss": 2.0085, + "step": 40 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.5511676669120789, + "learning_rate": 0.0002, + "loss": 2.0276, + "step": 50 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.4666278064250946, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 60 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 0.5310961008071899, + "learning_rate": 0.0002, + "loss": 1.8413, + "step": 70 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.5606027245521545, + "learning_rate": 0.0002, + "loss": 1.8711, + "step": 80 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.4934779703617096, + "learning_rate": 0.0002, + "loss": 1.9282, + "step": 90 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4821869730949402, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 100 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 0.5262084603309631, + "learning_rate": 0.0002, + "loss": 1.8628, + "step": 110 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.3774230182170868, + "learning_rate": 0.0002, + "loss": 1.8347, + "step": 120 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 0.34137430787086487, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 130 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.407272070646286, + "learning_rate": 0.0002, + "loss": 1.861, + "step": 140 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.4011937975883484, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 150 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.4432467222213745, + "learning_rate": 0.0002, + "loss": 1.9317, + "step": 160 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 0.44030463695526123, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 170 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3799569308757782, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 180 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 0.33721521496772766, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 190 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4096226692199707, + "learning_rate": 0.0002, + "loss": 1.8269, + "step": 200 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.37374693155288696, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 210 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3249480128288269, + "learning_rate": 0.0002, + "loss": 1.8901, + "step": 220 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 0.3612042963504791, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 230 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3686671257019043, + "learning_rate": 0.0002, + "loss": 1.7585, + "step": 240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.3521044850349426, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 250 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.4073677361011505, + "learning_rate": 0.0002, + "loss": 1.8623, + "step": 260 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.34522193670272827, + "learning_rate": 0.0002, + "loss": 1.8026, + "step": 270 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.4121900498867035, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 280 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 0.3544778525829315, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 290 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3482133448123932, + "learning_rate": 0.0002, + "loss": 1.8787, + "step": 300 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 0.3421826660633087, + "learning_rate": 0.0002, + "loss": 1.8578, + "step": 310 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.5024696588516235, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 320 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.36013063788414, + "learning_rate": 0.0002, + "loss": 1.8607, + "step": 330 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.3611244857311249, + "learning_rate": 0.0002, + "loss": 1.9075, + "step": 340 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.39244529604911804, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 350 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3299325704574585, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 360 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 0.3994322419166565, + "learning_rate": 0.0002, + "loss": 1.8028, + "step": 370 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.3559151887893677, + "learning_rate": 0.0002, + "loss": 1.8321, + "step": 380 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.3873756229877472, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 390 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3710744082927704, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 400 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 0.3618465065956116, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 410 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.30063769221305847, + "learning_rate": 0.0002, + "loss": 1.8529, + "step": 420 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 0.3695628345012665, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 430 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.31451135873794556, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 440 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 450 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.33475354313850403, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 460 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 0.33933115005493164, + "learning_rate": 0.0002, + "loss": 1.8805, + "step": 470 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.3264943063259125, + "learning_rate": 0.0002, + "loss": 1.7564, + "step": 480 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 0.40188100934028625, + "learning_rate": 0.0002, + "loss": 1.8428, + "step": 490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.37408649921417236, + "learning_rate": 0.0002, + "loss": 1.7624, + "step": 500 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.33925938606262207, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 510 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.36836713552474976, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 520 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 0.37284499406814575, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 530 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.3192278742790222, + "learning_rate": 0.0002, + "loss": 1.8379, + "step": 540 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.30233290791511536, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 550 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.3340817391872406, + "learning_rate": 0.0002, + "loss": 1.8799, + "step": 560 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.32600095868110657, + "learning_rate": 0.0002, + "loss": 1.8404, + "step": 570 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.33711278438568115, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 580 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 0.34890690445899963, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 590 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.38238924741744995, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 600 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 0.34399354457855225, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 610 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.3346073627471924, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 620 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.3545648157596588, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 630 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.3378899097442627, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 640 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3255569040775299, + "learning_rate": 0.0002, + "loss": 1.804, + "step": 650 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.34880587458610535, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 660 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 0.3402383625507355, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 670 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.3594033718109131, + "learning_rate": 0.0002, + "loss": 1.8131, + "step": 680 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.31000566482543945, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 690 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.37229061126708984, + "learning_rate": 0.0002, + "loss": 1.7521, + "step": 700 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 0.315801739692688, + "learning_rate": 0.0002, + "loss": 1.7779, + "step": 710 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.3220832645893097, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 720 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 0.3435456156730652, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 730 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.30380892753601074, + "learning_rate": 0.0002, + "loss": 1.8844, + "step": 740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3555026054382324, + "learning_rate": 0.0002, + "loss": 1.7792, + "step": 750 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.3019855320453644, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 760 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 0.309111088514328, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 770 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.366020530462265, + "learning_rate": 0.0002, + "loss": 1.7913, + "step": 780 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 0.3267050087451935, + "learning_rate": 0.0002, + "loss": 1.8008, + "step": 790 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.34265750646591187, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 800 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.313669890165329, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 810 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.3355236053466797, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 820 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 0.3186608552932739, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 830 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.30357518792152405, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 840 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.3990040123462677, + "learning_rate": 0.0002, + "loss": 1.769, + "step": 850 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.34363803267478943, + "learning_rate": 0.0002, + "loss": 1.7482, + "step": 860 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.3757908046245575, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 870 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.3359757661819458, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 880 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 0.5555329918861389, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 890 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.4046323895454407, + "learning_rate": 0.0002, + "loss": 1.7715, + "step": 900 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 0.29834219813346863, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 910 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.3241238594055176, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 920 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.35154739022254944, + "learning_rate": 0.0002, + "loss": 1.8342, + "step": 930 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.3287706673145294, + "learning_rate": 0.0002, + "loss": 1.8076, + "step": 940 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.35670626163482666, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 950 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.6114104986190796, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 960 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 0.3186565041542053, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 970 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.27164125442504883, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 980 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.34407344460487366, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.368415892124176, + "learning_rate": 0.0002, + "loss": 1.855, + "step": 1000 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 0.3306390643119812, + "learning_rate": 0.0002, + "loss": 1.7821, + "step": 1010 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.3198648989200592, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 1020 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 0.3092987537384033, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 1030 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.3090653419494629, + "learning_rate": 0.0002, + "loss": 1.7689, + "step": 1040 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.3485880196094513, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 1050 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.35782721638679504, + "learning_rate": 0.0002, + "loss": 1.8187, + "step": 1060 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 0.34256869554519653, + "learning_rate": 0.0002, + "loss": 1.806, + "step": 1070 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.30461037158966064, + "learning_rate": 0.0002, + "loss": 1.7873, + "step": 1080 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 0.3398691713809967, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1090 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.3180808126926422, + "learning_rate": 0.0002, + "loss": 1.8756, + "step": 1100 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.34400665760040283, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1110 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.34244877099990845, + "learning_rate": 0.0002, + "loss": 1.7851, + "step": 1120 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 0.29946693778038025, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1130 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.37547236680984497, + "learning_rate": 0.0002, + "loss": 1.7456, + "step": 1140 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.3263005018234253, + "learning_rate": 0.0002, + "loss": 1.8425, + "step": 1150 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.41363608837127686, + "learning_rate": 0.0002, + "loss": 1.7222, + "step": 1160 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.36267954111099243, + "learning_rate": 0.0002, + "loss": 1.7836, + "step": 1170 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.31789499521255493, + "learning_rate": 0.0002, + "loss": 1.9183, + "step": 1180 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 0.5708149075508118, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1190 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.322099506855011, + "learning_rate": 0.0002, + "loss": 1.6908, + "step": 1200 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 0.3419909179210663, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1210 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.36286255717277527, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 1220 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.33992862701416016, + "learning_rate": 0.0002, + "loss": 1.8409, + "step": 1230 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.32622793316841125, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 1240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3036167621612549, + "learning_rate": 0.0002, + "loss": 1.8098, + "step": 1250 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.3182215392589569, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 1260 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 0.3270018696784973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1270 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.32652342319488525, + "learning_rate": 0.0002, + "loss": 1.798, + "step": 1280 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.3631329834461212, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 1290 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.36706018447875977, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 1300 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 0.3347418010234833, + "learning_rate": 0.0002, + "loss": 1.8178, + "step": 1310 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.34371060132980347, + "learning_rate": 0.0002, + "loss": 1.7824, + "step": 1320 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 0.3029090166091919, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.34700682759284973, + "learning_rate": 0.0002, + "loss": 1.8017, + "step": 1340 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.35574328899383545, + "learning_rate": 0.0002, + "loss": 1.7998, + "step": 1350 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.30928221344947815, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 0.30652928352355957, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 1370 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.3838157653808594, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 1380 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 0.31655240058898926, + "learning_rate": 0.0002, + "loss": 1.7977, + "step": 1390 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.41737303137779236, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1400 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.3227267861366272, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1410 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.3729925751686096, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1420 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 0.30779409408569336, + "learning_rate": 0.0002, + "loss": 1.8221, + "step": 1430 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.334379643201828, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1440 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.3568236231803894, + "learning_rate": 0.0002, + "loss": 1.7141, + "step": 1450 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.33310577273368835, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1460 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.2972261905670166, + "learning_rate": 0.0002, + "loss": 1.8511, + "step": 1470 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.3322717845439911, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 1480 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 0.3276330828666687, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 1490 + }, + { + "epoch": 0.9996649916247906, + "eval_loss": 1.8036354780197144, + "eval_runtime": 37.8949, + "eval_samples_per_second": 13.59, + "eval_steps_per_second": 1.715, + "step": 1492 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.29252371191978455, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1500 + }, + { + "epoch": 1.0117252931323284, + "grad_norm": 0.31607162952423096, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 1510 + }, + { + "epoch": 1.018425460636516, + "grad_norm": 0.32294467091560364, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1520 + }, + { + "epoch": 1.0251256281407035, + "grad_norm": 0.3868017792701721, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 1530 + }, + { + "epoch": 1.031825795644891, + "grad_norm": 0.3178282082080841, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 1540 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.3706750273704529, + "learning_rate": 0.0002, + "loss": 1.7136, + "step": 1550 + }, + { + "epoch": 1.0452261306532664, + "grad_norm": 0.33930912613868713, + "learning_rate": 0.0002, + "loss": 1.7382, + "step": 1560 + }, + { + "epoch": 1.051926298157454, + "grad_norm": 0.33970504999160767, + "learning_rate": 0.0002, + "loss": 1.7602, + "step": 1570 + }, + { + "epoch": 1.0586264656616415, + "grad_norm": 0.42553383111953735, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1580 + }, + { + "epoch": 1.065326633165829, + "grad_norm": 0.3772421181201935, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 1590 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.34212902188301086, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1600 + }, + { + "epoch": 1.0787269681742044, + "grad_norm": 0.3798283338546753, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1610 + }, + { + "epoch": 1.085427135678392, + "grad_norm": 0.36909598112106323, + "learning_rate": 0.0002, + "loss": 1.7468, + "step": 1620 + }, + { + "epoch": 1.0921273031825796, + "grad_norm": 0.3344230651855469, + "learning_rate": 0.0002, + "loss": 1.7807, + "step": 1630 + }, + { + "epoch": 1.0988274706867671, + "grad_norm": 0.3862569332122803, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1640 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.31188511848449707, + "learning_rate": 0.0002, + "loss": 1.7163, + "step": 1650 + }, + { + "epoch": 1.1122278056951425, + "grad_norm": 0.3563670814037323, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 1660 + }, + { + "epoch": 1.11892797319933, + "grad_norm": 0.35052165389060974, + "learning_rate": 0.0002, + "loss": 1.7718, + "step": 1670 + }, + { + "epoch": 1.1256281407035176, + "grad_norm": 0.3285699188709259, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 1680 + }, + { + "epoch": 1.1323283082077051, + "grad_norm": 0.3639393746852875, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1690 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.3842753767967224, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 1700 + }, + { + "epoch": 1.1457286432160805, + "grad_norm": 0.3624933063983917, + "learning_rate": 0.0002, + "loss": 1.7002, + "step": 1710 + }, + { + "epoch": 1.152428810720268, + "grad_norm": 0.3641220033168793, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1720 + }, + { + "epoch": 1.1591289782244556, + "grad_norm": 0.32765355706214905, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1730 + }, + { + "epoch": 1.1658291457286432, + "grad_norm": 0.34974896907806396, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 1740 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3910926580429077, + "learning_rate": 0.0002, + "loss": 1.7273, + "step": 1750 + }, + { + "epoch": 1.1792294807370185, + "grad_norm": 0.3564300537109375, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 1760 + }, + { + "epoch": 1.185929648241206, + "grad_norm": 0.34822574257850647, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1770 + }, + { + "epoch": 1.1926298157453936, + "grad_norm": 0.36185044050216675, + "learning_rate": 0.0002, + "loss": 1.7256, + "step": 1780 + }, + { + "epoch": 1.1993299832495812, + "grad_norm": 0.34866711497306824, + "learning_rate": 0.0002, + "loss": 1.6431, + "step": 1790 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.4017769992351532, + "learning_rate": 0.0002, + "loss": 1.8084, + "step": 1800 + }, + { + "epoch": 1.2127303182579565, + "grad_norm": 0.32930681109428406, + "learning_rate": 0.0002, + "loss": 1.6985, + "step": 1810 + }, + { + "epoch": 1.219430485762144, + "grad_norm": 0.35951921343803406, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1820 + }, + { + "epoch": 1.2261306532663316, + "grad_norm": 0.37366992235183716, + "learning_rate": 0.0002, + "loss": 1.6933, + "step": 1830 + }, + { + "epoch": 1.2328308207705192, + "grad_norm": 0.3565689027309418, + "learning_rate": 0.0002, + "loss": 1.6737, + "step": 1840 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.3692343533039093, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 1850 + }, + { + "epoch": 1.2462311557788945, + "grad_norm": 0.38426971435546875, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 1860 + }, + { + "epoch": 1.252931323283082, + "grad_norm": 0.33559855818748474, + "learning_rate": 0.0002, + "loss": 1.7031, + "step": 1870 + }, + { + "epoch": 1.2596314907872697, + "grad_norm": 0.34181106090545654, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1880 + }, + { + "epoch": 1.2663316582914572, + "grad_norm": 0.3916318416595459, + "learning_rate": 0.0002, + "loss": 1.7707, + "step": 1890 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3887825012207031, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 1900 + }, + { + "epoch": 1.2797319932998326, + "grad_norm": 0.33583927154541016, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1910 + }, + { + "epoch": 1.2864321608040201, + "grad_norm": 0.37639349699020386, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1920 + }, + { + "epoch": 1.2931323283082077, + "grad_norm": 0.38059428334236145, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1930 + }, + { + "epoch": 1.2998324958123952, + "grad_norm": 0.37253183126449585, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 1940 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.37371566891670227, + "learning_rate": 0.0002, + "loss": 1.6758, + "step": 1950 + }, + { + "epoch": 1.3132328308207706, + "grad_norm": 0.4080910086631775, + "learning_rate": 0.0002, + "loss": 1.6788, + "step": 1960 + }, + { + "epoch": 1.3199329983249581, + "grad_norm": 0.3174354135990143, + "learning_rate": 0.0002, + "loss": 1.6518, + "step": 1970 + }, + { + "epoch": 1.3266331658291457, + "grad_norm": 0.4518888294696808, + "learning_rate": 0.0002, + "loss": 1.7925, + "step": 1980 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3627921938896179, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 1990 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3655930161476135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 2000 + }, + { + "epoch": 1.3467336683417086, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.0002, + "loss": 1.7016, + "step": 2010 + }, + { + "epoch": 1.3534338358458962, + "grad_norm": 0.4281129240989685, + "learning_rate": 0.0002, + "loss": 1.7359, + "step": 2020 + }, + { + "epoch": 1.3601340033500837, + "grad_norm": 0.3821414113044739, + "learning_rate": 0.0002, + "loss": 1.6884, + "step": 2030 + }, + { + "epoch": 1.3668341708542713, + "grad_norm": 0.3907586336135864, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 2040 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37792932987213135, + "learning_rate": 0.0002, + "loss": 1.7424, + "step": 2050 + }, + { + "epoch": 1.3802345058626466, + "grad_norm": 0.3693985641002655, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 2060 + }, + { + "epoch": 1.3869346733668342, + "grad_norm": 0.32275936007499695, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 2070 + }, + { + "epoch": 1.3936348408710217, + "grad_norm": 0.3789440095424652, + "learning_rate": 0.0002, + "loss": 1.6677, + "step": 2080 + }, + { + "epoch": 1.4003350083752093, + "grad_norm": 0.3638380467891693, + "learning_rate": 0.0002, + "loss": 1.6825, + "step": 2090 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3495481610298157, + "learning_rate": 0.0002, + "loss": 1.6542, + "step": 2100 + }, + { + "epoch": 1.4137353433835846, + "grad_norm": 0.37920597195625305, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 2110 + }, + { + "epoch": 1.4204355108877722, + "grad_norm": 0.37218064069747925, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 2120 + }, + { + "epoch": 1.4271356783919598, + "grad_norm": 0.38074082136154175, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 2130 + }, + { + "epoch": 1.4338358458961473, + "grad_norm": 0.3455527126789093, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 2140 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.3712003529071808, + "learning_rate": 0.0002, + "loss": 1.776, + "step": 2150 + }, + { + "epoch": 1.4472361809045227, + "grad_norm": 0.3786754906177521, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2160 + }, + { + "epoch": 1.4539363484087102, + "grad_norm": 0.3879223167896271, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 2170 + }, + { + "epoch": 1.4606365159128978, + "grad_norm": 0.38738805055618286, + "learning_rate": 0.0002, + "loss": 1.7, + "step": 2180 + }, + { + "epoch": 1.4673366834170856, + "grad_norm": 0.39768800139427185, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2190 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.4172441065311432, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 2200 + }, + { + "epoch": 1.4807370184254607, + "grad_norm": 0.4043174982070923, + "learning_rate": 0.0002, + "loss": 1.6736, + "step": 2210 + }, + { + "epoch": 1.4874371859296482, + "grad_norm": 0.3750883936882019, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 2220 + }, + { + "epoch": 1.4941373534338358, + "grad_norm": 0.3552253246307373, + "learning_rate": 0.0002, + "loss": 1.6861, + "step": 2230 + }, + { + "epoch": 1.5008375209380236, + "grad_norm": 0.34607139229774475, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2240 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.3406706750392914, + "learning_rate": 0.0002, + "loss": 1.6962, + "step": 2250 + }, + { + "epoch": 1.5142378559463987, + "grad_norm": 0.36654895544052124, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 2260 + }, + { + "epoch": 1.5209380234505863, + "grad_norm": 0.3914054334163666, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2270 + }, + { + "epoch": 1.5276381909547738, + "grad_norm": 0.42012137174606323, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 2280 + }, + { + "epoch": 1.5343383584589616, + "grad_norm": 0.39563435316085815, + "learning_rate": 0.0002, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.3508438766002655, + "learning_rate": 0.0002, + "loss": 1.7491, + "step": 2300 + }, + { + "epoch": 1.5477386934673367, + "grad_norm": 0.3785218596458435, + "learning_rate": 0.0002, + "loss": 1.7727, + "step": 2310 + }, + { + "epoch": 1.5544388609715243, + "grad_norm": 0.39377647638320923, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 2320 + }, + { + "epoch": 1.5611390284757118, + "grad_norm": 0.3391438126564026, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2330 + }, + { + "epoch": 1.5678391959798996, + "grad_norm": 0.37944263219833374, + "learning_rate": 0.0002, + "loss": 1.7722, + "step": 2340 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3523491322994232, + "learning_rate": 0.0002, + "loss": 1.6371, + "step": 2350 + }, + { + "epoch": 1.5812395309882747, + "grad_norm": 0.3911575973033905, + "learning_rate": 0.0002, + "loss": 1.7583, + "step": 2360 + }, + { + "epoch": 1.5879396984924623, + "grad_norm": 0.33832186460494995, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 2370 + }, + { + "epoch": 1.5946398659966499, + "grad_norm": 0.3665979206562042, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2380 + }, + { + "epoch": 1.6013400335008376, + "grad_norm": 0.3871748149394989, + "learning_rate": 0.0002, + "loss": 1.779, + "step": 2390 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3586967885494232, + "learning_rate": 0.0002, + "loss": 1.7109, + "step": 2400 + }, + { + "epoch": 1.6147403685092128, + "grad_norm": 0.3563673198223114, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 2410 + }, + { + "epoch": 1.6214405360134003, + "grad_norm": 0.37588971853256226, + "learning_rate": 0.0002, + "loss": 1.745, + "step": 2420 + }, + { + "epoch": 1.6281407035175879, + "grad_norm": 0.352556437253952, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 2430 + }, + { + "epoch": 1.6348408710217757, + "grad_norm": 0.3716259300708771, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2440 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.372001975774765, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2450 + }, + { + "epoch": 1.6482412060301508, + "grad_norm": 0.3430042862892151, + "learning_rate": 0.0002, + "loss": 1.6584, + "step": 2460 + }, + { + "epoch": 1.6549413735343383, + "grad_norm": 0.3741483688354492, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2470 + }, + { + "epoch": 1.661641541038526, + "grad_norm": 0.3610571324825287, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 2480 + }, + { + "epoch": 1.6683417085427137, + "grad_norm": 0.4204719066619873, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2490 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3938186466693878, + "learning_rate": 0.0002, + "loss": 1.7954, + "step": 2500 + }, + { + "epoch": 1.6817420435510888, + "grad_norm": 0.3421435058116913, + "learning_rate": 0.0002, + "loss": 1.6633, + "step": 2510 + }, + { + "epoch": 1.6884422110552764, + "grad_norm": 0.42441412806510925, + "learning_rate": 0.0002, + "loss": 1.7996, + "step": 2520 + }, + { + "epoch": 1.695142378559464, + "grad_norm": 0.38071519136428833, + "learning_rate": 0.0002, + "loss": 1.7142, + "step": 2530 + }, + { + "epoch": 1.7018425460636517, + "grad_norm": 0.34078919887542725, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 2540 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.412844181060791, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 2550 + }, + { + "epoch": 1.7152428810720268, + "grad_norm": 0.3753604292869568, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 2560 + }, + { + "epoch": 1.7219430485762144, + "grad_norm": 0.41588476300239563, + "learning_rate": 0.0002, + "loss": 1.7011, + "step": 2570 + }, + { + "epoch": 1.728643216080402, + "grad_norm": 0.35504111647605896, + "learning_rate": 0.0002, + "loss": 1.6427, + "step": 2580 + }, + { + "epoch": 1.7353433835845897, + "grad_norm": 0.36909720301628113, + "learning_rate": 0.0002, + "loss": 1.7296, + "step": 2590 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.4149979054927826, + "learning_rate": 0.0002, + "loss": 1.7022, + "step": 2600 + }, + { + "epoch": 1.7487437185929648, + "grad_norm": 0.38859328627586365, + "learning_rate": 0.0002, + "loss": 1.77, + "step": 2610 + }, + { + "epoch": 1.7554438860971524, + "grad_norm": 0.36738792061805725, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2620 + }, + { + "epoch": 1.76214405360134, + "grad_norm": 0.3968178927898407, + "learning_rate": 0.0002, + "loss": 1.764, + "step": 2630 + }, + { + "epoch": 1.7688442211055277, + "grad_norm": 0.3972901999950409, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 2640 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3949959874153137, + "learning_rate": 0.0002, + "loss": 1.6774, + "step": 2650 + }, + { + "epoch": 1.7822445561139029, + "grad_norm": 0.44074657559394836, + "learning_rate": 0.0002, + "loss": 1.7247, + "step": 2660 + }, + { + "epoch": 1.7889447236180904, + "grad_norm": 0.39743664860725403, + "learning_rate": 0.0002, + "loss": 1.7188, + "step": 2670 + }, + { + "epoch": 1.795644891122278, + "grad_norm": 0.3950406610965729, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2680 + }, + { + "epoch": 1.8023450586264658, + "grad_norm": 0.3568263649940491, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2690 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.3819476366043091, + "learning_rate": 0.0002, + "loss": 1.6735, + "step": 2700 + }, + { + "epoch": 1.8157453936348409, + "grad_norm": 0.3480634391307831, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 2710 + }, + { + "epoch": 1.8224455611390284, + "grad_norm": 0.3875853419303894, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2720 + }, + { + "epoch": 1.829145728643216, + "grad_norm": 0.3441337049007416, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2730 + }, + { + "epoch": 1.8358458961474038, + "grad_norm": 0.35692882537841797, + "learning_rate": 0.0002, + "loss": 1.7647, + "step": 2740 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.36959215998649597, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2750 + }, + { + "epoch": 1.849246231155779, + "grad_norm": 0.3893393278121948, + "learning_rate": 0.0002, + "loss": 1.7657, + "step": 2760 + }, + { + "epoch": 1.8559463986599665, + "grad_norm": 0.37817293405532837, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2770 + }, + { + "epoch": 1.862646566164154, + "grad_norm": 0.36071285605430603, + "learning_rate": 0.0002, + "loss": 1.761, + "step": 2780 + }, + { + "epoch": 1.8693467336683418, + "grad_norm": 0.3758420944213867, + "learning_rate": 0.0002, + "loss": 1.7623, + "step": 2790 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3889938294887543, + "learning_rate": 0.0002, + "loss": 1.6743, + "step": 2800 + }, + { + "epoch": 1.882747068676717, + "grad_norm": 0.34361857175827026, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 2810 + }, + { + "epoch": 1.8894472361809045, + "grad_norm": 0.39283323287963867, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2820 + }, + { + "epoch": 1.896147403685092, + "grad_norm": 0.3919452726840973, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 2830 + }, + { + "epoch": 1.9028475711892798, + "grad_norm": 0.38215070962905884, + "learning_rate": 0.0002, + "loss": 1.673, + "step": 2840 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.4235064387321472, + "learning_rate": 0.0002, + "loss": 1.7044, + "step": 2850 + }, + { + "epoch": 1.916247906197655, + "grad_norm": 0.35694634914398193, + "learning_rate": 0.0002, + "loss": 1.7123, + "step": 2860 + }, + { + "epoch": 1.9229480737018425, + "grad_norm": 0.383492112159729, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 2870 + }, + { + "epoch": 1.92964824120603, + "grad_norm": 0.5945147275924683, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 2880 + }, + { + "epoch": 1.9363484087102178, + "grad_norm": 0.3367522358894348, + "learning_rate": 0.0002, + "loss": 1.7421, + "step": 2890 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.35300394892692566, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2900 + }, + { + "epoch": 1.949748743718593, + "grad_norm": 0.38084495067596436, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 2910 + }, + { + "epoch": 1.9564489112227805, + "grad_norm": 0.37559160590171814, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 2920 + }, + { + "epoch": 1.963149078726968, + "grad_norm": 0.3661738336086273, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 2930 + }, + { + "epoch": 1.9698492462311559, + "grad_norm": 0.4073849320411682, + "learning_rate": 0.0002, + "loss": 1.7643, + "step": 2940 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3723304271697998, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 2950 + }, + { + "epoch": 1.983249581239531, + "grad_norm": 0.3991098999977112, + "learning_rate": 0.0002, + "loss": 1.7611, + "step": 2960 + }, + { + "epoch": 1.9899497487437185, + "grad_norm": 0.3947085440158844, + "learning_rate": 0.0002, + "loss": 1.7263, + "step": 2970 + }, + { + "epoch": 1.996649916247906, + "grad_norm": 0.3786258399486542, + "learning_rate": 0.0002, + "loss": 1.7217, + "step": 2980 + }, + { + "epoch": 2.0, + "eval_loss": 1.8028968572616577, + "eval_runtime": 37.8985, + "eval_samples_per_second": 13.589, + "eval_steps_per_second": 1.715, + "step": 2985 + }, + { + "epoch": 2.003350083752094, + "grad_norm": 0.34824079275131226, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2990 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.3394894003868103, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3000 + }, + { + "epoch": 2.016750418760469, + "grad_norm": 0.36910977959632874, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3010 + }, + { + "epoch": 2.023450586264657, + "grad_norm": 0.45000967383384705, + "learning_rate": 0.0002, + "loss": 1.6105, + "step": 3020 + }, + { + "epoch": 2.030150753768844, + "grad_norm": 0.3791407346725464, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3030 + }, + { + "epoch": 2.036850921273032, + "grad_norm": 0.387321799993515, + "learning_rate": 0.0002, + "loss": 1.5832, + "step": 3040 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.4185757040977478, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3050 + }, + { + "epoch": 2.050251256281407, + "grad_norm": 0.45110777020454407, + "learning_rate": 0.0002, + "loss": 1.5696, + "step": 3060 + }, + { + "epoch": 2.056951423785595, + "grad_norm": 0.42663660645484924, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 3070 + }, + { + "epoch": 2.063651591289782, + "grad_norm": 0.4546292722225189, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 3080 + }, + { + "epoch": 2.07035175879397, + "grad_norm": 0.3979759216308594, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3090 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.43596673011779785, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3100 + }, + { + "epoch": 2.083752093802345, + "grad_norm": 0.40120232105255127, + "learning_rate": 0.0002, + "loss": 1.5441, + "step": 3110 + }, + { + "epoch": 2.090452261306533, + "grad_norm": 0.44449281692504883, + "learning_rate": 0.0002, + "loss": 1.6309, + "step": 3120 + }, + { + "epoch": 2.09715242881072, + "grad_norm": 0.42672568559646606, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 3130 + }, + { + "epoch": 2.103852596314908, + "grad_norm": 0.4232690930366516, + "learning_rate": 0.0002, + "loss": 1.682, + "step": 3140 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.4299317002296448, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 3150 + }, + { + "epoch": 2.117252931323283, + "grad_norm": 0.4067758023738861, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 3160 + }, + { + "epoch": 2.123953098827471, + "grad_norm": 0.4918815791606903, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 3170 + }, + { + "epoch": 2.130653266331658, + "grad_norm": 0.4140559732913971, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3180 + }, + { + "epoch": 2.137353433835846, + "grad_norm": 0.4555995464324951, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 3190 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.42943915724754333, + "learning_rate": 0.0002, + "loss": 1.5888, + "step": 3200 + }, + { + "epoch": 2.150753768844221, + "grad_norm": 0.4730435013771057, + "learning_rate": 0.0002, + "loss": 1.5886, + "step": 3210 + }, + { + "epoch": 2.157453936348409, + "grad_norm": 0.43310216069221497, + "learning_rate": 0.0002, + "loss": 1.6022, + "step": 3220 + }, + { + "epoch": 2.164154103852596, + "grad_norm": 0.42054110765457153, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 3230 + }, + { + "epoch": 2.170854271356784, + "grad_norm": 0.4897233247756958, + "learning_rate": 0.0002, + "loss": 1.6749, + "step": 3240 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.42194533348083496, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 3250 + }, + { + "epoch": 2.184254606365159, + "grad_norm": 0.44494450092315674, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3260 + }, + { + "epoch": 2.190954773869347, + "grad_norm": 0.43524879217147827, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 3270 + }, + { + "epoch": 2.1976549413735342, + "grad_norm": 0.4621117413043976, + "learning_rate": 0.0002, + "loss": 1.528, + "step": 3280 + }, + { + "epoch": 2.204355108877722, + "grad_norm": 0.4073285460472107, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 3290 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.47868335247039795, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 3300 + }, + { + "epoch": 2.217755443886097, + "grad_norm": 0.4264970123767853, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 3310 + }, + { + "epoch": 2.224455611390285, + "grad_norm": 0.4491245150566101, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3320 + }, + { + "epoch": 2.2311557788944723, + "grad_norm": 0.4010344445705414, + "learning_rate": 0.0002, + "loss": 1.5881, + "step": 3330 + }, + { + "epoch": 2.23785594639866, + "grad_norm": 0.4232759177684784, + "learning_rate": 0.0002, + "loss": 1.6684, + "step": 3340 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5099776983261108, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3350 + }, + { + "epoch": 2.251256281407035, + "grad_norm": 0.5223407745361328, + "learning_rate": 0.0002, + "loss": 1.6764, + "step": 3360 + }, + { + "epoch": 2.257956448911223, + "grad_norm": 0.47818470001220703, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3370 + }, + { + "epoch": 2.2646566164154103, + "grad_norm": 0.4721255898475647, + "learning_rate": 0.0002, + "loss": 1.5946, + "step": 3380 + }, + { + "epoch": 2.271356783919598, + "grad_norm": 0.4113229513168335, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3390 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.507080078125, + "learning_rate": 0.0002, + "loss": 1.6494, + "step": 3400 + }, + { + "epoch": 2.284757118927973, + "grad_norm": 0.4852292239665985, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 3410 + }, + { + "epoch": 2.291457286432161, + "grad_norm": 0.4503684341907501, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 3420 + }, + { + "epoch": 2.2981574539363483, + "grad_norm": 0.8359600305557251, + "learning_rate": 0.0002, + "loss": 1.6649, + "step": 3430 + }, + { + "epoch": 2.304857621440536, + "grad_norm": 0.44604045152664185, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 3440 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.45667049288749695, + "learning_rate": 0.0002, + "loss": 1.5972, + "step": 3450 + }, + { + "epoch": 2.318257956448911, + "grad_norm": 0.4879349172115326, + "learning_rate": 0.0002, + "loss": 1.6667, + "step": 3460 + }, + { + "epoch": 2.324958123953099, + "grad_norm": 0.4033963084220886, + "learning_rate": 0.0002, + "loss": 1.5804, + "step": 3470 + }, + { + "epoch": 2.3316582914572863, + "grad_norm": 0.44494301080703735, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 3480 + }, + { + "epoch": 2.338358458961474, + "grad_norm": 0.4794621765613556, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.41404327750205994, + "learning_rate": 0.0002, + "loss": 1.6807, + "step": 3500 + }, + { + "epoch": 2.351758793969849, + "grad_norm": 0.4664851725101471, + "learning_rate": 0.0002, + "loss": 1.714, + "step": 3510 + }, + { + "epoch": 2.358458961474037, + "grad_norm": 0.4263697564601898, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 3520 + }, + { + "epoch": 2.3651591289782243, + "grad_norm": 0.5035167336463928, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 3530 + }, + { + "epoch": 2.371859296482412, + "grad_norm": 0.4380664527416229, + "learning_rate": 0.0002, + "loss": 1.6208, + "step": 3540 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.5227681994438171, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 3550 + }, + { + "epoch": 2.3852596314907872, + "grad_norm": 0.4382302761077881, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3560 + }, + { + "epoch": 2.391959798994975, + "grad_norm": 0.4392451047897339, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3570 + }, + { + "epoch": 2.3986599664991624, + "grad_norm": 0.4372786581516266, + "learning_rate": 0.0002, + "loss": 1.6626, + "step": 3580 + }, + { + "epoch": 2.40536013400335, + "grad_norm": 0.5015502572059631, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 3590 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.5653210878372192, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 3600 + }, + { + "epoch": 2.4187604690117253, + "grad_norm": 0.53007972240448, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 3610 + }, + { + "epoch": 2.425460636515913, + "grad_norm": 0.4659176766872406, + "learning_rate": 0.0002, + "loss": 1.6421, + "step": 3620 + }, + { + "epoch": 2.4321608040201004, + "grad_norm": 0.5637837052345276, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 3630 + }, + { + "epoch": 2.438860971524288, + "grad_norm": 0.4248391389846802, + "learning_rate": 0.0002, + "loss": 1.6168, + "step": 3640 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.44668248295783997, + "learning_rate": 0.0002, + "loss": 1.6822, + "step": 3650 + }, + { + "epoch": 2.4522613065326633, + "grad_norm": 0.43990179896354675, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 3660 + }, + { + "epoch": 2.458961474036851, + "grad_norm": 0.4532523453235626, + "learning_rate": 0.0002, + "loss": 1.6723, + "step": 3670 + }, + { + "epoch": 2.4656616415410384, + "grad_norm": 0.6605591773986816, + "learning_rate": 0.0002, + "loss": 1.6957, + "step": 3680 + }, + { + "epoch": 2.472361809045226, + "grad_norm": 0.4694533348083496, + "learning_rate": 0.0002, + "loss": 1.6159, + "step": 3690 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.4485011100769043, + "learning_rate": 0.0002, + "loss": 1.6239, + "step": 3700 + }, + { + "epoch": 2.4857621440536013, + "grad_norm": 0.4761785864830017, + "learning_rate": 0.0002, + "loss": 1.6834, + "step": 3710 + }, + { + "epoch": 2.492462311557789, + "grad_norm": 0.5116432309150696, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 3720 + }, + { + "epoch": 2.4991624790619764, + "grad_norm": 0.49523618817329407, + "learning_rate": 0.0002, + "loss": 1.5054, + "step": 3730 + }, + { + "epoch": 2.505862646566164, + "grad_norm": 0.43826380372047424, + "learning_rate": 0.0002, + "loss": 1.6249, + "step": 3740 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.4916154146194458, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 3750 + }, + { + "epoch": 2.5192629815745393, + "grad_norm": 0.5381299257278442, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 3760 + }, + { + "epoch": 2.525963149078727, + "grad_norm": 0.44947415590286255, + "learning_rate": 0.0002, + "loss": 1.6467, + "step": 3770 + }, + { + "epoch": 2.5326633165829144, + "grad_norm": 0.49979084730148315, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3780 + }, + { + "epoch": 2.539363484087102, + "grad_norm": 0.43046900629997253, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 3790 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.4513470530509949, + "learning_rate": 0.0002, + "loss": 1.6789, + "step": 3800 + }, + { + "epoch": 2.5527638190954773, + "grad_norm": 0.49900051951408386, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3810 + }, + { + "epoch": 2.559463986599665, + "grad_norm": 0.4348420202732086, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 3820 + }, + { + "epoch": 2.5661641541038525, + "grad_norm": 0.4684867560863495, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3830 + }, + { + "epoch": 2.5728643216080402, + "grad_norm": 0.44430989027023315, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 3840 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.47375255823135376, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 3850 + }, + { + "epoch": 2.5862646566164154, + "grad_norm": 0.45493075251579285, + "learning_rate": 0.0002, + "loss": 1.6269, + "step": 3860 + }, + { + "epoch": 2.592964824120603, + "grad_norm": 0.4563275873661041, + "learning_rate": 0.0002, + "loss": 1.604, + "step": 3870 + }, + { + "epoch": 2.5996649916247905, + "grad_norm": 0.46060335636138916, + "learning_rate": 0.0002, + "loss": 1.642, + "step": 3880 + }, + { + "epoch": 2.6063651591289783, + "grad_norm": 0.4718867540359497, + "learning_rate": 0.0002, + "loss": 1.6302, + "step": 3890 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.41570305824279785, + "learning_rate": 0.0002, + "loss": 1.6242, + "step": 3900 + }, + { + "epoch": 2.6197654941373534, + "grad_norm": 0.4603121876716614, + "learning_rate": 0.0002, + "loss": 1.6401, + "step": 3910 + }, + { + "epoch": 2.626465661641541, + "grad_norm": 0.4734652638435364, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 3920 + }, + { + "epoch": 2.6331658291457285, + "grad_norm": 0.45348483324050903, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 3930 + }, + { + "epoch": 2.6398659966499163, + "grad_norm": 0.46559447050094604, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3940 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.44113144278526306, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 3950 + }, + { + "epoch": 2.6532663316582914, + "grad_norm": 0.41415104269981384, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3960 + }, + { + "epoch": 2.659966499162479, + "grad_norm": 0.48868080973625183, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 3970 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.49610549211502075, + "learning_rate": 0.0002, + "loss": 1.6211, + "step": 3980 + }, + { + "epoch": 2.6733668341708543, + "grad_norm": 0.4309130907058716, + "learning_rate": 0.0002, + "loss": 1.6235, + "step": 3990 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.4489327669143677, + "learning_rate": 0.0002, + "loss": 1.6452, + "step": 4000 + }, + { + "epoch": 2.6867671691792294, + "grad_norm": 0.5380139946937561, + "learning_rate": 0.0002, + "loss": 1.5744, + "step": 4010 + }, + { + "epoch": 2.693467336683417, + "grad_norm": 0.5076672434806824, + "learning_rate": 0.0002, + "loss": 1.6524, + "step": 4020 + }, + { + "epoch": 2.7001675041876045, + "grad_norm": 0.47620031237602234, + "learning_rate": 0.0002, + "loss": 1.636, + "step": 4030 + }, + { + "epoch": 2.7068676716917923, + "grad_norm": 0.48089155554771423, + "learning_rate": 0.0002, + "loss": 1.5543, + "step": 4040 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.5108814239501953, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 4050 + }, + { + "epoch": 2.7202680067001674, + "grad_norm": 0.4196513295173645, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 4060 + }, + { + "epoch": 2.726968174204355, + "grad_norm": 0.4574664831161499, + "learning_rate": 0.0002, + "loss": 1.686, + "step": 4070 + }, + { + "epoch": 2.7336683417085426, + "grad_norm": 0.4671640992164612, + "learning_rate": 0.0002, + "loss": 1.6234, + "step": 4080 + }, + { + "epoch": 2.7403685092127303, + "grad_norm": 0.49355530738830566, + "learning_rate": 0.0002, + "loss": 1.6827, + "step": 4090 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.46716663241386414, + "learning_rate": 0.0002, + "loss": 1.6999, + "step": 4100 + }, + { + "epoch": 2.7537688442211055, + "grad_norm": 0.45420581102371216, + "learning_rate": 0.0002, + "loss": 1.6463, + "step": 4110 + }, + { + "epoch": 2.7604690117252932, + "grad_norm": 0.4680487811565399, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4120 + }, + { + "epoch": 2.7671691792294806, + "grad_norm": 0.5375032424926758, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 4130 + }, + { + "epoch": 2.7738693467336684, + "grad_norm": 0.46026280522346497, + "learning_rate": 0.0002, + "loss": 1.5254, + "step": 4140 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.43658447265625, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 4150 + }, + { + "epoch": 2.7872696817420435, + "grad_norm": 0.4935547113418579, + "learning_rate": 0.0002, + "loss": 1.6546, + "step": 4160 + }, + { + "epoch": 2.7939698492462313, + "grad_norm": 0.8167962431907654, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 4170 + }, + { + "epoch": 2.8006700167504186, + "grad_norm": 0.4289683997631073, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 4180 + }, + { + "epoch": 2.8073701842546064, + "grad_norm": 0.4569324254989624, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 4190 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.474795937538147, + "learning_rate": 0.0002, + "loss": 1.6077, + "step": 4200 + }, + { + "epoch": 2.8207705192629815, + "grad_norm": 0.44272229075431824, + "learning_rate": 0.0002, + "loss": 1.6223, + "step": 4210 + }, + { + "epoch": 2.8274706867671693, + "grad_norm": 0.525240957736969, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 4220 + }, + { + "epoch": 2.8341708542713566, + "grad_norm": 0.4802303910255432, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 4230 + }, + { + "epoch": 2.8408710217755444, + "grad_norm": 0.46400442719459534, + "learning_rate": 0.0002, + "loss": 1.6002, + "step": 4240 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.49884888529777527, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 4250 + }, + { + "epoch": 2.8542713567839195, + "grad_norm": 0.5015072226524353, + "learning_rate": 0.0002, + "loss": 1.6919, + "step": 4260 + }, + { + "epoch": 2.8609715242881073, + "grad_norm": 0.4335440695285797, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 4270 + }, + { + "epoch": 2.8676716917922946, + "grad_norm": 0.5131644606590271, + "learning_rate": 0.0002, + "loss": 1.5664, + "step": 4280 + }, + { + "epoch": 2.8743718592964824, + "grad_norm": 0.6977195739746094, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 4290 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5133762955665588, + "learning_rate": 0.0002, + "loss": 1.7192, + "step": 4300 + }, + { + "epoch": 2.8877721943048575, + "grad_norm": 0.4737614393234253, + "learning_rate": 0.0002, + "loss": 1.6257, + "step": 4310 + }, + { + "epoch": 2.8944723618090453, + "grad_norm": 0.4580535590648651, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 4320 + }, + { + "epoch": 2.901172529313233, + "grad_norm": 0.43863341212272644, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 4330 + }, + { + "epoch": 2.9078726968174204, + "grad_norm": 0.4103737473487854, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4340 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.438014417886734, + "learning_rate": 0.0002, + "loss": 1.7106, + "step": 4350 + }, + { + "epoch": 2.9212730318257956, + "grad_norm": 0.5068213939666748, + "learning_rate": 0.0002, + "loss": 1.6025, + "step": 4360 + }, + { + "epoch": 2.9279731993299833, + "grad_norm": 0.45305484533309937, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 4370 + }, + { + "epoch": 2.934673366834171, + "grad_norm": 0.4612090289592743, + "learning_rate": 0.0002, + "loss": 1.5726, + "step": 4380 + }, + { + "epoch": 2.9413735343383585, + "grad_norm": 0.508736789226532, + "learning_rate": 0.0002, + "loss": 1.6536, + "step": 4390 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4924427270889282, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 4400 + }, + { + "epoch": 2.9547738693467336, + "grad_norm": 0.5707460641860962, + "learning_rate": 0.0002, + "loss": 1.7007, + "step": 4410 + }, + { + "epoch": 2.9614740368509214, + "grad_norm": 0.42270299792289734, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 4420 + }, + { + "epoch": 2.968174204355109, + "grad_norm": 0.4429931044578552, + "learning_rate": 0.0002, + "loss": 1.6644, + "step": 4430 + }, + { + "epoch": 2.9748743718592965, + "grad_norm": 0.49760574102401733, + "learning_rate": 0.0002, + "loss": 1.6251, + "step": 4440 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4558229148387909, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 4450 + }, + { + "epoch": 2.9882747068676716, + "grad_norm": 0.39848530292510986, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 4460 + }, + { + "epoch": 2.9949748743718594, + "grad_norm": 0.5224862098693848, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 4470 + }, + { + "epoch": 2.9996649916247904, + "eval_loss": 1.8228833675384521, + "eval_runtime": 37.9049, + "eval_samples_per_second": 13.587, + "eval_steps_per_second": 1.715, + "step": 4477 + }, + { + "epoch": 3.0016750418760467, + "grad_norm": 0.41169142723083496, + "learning_rate": 0.0002, + "loss": 1.6637, + "step": 4480 + }, + { + "epoch": 3.0083752093802345, + "grad_norm": 0.4865207374095917, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 4490 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5462028384208679, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4500 + }, + { + "epoch": 3.0217755443886096, + "grad_norm": 0.6169732809066772, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 4510 + }, + { + "epoch": 3.0284757118927974, + "grad_norm": 0.5667954087257385, + "learning_rate": 0.0002, + "loss": 1.5559, + "step": 4520 + }, + { + "epoch": 3.0351758793969847, + "grad_norm": 0.5758325457572937, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 4530 + }, + { + "epoch": 3.0418760469011725, + "grad_norm": 0.5220064520835876, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 4540 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.5469558835029602, + "learning_rate": 0.0002, + "loss": 1.5126, + "step": 4550 + }, + { + "epoch": 3.0552763819095476, + "grad_norm": 0.5680848956108093, + "learning_rate": 0.0002, + "loss": 1.4275, + "step": 4560 + }, + { + "epoch": 3.0619765494137354, + "grad_norm": 0.5906574726104736, + "learning_rate": 0.0002, + "loss": 1.5187, + "step": 4570 + }, + { + "epoch": 3.0686767169179228, + "grad_norm": 0.4725631773471832, + "learning_rate": 0.0002, + "loss": 1.4551, + "step": 4580 + }, + { + "epoch": 3.0753768844221105, + "grad_norm": 0.5273477435112, + "learning_rate": 0.0002, + "loss": 1.5083, + "step": 4590 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.5861203074455261, + "learning_rate": 0.0002, + "loss": 1.5154, + "step": 4600 + }, + { + "epoch": 3.0887772194304857, + "grad_norm": 0.5343965291976929, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 4610 + }, + { + "epoch": 3.0954773869346734, + "grad_norm": 0.5348150730133057, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 4620 + }, + { + "epoch": 3.102177554438861, + "grad_norm": 0.5971846580505371, + "learning_rate": 0.0002, + "loss": 1.5399, + "step": 4630 + }, + { + "epoch": 3.1088777219430486, + "grad_norm": 0.5203177332878113, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4640 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.55289226770401, + "learning_rate": 0.0002, + "loss": 1.5805, + "step": 4650 + }, + { + "epoch": 3.1222780569514237, + "grad_norm": 0.6878530979156494, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4660 + }, + { + "epoch": 3.1289782244556115, + "grad_norm": 0.6173256635665894, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 4670 + }, + { + "epoch": 3.135678391959799, + "grad_norm": 0.536796510219574, + "learning_rate": 0.0002, + "loss": 1.51, + "step": 4680 + }, + { + "epoch": 3.1423785594639866, + "grad_norm": 0.58846116065979, + "learning_rate": 0.0002, + "loss": 1.4713, + "step": 4690 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.645889401435852, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 4700 + }, + { + "epoch": 3.1557788944723617, + "grad_norm": 0.6118691563606262, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 4710 + }, + { + "epoch": 3.1624790619765495, + "grad_norm": 0.5189669132232666, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 4720 + }, + { + "epoch": 3.169179229480737, + "grad_norm": 0.5794713497161865, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 4730 + }, + { + "epoch": 3.1758793969849246, + "grad_norm": 0.6579326391220093, + "learning_rate": 0.0002, + "loss": 1.4849, + "step": 4740 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.5822742581367493, + "learning_rate": 0.0002, + "loss": 1.545, + "step": 4750 + }, + { + "epoch": 3.1892797319932997, + "grad_norm": 0.5475956201553345, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 4760 + }, + { + "epoch": 3.1959798994974875, + "grad_norm": 0.6743834018707275, + "learning_rate": 0.0002, + "loss": 1.4723, + "step": 4770 + }, + { + "epoch": 3.202680067001675, + "grad_norm": 0.6110585927963257, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4780 + }, + { + "epoch": 3.2093802345058626, + "grad_norm": 0.5426181554794312, + "learning_rate": 0.0002, + "loss": 1.5455, + "step": 4790 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6077824234962463, + "learning_rate": 0.0002, + "loss": 1.5315, + "step": 4800 + }, + { + "epoch": 3.2227805695142377, + "grad_norm": 0.5785858631134033, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 4810 + }, + { + "epoch": 3.2294807370184255, + "grad_norm": 0.6425958275794983, + "learning_rate": 0.0002, + "loss": 1.4041, + "step": 4820 + }, + { + "epoch": 3.236180904522613, + "grad_norm": 0.6607080698013306, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 4830 + }, + { + "epoch": 3.2428810720268006, + "grad_norm": 0.5385788679122925, + "learning_rate": 0.0002, + "loss": 1.5267, + "step": 4840 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.5630403757095337, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 4850 + }, + { + "epoch": 3.2562814070351758, + "grad_norm": 0.6340779662132263, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 4860 + }, + { + "epoch": 3.2629815745393635, + "grad_norm": 0.5305342674255371, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4870 + }, + { + "epoch": 3.2696817420435513, + "grad_norm": 0.597670316696167, + "learning_rate": 0.0002, + "loss": 1.5162, + "step": 4880 + }, + { + "epoch": 3.2763819095477387, + "grad_norm": 0.665553867816925, + "learning_rate": 0.0002, + "loss": 1.5429, + "step": 4890 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.579767644405365, + "learning_rate": 0.0002, + "loss": 1.4607, + "step": 4900 + }, + { + "epoch": 3.289782244556114, + "grad_norm": 0.5512481331825256, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 4910 + }, + { + "epoch": 3.2964824120603016, + "grad_norm": 0.5916532278060913, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 4920 + }, + { + "epoch": 3.3031825795644894, + "grad_norm": 0.7521726489067078, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 4930 + }, + { + "epoch": 3.3098827470686767, + "grad_norm": 0.5352797508239746, + "learning_rate": 0.0002, + "loss": 1.4223, + "step": 4940 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.5950371623039246, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 4950 + }, + { + "epoch": 3.323283082077052, + "grad_norm": 0.8020477890968323, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 4960 + }, + { + "epoch": 3.3299832495812396, + "grad_norm": 0.6790024638175964, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 4970 + }, + { + "epoch": 3.3366834170854274, + "grad_norm": 0.687627375125885, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4980 + }, + { + "epoch": 3.3433835845896147, + "grad_norm": 0.6094385385513306, + "learning_rate": 0.0002, + "loss": 1.5276, + "step": 4990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.6541242003440857, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 5000 + }, + { + "epoch": 3.35678391959799, + "grad_norm": 0.5560880303382874, + "learning_rate": 0.0002, + "loss": 1.6067, + "step": 5010 + }, + { + "epoch": 3.3634840871021776, + "grad_norm": 0.5440094470977783, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 5020 + }, + { + "epoch": 3.3701842546063654, + "grad_norm": 0.5749301314353943, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 5030 + }, + { + "epoch": 3.3768844221105527, + "grad_norm": 0.5919716954231262, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 5040 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.6331481337547302, + "learning_rate": 0.0002, + "loss": 1.5957, + "step": 5050 + }, + { + "epoch": 3.390284757118928, + "grad_norm": 0.5687161684036255, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 5060 + }, + { + "epoch": 3.3969849246231156, + "grad_norm": 0.6718577742576599, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 5070 + }, + { + "epoch": 3.4036850921273034, + "grad_norm": 0.5089324116706848, + "learning_rate": 0.0002, + "loss": 1.5577, + "step": 5080 + }, + { + "epoch": 3.4103852596314908, + "grad_norm": 0.5710174441337585, + "learning_rate": 0.0002, + "loss": 1.512, + "step": 5090 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6670721173286438, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 5100 + }, + { + "epoch": 3.423785594639866, + "grad_norm": 0.6875665187835693, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 5110 + }, + { + "epoch": 3.4304857621440537, + "grad_norm": 0.5375880599021912, + "learning_rate": 0.0002, + "loss": 1.4496, + "step": 5120 + }, + { + "epoch": 3.4371859296482414, + "grad_norm": 0.6550399661064148, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 5130 + }, + { + "epoch": 3.4438860971524288, + "grad_norm": 0.5948067903518677, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 5140 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.6134477257728577, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5150 + }, + { + "epoch": 3.457286432160804, + "grad_norm": 0.6506398320198059, + "learning_rate": 0.0002, + "loss": 1.5069, + "step": 5160 + }, + { + "epoch": 3.4639865996649917, + "grad_norm": 0.6060147881507874, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 5170 + }, + { + "epoch": 3.4706867671691795, + "grad_norm": 0.6173806190490723, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 5180 + }, + { + "epoch": 3.477386934673367, + "grad_norm": 0.6032607555389404, + "learning_rate": 0.0002, + "loss": 1.4975, + "step": 5190 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5652492046356201, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 5200 + }, + { + "epoch": 3.490787269681742, + "grad_norm": 0.6168607473373413, + "learning_rate": 0.0002, + "loss": 1.4883, + "step": 5210 + }, + { + "epoch": 3.4974874371859297, + "grad_norm": 0.6170629262924194, + "learning_rate": 0.0002, + "loss": 1.5164, + "step": 5220 + }, + { + "epoch": 3.5041876046901175, + "grad_norm": 0.6926297545433044, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 5230 + }, + { + "epoch": 3.510887772194305, + "grad_norm": 0.6702437996864319, + "learning_rate": 0.0002, + "loss": 1.4982, + "step": 5240 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.5421436429023743, + "learning_rate": 0.0002, + "loss": 1.4986, + "step": 5250 + }, + { + "epoch": 3.52428810720268, + "grad_norm": 0.5726765990257263, + "learning_rate": 0.0002, + "loss": 1.4673, + "step": 5260 + }, + { + "epoch": 3.5309882747068677, + "grad_norm": 0.5685455203056335, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 5270 + }, + { + "epoch": 3.5376884422110555, + "grad_norm": 0.6018396019935608, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 5280 + }, + { + "epoch": 3.544388609715243, + "grad_norm": 0.5731932520866394, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 5290 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.6601519584655762, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5300 + }, + { + "epoch": 3.557788944723618, + "grad_norm": 0.5545530319213867, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 5310 + }, + { + "epoch": 3.5644891122278057, + "grad_norm": 0.5998541116714478, + "learning_rate": 0.0002, + "loss": 1.5438, + "step": 5320 + }, + { + "epoch": 3.5711892797319935, + "grad_norm": 0.5651767253875732, + "learning_rate": 0.0002, + "loss": 1.56, + "step": 5330 + }, + { + "epoch": 3.577889447236181, + "grad_norm": 0.7425084114074707, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 5340 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5770602226257324, + "learning_rate": 0.0002, + "loss": 1.5571, + "step": 5350 + }, + { + "epoch": 3.591289782244556, + "grad_norm": 0.54723060131073, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 5360 + }, + { + "epoch": 3.5979899497487438, + "grad_norm": 0.6658238172531128, + "learning_rate": 0.0002, + "loss": 1.497, + "step": 5370 + }, + { + "epoch": 3.6046901172529315, + "grad_norm": 0.5787645578384399, + "learning_rate": 0.0002, + "loss": 1.5456, + "step": 5380 + }, + { + "epoch": 3.611390284757119, + "grad_norm": 0.594913125038147, + "learning_rate": 0.0002, + "loss": 1.5343, + "step": 5390 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.4964977502822876, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5400 + }, + { + "epoch": 3.624790619765494, + "grad_norm": 0.6087527275085449, + "learning_rate": 0.0002, + "loss": 1.5062, + "step": 5410 + }, + { + "epoch": 3.6314907872696818, + "grad_norm": 0.6315323710441589, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 5420 + }, + { + "epoch": 3.6381909547738696, + "grad_norm": 0.574799120426178, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 5430 + }, + { + "epoch": 3.644891122278057, + "grad_norm": 0.5949277877807617, + "learning_rate": 0.0002, + "loss": 1.4595, + "step": 5440 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.5640677213668823, + "learning_rate": 0.0002, + "loss": 1.4816, + "step": 5450 + }, + { + "epoch": 3.658291457286432, + "grad_norm": 0.6198237538337708, + "learning_rate": 0.0002, + "loss": 1.525, + "step": 5460 + }, + { + "epoch": 3.66499162479062, + "grad_norm": 0.6902034878730774, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 5470 + }, + { + "epoch": 3.6716917922948076, + "grad_norm": 0.5686674118041992, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5480 + }, + { + "epoch": 3.678391959798995, + "grad_norm": 0.6532107591629028, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 5490 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.5790849924087524, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 5500 + }, + { + "epoch": 3.69179229480737, + "grad_norm": 0.6055065393447876, + "learning_rate": 0.0002, + "loss": 1.5507, + "step": 5510 + }, + { + "epoch": 3.698492462311558, + "grad_norm": 0.5630605816841125, + "learning_rate": 0.0002, + "loss": 1.4656, + "step": 5520 + }, + { + "epoch": 3.7051926298157456, + "grad_norm": 0.6005825996398926, + "learning_rate": 0.0002, + "loss": 1.537, + "step": 5530 + }, + { + "epoch": 3.711892797319933, + "grad_norm": 0.6553038954734802, + "learning_rate": 0.0002, + "loss": 1.5313, + "step": 5540 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5601094961166382, + "learning_rate": 0.0002, + "loss": 1.4943, + "step": 5550 + }, + { + "epoch": 3.725293132328308, + "grad_norm": 0.6598808169364929, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 5560 + }, + { + "epoch": 3.731993299832496, + "grad_norm": 0.5506255626678467, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 5570 + }, + { + "epoch": 3.7386934673366836, + "grad_norm": 0.6001223921775818, + "learning_rate": 0.0002, + "loss": 1.4805, + "step": 5580 + }, + { + "epoch": 3.745393634840871, + "grad_norm": 0.6287297606468201, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 5590 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.6253238916397095, + "learning_rate": 0.0002, + "loss": 1.5246, + "step": 5600 + }, + { + "epoch": 3.758793969849246, + "grad_norm": 0.5713174939155579, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 5610 + }, + { + "epoch": 3.765494137353434, + "grad_norm": 0.6198310852050781, + "learning_rate": 0.0002, + "loss": 1.5661, + "step": 5620 + }, + { + "epoch": 3.7721943048576216, + "grad_norm": 0.5941224098205566, + "learning_rate": 0.0002, + "loss": 1.5448, + "step": 5630 + }, + { + "epoch": 3.778894472361809, + "grad_norm": 0.606002151966095, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 5640 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.6540704965591431, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 5650 + }, + { + "epoch": 3.792294807370184, + "grad_norm": 0.6147415041923523, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 5660 + }, + { + "epoch": 3.798994974874372, + "grad_norm": 0.5649605393409729, + "learning_rate": 0.0002, + "loss": 1.5329, + "step": 5670 + }, + { + "epoch": 3.8056951423785597, + "grad_norm": 0.6788773536682129, + "learning_rate": 0.0002, + "loss": 1.5747, + "step": 5680 + }, + { + "epoch": 3.812395309882747, + "grad_norm": 0.6581860780715942, + "learning_rate": 0.0002, + "loss": 1.535, + "step": 5690 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.5529348850250244, + "learning_rate": 0.0002, + "loss": 1.4587, + "step": 5700 + }, + { + "epoch": 3.825795644891122, + "grad_norm": 0.6320232152938843, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 5710 + }, + { + "epoch": 3.83249581239531, + "grad_norm": 0.6529698371887207, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 5720 + }, + { + "epoch": 3.8391959798994977, + "grad_norm": 0.5983362793922424, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 5730 + }, + { + "epoch": 3.845896147403685, + "grad_norm": 0.6335684061050415, + "learning_rate": 0.0002, + "loss": 1.465, + "step": 5740 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.700446605682373, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 5750 + }, + { + "epoch": 3.85929648241206, + "grad_norm": 0.6092597842216492, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 5760 + }, + { + "epoch": 3.865996649916248, + "grad_norm": 0.564146101474762, + "learning_rate": 0.0002, + "loss": 1.5729, + "step": 5770 + }, + { + "epoch": 3.8726968174204357, + "grad_norm": 0.615275502204895, + "learning_rate": 0.0002, + "loss": 1.5872, + "step": 5780 + }, + { + "epoch": 3.879396984924623, + "grad_norm": 0.6685376763343811, + "learning_rate": 0.0002, + "loss": 1.5142, + "step": 5790 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6116922497749329, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 5800 + }, + { + "epoch": 3.892797319932998, + "grad_norm": 0.5486813187599182, + "learning_rate": 0.0002, + "loss": 1.5179, + "step": 5810 + }, + { + "epoch": 3.899497487437186, + "grad_norm": 0.6208204030990601, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 5820 + }, + { + "epoch": 3.9061976549413737, + "grad_norm": 0.6500625014305115, + "learning_rate": 0.0002, + "loss": 1.5334, + "step": 5830 + }, + { + "epoch": 3.912897822445561, + "grad_norm": 0.5948089361190796, + "learning_rate": 0.0002, + "loss": 1.4716, + "step": 5840 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.7210732698440552, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 5850 + }, + { + "epoch": 3.926298157453936, + "grad_norm": 0.6662322878837585, + "learning_rate": 0.0002, + "loss": 1.5519, + "step": 5860 + }, + { + "epoch": 3.932998324958124, + "grad_norm": 0.5613839626312256, + "learning_rate": 0.0002, + "loss": 1.5656, + "step": 5870 + }, + { + "epoch": 3.9396984924623117, + "grad_norm": 0.6069002151489258, + "learning_rate": 0.0002, + "loss": 1.544, + "step": 5880 + }, + { + "epoch": 3.946398659966499, + "grad_norm": 0.7075562477111816, + "learning_rate": 0.0002, + "loss": 1.6745, + "step": 5890 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.6316173076629639, + "learning_rate": 0.0002, + "loss": 1.5391, + "step": 5900 + }, + { + "epoch": 3.959798994974874, + "grad_norm": 0.5716308355331421, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 5910 + }, + { + "epoch": 3.966499162479062, + "grad_norm": 0.6800096035003662, + "learning_rate": 0.0002, + "loss": 1.5947, + "step": 5920 + }, + { + "epoch": 3.9731993299832498, + "grad_norm": 0.6057983040809631, + "learning_rate": 0.0002, + "loss": 1.5189, + "step": 5930 + }, + { + "epoch": 3.979899497487437, + "grad_norm": 0.5938987731933594, + "learning_rate": 0.0002, + "loss": 1.5431, + "step": 5940 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.6963576674461365, + "learning_rate": 0.0002, + "loss": 1.5111, + "step": 5950 + }, + { + "epoch": 3.993299832495812, + "grad_norm": 0.6279940009117126, + "learning_rate": 0.0002, + "loss": 1.5521, + "step": 5960 + }, + { + "epoch": 4.0, + "grad_norm": 0.7161159515380859, + "learning_rate": 0.0002, + "loss": 1.5974, + "step": 5970 + }, + { + "epoch": 4.0, + "eval_loss": 1.8655421733856201, + "eval_runtime": 37.9276, + "eval_samples_per_second": 13.579, + "eval_steps_per_second": 1.714, + "step": 5970 + }, + { + "epoch": 4.006700167504188, + "grad_norm": 0.7380476593971252, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 5980 + }, + { + "epoch": 4.013400335008376, + "grad_norm": 0.7148947715759277, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 5990 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.6177082657814026, + "learning_rate": 0.0002, + "loss": 1.4204, + "step": 6000 + }, + { + "epoch": 4.02680067001675, + "grad_norm": 0.8552946448326111, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 6010 + }, + { + "epoch": 4.033500837520938, + "grad_norm": 0.8033416271209717, + "learning_rate": 0.0002, + "loss": 1.4342, + "step": 6020 + }, + { + "epoch": 4.040201005025126, + "grad_norm": 0.8501318097114563, + "learning_rate": 0.0002, + "loss": 1.4092, + "step": 6030 + }, + { + "epoch": 4.046901172529314, + "grad_norm": 0.6981393098831177, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 6040 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.7227180600166321, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 6050 + }, + { + "epoch": 4.060301507537688, + "grad_norm": 0.6923989653587341, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 6060 + }, + { + "epoch": 4.067001675041876, + "grad_norm": 0.879779040813446, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 6070 + }, + { + "epoch": 4.073701842546064, + "grad_norm": 0.8184754848480225, + "learning_rate": 0.0002, + "loss": 1.4383, + "step": 6080 + }, + { + "epoch": 4.080402010050252, + "grad_norm": 0.8211342692375183, + "learning_rate": 0.0002, + "loss": 1.3128, + "step": 6090 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.7542396783828735, + "learning_rate": 0.0002, + "loss": 1.3892, + "step": 6100 + }, + { + "epoch": 4.093802345058626, + "grad_norm": 0.6631066799163818, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 6110 + }, + { + "epoch": 4.100502512562814, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 6120 + }, + { + "epoch": 4.107202680067002, + "grad_norm": 0.681851863861084, + "learning_rate": 0.0002, + "loss": 1.3443, + "step": 6130 + }, + { + "epoch": 4.11390284757119, + "grad_norm": 0.8757794499397278, + "learning_rate": 0.0002, + "loss": 1.3486, + "step": 6140 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.6567301750183105, + "learning_rate": 0.0002, + "loss": 1.351, + "step": 6150 + }, + { + "epoch": 4.127303182579564, + "grad_norm": 0.7950329184532166, + "learning_rate": 0.0002, + "loss": 1.3824, + "step": 6160 + }, + { + "epoch": 4.134003350083752, + "grad_norm": 0.7545644044876099, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6170 + }, + { + "epoch": 4.14070351758794, + "grad_norm": 0.7172710299491882, + "learning_rate": 0.0002, + "loss": 1.4214, + "step": 6180 + }, + { + "epoch": 4.147403685092128, + "grad_norm": 0.7040584087371826, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 6190 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7482913732528687, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 6200 + }, + { + "epoch": 4.160804020100502, + "grad_norm": 0.8523276448249817, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 6210 + }, + { + "epoch": 4.16750418760469, + "grad_norm": 0.6672041416168213, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 6220 + }, + { + "epoch": 4.174204355108878, + "grad_norm": 0.7523500919342041, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 6230 + }, + { + "epoch": 4.180904522613066, + "grad_norm": 0.8085253834724426, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 6240 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.789450466632843, + "learning_rate": 0.0002, + "loss": 1.3293, + "step": 6250 + }, + { + "epoch": 4.19430485762144, + "grad_norm": 0.7502310872077942, + "learning_rate": 0.0002, + "loss": 1.3539, + "step": 6260 + }, + { + "epoch": 4.201005025125628, + "grad_norm": 0.7397456765174866, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 6270 + }, + { + "epoch": 4.207705192629816, + "grad_norm": 0.6921947002410889, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6280 + }, + { + "epoch": 4.214405360134004, + "grad_norm": 0.9334571957588196, + "learning_rate": 0.0002, + "loss": 1.3125, + "step": 6290 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.725799024105072, + "learning_rate": 0.0002, + "loss": 1.3612, + "step": 6300 + }, + { + "epoch": 4.227805695142378, + "grad_norm": 0.8290495872497559, + "learning_rate": 0.0002, + "loss": 1.4217, + "step": 6310 + }, + { + "epoch": 4.234505862646566, + "grad_norm": 0.688983678817749, + "learning_rate": 0.0002, + "loss": 1.4135, + "step": 6320 + }, + { + "epoch": 4.241206030150754, + "grad_norm": 0.8620913028717041, + "learning_rate": 0.0002, + "loss": 1.3807, + "step": 6330 + }, + { + "epoch": 4.247906197654942, + "grad_norm": 0.8008657693862915, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 6340 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7379199266433716, + "learning_rate": 0.0002, + "loss": 1.4005, + "step": 6350 + }, + { + "epoch": 4.261306532663316, + "grad_norm": 0.7842815518379211, + "learning_rate": 0.0002, + "loss": 1.426, + "step": 6360 + }, + { + "epoch": 4.268006700167504, + "grad_norm": 0.812600314617157, + "learning_rate": 0.0002, + "loss": 1.4262, + "step": 6370 + }, + { + "epoch": 4.274706867671692, + "grad_norm": 0.7852841019630432, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 6380 + }, + { + "epoch": 4.28140703517588, + "grad_norm": 1.0377534627914429, + "learning_rate": 0.0002, + "loss": 1.3722, + "step": 6390 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 1.03935706615448, + "learning_rate": 0.0002, + "loss": 1.3755, + "step": 6400 + }, + { + "epoch": 4.294807370184254, + "grad_norm": 0.7244732975959778, + "learning_rate": 0.0002, + "loss": 1.3961, + "step": 6410 + }, + { + "epoch": 4.301507537688442, + "grad_norm": 0.7137406468391418, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 6420 + }, + { + "epoch": 4.30820770519263, + "grad_norm": 0.7492543458938599, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 6430 + }, + { + "epoch": 4.314907872696818, + "grad_norm": 0.7065439224243164, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 6440 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.7786989808082581, + "learning_rate": 0.0002, + "loss": 1.4246, + "step": 6450 + }, + { + "epoch": 4.328308207705192, + "grad_norm": 0.7369208335876465, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 6460 + }, + { + "epoch": 4.33500837520938, + "grad_norm": 0.7412346005439758, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 6470 + }, + { + "epoch": 4.341708542713568, + "grad_norm": 0.780927300453186, + "learning_rate": 0.0002, + "loss": 1.4087, + "step": 6480 + }, + { + "epoch": 4.348408710217756, + "grad_norm": 0.8320930600166321, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 6490 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.6871094703674316, + "learning_rate": 0.0002, + "loss": 1.3715, + "step": 6500 + }, + { + "epoch": 4.36180904522613, + "grad_norm": 0.6751559972763062, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 6510 + }, + { + "epoch": 4.368509212730318, + "grad_norm": 0.7723976969718933, + "learning_rate": 0.0002, + "loss": 1.4311, + "step": 6520 + }, + { + "epoch": 4.375209380234506, + "grad_norm": 0.7915401458740234, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 6530 + }, + { + "epoch": 4.381909547738694, + "grad_norm": 0.7329102754592896, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 6540 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.7388760447502136, + "learning_rate": 0.0002, + "loss": 1.447, + "step": 6550 + }, + { + "epoch": 4.3953098827470685, + "grad_norm": 0.8282579183578491, + "learning_rate": 0.0002, + "loss": 1.4378, + "step": 6560 + }, + { + "epoch": 4.402010050251256, + "grad_norm": 0.7192724347114563, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6570 + }, + { + "epoch": 4.408710217755444, + "grad_norm": 0.746526837348938, + "learning_rate": 0.0002, + "loss": 1.4141, + "step": 6580 + }, + { + "epoch": 4.415410385259632, + "grad_norm": 0.8738046288490295, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 6590 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.8408458828926086, + "learning_rate": 0.0002, + "loss": 1.3995, + "step": 6600 + }, + { + "epoch": 4.4288107202680065, + "grad_norm": 0.8110666275024414, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 6610 + }, + { + "epoch": 4.435510887772194, + "grad_norm": 0.8602406978607178, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 6620 + }, + { + "epoch": 4.442211055276382, + "grad_norm": 0.7549102902412415, + "learning_rate": 0.0002, + "loss": 1.4319, + "step": 6630 + }, + { + "epoch": 4.44891122278057, + "grad_norm": 0.7831804156303406, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 6640 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.7269673943519592, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6650 + }, + { + "epoch": 4.4623115577889445, + "grad_norm": 0.7397838830947876, + "learning_rate": 0.0002, + "loss": 1.4132, + "step": 6660 + }, + { + "epoch": 4.469011725293132, + "grad_norm": 0.713707447052002, + "learning_rate": 0.0002, + "loss": 1.3174, + "step": 6670 + }, + { + "epoch": 4.47571189279732, + "grad_norm": 0.7525581121444702, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 6680 + }, + { + "epoch": 4.482412060301508, + "grad_norm": 0.8030191659927368, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 6690 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.7469439506530762, + "learning_rate": 0.0002, + "loss": 1.4586, + "step": 6700 + }, + { + "epoch": 4.4958123953098825, + "grad_norm": 0.7743868231773376, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 6710 + }, + { + "epoch": 4.50251256281407, + "grad_norm": 0.6539737582206726, + "learning_rate": 0.0002, + "loss": 1.3439, + "step": 6720 + }, + { + "epoch": 4.509212730318258, + "grad_norm": 0.825818657875061, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 6730 + }, + { + "epoch": 4.515912897822446, + "grad_norm": 0.8048575520515442, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 6740 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.7828766107559204, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 6750 + }, + { + "epoch": 4.5293132328308205, + "grad_norm": 0.7406010031700134, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 6760 + }, + { + "epoch": 4.536013400335008, + "grad_norm": 0.840345561504364, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 6770 + }, + { + "epoch": 4.542713567839196, + "grad_norm": 0.8492622971534729, + "learning_rate": 0.0002, + "loss": 1.4808, + "step": 6780 + }, + { + "epoch": 4.549413735343384, + "grad_norm": 0.7130163908004761, + "learning_rate": 0.0002, + "loss": 1.4384, + "step": 6790 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 0.8454728126525879, + "learning_rate": 0.0002, + "loss": 1.4531, + "step": 6800 + }, + { + "epoch": 4.562814070351759, + "grad_norm": 0.7847645282745361, + "learning_rate": 0.0002, + "loss": 1.3239, + "step": 6810 + }, + { + "epoch": 4.569514237855946, + "grad_norm": 0.7245864272117615, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 6820 + }, + { + "epoch": 4.576214405360134, + "grad_norm": 0.768893301486969, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 6830 + }, + { + "epoch": 4.582914572864322, + "grad_norm": 0.8028400540351868, + "learning_rate": 0.0002, + "loss": 1.3932, + "step": 6840 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.763945460319519, + "learning_rate": 0.0002, + "loss": 1.3745, + "step": 6850 + }, + { + "epoch": 4.596314907872697, + "grad_norm": 0.7417685389518738, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 6860 + }, + { + "epoch": 4.603015075376884, + "grad_norm": 0.7603038549423218, + "learning_rate": 0.0002, + "loss": 1.4468, + "step": 6870 + }, + { + "epoch": 4.609715242881072, + "grad_norm": 0.7981528043746948, + "learning_rate": 0.0002, + "loss": 1.4095, + "step": 6880 + }, + { + "epoch": 4.61641541038526, + "grad_norm": 0.8077111840248108, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 6890 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.8778454065322876, + "learning_rate": 0.0002, + "loss": 1.4721, + "step": 6900 + }, + { + "epoch": 4.629815745393635, + "grad_norm": 0.8620710372924805, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 6910 + }, + { + "epoch": 4.636515912897822, + "grad_norm": 0.7486072778701782, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 6920 + }, + { + "epoch": 4.64321608040201, + "grad_norm": 0.7493042945861816, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 6930 + }, + { + "epoch": 4.649916247906198, + "grad_norm": 0.7388978600502014, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 6940 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.798530638217926, + "learning_rate": 0.0002, + "loss": 1.3593, + "step": 6950 + }, + { + "epoch": 4.663316582914573, + "grad_norm": 0.7929500937461853, + "learning_rate": 0.0002, + "loss": 1.3982, + "step": 6960 + }, + { + "epoch": 4.67001675041876, + "grad_norm": 0.9186785221099854, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 6970 + }, + { + "epoch": 4.676716917922948, + "grad_norm": 1.1103485822677612, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 6980 + }, + { + "epoch": 4.683417085427136, + "grad_norm": 0.8000466823577881, + "learning_rate": 0.0002, + "loss": 1.3941, + "step": 6990 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7520599961280823, + "learning_rate": 0.0002, + "loss": 1.371, + "step": 7000 + }, + { + "epoch": 4.696817420435511, + "grad_norm": 0.7971973419189453, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 7010 + }, + { + "epoch": 4.703517587939698, + "grad_norm": 0.7363343834877014, + "learning_rate": 0.0002, + "loss": 1.3682, + "step": 7020 + }, + { + "epoch": 4.710217755443886, + "grad_norm": 0.8268865942955017, + "learning_rate": 0.0002, + "loss": 1.3889, + "step": 7030 + }, + { + "epoch": 4.716917922948074, + "grad_norm": 0.7054963111877441, + "learning_rate": 0.0002, + "loss": 1.4382, + "step": 7040 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.8196262121200562, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7050 + }, + { + "epoch": 4.730318257956449, + "grad_norm": 0.8276031017303467, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 7060 + }, + { + "epoch": 4.7370184254606365, + "grad_norm": 0.8248157501220703, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 7070 + }, + { + "epoch": 4.743718592964824, + "grad_norm": 0.8937979936599731, + "learning_rate": 0.0002, + "loss": 1.4193, + "step": 7080 + }, + { + "epoch": 4.750418760469012, + "grad_norm": 0.7984238862991333, + "learning_rate": 0.0002, + "loss": 1.4334, + "step": 7090 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.9495313763618469, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 7100 + }, + { + "epoch": 4.763819095477387, + "grad_norm": 0.8598204255104065, + "learning_rate": 0.0002, + "loss": 1.4504, + "step": 7110 + }, + { + "epoch": 4.7705192629815745, + "grad_norm": 0.8951472640037537, + "learning_rate": 0.0002, + "loss": 1.3969, + "step": 7120 + }, + { + "epoch": 4.777219430485762, + "grad_norm": 0.9110309481620789, + "learning_rate": 0.0002, + "loss": 1.4339, + "step": 7130 + }, + { + "epoch": 4.78391959798995, + "grad_norm": 0.7929584980010986, + "learning_rate": 0.0002, + "loss": 1.4001, + "step": 7140 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7415322661399841, + "learning_rate": 0.0002, + "loss": 1.467, + "step": 7150 + }, + { + "epoch": 4.797319932998325, + "grad_norm": 0.7504757046699524, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 7160 + }, + { + "epoch": 4.8040201005025125, + "grad_norm": 0.7166924476623535, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 7170 + }, + { + "epoch": 4.8107202680067, + "grad_norm": 0.7728400826454163, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 7180 + }, + { + "epoch": 4.817420435510888, + "grad_norm": 0.7992154955863953, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 7190 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.8655321002006531, + "learning_rate": 0.0002, + "loss": 1.3958, + "step": 7200 + }, + { + "epoch": 4.830820770519263, + "grad_norm": 0.7672632336616516, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 7210 + }, + { + "epoch": 4.8375209380234505, + "grad_norm": 0.708416223526001, + "learning_rate": 0.0002, + "loss": 1.4578, + "step": 7220 + }, + { + "epoch": 4.844221105527638, + "grad_norm": 0.8914081454277039, + "learning_rate": 0.0002, + "loss": 1.5413, + "step": 7230 + }, + { + "epoch": 4.850921273031826, + "grad_norm": 0.7141931653022766, + "learning_rate": 0.0002, + "loss": 1.3569, + "step": 7240 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.6913040280342102, + "learning_rate": 0.0002, + "loss": 1.4532, + "step": 7250 + }, + { + "epoch": 4.864321608040201, + "grad_norm": 0.7871233820915222, + "learning_rate": 0.0002, + "loss": 1.3912, + "step": 7260 + }, + { + "epoch": 4.8710217755443885, + "grad_norm": 0.8466277122497559, + "learning_rate": 0.0002, + "loss": 1.3688, + "step": 7270 + }, + { + "epoch": 4.877721943048576, + "grad_norm": 0.8492183685302734, + "learning_rate": 0.0002, + "loss": 1.33, + "step": 7280 + }, + { + "epoch": 4.884422110552764, + "grad_norm": 0.8339574933052063, + "learning_rate": 0.0002, + "loss": 1.3744, + "step": 7290 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.787022590637207, + "learning_rate": 0.0002, + "loss": 1.4157, + "step": 7300 + }, + { + "epoch": 4.897822445561139, + "grad_norm": 0.8877332806587219, + "learning_rate": 0.0002, + "loss": 1.3725, + "step": 7310 + }, + { + "epoch": 4.9045226130653266, + "grad_norm": 0.744989812374115, + "learning_rate": 0.0002, + "loss": 1.3968, + "step": 7320 + }, + { + "epoch": 4.911222780569514, + "grad_norm": 0.8027268648147583, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 7330 + }, + { + "epoch": 4.917922948073702, + "grad_norm": 0.6437455415725708, + "learning_rate": 0.0002, + "loss": 1.425, + "step": 7340 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.685999870300293, + "learning_rate": 0.0002, + "loss": 1.4829, + "step": 7350 + }, + { + "epoch": 4.931323283082077, + "grad_norm": 0.9086187481880188, + "learning_rate": 0.0002, + "loss": 1.4352, + "step": 7360 + }, + { + "epoch": 4.938023450586265, + "grad_norm": 0.8272411227226257, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 7370 + }, + { + "epoch": 4.944723618090452, + "grad_norm": 0.9227852821350098, + "learning_rate": 0.0002, + "loss": 1.4226, + "step": 7380 + }, + { + "epoch": 4.95142378559464, + "grad_norm": 0.7688441276550293, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 7390 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8662643432617188, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 7400 + }, + { + "epoch": 4.964824120603015, + "grad_norm": 0.9234127998352051, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 7410 + }, + { + "epoch": 4.971524288107203, + "grad_norm": 0.9131470918655396, + "learning_rate": 0.0002, + "loss": 1.4009, + "step": 7420 + }, + { + "epoch": 4.97822445561139, + "grad_norm": 0.7377504110336304, + "learning_rate": 0.0002, + "loss": 1.4544, + "step": 7430 + }, + { + "epoch": 4.984924623115578, + "grad_norm": 0.8762801289558411, + "learning_rate": 0.0002, + "loss": 1.4008, + "step": 7440 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7919872999191284, + "learning_rate": 0.0002, + "loss": 1.4304, + "step": 7450 + }, + { + "epoch": 4.998324958123953, + "grad_norm": 0.7144299149513245, + "learning_rate": 0.0002, + "loss": 1.3817, + "step": 7460 + }, + { + "epoch": 4.99966499162479, + "eval_loss": 1.9291157722473145, + "eval_runtime": 37.9831, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 7462 + }, + { + "epoch": 5.005025125628141, + "grad_norm": 0.7860151529312134, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 7470 + }, + { + "epoch": 5.011725293132328, + "grad_norm": 0.9418314695358276, + "learning_rate": 0.0002, + "loss": 1.2149, + "step": 7480 + }, + { + "epoch": 5.018425460636516, + "grad_norm": 0.8474572896957397, + "learning_rate": 0.0002, + "loss": 1.1966, + "step": 7490 + }, + { + "epoch": 5.025125628140704, + "grad_norm": 1.0724040269851685, + "learning_rate": 0.0002, + "loss": 1.2111, + "step": 7500 + }, + { + "epoch": 5.031825795644891, + "grad_norm": 0.9109148979187012, + "learning_rate": 0.0002, + "loss": 1.2228, + "step": 7510 + }, + { + "epoch": 5.038525963149079, + "grad_norm": 1.0088659524917603, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 7520 + }, + { + "epoch": 5.045226130653266, + "grad_norm": 1.1421623229980469, + "learning_rate": 0.0002, + "loss": 1.2156, + "step": 7530 + }, + { + "epoch": 5.051926298157454, + "grad_norm": 0.9219902157783508, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 7540 + }, + { + "epoch": 5.058626465661642, + "grad_norm": 0.9150987863540649, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 7550 + }, + { + "epoch": 5.065326633165829, + "grad_norm": 0.8889328241348267, + "learning_rate": 0.0002, + "loss": 1.2068, + "step": 7560 + }, + { + "epoch": 5.072026800670017, + "grad_norm": 0.9751363396644592, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 7570 + }, + { + "epoch": 5.078726968174204, + "grad_norm": 0.8603123426437378, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 7580 + }, + { + "epoch": 5.085427135678392, + "grad_norm": 0.8910616636276245, + "learning_rate": 0.0002, + "loss": 1.2175, + "step": 7590 + }, + { + "epoch": 5.09212730318258, + "grad_norm": 1.1128392219543457, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 7600 + }, + { + "epoch": 5.098827470686767, + "grad_norm": 0.9480258822441101, + "learning_rate": 0.0002, + "loss": 1.3065, + "step": 7610 + }, + { + "epoch": 5.105527638190955, + "grad_norm": 0.906958818435669, + "learning_rate": 0.0002, + "loss": 1.193, + "step": 7620 + }, + { + "epoch": 5.1122278056951425, + "grad_norm": 0.8741167187690735, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 7630 + }, + { + "epoch": 5.11892797319933, + "grad_norm": 0.966268002986908, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 7640 + }, + { + "epoch": 5.125628140703517, + "grad_norm": 0.9124358892440796, + "learning_rate": 0.0002, + "loss": 1.2782, + "step": 7650 + }, + { + "epoch": 5.132328308207705, + "grad_norm": 1.0436606407165527, + "learning_rate": 0.0002, + "loss": 1.3004, + "step": 7660 + }, + { + "epoch": 5.139028475711893, + "grad_norm": 0.9217309355735779, + "learning_rate": 0.0002, + "loss": 1.2675, + "step": 7670 + }, + { + "epoch": 5.1457286432160805, + "grad_norm": 1.344765543937683, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 7680 + }, + { + "epoch": 5.152428810720268, + "grad_norm": 1.0730723142623901, + "learning_rate": 0.0002, + "loss": 1.2416, + "step": 7690 + }, + { + "epoch": 5.159128978224456, + "grad_norm": 0.9321247339248657, + "learning_rate": 0.0002, + "loss": 1.1888, + "step": 7700 + }, + { + "epoch": 5.165829145728643, + "grad_norm": 0.8482614755630493, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 7710 + }, + { + "epoch": 5.172529313232831, + "grad_norm": 0.8274452686309814, + "learning_rate": 0.0002, + "loss": 1.2668, + "step": 7720 + }, + { + "epoch": 5.1792294807370185, + "grad_norm": 0.9120376706123352, + "learning_rate": 0.0002, + "loss": 1.1972, + "step": 7730 + }, + { + "epoch": 5.185929648241206, + "grad_norm": 1.0062892436981201, + "learning_rate": 0.0002, + "loss": 1.1648, + "step": 7740 + }, + { + "epoch": 5.192629815745394, + "grad_norm": 0.9521504640579224, + "learning_rate": 0.0002, + "loss": 1.2199, + "step": 7750 + }, + { + "epoch": 5.199329983249581, + "grad_norm": 0.8800198435783386, + "learning_rate": 0.0002, + "loss": 1.2855, + "step": 7760 + }, + { + "epoch": 5.206030150753769, + "grad_norm": 0.9749179482460022, + "learning_rate": 0.0002, + "loss": 1.2535, + "step": 7770 + }, + { + "epoch": 5.2127303182579565, + "grad_norm": 0.9441686868667603, + "learning_rate": 0.0002, + "loss": 1.2975, + "step": 7780 + }, + { + "epoch": 5.219430485762144, + "grad_norm": 0.9114066362380981, + "learning_rate": 0.0002, + "loss": 1.256, + "step": 7790 + }, + { + "epoch": 5.226130653266332, + "grad_norm": 0.9851446151733398, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 7800 + }, + { + "epoch": 5.232830820770519, + "grad_norm": 0.9526297450065613, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 7810 + }, + { + "epoch": 5.239530988274707, + "grad_norm": 1.05986487865448, + "learning_rate": 0.0002, + "loss": 1.1502, + "step": 7820 + }, + { + "epoch": 5.2462311557788945, + "grad_norm": 0.8956538438796997, + "learning_rate": 0.0002, + "loss": 1.2517, + "step": 7830 + }, + { + "epoch": 5.252931323283082, + "grad_norm": 0.9568153619766235, + "learning_rate": 0.0002, + "loss": 1.2556, + "step": 7840 + }, + { + "epoch": 5.259631490787269, + "grad_norm": 1.0035018920898438, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 7850 + }, + { + "epoch": 5.266331658291457, + "grad_norm": 0.8554368615150452, + "learning_rate": 0.0002, + "loss": 1.2605, + "step": 7860 + }, + { + "epoch": 5.273031825795645, + "grad_norm": 0.9677708148956299, + "learning_rate": 0.0002, + "loss": 1.2799, + "step": 7870 + }, + { + "epoch": 5.279731993299833, + "grad_norm": 0.943606436252594, + "learning_rate": 0.0002, + "loss": 1.275, + "step": 7880 + }, + { + "epoch": 5.28643216080402, + "grad_norm": 1.0029335021972656, + "learning_rate": 0.0002, + "loss": 1.2335, + "step": 7890 + }, + { + "epoch": 5.293132328308207, + "grad_norm": 1.0164015293121338, + "learning_rate": 0.0002, + "loss": 1.2494, + "step": 7900 + }, + { + "epoch": 5.299832495812395, + "grad_norm": 0.8908365368843079, + "learning_rate": 0.0002, + "loss": 1.3117, + "step": 7910 + }, + { + "epoch": 5.306532663316583, + "grad_norm": 0.9307826161384583, + "learning_rate": 0.0002, + "loss": 1.2832, + "step": 7920 + }, + { + "epoch": 5.313232830820771, + "grad_norm": 1.0730371475219727, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 7930 + }, + { + "epoch": 5.319932998324958, + "grad_norm": 0.844739556312561, + "learning_rate": 0.0002, + "loss": 1.2003, + "step": 7940 + }, + { + "epoch": 5.326633165829146, + "grad_norm": 1.275833010673523, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 7950 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.9042661190032959, + "learning_rate": 0.0002, + "loss": 1.2957, + "step": 7960 + }, + { + "epoch": 5.340033500837521, + "grad_norm": 0.9374269247055054, + "learning_rate": 0.0002, + "loss": 1.2912, + "step": 7970 + }, + { + "epoch": 5.346733668341709, + "grad_norm": 1.033098578453064, + "learning_rate": 0.0002, + "loss": 1.2721, + "step": 7980 + }, + { + "epoch": 5.353433835845896, + "grad_norm": 1.062775731086731, + "learning_rate": 0.0002, + "loss": 1.3208, + "step": 7990 + }, + { + "epoch": 5.360134003350084, + "grad_norm": 1.1064317226409912, + "learning_rate": 0.0002, + "loss": 1.3065, + "step": 8000 + }, + { + "epoch": 5.366834170854271, + "grad_norm": 1.1114039421081543, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 8010 + }, + { + "epoch": 5.373534338358459, + "grad_norm": 1.0198014974594116, + "learning_rate": 0.0002, + "loss": 1.2255, + "step": 8020 + }, + { + "epoch": 5.380234505862647, + "grad_norm": 0.8443173170089722, + "learning_rate": 0.0002, + "loss": 1.2433, + "step": 8030 + }, + { + "epoch": 5.386934673366834, + "grad_norm": 1.000881314277649, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 8040 + }, + { + "epoch": 5.393634840871022, + "grad_norm": 0.9874443411827087, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 8050 + }, + { + "epoch": 5.400335008375209, + "grad_norm": 0.9895344972610474, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 8060 + }, + { + "epoch": 5.407035175879397, + "grad_norm": 0.8595236539840698, + "learning_rate": 0.0002, + "loss": 1.249, + "step": 8070 + }, + { + "epoch": 5.413735343383585, + "grad_norm": 0.9523849487304688, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 8080 + }, + { + "epoch": 5.420435510887772, + "grad_norm": 1.0560476779937744, + "learning_rate": 0.0002, + "loss": 1.2343, + "step": 8090 + }, + { + "epoch": 5.42713567839196, + "grad_norm": 1.0893689393997192, + "learning_rate": 0.0002, + "loss": 1.2956, + "step": 8100 + }, + { + "epoch": 5.433835845896147, + "grad_norm": 0.9395513534545898, + "learning_rate": 0.0002, + "loss": 1.2846, + "step": 8110 + }, + { + "epoch": 5.440536013400335, + "grad_norm": 0.9364215135574341, + "learning_rate": 0.0002, + "loss": 1.3444, + "step": 8120 + }, + { + "epoch": 5.447236180904523, + "grad_norm": 0.9502208232879639, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 8130 + }, + { + "epoch": 5.45393634840871, + "grad_norm": 0.9559208154678345, + "learning_rate": 0.0002, + "loss": 1.2971, + "step": 8140 + }, + { + "epoch": 5.460636515912898, + "grad_norm": 0.9261730313301086, + "learning_rate": 0.0002, + "loss": 1.2495, + "step": 8150 + }, + { + "epoch": 5.467336683417085, + "grad_norm": 0.9832326173782349, + "learning_rate": 0.0002, + "loss": 1.2599, + "step": 8160 + }, + { + "epoch": 5.474036850921273, + "grad_norm": 1.065953016281128, + "learning_rate": 0.0002, + "loss": 1.2771, + "step": 8170 + }, + { + "epoch": 5.480737018425461, + "grad_norm": 0.9139469861984253, + "learning_rate": 0.0002, + "loss": 1.3617, + "step": 8180 + }, + { + "epoch": 5.4874371859296485, + "grad_norm": 1.2322484254837036, + "learning_rate": 0.0002, + "loss": 1.2197, + "step": 8190 + }, + { + "epoch": 5.494137353433836, + "grad_norm": 0.9722974896430969, + "learning_rate": 0.0002, + "loss": 1.2879, + "step": 8200 + }, + { + "epoch": 5.500837520938023, + "grad_norm": 0.9338926076889038, + "learning_rate": 0.0002, + "loss": 1.2664, + "step": 8210 + }, + { + "epoch": 5.507537688442211, + "grad_norm": 0.9283728003501892, + "learning_rate": 0.0002, + "loss": 1.2128, + "step": 8220 + }, + { + "epoch": 5.514237855946399, + "grad_norm": 1.0489585399627686, + "learning_rate": 0.0002, + "loss": 1.2141, + "step": 8230 + }, + { + "epoch": 5.5209380234505865, + "grad_norm": 0.9881814122200012, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 8240 + }, + { + "epoch": 5.527638190954773, + "grad_norm": 0.9274460077285767, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 8250 + }, + { + "epoch": 5.534338358458961, + "grad_norm": 0.8650718331336975, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 8260 + }, + { + "epoch": 5.541038525963149, + "grad_norm": 1.014069676399231, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 8270 + }, + { + "epoch": 5.547738693467337, + "grad_norm": 0.9212974905967712, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 8280 + }, + { + "epoch": 5.5544388609715245, + "grad_norm": 1.1235398054122925, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 8290 + }, + { + "epoch": 5.561139028475711, + "grad_norm": 0.961954653263092, + "learning_rate": 0.0002, + "loss": 1.306, + "step": 8300 + }, + { + "epoch": 5.567839195979899, + "grad_norm": 0.9386700391769409, + "learning_rate": 0.0002, + "loss": 1.2946, + "step": 8310 + }, + { + "epoch": 5.574539363484087, + "grad_norm": 1.01912522315979, + "learning_rate": 0.0002, + "loss": 1.313, + "step": 8320 + }, + { + "epoch": 5.581239530988275, + "grad_norm": 0.9851216077804565, + "learning_rate": 0.0002, + "loss": 1.3121, + "step": 8330 + }, + { + "epoch": 5.5879396984924625, + "grad_norm": 1.0138001441955566, + "learning_rate": 0.0002, + "loss": 1.3071, + "step": 8340 + }, + { + "epoch": 5.594639865996649, + "grad_norm": 0.9262447357177734, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 8350 + }, + { + "epoch": 5.601340033500837, + "grad_norm": 1.1322970390319824, + "learning_rate": 0.0002, + "loss": 1.2473, + "step": 8360 + }, + { + "epoch": 5.608040201005025, + "grad_norm": 1.1429349184036255, + "learning_rate": 0.0002, + "loss": 1.3098, + "step": 8370 + }, + { + "epoch": 5.614740368509213, + "grad_norm": 0.9130118489265442, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 8380 + }, + { + "epoch": 5.6214405360134005, + "grad_norm": 0.9651545882225037, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 8390 + }, + { + "epoch": 5.628140703517588, + "grad_norm": 0.9595398306846619, + "learning_rate": 0.0002, + "loss": 1.2799, + "step": 8400 + }, + { + "epoch": 5.634840871021775, + "grad_norm": 1.0049372911453247, + "learning_rate": 0.0002, + "loss": 1.3429, + "step": 8410 + }, + { + "epoch": 5.641541038525963, + "grad_norm": 1.082804560661316, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 8420 + }, + { + "epoch": 5.648241206030151, + "grad_norm": 0.9489204287528992, + "learning_rate": 0.0002, + "loss": 1.297, + "step": 8430 + }, + { + "epoch": 5.654941373534339, + "grad_norm": 0.9470235109329224, + "learning_rate": 0.0002, + "loss": 1.3424, + "step": 8440 + }, + { + "epoch": 5.661641541038526, + "grad_norm": 1.0662927627563477, + "learning_rate": 0.0002, + "loss": 1.3358, + "step": 8450 + }, + { + "epoch": 5.668341708542713, + "grad_norm": 0.9097877740859985, + "learning_rate": 0.0002, + "loss": 1.2973, + "step": 8460 + }, + { + "epoch": 5.675041876046901, + "grad_norm": 0.9740368127822876, + "learning_rate": 0.0002, + "loss": 1.3072, + "step": 8470 + }, + { + "epoch": 5.681742043551089, + "grad_norm": 0.9878810048103333, + "learning_rate": 0.0002, + "loss": 1.286, + "step": 8480 + }, + { + "epoch": 5.688442211055277, + "grad_norm": 1.148260474205017, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 8490 + }, + { + "epoch": 5.695142378559464, + "grad_norm": 0.9632558822631836, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 8500 + }, + { + "epoch": 5.701842546063651, + "grad_norm": 0.876812756061554, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 8510 + }, + { + "epoch": 5.708542713567839, + "grad_norm": 1.0730829238891602, + "learning_rate": 0.0002, + "loss": 1.3186, + "step": 8520 + }, + { + "epoch": 5.715242881072027, + "grad_norm": 1.2239218950271606, + "learning_rate": 0.0002, + "loss": 1.2856, + "step": 8530 + }, + { + "epoch": 5.721943048576215, + "grad_norm": 0.9460835456848145, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 8540 + }, + { + "epoch": 5.728643216080402, + "grad_norm": 0.9086270928382874, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 8550 + }, + { + "epoch": 5.735343383584589, + "grad_norm": 1.0258867740631104, + "learning_rate": 0.0002, + "loss": 1.2971, + "step": 8560 + }, + { + "epoch": 5.742043551088777, + "grad_norm": 1.0543923377990723, + "learning_rate": 0.0002, + "loss": 1.3581, + "step": 8570 + }, + { + "epoch": 5.748743718592965, + "grad_norm": 0.9063900113105774, + "learning_rate": 0.0002, + "loss": 1.2988, + "step": 8580 + }, + { + "epoch": 5.755443886097153, + "grad_norm": 1.1838830709457397, + "learning_rate": 0.0002, + "loss": 1.3535, + "step": 8590 + }, + { + "epoch": 5.76214405360134, + "grad_norm": 0.9631859064102173, + "learning_rate": 0.0002, + "loss": 1.2655, + "step": 8600 + }, + { + "epoch": 5.768844221105527, + "grad_norm": 0.9702655673027039, + "learning_rate": 0.0002, + "loss": 1.276, + "step": 8610 + }, + { + "epoch": 5.775544388609715, + "grad_norm": 1.0591435432434082, + "learning_rate": 0.0002, + "loss": 1.3196, + "step": 8620 + }, + { + "epoch": 5.782244556113903, + "grad_norm": 0.9989570379257202, + "learning_rate": 0.0002, + "loss": 1.267, + "step": 8630 + }, + { + "epoch": 5.788944723618091, + "grad_norm": 1.0836435556411743, + "learning_rate": 0.0002, + "loss": 1.3227, + "step": 8640 + }, + { + "epoch": 5.795644891122278, + "grad_norm": 0.8832896947860718, + "learning_rate": 0.0002, + "loss": 1.3334, + "step": 8650 + }, + { + "epoch": 5.802345058626465, + "grad_norm": 1.0104607343673706, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 8660 + }, + { + "epoch": 5.809045226130653, + "grad_norm": 0.8375084400177002, + "learning_rate": 0.0002, + "loss": 1.2703, + "step": 8670 + }, + { + "epoch": 5.815745393634841, + "grad_norm": 1.1300716400146484, + "learning_rate": 0.0002, + "loss": 1.3554, + "step": 8680 + }, + { + "epoch": 5.822445561139029, + "grad_norm": 0.9311910271644592, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 8690 + }, + { + "epoch": 5.8291457286432165, + "grad_norm": 0.9488391876220703, + "learning_rate": 0.0002, + "loss": 1.2749, + "step": 8700 + }, + { + "epoch": 5.835845896147403, + "grad_norm": 0.9747629761695862, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 8710 + }, + { + "epoch": 5.842546063651591, + "grad_norm": 1.1029598712921143, + "learning_rate": 0.0002, + "loss": 1.2923, + "step": 8720 + }, + { + "epoch": 5.849246231155779, + "grad_norm": 1.0396875143051147, + "learning_rate": 0.0002, + "loss": 1.3613, + "step": 8730 + }, + { + "epoch": 5.855946398659967, + "grad_norm": 0.9259780645370483, + "learning_rate": 0.0002, + "loss": 1.3272, + "step": 8740 + }, + { + "epoch": 5.8626465661641545, + "grad_norm": 1.020033597946167, + "learning_rate": 0.0002, + "loss": 1.3236, + "step": 8750 + }, + { + "epoch": 5.869346733668341, + "grad_norm": 0.9191218614578247, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 8760 + }, + { + "epoch": 5.876046901172529, + "grad_norm": 1.1093107461929321, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 8770 + }, + { + "epoch": 5.882747068676717, + "grad_norm": 1.1626793146133423, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 8780 + }, + { + "epoch": 5.889447236180905, + "grad_norm": 0.9542945027351379, + "learning_rate": 0.0002, + "loss": 1.2969, + "step": 8790 + }, + { + "epoch": 5.8961474036850925, + "grad_norm": 0.9086058139801025, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 8800 + }, + { + "epoch": 5.902847571189279, + "grad_norm": 0.9249639511108398, + "learning_rate": 0.0002, + "loss": 1.2731, + "step": 8810 + }, + { + "epoch": 5.909547738693467, + "grad_norm": 0.9414396286010742, + "learning_rate": 0.0002, + "loss": 1.337, + "step": 8820 + }, + { + "epoch": 5.916247906197655, + "grad_norm": 0.9086037874221802, + "learning_rate": 0.0002, + "loss": 1.2865, + "step": 8830 + }, + { + "epoch": 5.922948073701843, + "grad_norm": 0.8685907125473022, + "learning_rate": 0.0002, + "loss": 1.2756, + "step": 8840 + }, + { + "epoch": 5.9296482412060305, + "grad_norm": 1.036419153213501, + "learning_rate": 0.0002, + "loss": 1.297, + "step": 8850 + }, + { + "epoch": 5.936348408710217, + "grad_norm": 1.0183674097061157, + "learning_rate": 0.0002, + "loss": 1.3207, + "step": 8860 + }, + { + "epoch": 5.943048576214405, + "grad_norm": 0.966444194316864, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 8870 + }, + { + "epoch": 5.949748743718593, + "grad_norm": 1.125693917274475, + "learning_rate": 0.0002, + "loss": 1.333, + "step": 8880 + }, + { + "epoch": 5.956448911222781, + "grad_norm": 0.9857436418533325, + "learning_rate": 0.0002, + "loss": 1.3116, + "step": 8890 + }, + { + "epoch": 5.9631490787269685, + "grad_norm": 0.9377069473266602, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 8900 + }, + { + "epoch": 5.969849246231155, + "grad_norm": 0.9493814706802368, + "learning_rate": 0.0002, + "loss": 1.3221, + "step": 8910 + }, + { + "epoch": 5.976549413735343, + "grad_norm": 0.8806208372116089, + "learning_rate": 0.0002, + "loss": 1.2516, + "step": 8920 + }, + { + "epoch": 5.983249581239531, + "grad_norm": 0.8727600574493408, + "learning_rate": 0.0002, + "loss": 1.2558, + "step": 8930 + }, + { + "epoch": 5.989949748743719, + "grad_norm": 0.9799810647964478, + "learning_rate": 0.0002, + "loss": 1.3538, + "step": 8940 + }, + { + "epoch": 5.9966499162479066, + "grad_norm": 0.9866513609886169, + "learning_rate": 0.0002, + "loss": 1.3323, + "step": 8950 + }, + { + "epoch": 6.0, + "eval_loss": 2.0282373428344727, + "eval_runtime": 38.0375, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 8955 + } + ], + "logging_steps": 10, + "max_steps": 11936, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.1441734894288896e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-8955/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..064f299b0f3d2a28f5b1f5c68ef32caab3e2dd49 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa7b586fc440d1e22156593e2dd4267d2bdcb8920a02fdf352ea29a9bec3dd94 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3d158f7569c1d1db0d5a47a0d18863cef728106f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9996649916247906, "step": 1492, "epoch_duration": 1574.8106648921967, "total_accumulated_duration": 1574.8106648921967, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}]} +{"epoch": 2.0, "step": 2985, "epoch_duration": 1577.000766992569, "total_accumulated_duration": 3151.8114318847656, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-1492", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}, {"eval_loss": 1.8036354780197144, "eval_runtime": 37.8949, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.715, "epoch": 0.9996649916247906, "step": 1492}, {"loss": 1.7138, "grad_norm": 0.29252371191978455, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1500}, {"loss": 1.8198, "grad_norm": 0.31607162952423096, "learning_rate": 0.0002, "epoch": 1.0117252931323284, "step": 1510}, {"loss": 1.6779, "grad_norm": 0.32294467091560364, "learning_rate": 0.0002, "epoch": 1.018425460636516, "step": 1520}, {"loss": 1.7919, "grad_norm": 0.3868017792701721, "learning_rate": 0.0002, "epoch": 1.0251256281407035, "step": 1530}, {"loss": 1.7954, "grad_norm": 0.3178282082080841, "learning_rate": 0.0002, "epoch": 1.031825795644891, "step": 1540}, {"loss": 1.7136, "grad_norm": 0.3706750273704529, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1550}, {"loss": 1.7382, "grad_norm": 0.33930912613868713, "learning_rate": 0.0002, "epoch": 1.0452261306532664, "step": 1560}, {"loss": 1.7602, "grad_norm": 0.33970504999160767, "learning_rate": 0.0002, "epoch": 1.051926298157454, "step": 1570}, {"loss": 1.6573, "grad_norm": 0.42553383111953735, "learning_rate": 0.0002, "epoch": 1.0586264656616415, "step": 1580}, {"loss": 1.645, "grad_norm": 0.3772421181201935, "learning_rate": 0.0002, "epoch": 1.065326633165829, "step": 1590}, {"loss": 1.7362, "grad_norm": 0.34212902188301086, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1600}, {"loss": 1.7057, "grad_norm": 0.3798283338546753, "learning_rate": 0.0002, "epoch": 1.0787269681742044, "step": 1610}, {"loss": 1.7468, "grad_norm": 0.36909598112106323, "learning_rate": 0.0002, "epoch": 1.085427135678392, "step": 1620}, {"loss": 1.7807, "grad_norm": 0.3344230651855469, "learning_rate": 0.0002, "epoch": 1.0921273031825796, "step": 1630}, {"loss": 1.7111, "grad_norm": 0.3862569332122803, "learning_rate": 0.0002, "epoch": 1.0988274706867671, "step": 1640}, {"loss": 1.7163, "grad_norm": 0.31188511848449707, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1650}, {"loss": 1.7263, "grad_norm": 0.3563670814037323, "learning_rate": 0.0002, "epoch": 1.1122278056951425, "step": 1660}, {"loss": 1.7718, "grad_norm": 0.35052165389060974, "learning_rate": 0.0002, "epoch": 1.11892797319933, "step": 1670}, {"loss": 1.7601, "grad_norm": 0.3285699188709259, "learning_rate": 0.0002, "epoch": 1.1256281407035176, "step": 1680}, {"loss": 1.6877, "grad_norm": 0.3639393746852875, "learning_rate": 0.0002, "epoch": 1.1323283082077051, "step": 1690}, {"loss": 1.7719, "grad_norm": 0.3842753767967224, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1700}, {"loss": 1.7002, "grad_norm": 0.3624933063983917, "learning_rate": 0.0002, "epoch": 1.1457286432160805, "step": 1710}, {"loss": 1.7243, "grad_norm": 0.3641220033168793, "learning_rate": 0.0002, "epoch": 1.152428810720268, "step": 1720}, {"loss": 1.752, "grad_norm": 0.32765355706214905, "learning_rate": 0.0002, "epoch": 1.1591289782244556, "step": 1730}, {"loss": 1.6556, "grad_norm": 0.34974896907806396, "learning_rate": 0.0002, "epoch": 1.1658291457286432, "step": 1740}, {"loss": 1.7273, "grad_norm": 0.3910926580429077, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1750}, {"loss": 1.7204, "grad_norm": 0.3564300537109375, "learning_rate": 0.0002, "epoch": 1.1792294807370185, "step": 1760}, {"loss": 1.746, "grad_norm": 0.34822574257850647, "learning_rate": 0.0002, "epoch": 1.185929648241206, "step": 1770}, {"loss": 1.7256, "grad_norm": 0.36185044050216675, "learning_rate": 0.0002, "epoch": 1.1926298157453936, "step": 1780}, {"loss": 1.6431, "grad_norm": 0.34866711497306824, "learning_rate": 0.0002, "epoch": 1.1993299832495812, "step": 1790}, {"loss": 1.8084, "grad_norm": 0.4017769992351532, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1800}, {"loss": 1.6985, "grad_norm": 0.32930681109428406, "learning_rate": 0.0002, "epoch": 1.2127303182579565, "step": 1810}, {"loss": 1.7606, "grad_norm": 0.35951921343803406, "learning_rate": 0.0002, "epoch": 1.219430485762144, "step": 1820}, {"loss": 1.6933, "grad_norm": 0.37366992235183716, "learning_rate": 0.0002, "epoch": 1.2261306532663316, "step": 1830}, {"loss": 1.6737, "grad_norm": 0.3565689027309418, "learning_rate": 0.0002, "epoch": 1.2328308207705192, "step": 1840}, {"loss": 1.8013, "grad_norm": 0.3692343533039093, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1850}, {"loss": 1.736, "grad_norm": 0.38426971435546875, "learning_rate": 0.0002, "epoch": 1.2462311557788945, "step": 1860}, {"loss": 1.7031, "grad_norm": 0.33559855818748474, "learning_rate": 0.0002, "epoch": 1.252931323283082, "step": 1870}, {"loss": 1.7033, "grad_norm": 0.34181106090545654, "learning_rate": 0.0002, "epoch": 1.2596314907872697, "step": 1880}, {"loss": 1.7707, "grad_norm": 0.3916318416595459, "learning_rate": 0.0002, "epoch": 1.2663316582914572, "step": 1890}, {"loss": 1.6686, "grad_norm": 0.3887825012207031, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1900}, {"loss": 1.7062, "grad_norm": 0.33583927154541016, "learning_rate": 0.0002, "epoch": 1.2797319932998326, "step": 1910}, {"loss": 1.717, "grad_norm": 0.37639349699020386, "learning_rate": 0.0002, "epoch": 1.2864321608040201, "step": 1920}, {"loss": 1.777, "grad_norm": 0.38059428334236145, "learning_rate": 0.0002, "epoch": 1.2931323283082077, "step": 1930}, {"loss": 1.6126, "grad_norm": 0.37253183126449585, "learning_rate": 0.0002, "epoch": 1.2998324958123952, "step": 1940}, {"loss": 1.6758, "grad_norm": 0.37371566891670227, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1950}, {"loss": 1.6788, "grad_norm": 0.4080910086631775, "learning_rate": 0.0002, "epoch": 1.3132328308207706, "step": 1960}, {"loss": 1.6518, "grad_norm": 0.3174354135990143, "learning_rate": 0.0002, "epoch": 1.3199329983249581, "step": 1970}, {"loss": 1.7925, "grad_norm": 0.4518888294696808, "learning_rate": 0.0002, "epoch": 1.3266331658291457, "step": 1980}, {"loss": 1.7085, "grad_norm": 0.3627921938896179, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 1990}, {"loss": 1.7676, "grad_norm": 0.3655930161476135, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 2000}, {"loss": 1.7016, "grad_norm": 0.3509993255138397, "learning_rate": 0.0002, "epoch": 1.3467336683417086, "step": 2010}, {"loss": 1.7359, "grad_norm": 0.4281129240989685, "learning_rate": 0.0002, "epoch": 1.3534338358458962, "step": 2020}, {"loss": 1.6884, "grad_norm": 0.3821414113044739, "learning_rate": 0.0002, "epoch": 1.3601340033500837, "step": 2030}, {"loss": 1.7075, "grad_norm": 0.3907586336135864, "learning_rate": 0.0002, "epoch": 1.3668341708542713, "step": 2040}, {"loss": 1.7424, "grad_norm": 0.37792932987213135, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 2050}, {"loss": 1.7305, "grad_norm": 0.3693985641002655, "learning_rate": 0.0002, "epoch": 1.3802345058626466, "step": 2060}, {"loss": 1.7434, "grad_norm": 0.32275936007499695, "learning_rate": 0.0002, "epoch": 1.3869346733668342, "step": 2070}, {"loss": 1.6677, "grad_norm": 0.3789440095424652, "learning_rate": 0.0002, "epoch": 1.3936348408710217, "step": 2080}, {"loss": 1.6825, "grad_norm": 0.3638380467891693, "learning_rate": 0.0002, "epoch": 1.4003350083752093, "step": 2090}, {"loss": 1.6542, "grad_norm": 0.3495481610298157, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 2100}, {"loss": 1.7225, "grad_norm": 0.37920597195625305, "learning_rate": 0.0002, "epoch": 1.4137353433835846, "step": 2110}, {"loss": 1.7329, "grad_norm": 0.37218064069747925, "learning_rate": 0.0002, "epoch": 1.4204355108877722, "step": 2120}, {"loss": 1.799, "grad_norm": 0.38074082136154175, "learning_rate": 0.0002, "epoch": 1.4271356783919598, "step": 2130}, {"loss": 1.7403, "grad_norm": 0.3455527126789093, "learning_rate": 0.0002, "epoch": 1.4338358458961473, "step": 2140}, {"loss": 1.776, "grad_norm": 0.3712003529071808, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 2150}, {"loss": 1.7619, "grad_norm": 0.3786754906177521, "learning_rate": 0.0002, "epoch": 1.4472361809045227, "step": 2160}, {"loss": 1.68, "grad_norm": 0.3879223167896271, "learning_rate": 0.0002, "epoch": 1.4539363484087102, "step": 2170}, {"loss": 1.7, "grad_norm": 0.38738805055618286, "learning_rate": 0.0002, "epoch": 1.4606365159128978, "step": 2180}, {"loss": 1.7581, "grad_norm": 0.39768800139427185, "learning_rate": 0.0002, "epoch": 1.4673366834170856, "step": 2190}, {"loss": 1.7671, "grad_norm": 0.4172441065311432, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 2200}, {"loss": 1.6736, "grad_norm": 0.4043174982070923, "learning_rate": 0.0002, "epoch": 1.4807370184254607, "step": 2210}, {"loss": 1.7444, "grad_norm": 0.3750883936882019, "learning_rate": 0.0002, "epoch": 1.4874371859296482, "step": 2220}, {"loss": 1.6861, "grad_norm": 0.3552253246307373, "learning_rate": 0.0002, "epoch": 1.4941373534338358, "step": 2230}, {"loss": 1.6471, "grad_norm": 0.34607139229774475, "learning_rate": 0.0002, "epoch": 1.5008375209380236, "step": 2240}, {"loss": 1.6962, "grad_norm": 0.3406706750392914, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 2250}, {"loss": 1.7694, "grad_norm": 0.36654895544052124, "learning_rate": 0.0002, "epoch": 1.5142378559463987, "step": 2260}, {"loss": 1.6812, "grad_norm": 0.3914054334163666, "learning_rate": 0.0002, "epoch": 1.5209380234505863, "step": 2270}, {"loss": 1.6822, "grad_norm": 0.42012137174606323, "learning_rate": 0.0002, "epoch": 1.5276381909547738, "step": 2280}, {"loss": 1.697, "grad_norm": 0.39563435316085815, "learning_rate": 0.0002, "epoch": 1.5343383584589616, "step": 2290}, {"loss": 1.7491, "grad_norm": 0.3508438766002655, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 2300}, {"loss": 1.7727, "grad_norm": 0.3785218596458435, "learning_rate": 0.0002, "epoch": 1.5477386934673367, "step": 2310}, {"loss": 1.6963, "grad_norm": 0.39377647638320923, "learning_rate": 0.0002, "epoch": 1.5544388609715243, "step": 2320}, {"loss": 1.7263, "grad_norm": 0.3391438126564026, "learning_rate": 0.0002, "epoch": 1.5611390284757118, "step": 2330}, {"loss": 1.7722, "grad_norm": 0.37944263219833374, "learning_rate": 0.0002, "epoch": 1.5678391959798996, "step": 2340}, {"loss": 1.6371, "grad_norm": 0.3523491322994232, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 2350}, {"loss": 1.7583, "grad_norm": 0.3911575973033905, "learning_rate": 0.0002, "epoch": 1.5812395309882747, "step": 2360}, {"loss": 1.7117, "grad_norm": 0.33832186460494995, "learning_rate": 0.0002, "epoch": 1.5879396984924623, "step": 2370}, {"loss": 1.7701, "grad_norm": 0.3665979206562042, "learning_rate": 0.0002, "epoch": 1.5946398659966499, "step": 2380}, {"loss": 1.779, "grad_norm": 0.3871748149394989, "learning_rate": 0.0002, "epoch": 1.6013400335008376, "step": 2390}, {"loss": 1.7109, "grad_norm": 0.3586967885494232, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 2400}, {"loss": 1.7096, "grad_norm": 0.3563673198223114, "learning_rate": 0.0002, "epoch": 1.6147403685092128, "step": 2410}, {"loss": 1.745, "grad_norm": 0.37588971853256226, "learning_rate": 0.0002, "epoch": 1.6214405360134003, "step": 2420}, {"loss": 1.7086, "grad_norm": 0.352556437253952, "learning_rate": 0.0002, "epoch": 1.6281407035175879, "step": 2430}, {"loss": 1.6547, "grad_norm": 0.3716259300708771, "learning_rate": 0.0002, "epoch": 1.6348408710217757, "step": 2440}, {"loss": 1.7033, "grad_norm": 0.372001975774765, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 2450}, {"loss": 1.6584, "grad_norm": 0.3430042862892151, "learning_rate": 0.0002, "epoch": 1.6482412060301508, "step": 2460}, {"loss": 1.7217, "grad_norm": 0.3741483688354492, "learning_rate": 0.0002, "epoch": 1.6549413735343383, "step": 2470}, {"loss": 1.7701, "grad_norm": 0.3610571324825287, "learning_rate": 0.0002, "epoch": 1.661641541038526, "step": 2480}, {"loss": 1.7057, "grad_norm": 0.4204719066619873, "learning_rate": 0.0002, "epoch": 1.6683417085427137, "step": 2490}, {"loss": 1.7954, "grad_norm": 0.3938186466693878, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2500}, {"loss": 1.6633, "grad_norm": 0.3421435058116913, "learning_rate": 0.0002, "epoch": 1.6817420435510888, "step": 2510}, {"loss": 1.7996, "grad_norm": 0.42441412806510925, "learning_rate": 0.0002, "epoch": 1.6884422110552764, "step": 2520}, {"loss": 1.7142, "grad_norm": 0.38071519136428833, "learning_rate": 0.0002, "epoch": 1.695142378559464, "step": 2530}, {"loss": 1.7232, "grad_norm": 0.34078919887542725, "learning_rate": 0.0002, "epoch": 1.7018425460636517, "step": 2540}, {"loss": 1.7126, "grad_norm": 0.412844181060791, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2550}, {"loss": 1.7149, "grad_norm": 0.3753604292869568, "learning_rate": 0.0002, "epoch": 1.7152428810720268, "step": 2560}, {"loss": 1.7011, "grad_norm": 0.41588476300239563, "learning_rate": 0.0002, "epoch": 1.7219430485762144, "step": 2570}, {"loss": 1.6427, "grad_norm": 0.35504111647605896, "learning_rate": 0.0002, "epoch": 1.728643216080402, "step": 2580}, {"loss": 1.7296, "grad_norm": 0.36909720301628113, "learning_rate": 0.0002, "epoch": 1.7353433835845897, "step": 2590}, {"loss": 1.7022, "grad_norm": 0.4149979054927826, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2600}, {"loss": 1.77, "grad_norm": 0.38859328627586365, "learning_rate": 0.0002, "epoch": 1.7487437185929648, "step": 2610}, {"loss": 1.7036, "grad_norm": 0.36738792061805725, "learning_rate": 0.0002, "epoch": 1.7554438860971524, "step": 2620}, {"loss": 1.764, "grad_norm": 0.3968178927898407, "learning_rate": 0.0002, "epoch": 1.76214405360134, "step": 2630}, {"loss": 1.7687, "grad_norm": 0.3972901999950409, "learning_rate": 0.0002, "epoch": 1.7688442211055277, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.3949959874153137, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2650}, {"loss": 1.7247, "grad_norm": 0.44074657559394836, "learning_rate": 0.0002, "epoch": 1.7822445561139029, "step": 2660}, {"loss": 1.7188, "grad_norm": 0.39743664860725403, "learning_rate": 0.0002, "epoch": 1.7889447236180904, "step": 2670}, {"loss": 1.7258, "grad_norm": 0.3950406610965729, "learning_rate": 0.0002, "epoch": 1.795644891122278, "step": 2680}, {"loss": 1.6906, "grad_norm": 0.3568263649940491, "learning_rate": 0.0002, "epoch": 1.8023450586264658, "step": 2690}, {"loss": 1.6735, "grad_norm": 0.3819476366043091, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2700}, {"loss": 1.7198, "grad_norm": 0.3480634391307831, "learning_rate": 0.0002, "epoch": 1.8157453936348409, "step": 2710}, {"loss": 1.7042, "grad_norm": 0.3875853419303894, "learning_rate": 0.0002, "epoch": 1.8224455611390284, "step": 2720}, {"loss": 1.6988, "grad_norm": 0.3441337049007416, "learning_rate": 0.0002, "epoch": 1.829145728643216, "step": 2730}, {"loss": 1.7647, "grad_norm": 0.35692882537841797, "learning_rate": 0.0002, "epoch": 1.8358458961474038, "step": 2740}, {"loss": 1.7033, "grad_norm": 0.36959215998649597, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2750}, {"loss": 1.7657, "grad_norm": 0.3893393278121948, "learning_rate": 0.0002, "epoch": 1.849246231155779, "step": 2760}, {"loss": 1.7068, "grad_norm": 0.37817293405532837, "learning_rate": 0.0002, "epoch": 1.8559463986599665, "step": 2770}, {"loss": 1.761, "grad_norm": 0.36071285605430603, "learning_rate": 0.0002, "epoch": 1.862646566164154, "step": 2780}, {"loss": 1.7623, "grad_norm": 0.3758420944213867, "learning_rate": 0.0002, "epoch": 1.8693467336683418, "step": 2790}, {"loss": 1.6743, "grad_norm": 0.3889938294887543, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2800}, {"loss": 1.6151, "grad_norm": 0.34361857175827026, "learning_rate": 0.0002, "epoch": 1.882747068676717, "step": 2810}, {"loss": 1.6038, "grad_norm": 0.39283323287963867, "learning_rate": 0.0002, "epoch": 1.8894472361809045, "step": 2820}, {"loss": 1.7555, "grad_norm": 0.3919452726840973, "learning_rate": 0.0002, "epoch": 1.896147403685092, "step": 2830}, {"loss": 1.673, "grad_norm": 0.38215070962905884, "learning_rate": 0.0002, "epoch": 1.9028475711892798, "step": 2840}, {"loss": 1.7044, "grad_norm": 0.4235064387321472, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2850}, {"loss": 1.7123, "grad_norm": 0.35694634914398193, "learning_rate": 0.0002, "epoch": 1.916247906197655, "step": 2860}, {"loss": 1.8128, "grad_norm": 0.383492112159729, "learning_rate": 0.0002, "epoch": 1.9229480737018425, "step": 2870}, {"loss": 1.7581, "grad_norm": 0.5945147275924683, "learning_rate": 0.0002, "epoch": 1.92964824120603, "step": 2880}, {"loss": 1.7421, "grad_norm": 0.3367522358894348, "learning_rate": 0.0002, "epoch": 1.9363484087102178, "step": 2890}, {"loss": 1.6561, "grad_norm": 0.35300394892692566, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2900}, {"loss": 1.7033, "grad_norm": 0.38084495067596436, "learning_rate": 0.0002, "epoch": 1.949748743718593, "step": 2910}, {"loss": 1.7132, "grad_norm": 0.37559160590171814, "learning_rate": 0.0002, "epoch": 1.9564489112227805, "step": 2920}, {"loss": 1.6759, "grad_norm": 0.3661738336086273, "learning_rate": 0.0002, "epoch": 1.963149078726968, "step": 2930}, {"loss": 1.7643, "grad_norm": 0.4073849320411682, "learning_rate": 0.0002, "epoch": 1.9698492462311559, "step": 2940}, {"loss": 1.6806, "grad_norm": 0.3723304271697998, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2950}, {"loss": 1.7611, "grad_norm": 0.3991098999977112, "learning_rate": 0.0002, "epoch": 1.983249581239531, "step": 2960}, {"loss": 1.7263, "grad_norm": 0.3947085440158844, "learning_rate": 0.0002, "epoch": 1.9899497487437185, "step": 2970}, {"loss": 1.7217, "grad_norm": 0.3786258399486542, "learning_rate": 0.0002, "epoch": 1.996649916247906, "step": 2980}]} +{"epoch": 2.9996649916247904, "step": 4477, "epoch_duration": 1576.7374460697174, "total_accumulated_duration": 4728.548877954483, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}, {"eval_loss": 1.8036354780197144, "eval_runtime": 37.8949, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.715, "epoch": 0.9996649916247906, "step": 1492}, {"loss": 1.7138, "grad_norm": 0.29252371191978455, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1500}, {"loss": 1.8198, "grad_norm": 0.31607162952423096, "learning_rate": 0.0002, "epoch": 1.0117252931323284, "step": 1510}, {"loss": 1.6779, "grad_norm": 0.32294467091560364, "learning_rate": 0.0002, "epoch": 1.018425460636516, "step": 1520}, {"loss": 1.7919, "grad_norm": 0.3868017792701721, "learning_rate": 0.0002, "epoch": 1.0251256281407035, "step": 1530}, {"loss": 1.7954, "grad_norm": 0.3178282082080841, "learning_rate": 0.0002, "epoch": 1.031825795644891, "step": 1540}, {"loss": 1.7136, "grad_norm": 0.3706750273704529, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1550}, {"loss": 1.7382, "grad_norm": 0.33930912613868713, "learning_rate": 0.0002, "epoch": 1.0452261306532664, "step": 1560}, {"loss": 1.7602, "grad_norm": 0.33970504999160767, "learning_rate": 0.0002, "epoch": 1.051926298157454, "step": 1570}, {"loss": 1.6573, "grad_norm": 0.42553383111953735, "learning_rate": 0.0002, "epoch": 1.0586264656616415, "step": 1580}, {"loss": 1.645, "grad_norm": 0.3772421181201935, "learning_rate": 0.0002, "epoch": 1.065326633165829, "step": 1590}, {"loss": 1.7362, "grad_norm": 0.34212902188301086, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1600}, {"loss": 1.7057, "grad_norm": 0.3798283338546753, "learning_rate": 0.0002, "epoch": 1.0787269681742044, "step": 1610}, {"loss": 1.7468, "grad_norm": 0.36909598112106323, "learning_rate": 0.0002, "epoch": 1.085427135678392, "step": 1620}, {"loss": 1.7807, "grad_norm": 0.3344230651855469, "learning_rate": 0.0002, "epoch": 1.0921273031825796, "step": 1630}, {"loss": 1.7111, "grad_norm": 0.3862569332122803, "learning_rate": 0.0002, "epoch": 1.0988274706867671, "step": 1640}, {"loss": 1.7163, "grad_norm": 0.31188511848449707, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1650}, {"loss": 1.7263, "grad_norm": 0.3563670814037323, "learning_rate": 0.0002, "epoch": 1.1122278056951425, "step": 1660}, {"loss": 1.7718, "grad_norm": 0.35052165389060974, "learning_rate": 0.0002, "epoch": 1.11892797319933, "step": 1670}, {"loss": 1.7601, "grad_norm": 0.3285699188709259, "learning_rate": 0.0002, "epoch": 1.1256281407035176, "step": 1680}, {"loss": 1.6877, "grad_norm": 0.3639393746852875, "learning_rate": 0.0002, "epoch": 1.1323283082077051, "step": 1690}, {"loss": 1.7719, "grad_norm": 0.3842753767967224, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1700}, {"loss": 1.7002, "grad_norm": 0.3624933063983917, "learning_rate": 0.0002, "epoch": 1.1457286432160805, "step": 1710}, {"loss": 1.7243, "grad_norm": 0.3641220033168793, "learning_rate": 0.0002, "epoch": 1.152428810720268, "step": 1720}, {"loss": 1.752, "grad_norm": 0.32765355706214905, "learning_rate": 0.0002, "epoch": 1.1591289782244556, "step": 1730}, {"loss": 1.6556, "grad_norm": 0.34974896907806396, "learning_rate": 0.0002, "epoch": 1.1658291457286432, "step": 1740}, {"loss": 1.7273, "grad_norm": 0.3910926580429077, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1750}, {"loss": 1.7204, "grad_norm": 0.3564300537109375, "learning_rate": 0.0002, "epoch": 1.1792294807370185, "step": 1760}, {"loss": 1.746, "grad_norm": 0.34822574257850647, "learning_rate": 0.0002, "epoch": 1.185929648241206, "step": 1770}, {"loss": 1.7256, "grad_norm": 0.36185044050216675, "learning_rate": 0.0002, "epoch": 1.1926298157453936, "step": 1780}, {"loss": 1.6431, "grad_norm": 0.34866711497306824, "learning_rate": 0.0002, "epoch": 1.1993299832495812, "step": 1790}, {"loss": 1.8084, "grad_norm": 0.4017769992351532, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1800}, {"loss": 1.6985, "grad_norm": 0.32930681109428406, "learning_rate": 0.0002, "epoch": 1.2127303182579565, "step": 1810}, {"loss": 1.7606, "grad_norm": 0.35951921343803406, "learning_rate": 0.0002, "epoch": 1.219430485762144, "step": 1820}, {"loss": 1.6933, "grad_norm": 0.37366992235183716, "learning_rate": 0.0002, "epoch": 1.2261306532663316, "step": 1830}, {"loss": 1.6737, "grad_norm": 0.3565689027309418, "learning_rate": 0.0002, "epoch": 1.2328308207705192, "step": 1840}, {"loss": 1.8013, "grad_norm": 0.3692343533039093, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1850}, {"loss": 1.736, "grad_norm": 0.38426971435546875, "learning_rate": 0.0002, "epoch": 1.2462311557788945, "step": 1860}, {"loss": 1.7031, "grad_norm": 0.33559855818748474, "learning_rate": 0.0002, "epoch": 1.252931323283082, "step": 1870}, {"loss": 1.7033, "grad_norm": 0.34181106090545654, "learning_rate": 0.0002, "epoch": 1.2596314907872697, "step": 1880}, {"loss": 1.7707, "grad_norm": 0.3916318416595459, "learning_rate": 0.0002, "epoch": 1.2663316582914572, "step": 1890}, {"loss": 1.6686, "grad_norm": 0.3887825012207031, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1900}, {"loss": 1.7062, "grad_norm": 0.33583927154541016, "learning_rate": 0.0002, "epoch": 1.2797319932998326, "step": 1910}, {"loss": 1.717, "grad_norm": 0.37639349699020386, "learning_rate": 0.0002, "epoch": 1.2864321608040201, "step": 1920}, {"loss": 1.777, "grad_norm": 0.38059428334236145, "learning_rate": 0.0002, "epoch": 1.2931323283082077, "step": 1930}, {"loss": 1.6126, "grad_norm": 0.37253183126449585, "learning_rate": 0.0002, "epoch": 1.2998324958123952, "step": 1940}, {"loss": 1.6758, "grad_norm": 0.37371566891670227, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1950}, {"loss": 1.6788, "grad_norm": 0.4080910086631775, "learning_rate": 0.0002, "epoch": 1.3132328308207706, "step": 1960}, {"loss": 1.6518, "grad_norm": 0.3174354135990143, "learning_rate": 0.0002, "epoch": 1.3199329983249581, "step": 1970}, {"loss": 1.7925, "grad_norm": 0.4518888294696808, "learning_rate": 0.0002, "epoch": 1.3266331658291457, "step": 1980}, {"loss": 1.7085, "grad_norm": 0.3627921938896179, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 1990}, {"loss": 1.7676, "grad_norm": 0.3655930161476135, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 2000}, {"loss": 1.7016, "grad_norm": 0.3509993255138397, "learning_rate": 0.0002, "epoch": 1.3467336683417086, "step": 2010}, {"loss": 1.7359, "grad_norm": 0.4281129240989685, "learning_rate": 0.0002, "epoch": 1.3534338358458962, "step": 2020}, {"loss": 1.6884, "grad_norm": 0.3821414113044739, "learning_rate": 0.0002, "epoch": 1.3601340033500837, "step": 2030}, {"loss": 1.7075, "grad_norm": 0.3907586336135864, "learning_rate": 0.0002, "epoch": 1.3668341708542713, "step": 2040}, {"loss": 1.7424, "grad_norm": 0.37792932987213135, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 2050}, {"loss": 1.7305, "grad_norm": 0.3693985641002655, "learning_rate": 0.0002, "epoch": 1.3802345058626466, "step": 2060}, {"loss": 1.7434, "grad_norm": 0.32275936007499695, "learning_rate": 0.0002, "epoch": 1.3869346733668342, "step": 2070}, {"loss": 1.6677, "grad_norm": 0.3789440095424652, "learning_rate": 0.0002, "epoch": 1.3936348408710217, "step": 2080}, {"loss": 1.6825, "grad_norm": 0.3638380467891693, "learning_rate": 0.0002, "epoch": 1.4003350083752093, "step": 2090}, {"loss": 1.6542, "grad_norm": 0.3495481610298157, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 2100}, {"loss": 1.7225, "grad_norm": 0.37920597195625305, "learning_rate": 0.0002, "epoch": 1.4137353433835846, "step": 2110}, {"loss": 1.7329, "grad_norm": 0.37218064069747925, "learning_rate": 0.0002, "epoch": 1.4204355108877722, "step": 2120}, {"loss": 1.799, "grad_norm": 0.38074082136154175, "learning_rate": 0.0002, "epoch": 1.4271356783919598, "step": 2130}, {"loss": 1.7403, "grad_norm": 0.3455527126789093, "learning_rate": 0.0002, "epoch": 1.4338358458961473, "step": 2140}, {"loss": 1.776, "grad_norm": 0.3712003529071808, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 2150}, {"loss": 1.7619, "grad_norm": 0.3786754906177521, "learning_rate": 0.0002, "epoch": 1.4472361809045227, "step": 2160}, {"loss": 1.68, "grad_norm": 0.3879223167896271, "learning_rate": 0.0002, "epoch": 1.4539363484087102, "step": 2170}, {"loss": 1.7, "grad_norm": 0.38738805055618286, "learning_rate": 0.0002, "epoch": 1.4606365159128978, "step": 2180}, {"loss": 1.7581, "grad_norm": 0.39768800139427185, "learning_rate": 0.0002, "epoch": 1.4673366834170856, "step": 2190}, {"loss": 1.7671, "grad_norm": 0.4172441065311432, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 2200}, {"loss": 1.6736, "grad_norm": 0.4043174982070923, "learning_rate": 0.0002, "epoch": 1.4807370184254607, "step": 2210}, {"loss": 1.7444, "grad_norm": 0.3750883936882019, "learning_rate": 0.0002, "epoch": 1.4874371859296482, "step": 2220}, {"loss": 1.6861, "grad_norm": 0.3552253246307373, "learning_rate": 0.0002, "epoch": 1.4941373534338358, "step": 2230}, {"loss": 1.6471, "grad_norm": 0.34607139229774475, "learning_rate": 0.0002, "epoch": 1.5008375209380236, "step": 2240}, {"loss": 1.6962, "grad_norm": 0.3406706750392914, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 2250}, {"loss": 1.7694, "grad_norm": 0.36654895544052124, "learning_rate": 0.0002, "epoch": 1.5142378559463987, "step": 2260}, {"loss": 1.6812, "grad_norm": 0.3914054334163666, "learning_rate": 0.0002, "epoch": 1.5209380234505863, "step": 2270}, {"loss": 1.6822, "grad_norm": 0.42012137174606323, "learning_rate": 0.0002, "epoch": 1.5276381909547738, "step": 2280}, {"loss": 1.697, "grad_norm": 0.39563435316085815, "learning_rate": 0.0002, "epoch": 1.5343383584589616, "step": 2290}, {"loss": 1.7491, "grad_norm": 0.3508438766002655, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 2300}, {"loss": 1.7727, "grad_norm": 0.3785218596458435, "learning_rate": 0.0002, "epoch": 1.5477386934673367, "step": 2310}, {"loss": 1.6963, "grad_norm": 0.39377647638320923, "learning_rate": 0.0002, "epoch": 1.5544388609715243, "step": 2320}, {"loss": 1.7263, "grad_norm": 0.3391438126564026, "learning_rate": 0.0002, "epoch": 1.5611390284757118, "step": 2330}, {"loss": 1.7722, "grad_norm": 0.37944263219833374, "learning_rate": 0.0002, "epoch": 1.5678391959798996, "step": 2340}, {"loss": 1.6371, "grad_norm": 0.3523491322994232, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 2350}, {"loss": 1.7583, "grad_norm": 0.3911575973033905, "learning_rate": 0.0002, "epoch": 1.5812395309882747, "step": 2360}, {"loss": 1.7117, "grad_norm": 0.33832186460494995, "learning_rate": 0.0002, "epoch": 1.5879396984924623, "step": 2370}, {"loss": 1.7701, "grad_norm": 0.3665979206562042, "learning_rate": 0.0002, "epoch": 1.5946398659966499, "step": 2380}, {"loss": 1.779, "grad_norm": 0.3871748149394989, "learning_rate": 0.0002, "epoch": 1.6013400335008376, "step": 2390}, {"loss": 1.7109, "grad_norm": 0.3586967885494232, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 2400}, {"loss": 1.7096, "grad_norm": 0.3563673198223114, "learning_rate": 0.0002, "epoch": 1.6147403685092128, "step": 2410}, {"loss": 1.745, "grad_norm": 0.37588971853256226, "learning_rate": 0.0002, "epoch": 1.6214405360134003, "step": 2420}, {"loss": 1.7086, "grad_norm": 0.352556437253952, "learning_rate": 0.0002, "epoch": 1.6281407035175879, "step": 2430}, {"loss": 1.6547, "grad_norm": 0.3716259300708771, "learning_rate": 0.0002, "epoch": 1.6348408710217757, "step": 2440}, {"loss": 1.7033, "grad_norm": 0.372001975774765, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 2450}, {"loss": 1.6584, "grad_norm": 0.3430042862892151, "learning_rate": 0.0002, "epoch": 1.6482412060301508, "step": 2460}, {"loss": 1.7217, "grad_norm": 0.3741483688354492, "learning_rate": 0.0002, "epoch": 1.6549413735343383, "step": 2470}, {"loss": 1.7701, "grad_norm": 0.3610571324825287, "learning_rate": 0.0002, "epoch": 1.661641541038526, "step": 2480}, {"loss": 1.7057, "grad_norm": 0.4204719066619873, "learning_rate": 0.0002, "epoch": 1.6683417085427137, "step": 2490}, {"loss": 1.7954, "grad_norm": 0.3938186466693878, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2500}, {"loss": 1.6633, "grad_norm": 0.3421435058116913, "learning_rate": 0.0002, "epoch": 1.6817420435510888, "step": 2510}, {"loss": 1.7996, "grad_norm": 0.42441412806510925, "learning_rate": 0.0002, "epoch": 1.6884422110552764, "step": 2520}, {"loss": 1.7142, "grad_norm": 0.38071519136428833, "learning_rate": 0.0002, "epoch": 1.695142378559464, "step": 2530}, {"loss": 1.7232, "grad_norm": 0.34078919887542725, "learning_rate": 0.0002, "epoch": 1.7018425460636517, "step": 2540}, {"loss": 1.7126, "grad_norm": 0.412844181060791, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2550}, {"loss": 1.7149, "grad_norm": 0.3753604292869568, "learning_rate": 0.0002, "epoch": 1.7152428810720268, "step": 2560}, {"loss": 1.7011, "grad_norm": 0.41588476300239563, "learning_rate": 0.0002, "epoch": 1.7219430485762144, "step": 2570}, {"loss": 1.6427, "grad_norm": 0.35504111647605896, "learning_rate": 0.0002, "epoch": 1.728643216080402, "step": 2580}, {"loss": 1.7296, "grad_norm": 0.36909720301628113, "learning_rate": 0.0002, "epoch": 1.7353433835845897, "step": 2590}, {"loss": 1.7022, "grad_norm": 0.4149979054927826, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2600}, {"loss": 1.77, "grad_norm": 0.38859328627586365, "learning_rate": 0.0002, "epoch": 1.7487437185929648, "step": 2610}, {"loss": 1.7036, "grad_norm": 0.36738792061805725, "learning_rate": 0.0002, "epoch": 1.7554438860971524, "step": 2620}, {"loss": 1.764, "grad_norm": 0.3968178927898407, "learning_rate": 0.0002, "epoch": 1.76214405360134, "step": 2630}, {"loss": 1.7687, "grad_norm": 0.3972901999950409, "learning_rate": 0.0002, "epoch": 1.7688442211055277, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.3949959874153137, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2650}, {"loss": 1.7247, "grad_norm": 0.44074657559394836, "learning_rate": 0.0002, "epoch": 1.7822445561139029, "step": 2660}, {"loss": 1.7188, "grad_norm": 0.39743664860725403, "learning_rate": 0.0002, "epoch": 1.7889447236180904, "step": 2670}, {"loss": 1.7258, "grad_norm": 0.3950406610965729, "learning_rate": 0.0002, "epoch": 1.795644891122278, "step": 2680}, {"loss": 1.6906, "grad_norm": 0.3568263649940491, "learning_rate": 0.0002, "epoch": 1.8023450586264658, "step": 2690}, {"loss": 1.6735, "grad_norm": 0.3819476366043091, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2700}, {"loss": 1.7198, "grad_norm": 0.3480634391307831, "learning_rate": 0.0002, "epoch": 1.8157453936348409, "step": 2710}, {"loss": 1.7042, "grad_norm": 0.3875853419303894, "learning_rate": 0.0002, "epoch": 1.8224455611390284, "step": 2720}, {"loss": 1.6988, "grad_norm": 0.3441337049007416, "learning_rate": 0.0002, "epoch": 1.829145728643216, "step": 2730}, {"loss": 1.7647, "grad_norm": 0.35692882537841797, "learning_rate": 0.0002, "epoch": 1.8358458961474038, "step": 2740}, {"loss": 1.7033, "grad_norm": 0.36959215998649597, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2750}, {"loss": 1.7657, "grad_norm": 0.3893393278121948, "learning_rate": 0.0002, "epoch": 1.849246231155779, "step": 2760}, {"loss": 1.7068, "grad_norm": 0.37817293405532837, "learning_rate": 0.0002, "epoch": 1.8559463986599665, "step": 2770}, {"loss": 1.761, "grad_norm": 0.36071285605430603, "learning_rate": 0.0002, "epoch": 1.862646566164154, "step": 2780}, {"loss": 1.7623, "grad_norm": 0.3758420944213867, "learning_rate": 0.0002, "epoch": 1.8693467336683418, "step": 2790}, {"loss": 1.6743, "grad_norm": 0.3889938294887543, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2800}, {"loss": 1.6151, "grad_norm": 0.34361857175827026, "learning_rate": 0.0002, "epoch": 1.882747068676717, "step": 2810}, {"loss": 1.6038, "grad_norm": 0.39283323287963867, "learning_rate": 0.0002, "epoch": 1.8894472361809045, "step": 2820}, {"loss": 1.7555, "grad_norm": 0.3919452726840973, "learning_rate": 0.0002, "epoch": 1.896147403685092, "step": 2830}, {"loss": 1.673, "grad_norm": 0.38215070962905884, "learning_rate": 0.0002, "epoch": 1.9028475711892798, "step": 2840}, {"loss": 1.7044, "grad_norm": 0.4235064387321472, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2850}, {"loss": 1.7123, "grad_norm": 0.35694634914398193, "learning_rate": 0.0002, "epoch": 1.916247906197655, "step": 2860}, {"loss": 1.8128, "grad_norm": 0.383492112159729, "learning_rate": 0.0002, "epoch": 1.9229480737018425, "step": 2870}, {"loss": 1.7581, "grad_norm": 0.5945147275924683, "learning_rate": 0.0002, "epoch": 1.92964824120603, "step": 2880}, {"loss": 1.7421, "grad_norm": 0.3367522358894348, "learning_rate": 0.0002, "epoch": 1.9363484087102178, "step": 2890}, {"loss": 1.6561, "grad_norm": 0.35300394892692566, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2900}, {"loss": 1.7033, "grad_norm": 0.38084495067596436, "learning_rate": 0.0002, "epoch": 1.949748743718593, "step": 2910}, {"loss": 1.7132, "grad_norm": 0.37559160590171814, "learning_rate": 0.0002, "epoch": 1.9564489112227805, "step": 2920}, {"loss": 1.6759, "grad_norm": 0.3661738336086273, "learning_rate": 0.0002, "epoch": 1.963149078726968, "step": 2930}, {"loss": 1.7643, "grad_norm": 0.4073849320411682, "learning_rate": 0.0002, "epoch": 1.9698492462311559, "step": 2940}, {"loss": 1.6806, "grad_norm": 0.3723304271697998, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2950}, {"loss": 1.7611, "grad_norm": 0.3991098999977112, "learning_rate": 0.0002, "epoch": 1.983249581239531, "step": 2960}, {"loss": 1.7263, "grad_norm": 0.3947085440158844, "learning_rate": 0.0002, "epoch": 1.9899497487437185, "step": 2970}, {"loss": 1.7217, "grad_norm": 0.3786258399486542, "learning_rate": 0.0002, "epoch": 1.996649916247906, "step": 2980}, {"eval_loss": 1.8028968572616577, "eval_runtime": 37.8985, "eval_samples_per_second": 13.589, "eval_steps_per_second": 1.715, "epoch": 2.0, "step": 2985}, {"loss": 1.695, "grad_norm": 0.34824079275131226, "learning_rate": 0.0002, "epoch": 2.003350083752094, "step": 2990}, {"loss": 1.5853, "grad_norm": 0.3394894003868103, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 3000}, {"loss": 1.5783, "grad_norm": 0.36910977959632874, "learning_rate": 0.0002, "epoch": 2.016750418760469, "step": 3010}, {"loss": 1.6105, "grad_norm": 0.45000967383384705, "learning_rate": 0.0002, "epoch": 2.023450586264657, "step": 3020}, {"loss": 1.6019, "grad_norm": 0.3791407346725464, "learning_rate": 0.0002, "epoch": 2.030150753768844, "step": 3030}, {"loss": 1.5832, "grad_norm": 0.387321799993515, "learning_rate": 0.0002, "epoch": 2.036850921273032, "step": 3040}, {"loss": 1.6834, "grad_norm": 0.4185757040977478, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 3050}, {"loss": 1.5696, "grad_norm": 0.45110777020454407, "learning_rate": 0.0002, "epoch": 2.050251256281407, "step": 3060}, {"loss": 1.6231, "grad_norm": 0.42663660645484924, "learning_rate": 0.0002, "epoch": 2.056951423785595, "step": 3070}, {"loss": 1.6279, "grad_norm": 0.4546292722225189, "learning_rate": 0.0002, "epoch": 2.063651591289782, "step": 3080}, {"loss": 1.6141, "grad_norm": 0.3979759216308594, "learning_rate": 0.0002, "epoch": 2.07035175879397, "step": 3090}, {"loss": 1.6343, "grad_norm": 0.43596673011779785, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 3100}, {"loss": 1.5441, "grad_norm": 0.40120232105255127, "learning_rate": 0.0002, "epoch": 2.083752093802345, "step": 3110}, {"loss": 1.6309, "grad_norm": 0.44449281692504883, "learning_rate": 0.0002, "epoch": 2.090452261306533, "step": 3120}, {"loss": 1.5652, "grad_norm": 0.42672568559646606, "learning_rate": 0.0002, "epoch": 2.09715242881072, "step": 3130}, {"loss": 1.682, "grad_norm": 0.4232690930366516, "learning_rate": 0.0002, "epoch": 2.103852596314908, "step": 3140}, {"loss": 1.624, "grad_norm": 0.4299317002296448, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 3150}, {"loss": 1.6766, "grad_norm": 0.4067758023738861, "learning_rate": 0.0002, "epoch": 2.117252931323283, "step": 3160}, {"loss": 1.6759, "grad_norm": 0.4918815791606903, "learning_rate": 0.0002, "epoch": 2.123953098827471, "step": 3170}, {"loss": 1.6478, "grad_norm": 0.4140559732913971, "learning_rate": 0.0002, "epoch": 2.130653266331658, "step": 3180}, {"loss": 1.6641, "grad_norm": 0.4555995464324951, "learning_rate": 0.0002, "epoch": 2.137353433835846, "step": 3190}, {"loss": 1.5888, "grad_norm": 0.42943915724754333, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 3200}, {"loss": 1.5886, "grad_norm": 0.4730435013771057, "learning_rate": 0.0002, "epoch": 2.150753768844221, "step": 3210}, {"loss": 1.6022, "grad_norm": 0.43310216069221497, "learning_rate": 0.0002, "epoch": 2.157453936348409, "step": 3220}, {"loss": 1.6058, "grad_norm": 0.42054110765457153, "learning_rate": 0.0002, "epoch": 2.164154103852596, "step": 3230}, {"loss": 1.6749, "grad_norm": 0.4897233247756958, "learning_rate": 0.0002, "epoch": 2.170854271356784, "step": 3240}, {"loss": 1.6983, "grad_norm": 0.42194533348083496, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 3250}, {"loss": 1.6159, "grad_norm": 0.44494450092315674, "learning_rate": 0.0002, "epoch": 2.184254606365159, "step": 3260}, {"loss": 1.6977, "grad_norm": 0.43524879217147827, "learning_rate": 0.0002, "epoch": 2.190954773869347, "step": 3270}, {"loss": 1.528, "grad_norm": 0.4621117413043976, "learning_rate": 0.0002, "epoch": 2.1976549413735342, "step": 3280}, {"loss": 1.632, "grad_norm": 0.4073285460472107, "learning_rate": 0.0002, "epoch": 2.204355108877722, "step": 3290}, {"loss": 1.6141, "grad_norm": 0.47868335247039795, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 3300}, {"loss": 1.6857, "grad_norm": 0.4264970123767853, "learning_rate": 0.0002, "epoch": 2.217755443886097, "step": 3310}, {"loss": 1.5653, "grad_norm": 0.4491245150566101, "learning_rate": 0.0002, "epoch": 2.224455611390285, "step": 3320}, {"loss": 1.5881, "grad_norm": 0.4010344445705414, "learning_rate": 0.0002, "epoch": 2.2311557788944723, "step": 3330}, {"loss": 1.6684, "grad_norm": 0.4232759177684784, "learning_rate": 0.0002, "epoch": 2.23785594639866, "step": 3340}, {"loss": 1.6336, "grad_norm": 0.5099776983261108, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 3350}, {"loss": 1.6764, "grad_norm": 0.5223407745361328, "learning_rate": 0.0002, "epoch": 2.251256281407035, "step": 3360}, {"loss": 1.6625, "grad_norm": 0.47818470001220703, "learning_rate": 0.0002, "epoch": 2.257956448911223, "step": 3370}, {"loss": 1.5946, "grad_norm": 0.4721255898475647, "learning_rate": 0.0002, "epoch": 2.2646566164154103, "step": 3380}, {"loss": 1.5568, "grad_norm": 0.4113229513168335, "learning_rate": 0.0002, "epoch": 2.271356783919598, "step": 3390}, {"loss": 1.6494, "grad_norm": 0.507080078125, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 3400}, {"loss": 1.6183, "grad_norm": 0.4852292239665985, "learning_rate": 0.0002, "epoch": 2.284757118927973, "step": 3410}, {"loss": 1.6132, "grad_norm": 0.4503684341907501, "learning_rate": 0.0002, "epoch": 2.291457286432161, "step": 3420}, {"loss": 1.6649, "grad_norm": 0.8359600305557251, "learning_rate": 0.0002, "epoch": 2.2981574539363483, "step": 3430}, {"loss": 1.6644, "grad_norm": 0.44604045152664185, "learning_rate": 0.0002, "epoch": 2.304857621440536, "step": 3440}, {"loss": 1.5972, "grad_norm": 0.45667049288749695, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 3450}, {"loss": 1.6667, "grad_norm": 0.4879349172115326, "learning_rate": 0.0002, "epoch": 2.318257956448911, "step": 3460}, {"loss": 1.5804, "grad_norm": 0.4033963084220886, "learning_rate": 0.0002, "epoch": 2.324958123953099, "step": 3470}, {"loss": 1.5838, "grad_norm": 0.44494301080703735, "learning_rate": 0.0002, "epoch": 2.3316582914572863, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4794621765613556, "learning_rate": 0.0002, "epoch": 2.338358458961474, "step": 3490}, {"loss": 1.6807, "grad_norm": 0.41404327750205994, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 3500}, {"loss": 1.714, "grad_norm": 0.4664851725101471, "learning_rate": 0.0002, "epoch": 2.351758793969849, "step": 3510}, {"loss": 1.6537, "grad_norm": 0.4263697564601898, "learning_rate": 0.0002, "epoch": 2.358458961474037, "step": 3520}, {"loss": 1.6551, "grad_norm": 0.5035167336463928, "learning_rate": 0.0002, "epoch": 2.3651591289782243, "step": 3530}, {"loss": 1.6208, "grad_norm": 0.4380664527416229, "learning_rate": 0.0002, "epoch": 2.371859296482412, "step": 3540}, {"loss": 1.634, "grad_norm": 0.5227681994438171, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 3550}, {"loss": 1.6146, "grad_norm": 0.4382302761077881, "learning_rate": 0.0002, "epoch": 2.3852596314907872, "step": 3560}, {"loss": 1.5653, "grad_norm": 0.4392451047897339, "learning_rate": 0.0002, "epoch": 2.391959798994975, "step": 3570}, {"loss": 1.6626, "grad_norm": 0.4372786581516266, "learning_rate": 0.0002, "epoch": 2.3986599664991624, "step": 3580}, {"loss": 1.519, "grad_norm": 0.5015502572059631, "learning_rate": 0.0002, "epoch": 2.40536013400335, "step": 3590}, {"loss": 1.588, "grad_norm": 0.5653210878372192, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 3600}, {"loss": 1.6075, "grad_norm": 0.53007972240448, "learning_rate": 0.0002, "epoch": 2.4187604690117253, "step": 3610}, {"loss": 1.6421, "grad_norm": 0.4659176766872406, "learning_rate": 0.0002, "epoch": 2.425460636515913, "step": 3620}, {"loss": 1.625, "grad_norm": 0.5637837052345276, "learning_rate": 0.0002, "epoch": 2.4321608040201004, "step": 3630}, {"loss": 1.6168, "grad_norm": 0.4248391389846802, "learning_rate": 0.0002, "epoch": 2.438860971524288, "step": 3640}, {"loss": 1.6822, "grad_norm": 0.44668248295783997, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 3650}, {"loss": 1.6417, "grad_norm": 0.43990179896354675, "learning_rate": 0.0002, "epoch": 2.4522613065326633, "step": 3660}, {"loss": 1.6723, "grad_norm": 0.4532523453235626, "learning_rate": 0.0002, "epoch": 2.458961474036851, "step": 3670}, {"loss": 1.6957, "grad_norm": 0.6605591773986816, "learning_rate": 0.0002, "epoch": 2.4656616415410384, "step": 3680}, {"loss": 1.6159, "grad_norm": 0.4694533348083496, "learning_rate": 0.0002, "epoch": 2.472361809045226, "step": 3690}, {"loss": 1.6239, "grad_norm": 0.4485011100769043, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 3700}, {"loss": 1.6834, "grad_norm": 0.4761785864830017, "learning_rate": 0.0002, "epoch": 2.4857621440536013, "step": 3710}, {"loss": 1.6313, "grad_norm": 0.5116432309150696, "learning_rate": 0.0002, "epoch": 2.492462311557789, "step": 3720}, {"loss": 1.5054, "grad_norm": 0.49523618817329407, "learning_rate": 0.0002, "epoch": 2.4991624790619764, "step": 3730}, {"loss": 1.6249, "grad_norm": 0.43826380372047424, "learning_rate": 0.0002, "epoch": 2.505862646566164, "step": 3740}, {"loss": 1.5762, "grad_norm": 0.4916154146194458, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3750}, {"loss": 1.5157, "grad_norm": 0.5381299257278442, "learning_rate": 0.0002, "epoch": 2.5192629815745393, "step": 3760}, {"loss": 1.6467, "grad_norm": 0.44947415590286255, "learning_rate": 0.0002, "epoch": 2.525963149078727, "step": 3770}, {"loss": 1.67, "grad_norm": 0.49979084730148315, "learning_rate": 0.0002, "epoch": 2.5326633165829144, "step": 3780}, {"loss": 1.622, "grad_norm": 0.43046900629997253, "learning_rate": 0.0002, "epoch": 2.539363484087102, "step": 3790}, {"loss": 1.6789, "grad_norm": 0.4513470530509949, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3800}, {"loss": 1.6335, "grad_norm": 0.49900051951408386, "learning_rate": 0.0002, "epoch": 2.5527638190954773, "step": 3810}, {"loss": 1.6322, "grad_norm": 0.4348420202732086, "learning_rate": 0.0002, "epoch": 2.559463986599665, "step": 3820}, {"loss": 1.6218, "grad_norm": 0.4684867560863495, "learning_rate": 0.0002, "epoch": 2.5661641541038525, "step": 3830}, {"loss": 1.6535, "grad_norm": 0.44430989027023315, "learning_rate": 0.0002, "epoch": 2.5728643216080402, "step": 3840}, {"loss": 1.5909, "grad_norm": 0.47375255823135376, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3850}, {"loss": 1.6269, "grad_norm": 0.45493075251579285, "learning_rate": 0.0002, "epoch": 2.5862646566164154, "step": 3860}, {"loss": 1.604, "grad_norm": 0.4563275873661041, "learning_rate": 0.0002, "epoch": 2.592964824120603, "step": 3870}, {"loss": 1.642, "grad_norm": 0.46060335636138916, "learning_rate": 0.0002, "epoch": 2.5996649916247905, "step": 3880}, {"loss": 1.6302, "grad_norm": 0.4718867540359497, "learning_rate": 0.0002, "epoch": 2.6063651591289783, "step": 3890}, {"loss": 1.6242, "grad_norm": 0.41570305824279785, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3900}, {"loss": 1.6401, "grad_norm": 0.4603121876716614, "learning_rate": 0.0002, "epoch": 2.6197654941373534, "step": 3910}, {"loss": 1.6839, "grad_norm": 0.4734652638435364, "learning_rate": 0.0002, "epoch": 2.626465661641541, "step": 3920}, {"loss": 1.5448, "grad_norm": 0.45348483324050903, "learning_rate": 0.0002, "epoch": 2.6331658291457285, "step": 3930}, {"loss": 1.6157, "grad_norm": 0.46559447050094604, "learning_rate": 0.0002, "epoch": 2.6398659966499163, "step": 3940}, {"loss": 1.7052, "grad_norm": 0.44113144278526306, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3950}, {"loss": 1.6315, "grad_norm": 0.41415104269981384, "learning_rate": 0.0002, "epoch": 2.6532663316582914, "step": 3960}, {"loss": 1.6589, "grad_norm": 0.48868080973625183, "learning_rate": 0.0002, "epoch": 2.659966499162479, "step": 3970}, {"loss": 1.6211, "grad_norm": 0.49610549211502075, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 3980}, {"loss": 1.6235, "grad_norm": 0.4309130907058716, "learning_rate": 0.0002, "epoch": 2.6733668341708543, "step": 3990}, {"loss": 1.6452, "grad_norm": 0.4489327669143677, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 4000}, {"loss": 1.5744, "grad_norm": 0.5380139946937561, "learning_rate": 0.0002, "epoch": 2.6867671691792294, "step": 4010}, {"loss": 1.6524, "grad_norm": 0.5076672434806824, "learning_rate": 0.0002, "epoch": 2.693467336683417, "step": 4020}, {"loss": 1.636, "grad_norm": 0.47620031237602234, "learning_rate": 0.0002, "epoch": 2.7001675041876045, "step": 4030}, {"loss": 1.5543, "grad_norm": 0.48089155554771423, "learning_rate": 0.0002, "epoch": 2.7068676716917923, "step": 4040}, {"loss": 1.6396, "grad_norm": 0.5108814239501953, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 4050}, {"loss": 1.5905, "grad_norm": 0.4196513295173645, "learning_rate": 0.0002, "epoch": 2.7202680067001674, "step": 4060}, {"loss": 1.686, "grad_norm": 0.4574664831161499, "learning_rate": 0.0002, "epoch": 2.726968174204355, "step": 4070}, {"loss": 1.6234, "grad_norm": 0.4671640992164612, "learning_rate": 0.0002, "epoch": 2.7336683417085426, "step": 4080}, {"loss": 1.6827, "grad_norm": 0.49355530738830566, "learning_rate": 0.0002, "epoch": 2.7403685092127303, "step": 4090}, {"loss": 1.6999, "grad_norm": 0.46716663241386414, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 4100}, {"loss": 1.6463, "grad_norm": 0.45420581102371216, "learning_rate": 0.0002, "epoch": 2.7537688442211055, "step": 4110}, {"loss": 1.5718, "grad_norm": 0.4680487811565399, "learning_rate": 0.0002, "epoch": 2.7604690117252932, "step": 4120}, {"loss": 1.5968, "grad_norm": 0.5375032424926758, "learning_rate": 0.0002, "epoch": 2.7671691792294806, "step": 4130}, {"loss": 1.5254, "grad_norm": 0.46026280522346497, "learning_rate": 0.0002, "epoch": 2.7738693467336684, "step": 4140}, {"loss": 1.6613, "grad_norm": 0.43658447265625, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 4150}, {"loss": 1.6546, "grad_norm": 0.4935547113418579, "learning_rate": 0.0002, "epoch": 2.7872696817420435, "step": 4160}, {"loss": 1.5961, "grad_norm": 0.8167962431907654, "learning_rate": 0.0002, "epoch": 2.7939698492462313, "step": 4170}, {"loss": 1.6907, "grad_norm": 0.4289683997631073, "learning_rate": 0.0002, "epoch": 2.8006700167504186, "step": 4180}, {"loss": 1.6385, "grad_norm": 0.4569324254989624, "learning_rate": 0.0002, "epoch": 2.8073701842546064, "step": 4190}, {"loss": 1.6077, "grad_norm": 0.474795937538147, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 4200}, {"loss": 1.6223, "grad_norm": 0.44272229075431824, "learning_rate": 0.0002, "epoch": 2.8207705192629815, "step": 4210}, {"loss": 1.6706, "grad_norm": 0.525240957736969, "learning_rate": 0.0002, "epoch": 2.8274706867671693, "step": 4220}, {"loss": 1.7196, "grad_norm": 0.4802303910255432, "learning_rate": 0.0002, "epoch": 2.8341708542713566, "step": 4230}, {"loss": 1.6002, "grad_norm": 0.46400442719459534, "learning_rate": 0.0002, "epoch": 2.8408710217755444, "step": 4240}, {"loss": 1.6052, "grad_norm": 0.49884888529777527, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 4250}, {"loss": 1.6919, "grad_norm": 0.5015072226524353, "learning_rate": 0.0002, "epoch": 2.8542713567839195, "step": 4260}, {"loss": 1.6335, "grad_norm": 0.4335440695285797, "learning_rate": 0.0002, "epoch": 2.8609715242881073, "step": 4270}, {"loss": 1.5664, "grad_norm": 0.5131644606590271, "learning_rate": 0.0002, "epoch": 2.8676716917922946, "step": 4280}, {"loss": 1.6409, "grad_norm": 0.6977195739746094, "learning_rate": 0.0002, "epoch": 2.8743718592964824, "step": 4290}, {"loss": 1.7192, "grad_norm": 0.5133762955665588, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 4300}, {"loss": 1.6257, "grad_norm": 0.4737614393234253, "learning_rate": 0.0002, "epoch": 2.8877721943048575, "step": 4310}, {"loss": 1.6076, "grad_norm": 0.4580535590648651, "learning_rate": 0.0002, "epoch": 2.8944723618090453, "step": 4320}, {"loss": 1.6538, "grad_norm": 0.43863341212272644, "learning_rate": 0.0002, "epoch": 2.901172529313233, "step": 4330}, {"loss": 1.6091, "grad_norm": 0.4103737473487854, "learning_rate": 0.0002, "epoch": 2.9078726968174204, "step": 4340}, {"loss": 1.7106, "grad_norm": 0.438014417886734, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 4350}, {"loss": 1.6025, "grad_norm": 0.5068213939666748, "learning_rate": 0.0002, "epoch": 2.9212730318257956, "step": 4360}, {"loss": 1.6426, "grad_norm": 0.45305484533309937, "learning_rate": 0.0002, "epoch": 2.9279731993299833, "step": 4370}, {"loss": 1.5726, "grad_norm": 0.4612090289592743, "learning_rate": 0.0002, "epoch": 2.934673366834171, "step": 4380}, {"loss": 1.6536, "grad_norm": 0.508736789226532, "learning_rate": 0.0002, "epoch": 2.9413735343383585, "step": 4390}, {"loss": 1.6132, "grad_norm": 0.4924427270889282, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 4400}, {"loss": 1.7007, "grad_norm": 0.5707460641860962, "learning_rate": 0.0002, "epoch": 2.9547738693467336, "step": 4410}, {"loss": 1.6814, "grad_norm": 0.42270299792289734, "learning_rate": 0.0002, "epoch": 2.9614740368509214, "step": 4420}, {"loss": 1.6644, "grad_norm": 0.4429931044578552, "learning_rate": 0.0002, "epoch": 2.968174204355109, "step": 4430}, {"loss": 1.6251, "grad_norm": 0.49760574102401733, "learning_rate": 0.0002, "epoch": 2.9748743718592965, "step": 4440}, {"loss": 1.6169, "grad_norm": 0.4558229148387909, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 4450}, {"loss": 1.6055, "grad_norm": 0.39848530292510986, "learning_rate": 0.0002, "epoch": 2.9882747068676716, "step": 4460}, {"loss": 1.6705, "grad_norm": 0.5224862098693848, "learning_rate": 0.0002, "epoch": 2.9949748743718594, "step": 4470}]} +{"epoch": 4.0, "step": 5970, "epoch_duration": 1613.8745076656342, "total_accumulated_duration": 6342.423385620117, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}, {"eval_loss": 1.8036354780197144, "eval_runtime": 37.8949, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.715, "epoch": 0.9996649916247906, "step": 1492}, {"loss": 1.7138, "grad_norm": 0.29252371191978455, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1500}, {"loss": 1.8198, "grad_norm": 0.31607162952423096, "learning_rate": 0.0002, "epoch": 1.0117252931323284, "step": 1510}, {"loss": 1.6779, "grad_norm": 0.32294467091560364, "learning_rate": 0.0002, "epoch": 1.018425460636516, "step": 1520}, {"loss": 1.7919, "grad_norm": 0.3868017792701721, "learning_rate": 0.0002, "epoch": 1.0251256281407035, "step": 1530}, {"loss": 1.7954, "grad_norm": 0.3178282082080841, "learning_rate": 0.0002, "epoch": 1.031825795644891, "step": 1540}, {"loss": 1.7136, "grad_norm": 0.3706750273704529, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1550}, {"loss": 1.7382, "grad_norm": 0.33930912613868713, "learning_rate": 0.0002, "epoch": 1.0452261306532664, "step": 1560}, {"loss": 1.7602, "grad_norm": 0.33970504999160767, "learning_rate": 0.0002, "epoch": 1.051926298157454, "step": 1570}, {"loss": 1.6573, "grad_norm": 0.42553383111953735, "learning_rate": 0.0002, "epoch": 1.0586264656616415, "step": 1580}, {"loss": 1.645, "grad_norm": 0.3772421181201935, "learning_rate": 0.0002, "epoch": 1.065326633165829, "step": 1590}, {"loss": 1.7362, "grad_norm": 0.34212902188301086, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1600}, {"loss": 1.7057, "grad_norm": 0.3798283338546753, "learning_rate": 0.0002, "epoch": 1.0787269681742044, "step": 1610}, {"loss": 1.7468, "grad_norm": 0.36909598112106323, "learning_rate": 0.0002, "epoch": 1.085427135678392, "step": 1620}, {"loss": 1.7807, "grad_norm": 0.3344230651855469, "learning_rate": 0.0002, "epoch": 1.0921273031825796, "step": 1630}, {"loss": 1.7111, "grad_norm": 0.3862569332122803, "learning_rate": 0.0002, "epoch": 1.0988274706867671, "step": 1640}, {"loss": 1.7163, "grad_norm": 0.31188511848449707, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1650}, {"loss": 1.7263, "grad_norm": 0.3563670814037323, "learning_rate": 0.0002, "epoch": 1.1122278056951425, "step": 1660}, {"loss": 1.7718, "grad_norm": 0.35052165389060974, "learning_rate": 0.0002, "epoch": 1.11892797319933, "step": 1670}, {"loss": 1.7601, "grad_norm": 0.3285699188709259, "learning_rate": 0.0002, "epoch": 1.1256281407035176, "step": 1680}, {"loss": 1.6877, "grad_norm": 0.3639393746852875, "learning_rate": 0.0002, "epoch": 1.1323283082077051, "step": 1690}, {"loss": 1.7719, "grad_norm": 0.3842753767967224, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1700}, {"loss": 1.7002, "grad_norm": 0.3624933063983917, "learning_rate": 0.0002, "epoch": 1.1457286432160805, "step": 1710}, {"loss": 1.7243, "grad_norm": 0.3641220033168793, "learning_rate": 0.0002, "epoch": 1.152428810720268, "step": 1720}, {"loss": 1.752, "grad_norm": 0.32765355706214905, "learning_rate": 0.0002, "epoch": 1.1591289782244556, "step": 1730}, {"loss": 1.6556, "grad_norm": 0.34974896907806396, "learning_rate": 0.0002, "epoch": 1.1658291457286432, "step": 1740}, {"loss": 1.7273, "grad_norm": 0.3910926580429077, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1750}, {"loss": 1.7204, "grad_norm": 0.3564300537109375, "learning_rate": 0.0002, "epoch": 1.1792294807370185, "step": 1760}, {"loss": 1.746, "grad_norm": 0.34822574257850647, "learning_rate": 0.0002, "epoch": 1.185929648241206, "step": 1770}, {"loss": 1.7256, "grad_norm": 0.36185044050216675, "learning_rate": 0.0002, "epoch": 1.1926298157453936, "step": 1780}, {"loss": 1.6431, "grad_norm": 0.34866711497306824, "learning_rate": 0.0002, "epoch": 1.1993299832495812, "step": 1790}, {"loss": 1.8084, "grad_norm": 0.4017769992351532, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1800}, {"loss": 1.6985, "grad_norm": 0.32930681109428406, "learning_rate": 0.0002, "epoch": 1.2127303182579565, "step": 1810}, {"loss": 1.7606, "grad_norm": 0.35951921343803406, "learning_rate": 0.0002, "epoch": 1.219430485762144, "step": 1820}, {"loss": 1.6933, "grad_norm": 0.37366992235183716, "learning_rate": 0.0002, "epoch": 1.2261306532663316, "step": 1830}, {"loss": 1.6737, "grad_norm": 0.3565689027309418, "learning_rate": 0.0002, "epoch": 1.2328308207705192, "step": 1840}, {"loss": 1.8013, "grad_norm": 0.3692343533039093, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1850}, {"loss": 1.736, "grad_norm": 0.38426971435546875, "learning_rate": 0.0002, "epoch": 1.2462311557788945, "step": 1860}, {"loss": 1.7031, "grad_norm": 0.33559855818748474, "learning_rate": 0.0002, "epoch": 1.252931323283082, "step": 1870}, {"loss": 1.7033, "grad_norm": 0.34181106090545654, "learning_rate": 0.0002, "epoch": 1.2596314907872697, "step": 1880}, {"loss": 1.7707, "grad_norm": 0.3916318416595459, "learning_rate": 0.0002, "epoch": 1.2663316582914572, "step": 1890}, {"loss": 1.6686, "grad_norm": 0.3887825012207031, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1900}, {"loss": 1.7062, "grad_norm": 0.33583927154541016, "learning_rate": 0.0002, "epoch": 1.2797319932998326, "step": 1910}, {"loss": 1.717, "grad_norm": 0.37639349699020386, "learning_rate": 0.0002, "epoch": 1.2864321608040201, "step": 1920}, {"loss": 1.777, "grad_norm": 0.38059428334236145, "learning_rate": 0.0002, "epoch": 1.2931323283082077, "step": 1930}, {"loss": 1.6126, "grad_norm": 0.37253183126449585, "learning_rate": 0.0002, "epoch": 1.2998324958123952, "step": 1940}, {"loss": 1.6758, "grad_norm": 0.37371566891670227, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1950}, {"loss": 1.6788, "grad_norm": 0.4080910086631775, "learning_rate": 0.0002, "epoch": 1.3132328308207706, "step": 1960}, {"loss": 1.6518, "grad_norm": 0.3174354135990143, "learning_rate": 0.0002, "epoch": 1.3199329983249581, "step": 1970}, {"loss": 1.7925, "grad_norm": 0.4518888294696808, "learning_rate": 0.0002, "epoch": 1.3266331658291457, "step": 1980}, {"loss": 1.7085, "grad_norm": 0.3627921938896179, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 1990}, {"loss": 1.7676, "grad_norm": 0.3655930161476135, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 2000}, {"loss": 1.7016, "grad_norm": 0.3509993255138397, "learning_rate": 0.0002, "epoch": 1.3467336683417086, "step": 2010}, {"loss": 1.7359, "grad_norm": 0.4281129240989685, "learning_rate": 0.0002, "epoch": 1.3534338358458962, "step": 2020}, {"loss": 1.6884, "grad_norm": 0.3821414113044739, "learning_rate": 0.0002, "epoch": 1.3601340033500837, "step": 2030}, {"loss": 1.7075, "grad_norm": 0.3907586336135864, "learning_rate": 0.0002, "epoch": 1.3668341708542713, "step": 2040}, {"loss": 1.7424, "grad_norm": 0.37792932987213135, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 2050}, {"loss": 1.7305, "grad_norm": 0.3693985641002655, "learning_rate": 0.0002, "epoch": 1.3802345058626466, "step": 2060}, {"loss": 1.7434, "grad_norm": 0.32275936007499695, "learning_rate": 0.0002, "epoch": 1.3869346733668342, "step": 2070}, {"loss": 1.6677, "grad_norm": 0.3789440095424652, "learning_rate": 0.0002, "epoch": 1.3936348408710217, "step": 2080}, {"loss": 1.6825, "grad_norm": 0.3638380467891693, "learning_rate": 0.0002, "epoch": 1.4003350083752093, "step": 2090}, {"loss": 1.6542, "grad_norm": 0.3495481610298157, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 2100}, {"loss": 1.7225, "grad_norm": 0.37920597195625305, "learning_rate": 0.0002, "epoch": 1.4137353433835846, "step": 2110}, {"loss": 1.7329, "grad_norm": 0.37218064069747925, "learning_rate": 0.0002, "epoch": 1.4204355108877722, "step": 2120}, {"loss": 1.799, "grad_norm": 0.38074082136154175, "learning_rate": 0.0002, "epoch": 1.4271356783919598, "step": 2130}, {"loss": 1.7403, "grad_norm": 0.3455527126789093, "learning_rate": 0.0002, "epoch": 1.4338358458961473, "step": 2140}, {"loss": 1.776, "grad_norm": 0.3712003529071808, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 2150}, {"loss": 1.7619, "grad_norm": 0.3786754906177521, "learning_rate": 0.0002, "epoch": 1.4472361809045227, "step": 2160}, {"loss": 1.68, "grad_norm": 0.3879223167896271, "learning_rate": 0.0002, "epoch": 1.4539363484087102, "step": 2170}, {"loss": 1.7, "grad_norm": 0.38738805055618286, "learning_rate": 0.0002, "epoch": 1.4606365159128978, "step": 2180}, {"loss": 1.7581, "grad_norm": 0.39768800139427185, "learning_rate": 0.0002, "epoch": 1.4673366834170856, "step": 2190}, {"loss": 1.7671, "grad_norm": 0.4172441065311432, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 2200}, {"loss": 1.6736, "grad_norm": 0.4043174982070923, "learning_rate": 0.0002, "epoch": 1.4807370184254607, "step": 2210}, {"loss": 1.7444, "grad_norm": 0.3750883936882019, "learning_rate": 0.0002, "epoch": 1.4874371859296482, "step": 2220}, {"loss": 1.6861, "grad_norm": 0.3552253246307373, "learning_rate": 0.0002, "epoch": 1.4941373534338358, "step": 2230}, {"loss": 1.6471, "grad_norm": 0.34607139229774475, "learning_rate": 0.0002, "epoch": 1.5008375209380236, "step": 2240}, {"loss": 1.6962, "grad_norm": 0.3406706750392914, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 2250}, {"loss": 1.7694, "grad_norm": 0.36654895544052124, "learning_rate": 0.0002, "epoch": 1.5142378559463987, "step": 2260}, {"loss": 1.6812, "grad_norm": 0.3914054334163666, "learning_rate": 0.0002, "epoch": 1.5209380234505863, "step": 2270}, {"loss": 1.6822, "grad_norm": 0.42012137174606323, "learning_rate": 0.0002, "epoch": 1.5276381909547738, "step": 2280}, {"loss": 1.697, "grad_norm": 0.39563435316085815, "learning_rate": 0.0002, "epoch": 1.5343383584589616, "step": 2290}, {"loss": 1.7491, "grad_norm": 0.3508438766002655, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 2300}, {"loss": 1.7727, "grad_norm": 0.3785218596458435, "learning_rate": 0.0002, "epoch": 1.5477386934673367, "step": 2310}, {"loss": 1.6963, "grad_norm": 0.39377647638320923, "learning_rate": 0.0002, "epoch": 1.5544388609715243, "step": 2320}, {"loss": 1.7263, "grad_norm": 0.3391438126564026, "learning_rate": 0.0002, "epoch": 1.5611390284757118, "step": 2330}, {"loss": 1.7722, "grad_norm": 0.37944263219833374, "learning_rate": 0.0002, "epoch": 1.5678391959798996, "step": 2340}, {"loss": 1.6371, "grad_norm": 0.3523491322994232, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 2350}, {"loss": 1.7583, "grad_norm": 0.3911575973033905, "learning_rate": 0.0002, "epoch": 1.5812395309882747, "step": 2360}, {"loss": 1.7117, "grad_norm": 0.33832186460494995, "learning_rate": 0.0002, "epoch": 1.5879396984924623, "step": 2370}, {"loss": 1.7701, "grad_norm": 0.3665979206562042, "learning_rate": 0.0002, "epoch": 1.5946398659966499, "step": 2380}, {"loss": 1.779, "grad_norm": 0.3871748149394989, "learning_rate": 0.0002, "epoch": 1.6013400335008376, "step": 2390}, {"loss": 1.7109, "grad_norm": 0.3586967885494232, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 2400}, {"loss": 1.7096, "grad_norm": 0.3563673198223114, "learning_rate": 0.0002, "epoch": 1.6147403685092128, "step": 2410}, {"loss": 1.745, "grad_norm": 0.37588971853256226, "learning_rate": 0.0002, "epoch": 1.6214405360134003, "step": 2420}, {"loss": 1.7086, "grad_norm": 0.352556437253952, "learning_rate": 0.0002, "epoch": 1.6281407035175879, "step": 2430}, {"loss": 1.6547, "grad_norm": 0.3716259300708771, "learning_rate": 0.0002, "epoch": 1.6348408710217757, "step": 2440}, {"loss": 1.7033, "grad_norm": 0.372001975774765, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 2450}, {"loss": 1.6584, "grad_norm": 0.3430042862892151, "learning_rate": 0.0002, "epoch": 1.6482412060301508, "step": 2460}, {"loss": 1.7217, "grad_norm": 0.3741483688354492, "learning_rate": 0.0002, "epoch": 1.6549413735343383, "step": 2470}, {"loss": 1.7701, "grad_norm": 0.3610571324825287, "learning_rate": 0.0002, "epoch": 1.661641541038526, "step": 2480}, {"loss": 1.7057, "grad_norm": 0.4204719066619873, "learning_rate": 0.0002, "epoch": 1.6683417085427137, "step": 2490}, {"loss": 1.7954, "grad_norm": 0.3938186466693878, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2500}, {"loss": 1.6633, "grad_norm": 0.3421435058116913, "learning_rate": 0.0002, "epoch": 1.6817420435510888, "step": 2510}, {"loss": 1.7996, "grad_norm": 0.42441412806510925, "learning_rate": 0.0002, "epoch": 1.6884422110552764, "step": 2520}, {"loss": 1.7142, "grad_norm": 0.38071519136428833, "learning_rate": 0.0002, "epoch": 1.695142378559464, "step": 2530}, {"loss": 1.7232, "grad_norm": 0.34078919887542725, "learning_rate": 0.0002, "epoch": 1.7018425460636517, "step": 2540}, {"loss": 1.7126, "grad_norm": 0.412844181060791, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2550}, {"loss": 1.7149, "grad_norm": 0.3753604292869568, "learning_rate": 0.0002, "epoch": 1.7152428810720268, "step": 2560}, {"loss": 1.7011, "grad_norm": 0.41588476300239563, "learning_rate": 0.0002, "epoch": 1.7219430485762144, "step": 2570}, {"loss": 1.6427, "grad_norm": 0.35504111647605896, "learning_rate": 0.0002, "epoch": 1.728643216080402, "step": 2580}, {"loss": 1.7296, "grad_norm": 0.36909720301628113, "learning_rate": 0.0002, "epoch": 1.7353433835845897, "step": 2590}, {"loss": 1.7022, "grad_norm": 0.4149979054927826, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2600}, {"loss": 1.77, "grad_norm": 0.38859328627586365, "learning_rate": 0.0002, "epoch": 1.7487437185929648, "step": 2610}, {"loss": 1.7036, "grad_norm": 0.36738792061805725, "learning_rate": 0.0002, "epoch": 1.7554438860971524, "step": 2620}, {"loss": 1.764, "grad_norm": 0.3968178927898407, "learning_rate": 0.0002, "epoch": 1.76214405360134, "step": 2630}, {"loss": 1.7687, "grad_norm": 0.3972901999950409, "learning_rate": 0.0002, "epoch": 1.7688442211055277, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.3949959874153137, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2650}, {"loss": 1.7247, "grad_norm": 0.44074657559394836, "learning_rate": 0.0002, "epoch": 1.7822445561139029, "step": 2660}, {"loss": 1.7188, "grad_norm": 0.39743664860725403, "learning_rate": 0.0002, "epoch": 1.7889447236180904, "step": 2670}, {"loss": 1.7258, "grad_norm": 0.3950406610965729, "learning_rate": 0.0002, "epoch": 1.795644891122278, "step": 2680}, {"loss": 1.6906, "grad_norm": 0.3568263649940491, "learning_rate": 0.0002, "epoch": 1.8023450586264658, "step": 2690}, {"loss": 1.6735, "grad_norm": 0.3819476366043091, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2700}, {"loss": 1.7198, "grad_norm": 0.3480634391307831, "learning_rate": 0.0002, "epoch": 1.8157453936348409, "step": 2710}, {"loss": 1.7042, "grad_norm": 0.3875853419303894, "learning_rate": 0.0002, "epoch": 1.8224455611390284, "step": 2720}, {"loss": 1.6988, "grad_norm": 0.3441337049007416, "learning_rate": 0.0002, "epoch": 1.829145728643216, "step": 2730}, {"loss": 1.7647, "grad_norm": 0.35692882537841797, "learning_rate": 0.0002, "epoch": 1.8358458961474038, "step": 2740}, {"loss": 1.7033, "grad_norm": 0.36959215998649597, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2750}, {"loss": 1.7657, "grad_norm": 0.3893393278121948, "learning_rate": 0.0002, "epoch": 1.849246231155779, "step": 2760}, {"loss": 1.7068, "grad_norm": 0.37817293405532837, "learning_rate": 0.0002, "epoch": 1.8559463986599665, "step": 2770}, {"loss": 1.761, "grad_norm": 0.36071285605430603, "learning_rate": 0.0002, "epoch": 1.862646566164154, "step": 2780}, {"loss": 1.7623, "grad_norm": 0.3758420944213867, "learning_rate": 0.0002, "epoch": 1.8693467336683418, "step": 2790}, {"loss": 1.6743, "grad_norm": 0.3889938294887543, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2800}, {"loss": 1.6151, "grad_norm": 0.34361857175827026, "learning_rate": 0.0002, "epoch": 1.882747068676717, "step": 2810}, {"loss": 1.6038, "grad_norm": 0.39283323287963867, "learning_rate": 0.0002, "epoch": 1.8894472361809045, "step": 2820}, {"loss": 1.7555, "grad_norm": 0.3919452726840973, "learning_rate": 0.0002, "epoch": 1.896147403685092, "step": 2830}, {"loss": 1.673, "grad_norm": 0.38215070962905884, "learning_rate": 0.0002, "epoch": 1.9028475711892798, "step": 2840}, {"loss": 1.7044, "grad_norm": 0.4235064387321472, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2850}, {"loss": 1.7123, "grad_norm": 0.35694634914398193, "learning_rate": 0.0002, "epoch": 1.916247906197655, "step": 2860}, {"loss": 1.8128, "grad_norm": 0.383492112159729, "learning_rate": 0.0002, "epoch": 1.9229480737018425, "step": 2870}, {"loss": 1.7581, "grad_norm": 0.5945147275924683, "learning_rate": 0.0002, "epoch": 1.92964824120603, "step": 2880}, {"loss": 1.7421, "grad_norm": 0.3367522358894348, "learning_rate": 0.0002, "epoch": 1.9363484087102178, "step": 2890}, {"loss": 1.6561, "grad_norm": 0.35300394892692566, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2900}, {"loss": 1.7033, "grad_norm": 0.38084495067596436, "learning_rate": 0.0002, "epoch": 1.949748743718593, "step": 2910}, {"loss": 1.7132, "grad_norm": 0.37559160590171814, "learning_rate": 0.0002, "epoch": 1.9564489112227805, "step": 2920}, {"loss": 1.6759, "grad_norm": 0.3661738336086273, "learning_rate": 0.0002, "epoch": 1.963149078726968, "step": 2930}, {"loss": 1.7643, "grad_norm": 0.4073849320411682, "learning_rate": 0.0002, "epoch": 1.9698492462311559, "step": 2940}, {"loss": 1.6806, "grad_norm": 0.3723304271697998, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2950}, {"loss": 1.7611, "grad_norm": 0.3991098999977112, "learning_rate": 0.0002, "epoch": 1.983249581239531, "step": 2960}, {"loss": 1.7263, "grad_norm": 0.3947085440158844, "learning_rate": 0.0002, "epoch": 1.9899497487437185, "step": 2970}, {"loss": 1.7217, "grad_norm": 0.3786258399486542, "learning_rate": 0.0002, "epoch": 1.996649916247906, "step": 2980}, {"eval_loss": 1.8028968572616577, "eval_runtime": 37.8985, "eval_samples_per_second": 13.589, "eval_steps_per_second": 1.715, "epoch": 2.0, "step": 2985}, {"loss": 1.695, "grad_norm": 0.34824079275131226, "learning_rate": 0.0002, "epoch": 2.003350083752094, "step": 2990}, {"loss": 1.5853, "grad_norm": 0.3394894003868103, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 3000}, {"loss": 1.5783, "grad_norm": 0.36910977959632874, "learning_rate": 0.0002, "epoch": 2.016750418760469, "step": 3010}, {"loss": 1.6105, "grad_norm": 0.45000967383384705, "learning_rate": 0.0002, "epoch": 2.023450586264657, "step": 3020}, {"loss": 1.6019, "grad_norm": 0.3791407346725464, "learning_rate": 0.0002, "epoch": 2.030150753768844, "step": 3030}, {"loss": 1.5832, "grad_norm": 0.387321799993515, "learning_rate": 0.0002, "epoch": 2.036850921273032, "step": 3040}, {"loss": 1.6834, "grad_norm": 0.4185757040977478, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 3050}, {"loss": 1.5696, "grad_norm": 0.45110777020454407, "learning_rate": 0.0002, "epoch": 2.050251256281407, "step": 3060}, {"loss": 1.6231, "grad_norm": 0.42663660645484924, "learning_rate": 0.0002, "epoch": 2.056951423785595, "step": 3070}, {"loss": 1.6279, "grad_norm": 0.4546292722225189, "learning_rate": 0.0002, "epoch": 2.063651591289782, "step": 3080}, {"loss": 1.6141, "grad_norm": 0.3979759216308594, "learning_rate": 0.0002, "epoch": 2.07035175879397, "step": 3090}, {"loss": 1.6343, "grad_norm": 0.43596673011779785, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 3100}, {"loss": 1.5441, "grad_norm": 0.40120232105255127, "learning_rate": 0.0002, "epoch": 2.083752093802345, "step": 3110}, {"loss": 1.6309, "grad_norm": 0.44449281692504883, "learning_rate": 0.0002, "epoch": 2.090452261306533, "step": 3120}, {"loss": 1.5652, "grad_norm": 0.42672568559646606, "learning_rate": 0.0002, "epoch": 2.09715242881072, "step": 3130}, {"loss": 1.682, "grad_norm": 0.4232690930366516, "learning_rate": 0.0002, "epoch": 2.103852596314908, "step": 3140}, {"loss": 1.624, "grad_norm": 0.4299317002296448, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 3150}, {"loss": 1.6766, "grad_norm": 0.4067758023738861, "learning_rate": 0.0002, "epoch": 2.117252931323283, "step": 3160}, {"loss": 1.6759, "grad_norm": 0.4918815791606903, "learning_rate": 0.0002, "epoch": 2.123953098827471, "step": 3170}, {"loss": 1.6478, "grad_norm": 0.4140559732913971, "learning_rate": 0.0002, "epoch": 2.130653266331658, "step": 3180}, {"loss": 1.6641, "grad_norm": 0.4555995464324951, "learning_rate": 0.0002, "epoch": 2.137353433835846, "step": 3190}, {"loss": 1.5888, "grad_norm": 0.42943915724754333, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 3200}, {"loss": 1.5886, "grad_norm": 0.4730435013771057, "learning_rate": 0.0002, "epoch": 2.150753768844221, "step": 3210}, {"loss": 1.6022, "grad_norm": 0.43310216069221497, "learning_rate": 0.0002, "epoch": 2.157453936348409, "step": 3220}, {"loss": 1.6058, "grad_norm": 0.42054110765457153, "learning_rate": 0.0002, "epoch": 2.164154103852596, "step": 3230}, {"loss": 1.6749, "grad_norm": 0.4897233247756958, "learning_rate": 0.0002, "epoch": 2.170854271356784, "step": 3240}, {"loss": 1.6983, "grad_norm": 0.42194533348083496, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 3250}, {"loss": 1.6159, "grad_norm": 0.44494450092315674, "learning_rate": 0.0002, "epoch": 2.184254606365159, "step": 3260}, {"loss": 1.6977, "grad_norm": 0.43524879217147827, "learning_rate": 0.0002, "epoch": 2.190954773869347, "step": 3270}, {"loss": 1.528, "grad_norm": 0.4621117413043976, "learning_rate": 0.0002, "epoch": 2.1976549413735342, "step": 3280}, {"loss": 1.632, "grad_norm": 0.4073285460472107, "learning_rate": 0.0002, "epoch": 2.204355108877722, "step": 3290}, {"loss": 1.6141, "grad_norm": 0.47868335247039795, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 3300}, {"loss": 1.6857, "grad_norm": 0.4264970123767853, "learning_rate": 0.0002, "epoch": 2.217755443886097, "step": 3310}, {"loss": 1.5653, "grad_norm": 0.4491245150566101, "learning_rate": 0.0002, "epoch": 2.224455611390285, "step": 3320}, {"loss": 1.5881, "grad_norm": 0.4010344445705414, "learning_rate": 0.0002, "epoch": 2.2311557788944723, "step": 3330}, {"loss": 1.6684, "grad_norm": 0.4232759177684784, "learning_rate": 0.0002, "epoch": 2.23785594639866, "step": 3340}, {"loss": 1.6336, "grad_norm": 0.5099776983261108, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 3350}, {"loss": 1.6764, "grad_norm": 0.5223407745361328, "learning_rate": 0.0002, "epoch": 2.251256281407035, "step": 3360}, {"loss": 1.6625, "grad_norm": 0.47818470001220703, "learning_rate": 0.0002, "epoch": 2.257956448911223, "step": 3370}, {"loss": 1.5946, "grad_norm": 0.4721255898475647, "learning_rate": 0.0002, "epoch": 2.2646566164154103, "step": 3380}, {"loss": 1.5568, "grad_norm": 0.4113229513168335, "learning_rate": 0.0002, "epoch": 2.271356783919598, "step": 3390}, {"loss": 1.6494, "grad_norm": 0.507080078125, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 3400}, {"loss": 1.6183, "grad_norm": 0.4852292239665985, "learning_rate": 0.0002, "epoch": 2.284757118927973, "step": 3410}, {"loss": 1.6132, "grad_norm": 0.4503684341907501, "learning_rate": 0.0002, "epoch": 2.291457286432161, "step": 3420}, {"loss": 1.6649, "grad_norm": 0.8359600305557251, "learning_rate": 0.0002, "epoch": 2.2981574539363483, "step": 3430}, {"loss": 1.6644, "grad_norm": 0.44604045152664185, "learning_rate": 0.0002, "epoch": 2.304857621440536, "step": 3440}, {"loss": 1.5972, "grad_norm": 0.45667049288749695, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 3450}, {"loss": 1.6667, "grad_norm": 0.4879349172115326, "learning_rate": 0.0002, "epoch": 2.318257956448911, "step": 3460}, {"loss": 1.5804, "grad_norm": 0.4033963084220886, "learning_rate": 0.0002, "epoch": 2.324958123953099, "step": 3470}, {"loss": 1.5838, "grad_norm": 0.44494301080703735, "learning_rate": 0.0002, "epoch": 2.3316582914572863, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4794621765613556, "learning_rate": 0.0002, "epoch": 2.338358458961474, "step": 3490}, {"loss": 1.6807, "grad_norm": 0.41404327750205994, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 3500}, {"loss": 1.714, "grad_norm": 0.4664851725101471, "learning_rate": 0.0002, "epoch": 2.351758793969849, "step": 3510}, {"loss": 1.6537, "grad_norm": 0.4263697564601898, "learning_rate": 0.0002, "epoch": 2.358458961474037, "step": 3520}, {"loss": 1.6551, "grad_norm": 0.5035167336463928, "learning_rate": 0.0002, "epoch": 2.3651591289782243, "step": 3530}, {"loss": 1.6208, "grad_norm": 0.4380664527416229, "learning_rate": 0.0002, "epoch": 2.371859296482412, "step": 3540}, {"loss": 1.634, "grad_norm": 0.5227681994438171, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 3550}, {"loss": 1.6146, "grad_norm": 0.4382302761077881, "learning_rate": 0.0002, "epoch": 2.3852596314907872, "step": 3560}, {"loss": 1.5653, "grad_norm": 0.4392451047897339, "learning_rate": 0.0002, "epoch": 2.391959798994975, "step": 3570}, {"loss": 1.6626, "grad_norm": 0.4372786581516266, "learning_rate": 0.0002, "epoch": 2.3986599664991624, "step": 3580}, {"loss": 1.519, "grad_norm": 0.5015502572059631, "learning_rate": 0.0002, "epoch": 2.40536013400335, "step": 3590}, {"loss": 1.588, "grad_norm": 0.5653210878372192, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 3600}, {"loss": 1.6075, "grad_norm": 0.53007972240448, "learning_rate": 0.0002, "epoch": 2.4187604690117253, "step": 3610}, {"loss": 1.6421, "grad_norm": 0.4659176766872406, "learning_rate": 0.0002, "epoch": 2.425460636515913, "step": 3620}, {"loss": 1.625, "grad_norm": 0.5637837052345276, "learning_rate": 0.0002, "epoch": 2.4321608040201004, "step": 3630}, {"loss": 1.6168, "grad_norm": 0.4248391389846802, "learning_rate": 0.0002, "epoch": 2.438860971524288, "step": 3640}, {"loss": 1.6822, "grad_norm": 0.44668248295783997, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 3650}, {"loss": 1.6417, "grad_norm": 0.43990179896354675, "learning_rate": 0.0002, "epoch": 2.4522613065326633, "step": 3660}, {"loss": 1.6723, "grad_norm": 0.4532523453235626, "learning_rate": 0.0002, "epoch": 2.458961474036851, "step": 3670}, {"loss": 1.6957, "grad_norm": 0.6605591773986816, "learning_rate": 0.0002, "epoch": 2.4656616415410384, "step": 3680}, {"loss": 1.6159, "grad_norm": 0.4694533348083496, "learning_rate": 0.0002, "epoch": 2.472361809045226, "step": 3690}, {"loss": 1.6239, "grad_norm": 0.4485011100769043, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 3700}, {"loss": 1.6834, "grad_norm": 0.4761785864830017, "learning_rate": 0.0002, "epoch": 2.4857621440536013, "step": 3710}, {"loss": 1.6313, "grad_norm": 0.5116432309150696, "learning_rate": 0.0002, "epoch": 2.492462311557789, "step": 3720}, {"loss": 1.5054, "grad_norm": 0.49523618817329407, "learning_rate": 0.0002, "epoch": 2.4991624790619764, "step": 3730}, {"loss": 1.6249, "grad_norm": 0.43826380372047424, "learning_rate": 0.0002, "epoch": 2.505862646566164, "step": 3740}, {"loss": 1.5762, "grad_norm": 0.4916154146194458, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3750}, {"loss": 1.5157, "grad_norm": 0.5381299257278442, "learning_rate": 0.0002, "epoch": 2.5192629815745393, "step": 3760}, {"loss": 1.6467, "grad_norm": 0.44947415590286255, "learning_rate": 0.0002, "epoch": 2.525963149078727, "step": 3770}, {"loss": 1.67, "grad_norm": 0.49979084730148315, "learning_rate": 0.0002, "epoch": 2.5326633165829144, "step": 3780}, {"loss": 1.622, "grad_norm": 0.43046900629997253, "learning_rate": 0.0002, "epoch": 2.539363484087102, "step": 3790}, {"loss": 1.6789, "grad_norm": 0.4513470530509949, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3800}, {"loss": 1.6335, "grad_norm": 0.49900051951408386, "learning_rate": 0.0002, "epoch": 2.5527638190954773, "step": 3810}, {"loss": 1.6322, "grad_norm": 0.4348420202732086, "learning_rate": 0.0002, "epoch": 2.559463986599665, "step": 3820}, {"loss": 1.6218, "grad_norm": 0.4684867560863495, "learning_rate": 0.0002, "epoch": 2.5661641541038525, "step": 3830}, {"loss": 1.6535, "grad_norm": 0.44430989027023315, "learning_rate": 0.0002, "epoch": 2.5728643216080402, "step": 3840}, {"loss": 1.5909, "grad_norm": 0.47375255823135376, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3850}, {"loss": 1.6269, "grad_norm": 0.45493075251579285, "learning_rate": 0.0002, "epoch": 2.5862646566164154, "step": 3860}, {"loss": 1.604, "grad_norm": 0.4563275873661041, "learning_rate": 0.0002, "epoch": 2.592964824120603, "step": 3870}, {"loss": 1.642, "grad_norm": 0.46060335636138916, "learning_rate": 0.0002, "epoch": 2.5996649916247905, "step": 3880}, {"loss": 1.6302, "grad_norm": 0.4718867540359497, "learning_rate": 0.0002, "epoch": 2.6063651591289783, "step": 3890}, {"loss": 1.6242, "grad_norm": 0.41570305824279785, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3900}, {"loss": 1.6401, "grad_norm": 0.4603121876716614, "learning_rate": 0.0002, "epoch": 2.6197654941373534, "step": 3910}, {"loss": 1.6839, "grad_norm": 0.4734652638435364, "learning_rate": 0.0002, "epoch": 2.626465661641541, "step": 3920}, {"loss": 1.5448, "grad_norm": 0.45348483324050903, "learning_rate": 0.0002, "epoch": 2.6331658291457285, "step": 3930}, {"loss": 1.6157, "grad_norm": 0.46559447050094604, "learning_rate": 0.0002, "epoch": 2.6398659966499163, "step": 3940}, {"loss": 1.7052, "grad_norm": 0.44113144278526306, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3950}, {"loss": 1.6315, "grad_norm": 0.41415104269981384, "learning_rate": 0.0002, "epoch": 2.6532663316582914, "step": 3960}, {"loss": 1.6589, "grad_norm": 0.48868080973625183, "learning_rate": 0.0002, "epoch": 2.659966499162479, "step": 3970}, {"loss": 1.6211, "grad_norm": 0.49610549211502075, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 3980}, {"loss": 1.6235, "grad_norm": 0.4309130907058716, "learning_rate": 0.0002, "epoch": 2.6733668341708543, "step": 3990}, {"loss": 1.6452, "grad_norm": 0.4489327669143677, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 4000}, {"loss": 1.5744, "grad_norm": 0.5380139946937561, "learning_rate": 0.0002, "epoch": 2.6867671691792294, "step": 4010}, {"loss": 1.6524, "grad_norm": 0.5076672434806824, "learning_rate": 0.0002, "epoch": 2.693467336683417, "step": 4020}, {"loss": 1.636, "grad_norm": 0.47620031237602234, "learning_rate": 0.0002, "epoch": 2.7001675041876045, "step": 4030}, {"loss": 1.5543, "grad_norm": 0.48089155554771423, "learning_rate": 0.0002, "epoch": 2.7068676716917923, "step": 4040}, {"loss": 1.6396, "grad_norm": 0.5108814239501953, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 4050}, {"loss": 1.5905, "grad_norm": 0.4196513295173645, "learning_rate": 0.0002, "epoch": 2.7202680067001674, "step": 4060}, {"loss": 1.686, "grad_norm": 0.4574664831161499, "learning_rate": 0.0002, "epoch": 2.726968174204355, "step": 4070}, {"loss": 1.6234, "grad_norm": 0.4671640992164612, "learning_rate": 0.0002, "epoch": 2.7336683417085426, "step": 4080}, {"loss": 1.6827, "grad_norm": 0.49355530738830566, "learning_rate": 0.0002, "epoch": 2.7403685092127303, "step": 4090}, {"loss": 1.6999, "grad_norm": 0.46716663241386414, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 4100}, {"loss": 1.6463, "grad_norm": 0.45420581102371216, "learning_rate": 0.0002, "epoch": 2.7537688442211055, "step": 4110}, {"loss": 1.5718, "grad_norm": 0.4680487811565399, "learning_rate": 0.0002, "epoch": 2.7604690117252932, "step": 4120}, {"loss": 1.5968, "grad_norm": 0.5375032424926758, "learning_rate": 0.0002, "epoch": 2.7671691792294806, "step": 4130}, {"loss": 1.5254, "grad_norm": 0.46026280522346497, "learning_rate": 0.0002, "epoch": 2.7738693467336684, "step": 4140}, {"loss": 1.6613, "grad_norm": 0.43658447265625, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 4150}, {"loss": 1.6546, "grad_norm": 0.4935547113418579, "learning_rate": 0.0002, "epoch": 2.7872696817420435, "step": 4160}, {"loss": 1.5961, "grad_norm": 0.8167962431907654, "learning_rate": 0.0002, "epoch": 2.7939698492462313, "step": 4170}, {"loss": 1.6907, "grad_norm": 0.4289683997631073, "learning_rate": 0.0002, "epoch": 2.8006700167504186, "step": 4180}, {"loss": 1.6385, "grad_norm": 0.4569324254989624, "learning_rate": 0.0002, "epoch": 2.8073701842546064, "step": 4190}, {"loss": 1.6077, "grad_norm": 0.474795937538147, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 4200}, {"loss": 1.6223, "grad_norm": 0.44272229075431824, "learning_rate": 0.0002, "epoch": 2.8207705192629815, "step": 4210}, {"loss": 1.6706, "grad_norm": 0.525240957736969, "learning_rate": 0.0002, "epoch": 2.8274706867671693, "step": 4220}, {"loss": 1.7196, "grad_norm": 0.4802303910255432, "learning_rate": 0.0002, "epoch": 2.8341708542713566, "step": 4230}, {"loss": 1.6002, "grad_norm": 0.46400442719459534, "learning_rate": 0.0002, "epoch": 2.8408710217755444, "step": 4240}, {"loss": 1.6052, "grad_norm": 0.49884888529777527, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 4250}, {"loss": 1.6919, "grad_norm": 0.5015072226524353, "learning_rate": 0.0002, "epoch": 2.8542713567839195, "step": 4260}, {"loss": 1.6335, "grad_norm": 0.4335440695285797, "learning_rate": 0.0002, "epoch": 2.8609715242881073, "step": 4270}, {"loss": 1.5664, "grad_norm": 0.5131644606590271, "learning_rate": 0.0002, "epoch": 2.8676716917922946, "step": 4280}, {"loss": 1.6409, "grad_norm": 0.6977195739746094, "learning_rate": 0.0002, "epoch": 2.8743718592964824, "step": 4290}, {"loss": 1.7192, "grad_norm": 0.5133762955665588, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 4300}, {"loss": 1.6257, "grad_norm": 0.4737614393234253, "learning_rate": 0.0002, "epoch": 2.8877721943048575, "step": 4310}, {"loss": 1.6076, "grad_norm": 0.4580535590648651, "learning_rate": 0.0002, "epoch": 2.8944723618090453, "step": 4320}, {"loss": 1.6538, "grad_norm": 0.43863341212272644, "learning_rate": 0.0002, "epoch": 2.901172529313233, "step": 4330}, {"loss": 1.6091, "grad_norm": 0.4103737473487854, "learning_rate": 0.0002, "epoch": 2.9078726968174204, "step": 4340}, {"loss": 1.7106, "grad_norm": 0.438014417886734, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 4350}, {"loss": 1.6025, "grad_norm": 0.5068213939666748, "learning_rate": 0.0002, "epoch": 2.9212730318257956, "step": 4360}, {"loss": 1.6426, "grad_norm": 0.45305484533309937, "learning_rate": 0.0002, "epoch": 2.9279731993299833, "step": 4370}, {"loss": 1.5726, "grad_norm": 0.4612090289592743, "learning_rate": 0.0002, "epoch": 2.934673366834171, "step": 4380}, {"loss": 1.6536, "grad_norm": 0.508736789226532, "learning_rate": 0.0002, "epoch": 2.9413735343383585, "step": 4390}, {"loss": 1.6132, "grad_norm": 0.4924427270889282, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 4400}, {"loss": 1.7007, "grad_norm": 0.5707460641860962, "learning_rate": 0.0002, "epoch": 2.9547738693467336, "step": 4410}, {"loss": 1.6814, "grad_norm": 0.42270299792289734, "learning_rate": 0.0002, "epoch": 2.9614740368509214, "step": 4420}, {"loss": 1.6644, "grad_norm": 0.4429931044578552, "learning_rate": 0.0002, "epoch": 2.968174204355109, "step": 4430}, {"loss": 1.6251, "grad_norm": 0.49760574102401733, "learning_rate": 0.0002, "epoch": 2.9748743718592965, "step": 4440}, {"loss": 1.6169, "grad_norm": 0.4558229148387909, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 4450}, {"loss": 1.6055, "grad_norm": 0.39848530292510986, "learning_rate": 0.0002, "epoch": 2.9882747068676716, "step": 4460}, {"loss": 1.6705, "grad_norm": 0.5224862098693848, "learning_rate": 0.0002, "epoch": 2.9949748743718594, "step": 4470}, {"eval_loss": 1.8228833675384521, "eval_runtime": 37.9049, "eval_samples_per_second": 13.587, "eval_steps_per_second": 1.715, "epoch": 2.9996649916247904, "step": 4477}, {"loss": 1.6637, "grad_norm": 0.41169142723083496, "learning_rate": 0.0002, "epoch": 3.0016750418760467, "step": 4480}, {"loss": 1.5974, "grad_norm": 0.4865207374095917, "learning_rate": 0.0002, "epoch": 3.0083752093802345, "step": 4490}, {"loss": 1.5297, "grad_norm": 0.5462028384208679, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 4500}, {"loss": 1.5251, "grad_norm": 0.6169732809066772, "learning_rate": 0.0002, "epoch": 3.0217755443886096, "step": 4510}, {"loss": 1.5559, "grad_norm": 0.5667954087257385, "learning_rate": 0.0002, "epoch": 3.0284757118927974, "step": 4520}, {"loss": 1.5037, "grad_norm": 0.5758325457572937, "learning_rate": 0.0002, "epoch": 3.0351758793969847, "step": 4530}, {"loss": 1.4873, "grad_norm": 0.5220064520835876, "learning_rate": 0.0002, "epoch": 3.0418760469011725, "step": 4540}, {"loss": 1.5126, "grad_norm": 0.5469558835029602, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 4550}, {"loss": 1.4275, "grad_norm": 0.5680848956108093, "learning_rate": 0.0002, "epoch": 3.0552763819095476, "step": 4560}, {"loss": 1.5187, "grad_norm": 0.5906574726104736, "learning_rate": 0.0002, "epoch": 3.0619765494137354, "step": 4570}, {"loss": 1.4551, "grad_norm": 0.4725631773471832, "learning_rate": 0.0002, "epoch": 3.0686767169179228, "step": 4580}, {"loss": 1.5083, "grad_norm": 0.5273477435112, "learning_rate": 0.0002, "epoch": 3.0753768844221105, "step": 4590}, {"loss": 1.5154, "grad_norm": 0.5861203074455261, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 4600}, {"loss": 1.4924, "grad_norm": 0.5343965291976929, "learning_rate": 0.0002, "epoch": 3.0887772194304857, "step": 4610}, {"loss": 1.5608, "grad_norm": 0.5348150730133057, "learning_rate": 0.0002, "epoch": 3.0954773869346734, "step": 4620}, {"loss": 1.5399, "grad_norm": 0.5971846580505371, "learning_rate": 0.0002, "epoch": 3.102177554438861, "step": 4630}, {"loss": 1.4662, "grad_norm": 0.5203177332878113, "learning_rate": 0.0002, "epoch": 3.1088777219430486, "step": 4640}, {"loss": 1.5805, "grad_norm": 0.55289226770401, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 4650}, {"loss": 1.4745, "grad_norm": 0.6878530979156494, "learning_rate": 0.0002, "epoch": 3.1222780569514237, "step": 4660}, {"loss": 1.5335, "grad_norm": 0.6173256635665894, "learning_rate": 0.0002, "epoch": 3.1289782244556115, "step": 4670}, {"loss": 1.51, "grad_norm": 0.536796510219574, "learning_rate": 0.0002, "epoch": 3.135678391959799, "step": 4680}, {"loss": 1.4713, "grad_norm": 0.58846116065979, "learning_rate": 0.0002, "epoch": 3.1423785594639866, "step": 4690}, {"loss": 1.5114, "grad_norm": 0.645889401435852, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 4700}, {"loss": 1.4705, "grad_norm": 0.6118691563606262, "learning_rate": 0.0002, "epoch": 3.1557788944723617, "step": 4710}, {"loss": 1.5533, "grad_norm": 0.5189669132232666, "learning_rate": 0.0002, "epoch": 3.1624790619765495, "step": 4720}, {"loss": 1.4769, "grad_norm": 0.5794713497161865, "learning_rate": 0.0002, "epoch": 3.169179229480737, "step": 4730}, {"loss": 1.4849, "grad_norm": 0.6579326391220093, "learning_rate": 0.0002, "epoch": 3.1758793969849246, "step": 4740}, {"loss": 1.545, "grad_norm": 0.5822742581367493, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 4750}, {"loss": 1.4358, "grad_norm": 0.5475956201553345, "learning_rate": 0.0002, "epoch": 3.1892797319932997, "step": 4760}, {"loss": 1.4723, "grad_norm": 0.6743834018707275, "learning_rate": 0.0002, "epoch": 3.1959798994974875, "step": 4770}, {"loss": 1.5161, "grad_norm": 0.6110585927963257, "learning_rate": 0.0002, "epoch": 3.202680067001675, "step": 4780}, {"loss": 1.5455, "grad_norm": 0.5426181554794312, "learning_rate": 0.0002, "epoch": 3.2093802345058626, "step": 4790}, {"loss": 1.5315, "grad_norm": 0.6077824234962463, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 4800}, {"loss": 1.5314, "grad_norm": 0.5785858631134033, "learning_rate": 0.0002, "epoch": 3.2227805695142377, "step": 4810}, {"loss": 1.4041, "grad_norm": 0.6425958275794983, "learning_rate": 0.0002, "epoch": 3.2294807370184255, "step": 4820}, {"loss": 1.4751, "grad_norm": 0.6607080698013306, "learning_rate": 0.0002, "epoch": 3.236180904522613, "step": 4830}, {"loss": 1.5267, "grad_norm": 0.5385788679122925, "learning_rate": 0.0002, "epoch": 3.2428810720268006, "step": 4840}, {"loss": 1.4673, "grad_norm": 0.5630403757095337, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 4850}, {"loss": 1.5257, "grad_norm": 0.6340779662132263, "learning_rate": 0.0002, "epoch": 3.2562814070351758, "step": 4860}, {"loss": 1.5148, "grad_norm": 0.5305342674255371, "learning_rate": 0.0002, "epoch": 3.2629815745393635, "step": 4870}, {"loss": 1.5162, "grad_norm": 0.597670316696167, "learning_rate": 0.0002, "epoch": 3.2696817420435513, "step": 4880}, {"loss": 1.5429, "grad_norm": 0.665553867816925, "learning_rate": 0.0002, "epoch": 3.2763819095477387, "step": 4890}, {"loss": 1.4607, "grad_norm": 0.579767644405365, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 4900}, {"loss": 1.4999, "grad_norm": 0.5512481331825256, "learning_rate": 0.0002, "epoch": 3.289782244556114, "step": 4910}, {"loss": 1.5022, "grad_norm": 0.5916532278060913, "learning_rate": 0.0002, "epoch": 3.2964824120603016, "step": 4920}, {"loss": 1.4889, "grad_norm": 0.7521726489067078, "learning_rate": 0.0002, "epoch": 3.3031825795644894, "step": 4930}, {"loss": 1.4223, "grad_norm": 0.5352797508239746, "learning_rate": 0.0002, "epoch": 3.3098827470686767, "step": 4940}, {"loss": 1.5122, "grad_norm": 0.5950371623039246, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 4950}, {"loss": 1.5072, "grad_norm": 0.8020477890968323, "learning_rate": 0.0002, "epoch": 3.323283082077052, "step": 4960}, {"loss": 1.5422, "grad_norm": 0.6790024638175964, "learning_rate": 0.0002, "epoch": 3.3299832495812396, "step": 4970}, {"loss": 1.5363, "grad_norm": 0.687627375125885, "learning_rate": 0.0002, "epoch": 3.3366834170854274, "step": 4980}, {"loss": 1.5276, "grad_norm": 0.6094385385513306, "learning_rate": 0.0002, "epoch": 3.3433835845896147, "step": 4990}, {"loss": 1.549, "grad_norm": 0.6541242003440857, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 5000}, {"loss": 1.6067, "grad_norm": 0.5560880303382874, "learning_rate": 0.0002, "epoch": 3.35678391959799, "step": 5010}, {"loss": 1.5769, "grad_norm": 0.5440094470977783, "learning_rate": 0.0002, "epoch": 3.3634840871021776, "step": 5020}, {"loss": 1.6183, "grad_norm": 0.5749301314353943, "learning_rate": 0.0002, "epoch": 3.3701842546063654, "step": 5030}, {"loss": 1.4801, "grad_norm": 0.5919716954231262, "learning_rate": 0.0002, "epoch": 3.3768844221105527, "step": 5040}, {"loss": 1.5957, "grad_norm": 0.6331481337547302, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 5050}, {"loss": 1.5188, "grad_norm": 0.5687161684036255, "learning_rate": 0.0002, "epoch": 3.390284757118928, "step": 5060}, {"loss": 1.5702, "grad_norm": 0.6718577742576599, "learning_rate": 0.0002, "epoch": 3.3969849246231156, "step": 5070}, {"loss": 1.5577, "grad_norm": 0.5089324116706848, "learning_rate": 0.0002, "epoch": 3.4036850921273034, "step": 5080}, {"loss": 1.512, "grad_norm": 0.5710174441337585, "learning_rate": 0.0002, "epoch": 3.4103852596314908, "step": 5090}, {"loss": 1.5492, "grad_norm": 0.6670721173286438, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 5100}, {"loss": 1.5227, "grad_norm": 0.6875665187835693, "learning_rate": 0.0002, "epoch": 3.423785594639866, "step": 5110}, {"loss": 1.4496, "grad_norm": 0.5375880599021912, "learning_rate": 0.0002, "epoch": 3.4304857621440537, "step": 5120}, {"loss": 1.5527, "grad_norm": 0.6550399661064148, "learning_rate": 0.0002, "epoch": 3.4371859296482414, "step": 5130}, {"loss": 1.5687, "grad_norm": 0.5948067903518677, "learning_rate": 0.0002, "epoch": 3.4438860971524288, "step": 5140}, {"loss": 1.4813, "grad_norm": 0.6134477257728577, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 5150}, {"loss": 1.5069, "grad_norm": 0.6506398320198059, "learning_rate": 0.0002, "epoch": 3.457286432160804, "step": 5160}, {"loss": 1.4422, "grad_norm": 0.6060147881507874, "learning_rate": 0.0002, "epoch": 3.4639865996649917, "step": 5170}, {"loss": 1.5093, "grad_norm": 0.6173806190490723, "learning_rate": 0.0002, "epoch": 3.4706867671691795, "step": 5180}, {"loss": 1.4975, "grad_norm": 0.6032607555389404, "learning_rate": 0.0002, "epoch": 3.477386934673367, "step": 5190}, {"loss": 1.4979, "grad_norm": 0.5652492046356201, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 5200}, {"loss": 1.4883, "grad_norm": 0.6168607473373413, "learning_rate": 0.0002, "epoch": 3.490787269681742, "step": 5210}, {"loss": 1.5164, "grad_norm": 0.6170629262924194, "learning_rate": 0.0002, "epoch": 3.4974874371859297, "step": 5220}, {"loss": 1.4879, "grad_norm": 0.6926297545433044, "learning_rate": 0.0002, "epoch": 3.5041876046901175, "step": 5230}, {"loss": 1.4982, "grad_norm": 0.6702437996864319, "learning_rate": 0.0002, "epoch": 3.510887772194305, "step": 5240}, {"loss": 1.4986, "grad_norm": 0.5421436429023743, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 5250}, {"loss": 1.4673, "grad_norm": 0.5726765990257263, "learning_rate": 0.0002, "epoch": 3.52428810720268, "step": 5260}, {"loss": 1.5423, "grad_norm": 0.5685455203056335, "learning_rate": 0.0002, "epoch": 3.5309882747068677, "step": 5270}, {"loss": 1.4715, "grad_norm": 0.6018396019935608, "learning_rate": 0.0002, "epoch": 3.5376884422110555, "step": 5280}, {"loss": 1.5451, "grad_norm": 0.5731932520866394, "learning_rate": 0.0002, "epoch": 3.544388609715243, "step": 5290}, {"loss": 1.4752, "grad_norm": 0.6601519584655762, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 5300}, {"loss": 1.5434, "grad_norm": 0.5545530319213867, "learning_rate": 0.0002, "epoch": 3.557788944723618, "step": 5310}, {"loss": 1.5438, "grad_norm": 0.5998541116714478, "learning_rate": 0.0002, "epoch": 3.5644891122278057, "step": 5320}, {"loss": 1.56, "grad_norm": 0.5651767253875732, "learning_rate": 0.0002, "epoch": 3.5711892797319935, "step": 5330}, {"loss": 1.4829, "grad_norm": 0.7425084114074707, "learning_rate": 0.0002, "epoch": 3.577889447236181, "step": 5340}, {"loss": 1.5571, "grad_norm": 0.5770602226257324, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 5350}, {"loss": 1.458, "grad_norm": 0.54723060131073, "learning_rate": 0.0002, "epoch": 3.591289782244556, "step": 5360}, {"loss": 1.497, "grad_norm": 0.6658238172531128, "learning_rate": 0.0002, "epoch": 3.5979899497487438, "step": 5370}, {"loss": 1.5456, "grad_norm": 0.5787645578384399, "learning_rate": 0.0002, "epoch": 3.6046901172529315, "step": 5380}, {"loss": 1.5343, "grad_norm": 0.594913125038147, "learning_rate": 0.0002, "epoch": 3.611390284757119, "step": 5390}, {"loss": 1.4727, "grad_norm": 0.4964977502822876, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 5400}, {"loss": 1.5062, "grad_norm": 0.6087527275085449, "learning_rate": 0.0002, "epoch": 3.624790619765494, "step": 5410}, {"loss": 1.5098, "grad_norm": 0.6315323710441589, "learning_rate": 0.0002, "epoch": 3.6314907872696818, "step": 5420}, {"loss": 1.4855, "grad_norm": 0.574799120426178, "learning_rate": 0.0002, "epoch": 3.6381909547738696, "step": 5430}, {"loss": 1.4595, "grad_norm": 0.5949277877807617, "learning_rate": 0.0002, "epoch": 3.644891122278057, "step": 5440}, {"loss": 1.4816, "grad_norm": 0.5640677213668823, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 5450}, {"loss": 1.525, "grad_norm": 0.6198237538337708, "learning_rate": 0.0002, "epoch": 3.658291457286432, "step": 5460}, {"loss": 1.5676, "grad_norm": 0.6902034878730774, "learning_rate": 0.0002, "epoch": 3.66499162479062, "step": 5470}, {"loss": 1.544, "grad_norm": 0.5686674118041992, "learning_rate": 0.0002, "epoch": 3.6716917922948076, "step": 5480}, {"loss": 1.5255, "grad_norm": 0.6532107591629028, "learning_rate": 0.0002, "epoch": 3.678391959798995, "step": 5490}, {"loss": 1.5767, "grad_norm": 0.5790849924087524, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 5500}, {"loss": 1.5507, "grad_norm": 0.6055065393447876, "learning_rate": 0.0002, "epoch": 3.69179229480737, "step": 5510}, {"loss": 1.4656, "grad_norm": 0.5630605816841125, "learning_rate": 0.0002, "epoch": 3.698492462311558, "step": 5520}, {"loss": 1.537, "grad_norm": 0.6005825996398926, "learning_rate": 0.0002, "epoch": 3.7051926298157456, "step": 5530}, {"loss": 1.5313, "grad_norm": 0.6553038954734802, "learning_rate": 0.0002, "epoch": 3.711892797319933, "step": 5540}, {"loss": 1.4943, "grad_norm": 0.5601094961166382, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 5550}, {"loss": 1.5161, "grad_norm": 0.6598808169364929, "learning_rate": 0.0002, "epoch": 3.725293132328308, "step": 5560}, {"loss": 1.5345, "grad_norm": 0.5506255626678467, "learning_rate": 0.0002, "epoch": 3.731993299832496, "step": 5570}, {"loss": 1.4805, "grad_norm": 0.6001223921775818, "learning_rate": 0.0002, "epoch": 3.7386934673366836, "step": 5580}, {"loss": 1.4652, "grad_norm": 0.6287297606468201, "learning_rate": 0.0002, "epoch": 3.745393634840871, "step": 5590}, {"loss": 1.5246, "grad_norm": 0.6253238916397095, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 5600}, {"loss": 1.5691, "grad_norm": 0.5713174939155579, "learning_rate": 0.0002, "epoch": 3.758793969849246, "step": 5610}, {"loss": 1.5661, "grad_norm": 0.6198310852050781, "learning_rate": 0.0002, "epoch": 3.765494137353434, "step": 5620}, {"loss": 1.5448, "grad_norm": 0.5941224098205566, "learning_rate": 0.0002, "epoch": 3.7721943048576216, "step": 5630}, {"loss": 1.4925, "grad_norm": 0.606002151966095, "learning_rate": 0.0002, "epoch": 3.778894472361809, "step": 5640}, {"loss": 1.5182, "grad_norm": 0.6540704965591431, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 5650}, {"loss": 1.5903, "grad_norm": 0.6147415041923523, "learning_rate": 0.0002, "epoch": 3.792294807370184, "step": 5660}, {"loss": 1.5329, "grad_norm": 0.5649605393409729, "learning_rate": 0.0002, "epoch": 3.798994974874372, "step": 5670}, {"loss": 1.5747, "grad_norm": 0.6788773536682129, "learning_rate": 0.0002, "epoch": 3.8056951423785597, "step": 5680}, {"loss": 1.535, "grad_norm": 0.6581860780715942, "learning_rate": 0.0002, "epoch": 3.812395309882747, "step": 5690}, {"loss": 1.4587, "grad_norm": 0.5529348850250244, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 5700}, {"loss": 1.5594, "grad_norm": 0.6320232152938843, "learning_rate": 0.0002, "epoch": 3.825795644891122, "step": 5710}, {"loss": 1.4696, "grad_norm": 0.6529698371887207, "learning_rate": 0.0002, "epoch": 3.83249581239531, "step": 5720}, {"loss": 1.5854, "grad_norm": 0.5983362793922424, "learning_rate": 0.0002, "epoch": 3.8391959798994977, "step": 5730}, {"loss": 1.465, "grad_norm": 0.6335684061050415, "learning_rate": 0.0002, "epoch": 3.845896147403685, "step": 5740}, {"loss": 1.5545, "grad_norm": 0.700446605682373, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 5750}, {"loss": 1.5707, "grad_norm": 0.6092597842216492, "learning_rate": 0.0002, "epoch": 3.85929648241206, "step": 5760}, {"loss": 1.5729, "grad_norm": 0.564146101474762, "learning_rate": 0.0002, "epoch": 3.865996649916248, "step": 5770}, {"loss": 1.5872, "grad_norm": 0.615275502204895, "learning_rate": 0.0002, "epoch": 3.8726968174204357, "step": 5780}, {"loss": 1.5142, "grad_norm": 0.6685376763343811, "learning_rate": 0.0002, "epoch": 3.879396984924623, "step": 5790}, {"loss": 1.4752, "grad_norm": 0.6116922497749329, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 5800}, {"loss": 1.5179, "grad_norm": 0.5486813187599182, "learning_rate": 0.0002, "epoch": 3.892797319932998, "step": 5810}, {"loss": 1.5167, "grad_norm": 0.6208204030990601, "learning_rate": 0.0002, "epoch": 3.899497487437186, "step": 5820}, {"loss": 1.5334, "grad_norm": 0.6500625014305115, "learning_rate": 0.0002, "epoch": 3.9061976549413737, "step": 5830}, {"loss": 1.4716, "grad_norm": 0.5948089361190796, "learning_rate": 0.0002, "epoch": 3.912897822445561, "step": 5840}, {"loss": 1.6011, "grad_norm": 0.7210732698440552, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 5850}, {"loss": 1.5519, "grad_norm": 0.6662322878837585, "learning_rate": 0.0002, "epoch": 3.926298157453936, "step": 5860}, {"loss": 1.5656, "grad_norm": 0.5613839626312256, "learning_rate": 0.0002, "epoch": 3.932998324958124, "step": 5870}, {"loss": 1.544, "grad_norm": 0.6069002151489258, "learning_rate": 0.0002, "epoch": 3.9396984924623117, "step": 5880}, {"loss": 1.6745, "grad_norm": 0.7075562477111816, "learning_rate": 0.0002, "epoch": 3.946398659966499, "step": 5890}, {"loss": 1.5391, "grad_norm": 0.6316173076629639, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 5900}, {"loss": 1.6314, "grad_norm": 0.5716308355331421, "learning_rate": 0.0002, "epoch": 3.959798994974874, "step": 5910}, {"loss": 1.5947, "grad_norm": 0.6800096035003662, "learning_rate": 0.0002, "epoch": 3.966499162479062, "step": 5920}, {"loss": 1.5189, "grad_norm": 0.6057983040809631, "learning_rate": 0.0002, "epoch": 3.9731993299832498, "step": 5930}, {"loss": 1.5431, "grad_norm": 0.5938987731933594, "learning_rate": 0.0002, "epoch": 3.979899497487437, "step": 5940}, {"loss": 1.5111, "grad_norm": 0.6963576674461365, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 5950}, {"loss": 1.5521, "grad_norm": 0.6279940009117126, "learning_rate": 0.0002, "epoch": 3.993299832495812, "step": 5960}, {"loss": 1.5974, "grad_norm": 0.7161159515380859, "learning_rate": 0.0002, "epoch": 4.0, "step": 5970}]} +{"epoch": 4.99966499162479, "step": 7462, "epoch_duration": 1615.0195803642273, "total_accumulated_duration": 7957.4429659843445, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}, {"eval_loss": 1.8036354780197144, "eval_runtime": 37.8949, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.715, "epoch": 0.9996649916247906, "step": 1492}, {"loss": 1.7138, "grad_norm": 0.29252371191978455, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1500}, {"loss": 1.8198, "grad_norm": 0.31607162952423096, "learning_rate": 0.0002, "epoch": 1.0117252931323284, "step": 1510}, {"loss": 1.6779, "grad_norm": 0.32294467091560364, "learning_rate": 0.0002, "epoch": 1.018425460636516, "step": 1520}, {"loss": 1.7919, "grad_norm": 0.3868017792701721, "learning_rate": 0.0002, "epoch": 1.0251256281407035, "step": 1530}, {"loss": 1.7954, "grad_norm": 0.3178282082080841, "learning_rate": 0.0002, "epoch": 1.031825795644891, "step": 1540}, {"loss": 1.7136, "grad_norm": 0.3706750273704529, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1550}, {"loss": 1.7382, "grad_norm": 0.33930912613868713, "learning_rate": 0.0002, "epoch": 1.0452261306532664, "step": 1560}, {"loss": 1.7602, "grad_norm": 0.33970504999160767, "learning_rate": 0.0002, "epoch": 1.051926298157454, "step": 1570}, {"loss": 1.6573, "grad_norm": 0.42553383111953735, "learning_rate": 0.0002, "epoch": 1.0586264656616415, "step": 1580}, {"loss": 1.645, "grad_norm": 0.3772421181201935, "learning_rate": 0.0002, "epoch": 1.065326633165829, "step": 1590}, {"loss": 1.7362, "grad_norm": 0.34212902188301086, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1600}, {"loss": 1.7057, "grad_norm": 0.3798283338546753, "learning_rate": 0.0002, "epoch": 1.0787269681742044, "step": 1610}, {"loss": 1.7468, "grad_norm": 0.36909598112106323, "learning_rate": 0.0002, "epoch": 1.085427135678392, "step": 1620}, {"loss": 1.7807, "grad_norm": 0.3344230651855469, "learning_rate": 0.0002, "epoch": 1.0921273031825796, "step": 1630}, {"loss": 1.7111, "grad_norm": 0.3862569332122803, "learning_rate": 0.0002, "epoch": 1.0988274706867671, "step": 1640}, {"loss": 1.7163, "grad_norm": 0.31188511848449707, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1650}, {"loss": 1.7263, "grad_norm": 0.3563670814037323, "learning_rate": 0.0002, "epoch": 1.1122278056951425, "step": 1660}, {"loss": 1.7718, "grad_norm": 0.35052165389060974, "learning_rate": 0.0002, "epoch": 1.11892797319933, "step": 1670}, {"loss": 1.7601, "grad_norm": 0.3285699188709259, "learning_rate": 0.0002, "epoch": 1.1256281407035176, "step": 1680}, {"loss": 1.6877, "grad_norm": 0.3639393746852875, "learning_rate": 0.0002, "epoch": 1.1323283082077051, "step": 1690}, {"loss": 1.7719, "grad_norm": 0.3842753767967224, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1700}, {"loss": 1.7002, "grad_norm": 0.3624933063983917, "learning_rate": 0.0002, "epoch": 1.1457286432160805, "step": 1710}, {"loss": 1.7243, "grad_norm": 0.3641220033168793, "learning_rate": 0.0002, "epoch": 1.152428810720268, "step": 1720}, {"loss": 1.752, "grad_norm": 0.32765355706214905, "learning_rate": 0.0002, "epoch": 1.1591289782244556, "step": 1730}, {"loss": 1.6556, "grad_norm": 0.34974896907806396, "learning_rate": 0.0002, "epoch": 1.1658291457286432, "step": 1740}, {"loss": 1.7273, "grad_norm": 0.3910926580429077, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1750}, {"loss": 1.7204, "grad_norm": 0.3564300537109375, "learning_rate": 0.0002, "epoch": 1.1792294807370185, "step": 1760}, {"loss": 1.746, "grad_norm": 0.34822574257850647, "learning_rate": 0.0002, "epoch": 1.185929648241206, "step": 1770}, {"loss": 1.7256, "grad_norm": 0.36185044050216675, "learning_rate": 0.0002, "epoch": 1.1926298157453936, "step": 1780}, {"loss": 1.6431, "grad_norm": 0.34866711497306824, "learning_rate": 0.0002, "epoch": 1.1993299832495812, "step": 1790}, {"loss": 1.8084, "grad_norm": 0.4017769992351532, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1800}, {"loss": 1.6985, "grad_norm": 0.32930681109428406, "learning_rate": 0.0002, "epoch": 1.2127303182579565, "step": 1810}, {"loss": 1.7606, "grad_norm": 0.35951921343803406, "learning_rate": 0.0002, "epoch": 1.219430485762144, "step": 1820}, {"loss": 1.6933, "grad_norm": 0.37366992235183716, "learning_rate": 0.0002, "epoch": 1.2261306532663316, "step": 1830}, {"loss": 1.6737, "grad_norm": 0.3565689027309418, "learning_rate": 0.0002, "epoch": 1.2328308207705192, "step": 1840}, {"loss": 1.8013, "grad_norm": 0.3692343533039093, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1850}, {"loss": 1.736, "grad_norm": 0.38426971435546875, "learning_rate": 0.0002, "epoch": 1.2462311557788945, "step": 1860}, {"loss": 1.7031, "grad_norm": 0.33559855818748474, "learning_rate": 0.0002, "epoch": 1.252931323283082, "step": 1870}, {"loss": 1.7033, "grad_norm": 0.34181106090545654, "learning_rate": 0.0002, "epoch": 1.2596314907872697, "step": 1880}, {"loss": 1.7707, "grad_norm": 0.3916318416595459, "learning_rate": 0.0002, "epoch": 1.2663316582914572, "step": 1890}, {"loss": 1.6686, "grad_norm": 0.3887825012207031, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1900}, {"loss": 1.7062, "grad_norm": 0.33583927154541016, "learning_rate": 0.0002, "epoch": 1.2797319932998326, "step": 1910}, {"loss": 1.717, "grad_norm": 0.37639349699020386, "learning_rate": 0.0002, "epoch": 1.2864321608040201, "step": 1920}, {"loss": 1.777, "grad_norm": 0.38059428334236145, "learning_rate": 0.0002, "epoch": 1.2931323283082077, "step": 1930}, {"loss": 1.6126, "grad_norm": 0.37253183126449585, "learning_rate": 0.0002, "epoch": 1.2998324958123952, "step": 1940}, {"loss": 1.6758, "grad_norm": 0.37371566891670227, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1950}, {"loss": 1.6788, "grad_norm": 0.4080910086631775, "learning_rate": 0.0002, "epoch": 1.3132328308207706, "step": 1960}, {"loss": 1.6518, "grad_norm": 0.3174354135990143, "learning_rate": 0.0002, "epoch": 1.3199329983249581, "step": 1970}, {"loss": 1.7925, "grad_norm": 0.4518888294696808, "learning_rate": 0.0002, "epoch": 1.3266331658291457, "step": 1980}, {"loss": 1.7085, "grad_norm": 0.3627921938896179, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 1990}, {"loss": 1.7676, "grad_norm": 0.3655930161476135, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 2000}, {"loss": 1.7016, "grad_norm": 0.3509993255138397, "learning_rate": 0.0002, "epoch": 1.3467336683417086, "step": 2010}, {"loss": 1.7359, "grad_norm": 0.4281129240989685, "learning_rate": 0.0002, "epoch": 1.3534338358458962, "step": 2020}, {"loss": 1.6884, "grad_norm": 0.3821414113044739, "learning_rate": 0.0002, "epoch": 1.3601340033500837, "step": 2030}, {"loss": 1.7075, "grad_norm": 0.3907586336135864, "learning_rate": 0.0002, "epoch": 1.3668341708542713, "step": 2040}, {"loss": 1.7424, "grad_norm": 0.37792932987213135, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 2050}, {"loss": 1.7305, "grad_norm": 0.3693985641002655, "learning_rate": 0.0002, "epoch": 1.3802345058626466, "step": 2060}, {"loss": 1.7434, "grad_norm": 0.32275936007499695, "learning_rate": 0.0002, "epoch": 1.3869346733668342, "step": 2070}, {"loss": 1.6677, "grad_norm": 0.3789440095424652, "learning_rate": 0.0002, "epoch": 1.3936348408710217, "step": 2080}, {"loss": 1.6825, "grad_norm": 0.3638380467891693, "learning_rate": 0.0002, "epoch": 1.4003350083752093, "step": 2090}, {"loss": 1.6542, "grad_norm": 0.3495481610298157, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 2100}, {"loss": 1.7225, "grad_norm": 0.37920597195625305, "learning_rate": 0.0002, "epoch": 1.4137353433835846, "step": 2110}, {"loss": 1.7329, "grad_norm": 0.37218064069747925, "learning_rate": 0.0002, "epoch": 1.4204355108877722, "step": 2120}, {"loss": 1.799, "grad_norm": 0.38074082136154175, "learning_rate": 0.0002, "epoch": 1.4271356783919598, "step": 2130}, {"loss": 1.7403, "grad_norm": 0.3455527126789093, "learning_rate": 0.0002, "epoch": 1.4338358458961473, "step": 2140}, {"loss": 1.776, "grad_norm": 0.3712003529071808, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 2150}, {"loss": 1.7619, "grad_norm": 0.3786754906177521, "learning_rate": 0.0002, "epoch": 1.4472361809045227, "step": 2160}, {"loss": 1.68, "grad_norm": 0.3879223167896271, "learning_rate": 0.0002, "epoch": 1.4539363484087102, "step": 2170}, {"loss": 1.7, "grad_norm": 0.38738805055618286, "learning_rate": 0.0002, "epoch": 1.4606365159128978, "step": 2180}, {"loss": 1.7581, "grad_norm": 0.39768800139427185, "learning_rate": 0.0002, "epoch": 1.4673366834170856, "step": 2190}, {"loss": 1.7671, "grad_norm": 0.4172441065311432, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 2200}, {"loss": 1.6736, "grad_norm": 0.4043174982070923, "learning_rate": 0.0002, "epoch": 1.4807370184254607, "step": 2210}, {"loss": 1.7444, "grad_norm": 0.3750883936882019, "learning_rate": 0.0002, "epoch": 1.4874371859296482, "step": 2220}, {"loss": 1.6861, "grad_norm": 0.3552253246307373, "learning_rate": 0.0002, "epoch": 1.4941373534338358, "step": 2230}, {"loss": 1.6471, "grad_norm": 0.34607139229774475, "learning_rate": 0.0002, "epoch": 1.5008375209380236, "step": 2240}, {"loss": 1.6962, "grad_norm": 0.3406706750392914, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 2250}, {"loss": 1.7694, "grad_norm": 0.36654895544052124, "learning_rate": 0.0002, "epoch": 1.5142378559463987, "step": 2260}, {"loss": 1.6812, "grad_norm": 0.3914054334163666, "learning_rate": 0.0002, "epoch": 1.5209380234505863, "step": 2270}, {"loss": 1.6822, "grad_norm": 0.42012137174606323, "learning_rate": 0.0002, "epoch": 1.5276381909547738, "step": 2280}, {"loss": 1.697, "grad_norm": 0.39563435316085815, "learning_rate": 0.0002, "epoch": 1.5343383584589616, "step": 2290}, {"loss": 1.7491, "grad_norm": 0.3508438766002655, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 2300}, {"loss": 1.7727, "grad_norm": 0.3785218596458435, "learning_rate": 0.0002, "epoch": 1.5477386934673367, "step": 2310}, {"loss": 1.6963, "grad_norm": 0.39377647638320923, "learning_rate": 0.0002, "epoch": 1.5544388609715243, "step": 2320}, {"loss": 1.7263, "grad_norm": 0.3391438126564026, "learning_rate": 0.0002, "epoch": 1.5611390284757118, "step": 2330}, {"loss": 1.7722, "grad_norm": 0.37944263219833374, "learning_rate": 0.0002, "epoch": 1.5678391959798996, "step": 2340}, {"loss": 1.6371, "grad_norm": 0.3523491322994232, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 2350}, {"loss": 1.7583, "grad_norm": 0.3911575973033905, "learning_rate": 0.0002, "epoch": 1.5812395309882747, "step": 2360}, {"loss": 1.7117, "grad_norm": 0.33832186460494995, "learning_rate": 0.0002, "epoch": 1.5879396984924623, "step": 2370}, {"loss": 1.7701, "grad_norm": 0.3665979206562042, "learning_rate": 0.0002, "epoch": 1.5946398659966499, "step": 2380}, {"loss": 1.779, "grad_norm": 0.3871748149394989, "learning_rate": 0.0002, "epoch": 1.6013400335008376, "step": 2390}, {"loss": 1.7109, "grad_norm": 0.3586967885494232, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 2400}, {"loss": 1.7096, "grad_norm": 0.3563673198223114, "learning_rate": 0.0002, "epoch": 1.6147403685092128, "step": 2410}, {"loss": 1.745, "grad_norm": 0.37588971853256226, "learning_rate": 0.0002, "epoch": 1.6214405360134003, "step": 2420}, {"loss": 1.7086, "grad_norm": 0.352556437253952, "learning_rate": 0.0002, "epoch": 1.6281407035175879, "step": 2430}, {"loss": 1.6547, "grad_norm": 0.3716259300708771, "learning_rate": 0.0002, "epoch": 1.6348408710217757, "step": 2440}, {"loss": 1.7033, "grad_norm": 0.372001975774765, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 2450}, {"loss": 1.6584, "grad_norm": 0.3430042862892151, "learning_rate": 0.0002, "epoch": 1.6482412060301508, "step": 2460}, {"loss": 1.7217, "grad_norm": 0.3741483688354492, "learning_rate": 0.0002, "epoch": 1.6549413735343383, "step": 2470}, {"loss": 1.7701, "grad_norm": 0.3610571324825287, "learning_rate": 0.0002, "epoch": 1.661641541038526, "step": 2480}, {"loss": 1.7057, "grad_norm": 0.4204719066619873, "learning_rate": 0.0002, "epoch": 1.6683417085427137, "step": 2490}, {"loss": 1.7954, "grad_norm": 0.3938186466693878, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2500}, {"loss": 1.6633, "grad_norm": 0.3421435058116913, "learning_rate": 0.0002, "epoch": 1.6817420435510888, "step": 2510}, {"loss": 1.7996, "grad_norm": 0.42441412806510925, "learning_rate": 0.0002, "epoch": 1.6884422110552764, "step": 2520}, {"loss": 1.7142, "grad_norm": 0.38071519136428833, "learning_rate": 0.0002, "epoch": 1.695142378559464, "step": 2530}, {"loss": 1.7232, "grad_norm": 0.34078919887542725, "learning_rate": 0.0002, "epoch": 1.7018425460636517, "step": 2540}, {"loss": 1.7126, "grad_norm": 0.412844181060791, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2550}, {"loss": 1.7149, "grad_norm": 0.3753604292869568, "learning_rate": 0.0002, "epoch": 1.7152428810720268, "step": 2560}, {"loss": 1.7011, "grad_norm": 0.41588476300239563, "learning_rate": 0.0002, "epoch": 1.7219430485762144, "step": 2570}, {"loss": 1.6427, "grad_norm": 0.35504111647605896, "learning_rate": 0.0002, "epoch": 1.728643216080402, "step": 2580}, {"loss": 1.7296, "grad_norm": 0.36909720301628113, "learning_rate": 0.0002, "epoch": 1.7353433835845897, "step": 2590}, {"loss": 1.7022, "grad_norm": 0.4149979054927826, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2600}, {"loss": 1.77, "grad_norm": 0.38859328627586365, "learning_rate": 0.0002, "epoch": 1.7487437185929648, "step": 2610}, {"loss": 1.7036, "grad_norm": 0.36738792061805725, "learning_rate": 0.0002, "epoch": 1.7554438860971524, "step": 2620}, {"loss": 1.764, "grad_norm": 0.3968178927898407, "learning_rate": 0.0002, "epoch": 1.76214405360134, "step": 2630}, {"loss": 1.7687, "grad_norm": 0.3972901999950409, "learning_rate": 0.0002, "epoch": 1.7688442211055277, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.3949959874153137, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2650}, {"loss": 1.7247, "grad_norm": 0.44074657559394836, "learning_rate": 0.0002, "epoch": 1.7822445561139029, "step": 2660}, {"loss": 1.7188, "grad_norm": 0.39743664860725403, "learning_rate": 0.0002, "epoch": 1.7889447236180904, "step": 2670}, {"loss": 1.7258, "grad_norm": 0.3950406610965729, "learning_rate": 0.0002, "epoch": 1.795644891122278, "step": 2680}, {"loss": 1.6906, "grad_norm": 0.3568263649940491, "learning_rate": 0.0002, "epoch": 1.8023450586264658, "step": 2690}, {"loss": 1.6735, "grad_norm": 0.3819476366043091, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2700}, {"loss": 1.7198, "grad_norm": 0.3480634391307831, "learning_rate": 0.0002, "epoch": 1.8157453936348409, "step": 2710}, {"loss": 1.7042, "grad_norm": 0.3875853419303894, "learning_rate": 0.0002, "epoch": 1.8224455611390284, "step": 2720}, {"loss": 1.6988, "grad_norm": 0.3441337049007416, "learning_rate": 0.0002, "epoch": 1.829145728643216, "step": 2730}, {"loss": 1.7647, "grad_norm": 0.35692882537841797, "learning_rate": 0.0002, "epoch": 1.8358458961474038, "step": 2740}, {"loss": 1.7033, "grad_norm": 0.36959215998649597, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2750}, {"loss": 1.7657, "grad_norm": 0.3893393278121948, "learning_rate": 0.0002, "epoch": 1.849246231155779, "step": 2760}, {"loss": 1.7068, "grad_norm": 0.37817293405532837, "learning_rate": 0.0002, "epoch": 1.8559463986599665, "step": 2770}, {"loss": 1.761, "grad_norm": 0.36071285605430603, "learning_rate": 0.0002, "epoch": 1.862646566164154, "step": 2780}, {"loss": 1.7623, "grad_norm": 0.3758420944213867, "learning_rate": 0.0002, "epoch": 1.8693467336683418, "step": 2790}, {"loss": 1.6743, "grad_norm": 0.3889938294887543, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2800}, {"loss": 1.6151, "grad_norm": 0.34361857175827026, "learning_rate": 0.0002, "epoch": 1.882747068676717, "step": 2810}, {"loss": 1.6038, "grad_norm": 0.39283323287963867, "learning_rate": 0.0002, "epoch": 1.8894472361809045, "step": 2820}, {"loss": 1.7555, "grad_norm": 0.3919452726840973, "learning_rate": 0.0002, "epoch": 1.896147403685092, "step": 2830}, {"loss": 1.673, "grad_norm": 0.38215070962905884, "learning_rate": 0.0002, "epoch": 1.9028475711892798, "step": 2840}, {"loss": 1.7044, "grad_norm": 0.4235064387321472, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2850}, {"loss": 1.7123, "grad_norm": 0.35694634914398193, "learning_rate": 0.0002, "epoch": 1.916247906197655, "step": 2860}, {"loss": 1.8128, "grad_norm": 0.383492112159729, "learning_rate": 0.0002, "epoch": 1.9229480737018425, "step": 2870}, {"loss": 1.7581, "grad_norm": 0.5945147275924683, "learning_rate": 0.0002, "epoch": 1.92964824120603, "step": 2880}, {"loss": 1.7421, "grad_norm": 0.3367522358894348, "learning_rate": 0.0002, "epoch": 1.9363484087102178, "step": 2890}, {"loss": 1.6561, "grad_norm": 0.35300394892692566, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2900}, {"loss": 1.7033, "grad_norm": 0.38084495067596436, "learning_rate": 0.0002, "epoch": 1.949748743718593, "step": 2910}, {"loss": 1.7132, "grad_norm": 0.37559160590171814, "learning_rate": 0.0002, "epoch": 1.9564489112227805, "step": 2920}, {"loss": 1.6759, "grad_norm": 0.3661738336086273, "learning_rate": 0.0002, "epoch": 1.963149078726968, "step": 2930}, {"loss": 1.7643, "grad_norm": 0.4073849320411682, "learning_rate": 0.0002, "epoch": 1.9698492462311559, "step": 2940}, {"loss": 1.6806, "grad_norm": 0.3723304271697998, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2950}, {"loss": 1.7611, "grad_norm": 0.3991098999977112, "learning_rate": 0.0002, "epoch": 1.983249581239531, "step": 2960}, {"loss": 1.7263, "grad_norm": 0.3947085440158844, "learning_rate": 0.0002, "epoch": 1.9899497487437185, "step": 2970}, {"loss": 1.7217, "grad_norm": 0.3786258399486542, "learning_rate": 0.0002, "epoch": 1.996649916247906, "step": 2980}, {"eval_loss": 1.8028968572616577, "eval_runtime": 37.8985, "eval_samples_per_second": 13.589, "eval_steps_per_second": 1.715, "epoch": 2.0, "step": 2985}, {"loss": 1.695, "grad_norm": 0.34824079275131226, "learning_rate": 0.0002, "epoch": 2.003350083752094, "step": 2990}, {"loss": 1.5853, "grad_norm": 0.3394894003868103, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 3000}, {"loss": 1.5783, "grad_norm": 0.36910977959632874, "learning_rate": 0.0002, "epoch": 2.016750418760469, "step": 3010}, {"loss": 1.6105, "grad_norm": 0.45000967383384705, "learning_rate": 0.0002, "epoch": 2.023450586264657, "step": 3020}, {"loss": 1.6019, "grad_norm": 0.3791407346725464, "learning_rate": 0.0002, "epoch": 2.030150753768844, "step": 3030}, {"loss": 1.5832, "grad_norm": 0.387321799993515, "learning_rate": 0.0002, "epoch": 2.036850921273032, "step": 3040}, {"loss": 1.6834, "grad_norm": 0.4185757040977478, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 3050}, {"loss": 1.5696, "grad_norm": 0.45110777020454407, "learning_rate": 0.0002, "epoch": 2.050251256281407, "step": 3060}, {"loss": 1.6231, "grad_norm": 0.42663660645484924, "learning_rate": 0.0002, "epoch": 2.056951423785595, "step": 3070}, {"loss": 1.6279, "grad_norm": 0.4546292722225189, "learning_rate": 0.0002, "epoch": 2.063651591289782, "step": 3080}, {"loss": 1.6141, "grad_norm": 0.3979759216308594, "learning_rate": 0.0002, "epoch": 2.07035175879397, "step": 3090}, {"loss": 1.6343, "grad_norm": 0.43596673011779785, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 3100}, {"loss": 1.5441, "grad_norm": 0.40120232105255127, "learning_rate": 0.0002, "epoch": 2.083752093802345, "step": 3110}, {"loss": 1.6309, "grad_norm": 0.44449281692504883, "learning_rate": 0.0002, "epoch": 2.090452261306533, "step": 3120}, {"loss": 1.5652, "grad_norm": 0.42672568559646606, "learning_rate": 0.0002, "epoch": 2.09715242881072, "step": 3130}, {"loss": 1.682, "grad_norm": 0.4232690930366516, "learning_rate": 0.0002, "epoch": 2.103852596314908, "step": 3140}, {"loss": 1.624, "grad_norm": 0.4299317002296448, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 3150}, {"loss": 1.6766, "grad_norm": 0.4067758023738861, "learning_rate": 0.0002, "epoch": 2.117252931323283, "step": 3160}, {"loss": 1.6759, "grad_norm": 0.4918815791606903, "learning_rate": 0.0002, "epoch": 2.123953098827471, "step": 3170}, {"loss": 1.6478, "grad_norm": 0.4140559732913971, "learning_rate": 0.0002, "epoch": 2.130653266331658, "step": 3180}, {"loss": 1.6641, "grad_norm": 0.4555995464324951, "learning_rate": 0.0002, "epoch": 2.137353433835846, "step": 3190}, {"loss": 1.5888, "grad_norm": 0.42943915724754333, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 3200}, {"loss": 1.5886, "grad_norm": 0.4730435013771057, "learning_rate": 0.0002, "epoch": 2.150753768844221, "step": 3210}, {"loss": 1.6022, "grad_norm": 0.43310216069221497, "learning_rate": 0.0002, "epoch": 2.157453936348409, "step": 3220}, {"loss": 1.6058, "grad_norm": 0.42054110765457153, "learning_rate": 0.0002, "epoch": 2.164154103852596, "step": 3230}, {"loss": 1.6749, "grad_norm": 0.4897233247756958, "learning_rate": 0.0002, "epoch": 2.170854271356784, "step": 3240}, {"loss": 1.6983, "grad_norm": 0.42194533348083496, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 3250}, {"loss": 1.6159, "grad_norm": 0.44494450092315674, "learning_rate": 0.0002, "epoch": 2.184254606365159, "step": 3260}, {"loss": 1.6977, "grad_norm": 0.43524879217147827, "learning_rate": 0.0002, "epoch": 2.190954773869347, "step": 3270}, {"loss": 1.528, "grad_norm": 0.4621117413043976, "learning_rate": 0.0002, "epoch": 2.1976549413735342, "step": 3280}, {"loss": 1.632, "grad_norm": 0.4073285460472107, "learning_rate": 0.0002, "epoch": 2.204355108877722, "step": 3290}, {"loss": 1.6141, "grad_norm": 0.47868335247039795, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 3300}, {"loss": 1.6857, "grad_norm": 0.4264970123767853, "learning_rate": 0.0002, "epoch": 2.217755443886097, "step": 3310}, {"loss": 1.5653, "grad_norm": 0.4491245150566101, "learning_rate": 0.0002, "epoch": 2.224455611390285, "step": 3320}, {"loss": 1.5881, "grad_norm": 0.4010344445705414, "learning_rate": 0.0002, "epoch": 2.2311557788944723, "step": 3330}, {"loss": 1.6684, "grad_norm": 0.4232759177684784, "learning_rate": 0.0002, "epoch": 2.23785594639866, "step": 3340}, {"loss": 1.6336, "grad_norm": 0.5099776983261108, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 3350}, {"loss": 1.6764, "grad_norm": 0.5223407745361328, "learning_rate": 0.0002, "epoch": 2.251256281407035, "step": 3360}, {"loss": 1.6625, "grad_norm": 0.47818470001220703, "learning_rate": 0.0002, "epoch": 2.257956448911223, "step": 3370}, {"loss": 1.5946, "grad_norm": 0.4721255898475647, "learning_rate": 0.0002, "epoch": 2.2646566164154103, "step": 3380}, {"loss": 1.5568, "grad_norm": 0.4113229513168335, "learning_rate": 0.0002, "epoch": 2.271356783919598, "step": 3390}, {"loss": 1.6494, "grad_norm": 0.507080078125, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 3400}, {"loss": 1.6183, "grad_norm": 0.4852292239665985, "learning_rate": 0.0002, "epoch": 2.284757118927973, "step": 3410}, {"loss": 1.6132, "grad_norm": 0.4503684341907501, "learning_rate": 0.0002, "epoch": 2.291457286432161, "step": 3420}, {"loss": 1.6649, "grad_norm": 0.8359600305557251, "learning_rate": 0.0002, "epoch": 2.2981574539363483, "step": 3430}, {"loss": 1.6644, "grad_norm": 0.44604045152664185, "learning_rate": 0.0002, "epoch": 2.304857621440536, "step": 3440}, {"loss": 1.5972, "grad_norm": 0.45667049288749695, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 3450}, {"loss": 1.6667, "grad_norm": 0.4879349172115326, "learning_rate": 0.0002, "epoch": 2.318257956448911, "step": 3460}, {"loss": 1.5804, "grad_norm": 0.4033963084220886, "learning_rate": 0.0002, "epoch": 2.324958123953099, "step": 3470}, {"loss": 1.5838, "grad_norm": 0.44494301080703735, "learning_rate": 0.0002, "epoch": 2.3316582914572863, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4794621765613556, "learning_rate": 0.0002, "epoch": 2.338358458961474, "step": 3490}, {"loss": 1.6807, "grad_norm": 0.41404327750205994, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 3500}, {"loss": 1.714, "grad_norm": 0.4664851725101471, "learning_rate": 0.0002, "epoch": 2.351758793969849, "step": 3510}, {"loss": 1.6537, "grad_norm": 0.4263697564601898, "learning_rate": 0.0002, "epoch": 2.358458961474037, "step": 3520}, {"loss": 1.6551, "grad_norm": 0.5035167336463928, "learning_rate": 0.0002, "epoch": 2.3651591289782243, "step": 3530}, {"loss": 1.6208, "grad_norm": 0.4380664527416229, "learning_rate": 0.0002, "epoch": 2.371859296482412, "step": 3540}, {"loss": 1.634, "grad_norm": 0.5227681994438171, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 3550}, {"loss": 1.6146, "grad_norm": 0.4382302761077881, "learning_rate": 0.0002, "epoch": 2.3852596314907872, "step": 3560}, {"loss": 1.5653, "grad_norm": 0.4392451047897339, "learning_rate": 0.0002, "epoch": 2.391959798994975, "step": 3570}, {"loss": 1.6626, "grad_norm": 0.4372786581516266, "learning_rate": 0.0002, "epoch": 2.3986599664991624, "step": 3580}, {"loss": 1.519, "grad_norm": 0.5015502572059631, "learning_rate": 0.0002, "epoch": 2.40536013400335, "step": 3590}, {"loss": 1.588, "grad_norm": 0.5653210878372192, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 3600}, {"loss": 1.6075, "grad_norm": 0.53007972240448, "learning_rate": 0.0002, "epoch": 2.4187604690117253, "step": 3610}, {"loss": 1.6421, "grad_norm": 0.4659176766872406, "learning_rate": 0.0002, "epoch": 2.425460636515913, "step": 3620}, {"loss": 1.625, "grad_norm": 0.5637837052345276, "learning_rate": 0.0002, "epoch": 2.4321608040201004, "step": 3630}, {"loss": 1.6168, "grad_norm": 0.4248391389846802, "learning_rate": 0.0002, "epoch": 2.438860971524288, "step": 3640}, {"loss": 1.6822, "grad_norm": 0.44668248295783997, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 3650}, {"loss": 1.6417, "grad_norm": 0.43990179896354675, "learning_rate": 0.0002, "epoch": 2.4522613065326633, "step": 3660}, {"loss": 1.6723, "grad_norm": 0.4532523453235626, "learning_rate": 0.0002, "epoch": 2.458961474036851, "step": 3670}, {"loss": 1.6957, "grad_norm": 0.6605591773986816, "learning_rate": 0.0002, "epoch": 2.4656616415410384, "step": 3680}, {"loss": 1.6159, "grad_norm": 0.4694533348083496, "learning_rate": 0.0002, "epoch": 2.472361809045226, "step": 3690}, {"loss": 1.6239, "grad_norm": 0.4485011100769043, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 3700}, {"loss": 1.6834, "grad_norm": 0.4761785864830017, "learning_rate": 0.0002, "epoch": 2.4857621440536013, "step": 3710}, {"loss": 1.6313, "grad_norm": 0.5116432309150696, "learning_rate": 0.0002, "epoch": 2.492462311557789, "step": 3720}, {"loss": 1.5054, "grad_norm": 0.49523618817329407, "learning_rate": 0.0002, "epoch": 2.4991624790619764, "step": 3730}, {"loss": 1.6249, "grad_norm": 0.43826380372047424, "learning_rate": 0.0002, "epoch": 2.505862646566164, "step": 3740}, {"loss": 1.5762, "grad_norm": 0.4916154146194458, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3750}, {"loss": 1.5157, "grad_norm": 0.5381299257278442, "learning_rate": 0.0002, "epoch": 2.5192629815745393, "step": 3760}, {"loss": 1.6467, "grad_norm": 0.44947415590286255, "learning_rate": 0.0002, "epoch": 2.525963149078727, "step": 3770}, {"loss": 1.67, "grad_norm": 0.49979084730148315, "learning_rate": 0.0002, "epoch": 2.5326633165829144, "step": 3780}, {"loss": 1.622, "grad_norm": 0.43046900629997253, "learning_rate": 0.0002, "epoch": 2.539363484087102, "step": 3790}, {"loss": 1.6789, "grad_norm": 0.4513470530509949, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3800}, {"loss": 1.6335, "grad_norm": 0.49900051951408386, "learning_rate": 0.0002, "epoch": 2.5527638190954773, "step": 3810}, {"loss": 1.6322, "grad_norm": 0.4348420202732086, "learning_rate": 0.0002, "epoch": 2.559463986599665, "step": 3820}, {"loss": 1.6218, "grad_norm": 0.4684867560863495, "learning_rate": 0.0002, "epoch": 2.5661641541038525, "step": 3830}, {"loss": 1.6535, "grad_norm": 0.44430989027023315, "learning_rate": 0.0002, "epoch": 2.5728643216080402, "step": 3840}, {"loss": 1.5909, "grad_norm": 0.47375255823135376, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3850}, {"loss": 1.6269, "grad_norm": 0.45493075251579285, "learning_rate": 0.0002, "epoch": 2.5862646566164154, "step": 3860}, {"loss": 1.604, "grad_norm": 0.4563275873661041, "learning_rate": 0.0002, "epoch": 2.592964824120603, "step": 3870}, {"loss": 1.642, "grad_norm": 0.46060335636138916, "learning_rate": 0.0002, "epoch": 2.5996649916247905, "step": 3880}, {"loss": 1.6302, "grad_norm": 0.4718867540359497, "learning_rate": 0.0002, "epoch": 2.6063651591289783, "step": 3890}, {"loss": 1.6242, "grad_norm": 0.41570305824279785, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3900}, {"loss": 1.6401, "grad_norm": 0.4603121876716614, "learning_rate": 0.0002, "epoch": 2.6197654941373534, "step": 3910}, {"loss": 1.6839, "grad_norm": 0.4734652638435364, "learning_rate": 0.0002, "epoch": 2.626465661641541, "step": 3920}, {"loss": 1.5448, "grad_norm": 0.45348483324050903, "learning_rate": 0.0002, "epoch": 2.6331658291457285, "step": 3930}, {"loss": 1.6157, "grad_norm": 0.46559447050094604, "learning_rate": 0.0002, "epoch": 2.6398659966499163, "step": 3940}, {"loss": 1.7052, "grad_norm": 0.44113144278526306, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3950}, {"loss": 1.6315, "grad_norm": 0.41415104269981384, "learning_rate": 0.0002, "epoch": 2.6532663316582914, "step": 3960}, {"loss": 1.6589, "grad_norm": 0.48868080973625183, "learning_rate": 0.0002, "epoch": 2.659966499162479, "step": 3970}, {"loss": 1.6211, "grad_norm": 0.49610549211502075, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 3980}, {"loss": 1.6235, "grad_norm": 0.4309130907058716, "learning_rate": 0.0002, "epoch": 2.6733668341708543, "step": 3990}, {"loss": 1.6452, "grad_norm": 0.4489327669143677, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 4000}, {"loss": 1.5744, "grad_norm": 0.5380139946937561, "learning_rate": 0.0002, "epoch": 2.6867671691792294, "step": 4010}, {"loss": 1.6524, "grad_norm": 0.5076672434806824, "learning_rate": 0.0002, "epoch": 2.693467336683417, "step": 4020}, {"loss": 1.636, "grad_norm": 0.47620031237602234, "learning_rate": 0.0002, "epoch": 2.7001675041876045, "step": 4030}, {"loss": 1.5543, "grad_norm": 0.48089155554771423, "learning_rate": 0.0002, "epoch": 2.7068676716917923, "step": 4040}, {"loss": 1.6396, "grad_norm": 0.5108814239501953, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 4050}, {"loss": 1.5905, "grad_norm": 0.4196513295173645, "learning_rate": 0.0002, "epoch": 2.7202680067001674, "step": 4060}, {"loss": 1.686, "grad_norm": 0.4574664831161499, "learning_rate": 0.0002, "epoch": 2.726968174204355, "step": 4070}, {"loss": 1.6234, "grad_norm": 0.4671640992164612, "learning_rate": 0.0002, "epoch": 2.7336683417085426, "step": 4080}, {"loss": 1.6827, "grad_norm": 0.49355530738830566, "learning_rate": 0.0002, "epoch": 2.7403685092127303, "step": 4090}, {"loss": 1.6999, "grad_norm": 0.46716663241386414, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 4100}, {"loss": 1.6463, "grad_norm": 0.45420581102371216, "learning_rate": 0.0002, "epoch": 2.7537688442211055, "step": 4110}, {"loss": 1.5718, "grad_norm": 0.4680487811565399, "learning_rate": 0.0002, "epoch": 2.7604690117252932, "step": 4120}, {"loss": 1.5968, "grad_norm": 0.5375032424926758, "learning_rate": 0.0002, "epoch": 2.7671691792294806, "step": 4130}, {"loss": 1.5254, "grad_norm": 0.46026280522346497, "learning_rate": 0.0002, "epoch": 2.7738693467336684, "step": 4140}, {"loss": 1.6613, "grad_norm": 0.43658447265625, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 4150}, {"loss": 1.6546, "grad_norm": 0.4935547113418579, "learning_rate": 0.0002, "epoch": 2.7872696817420435, "step": 4160}, {"loss": 1.5961, "grad_norm": 0.8167962431907654, "learning_rate": 0.0002, "epoch": 2.7939698492462313, "step": 4170}, {"loss": 1.6907, "grad_norm": 0.4289683997631073, "learning_rate": 0.0002, "epoch": 2.8006700167504186, "step": 4180}, {"loss": 1.6385, "grad_norm": 0.4569324254989624, "learning_rate": 0.0002, "epoch": 2.8073701842546064, "step": 4190}, {"loss": 1.6077, "grad_norm": 0.474795937538147, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 4200}, {"loss": 1.6223, "grad_norm": 0.44272229075431824, "learning_rate": 0.0002, "epoch": 2.8207705192629815, "step": 4210}, {"loss": 1.6706, "grad_norm": 0.525240957736969, "learning_rate": 0.0002, "epoch": 2.8274706867671693, "step": 4220}, {"loss": 1.7196, "grad_norm": 0.4802303910255432, "learning_rate": 0.0002, "epoch": 2.8341708542713566, "step": 4230}, {"loss": 1.6002, "grad_norm": 0.46400442719459534, "learning_rate": 0.0002, "epoch": 2.8408710217755444, "step": 4240}, {"loss": 1.6052, "grad_norm": 0.49884888529777527, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 4250}, {"loss": 1.6919, "grad_norm": 0.5015072226524353, "learning_rate": 0.0002, "epoch": 2.8542713567839195, "step": 4260}, {"loss": 1.6335, "grad_norm": 0.4335440695285797, "learning_rate": 0.0002, "epoch": 2.8609715242881073, "step": 4270}, {"loss": 1.5664, "grad_norm": 0.5131644606590271, "learning_rate": 0.0002, "epoch": 2.8676716917922946, "step": 4280}, {"loss": 1.6409, "grad_norm": 0.6977195739746094, "learning_rate": 0.0002, "epoch": 2.8743718592964824, "step": 4290}, {"loss": 1.7192, "grad_norm": 0.5133762955665588, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 4300}, {"loss": 1.6257, "grad_norm": 0.4737614393234253, "learning_rate": 0.0002, "epoch": 2.8877721943048575, "step": 4310}, {"loss": 1.6076, "grad_norm": 0.4580535590648651, "learning_rate": 0.0002, "epoch": 2.8944723618090453, "step": 4320}, {"loss": 1.6538, "grad_norm": 0.43863341212272644, "learning_rate": 0.0002, "epoch": 2.901172529313233, "step": 4330}, {"loss": 1.6091, "grad_norm": 0.4103737473487854, "learning_rate": 0.0002, "epoch": 2.9078726968174204, "step": 4340}, {"loss": 1.7106, "grad_norm": 0.438014417886734, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 4350}, {"loss": 1.6025, "grad_norm": 0.5068213939666748, "learning_rate": 0.0002, "epoch": 2.9212730318257956, "step": 4360}, {"loss": 1.6426, "grad_norm": 0.45305484533309937, "learning_rate": 0.0002, "epoch": 2.9279731993299833, "step": 4370}, {"loss": 1.5726, "grad_norm": 0.4612090289592743, "learning_rate": 0.0002, "epoch": 2.934673366834171, "step": 4380}, {"loss": 1.6536, "grad_norm": 0.508736789226532, "learning_rate": 0.0002, "epoch": 2.9413735343383585, "step": 4390}, {"loss": 1.6132, "grad_norm": 0.4924427270889282, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 4400}, {"loss": 1.7007, "grad_norm": 0.5707460641860962, "learning_rate": 0.0002, "epoch": 2.9547738693467336, "step": 4410}, {"loss": 1.6814, "grad_norm": 0.42270299792289734, "learning_rate": 0.0002, "epoch": 2.9614740368509214, "step": 4420}, {"loss": 1.6644, "grad_norm": 0.4429931044578552, "learning_rate": 0.0002, "epoch": 2.968174204355109, "step": 4430}, {"loss": 1.6251, "grad_norm": 0.49760574102401733, "learning_rate": 0.0002, "epoch": 2.9748743718592965, "step": 4440}, {"loss": 1.6169, "grad_norm": 0.4558229148387909, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 4450}, {"loss": 1.6055, "grad_norm": 0.39848530292510986, "learning_rate": 0.0002, "epoch": 2.9882747068676716, "step": 4460}, {"loss": 1.6705, "grad_norm": 0.5224862098693848, "learning_rate": 0.0002, "epoch": 2.9949748743718594, "step": 4470}, {"eval_loss": 1.8228833675384521, "eval_runtime": 37.9049, "eval_samples_per_second": 13.587, "eval_steps_per_second": 1.715, "epoch": 2.9996649916247904, "step": 4477}, {"loss": 1.6637, "grad_norm": 0.41169142723083496, "learning_rate": 0.0002, "epoch": 3.0016750418760467, "step": 4480}, {"loss": 1.5974, "grad_norm": 0.4865207374095917, "learning_rate": 0.0002, "epoch": 3.0083752093802345, "step": 4490}, {"loss": 1.5297, "grad_norm": 0.5462028384208679, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 4500}, {"loss": 1.5251, "grad_norm": 0.6169732809066772, "learning_rate": 0.0002, "epoch": 3.0217755443886096, "step": 4510}, {"loss": 1.5559, "grad_norm": 0.5667954087257385, "learning_rate": 0.0002, "epoch": 3.0284757118927974, "step": 4520}, {"loss": 1.5037, "grad_norm": 0.5758325457572937, "learning_rate": 0.0002, "epoch": 3.0351758793969847, "step": 4530}, {"loss": 1.4873, "grad_norm": 0.5220064520835876, "learning_rate": 0.0002, "epoch": 3.0418760469011725, "step": 4540}, {"loss": 1.5126, "grad_norm": 0.5469558835029602, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 4550}, {"loss": 1.4275, "grad_norm": 0.5680848956108093, "learning_rate": 0.0002, "epoch": 3.0552763819095476, "step": 4560}, {"loss": 1.5187, "grad_norm": 0.5906574726104736, "learning_rate": 0.0002, "epoch": 3.0619765494137354, "step": 4570}, {"loss": 1.4551, "grad_norm": 0.4725631773471832, "learning_rate": 0.0002, "epoch": 3.0686767169179228, "step": 4580}, {"loss": 1.5083, "grad_norm": 0.5273477435112, "learning_rate": 0.0002, "epoch": 3.0753768844221105, "step": 4590}, {"loss": 1.5154, "grad_norm": 0.5861203074455261, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 4600}, {"loss": 1.4924, "grad_norm": 0.5343965291976929, "learning_rate": 0.0002, "epoch": 3.0887772194304857, "step": 4610}, {"loss": 1.5608, "grad_norm": 0.5348150730133057, "learning_rate": 0.0002, "epoch": 3.0954773869346734, "step": 4620}, {"loss": 1.5399, "grad_norm": 0.5971846580505371, "learning_rate": 0.0002, "epoch": 3.102177554438861, "step": 4630}, {"loss": 1.4662, "grad_norm": 0.5203177332878113, "learning_rate": 0.0002, "epoch": 3.1088777219430486, "step": 4640}, {"loss": 1.5805, "grad_norm": 0.55289226770401, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 4650}, {"loss": 1.4745, "grad_norm": 0.6878530979156494, "learning_rate": 0.0002, "epoch": 3.1222780569514237, "step": 4660}, {"loss": 1.5335, "grad_norm": 0.6173256635665894, "learning_rate": 0.0002, "epoch": 3.1289782244556115, "step": 4670}, {"loss": 1.51, "grad_norm": 0.536796510219574, "learning_rate": 0.0002, "epoch": 3.135678391959799, "step": 4680}, {"loss": 1.4713, "grad_norm": 0.58846116065979, "learning_rate": 0.0002, "epoch": 3.1423785594639866, "step": 4690}, {"loss": 1.5114, "grad_norm": 0.645889401435852, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 4700}, {"loss": 1.4705, "grad_norm": 0.6118691563606262, "learning_rate": 0.0002, "epoch": 3.1557788944723617, "step": 4710}, {"loss": 1.5533, "grad_norm": 0.5189669132232666, "learning_rate": 0.0002, "epoch": 3.1624790619765495, "step": 4720}, {"loss": 1.4769, "grad_norm": 0.5794713497161865, "learning_rate": 0.0002, "epoch": 3.169179229480737, "step": 4730}, {"loss": 1.4849, "grad_norm": 0.6579326391220093, "learning_rate": 0.0002, "epoch": 3.1758793969849246, "step": 4740}, {"loss": 1.545, "grad_norm": 0.5822742581367493, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 4750}, {"loss": 1.4358, "grad_norm": 0.5475956201553345, "learning_rate": 0.0002, "epoch": 3.1892797319932997, "step": 4760}, {"loss": 1.4723, "grad_norm": 0.6743834018707275, "learning_rate": 0.0002, "epoch": 3.1959798994974875, "step": 4770}, {"loss": 1.5161, "grad_norm": 0.6110585927963257, "learning_rate": 0.0002, "epoch": 3.202680067001675, "step": 4780}, {"loss": 1.5455, "grad_norm": 0.5426181554794312, "learning_rate": 0.0002, "epoch": 3.2093802345058626, "step": 4790}, {"loss": 1.5315, "grad_norm": 0.6077824234962463, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 4800}, {"loss": 1.5314, "grad_norm": 0.5785858631134033, "learning_rate": 0.0002, "epoch": 3.2227805695142377, "step": 4810}, {"loss": 1.4041, "grad_norm": 0.6425958275794983, "learning_rate": 0.0002, "epoch": 3.2294807370184255, "step": 4820}, {"loss": 1.4751, "grad_norm": 0.6607080698013306, "learning_rate": 0.0002, "epoch": 3.236180904522613, "step": 4830}, {"loss": 1.5267, "grad_norm": 0.5385788679122925, "learning_rate": 0.0002, "epoch": 3.2428810720268006, "step": 4840}, {"loss": 1.4673, "grad_norm": 0.5630403757095337, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 4850}, {"loss": 1.5257, "grad_norm": 0.6340779662132263, "learning_rate": 0.0002, "epoch": 3.2562814070351758, "step": 4860}, {"loss": 1.5148, "grad_norm": 0.5305342674255371, "learning_rate": 0.0002, "epoch": 3.2629815745393635, "step": 4870}, {"loss": 1.5162, "grad_norm": 0.597670316696167, "learning_rate": 0.0002, "epoch": 3.2696817420435513, "step": 4880}, {"loss": 1.5429, "grad_norm": 0.665553867816925, "learning_rate": 0.0002, "epoch": 3.2763819095477387, "step": 4890}, {"loss": 1.4607, "grad_norm": 0.579767644405365, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 4900}, {"loss": 1.4999, "grad_norm": 0.5512481331825256, "learning_rate": 0.0002, "epoch": 3.289782244556114, "step": 4910}, {"loss": 1.5022, "grad_norm": 0.5916532278060913, "learning_rate": 0.0002, "epoch": 3.2964824120603016, "step": 4920}, {"loss": 1.4889, "grad_norm": 0.7521726489067078, "learning_rate": 0.0002, "epoch": 3.3031825795644894, "step": 4930}, {"loss": 1.4223, "grad_norm": 0.5352797508239746, "learning_rate": 0.0002, "epoch": 3.3098827470686767, "step": 4940}, {"loss": 1.5122, "grad_norm": 0.5950371623039246, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 4950}, {"loss": 1.5072, "grad_norm": 0.8020477890968323, "learning_rate": 0.0002, "epoch": 3.323283082077052, "step": 4960}, {"loss": 1.5422, "grad_norm": 0.6790024638175964, "learning_rate": 0.0002, "epoch": 3.3299832495812396, "step": 4970}, {"loss": 1.5363, "grad_norm": 0.687627375125885, "learning_rate": 0.0002, "epoch": 3.3366834170854274, "step": 4980}, {"loss": 1.5276, "grad_norm": 0.6094385385513306, "learning_rate": 0.0002, "epoch": 3.3433835845896147, "step": 4990}, {"loss": 1.549, "grad_norm": 0.6541242003440857, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 5000}, {"loss": 1.6067, "grad_norm": 0.5560880303382874, "learning_rate": 0.0002, "epoch": 3.35678391959799, "step": 5010}, {"loss": 1.5769, "grad_norm": 0.5440094470977783, "learning_rate": 0.0002, "epoch": 3.3634840871021776, "step": 5020}, {"loss": 1.6183, "grad_norm": 0.5749301314353943, "learning_rate": 0.0002, "epoch": 3.3701842546063654, "step": 5030}, {"loss": 1.4801, "grad_norm": 0.5919716954231262, "learning_rate": 0.0002, "epoch": 3.3768844221105527, "step": 5040}, {"loss": 1.5957, "grad_norm": 0.6331481337547302, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 5050}, {"loss": 1.5188, "grad_norm": 0.5687161684036255, "learning_rate": 0.0002, "epoch": 3.390284757118928, "step": 5060}, {"loss": 1.5702, "grad_norm": 0.6718577742576599, "learning_rate": 0.0002, "epoch": 3.3969849246231156, "step": 5070}, {"loss": 1.5577, "grad_norm": 0.5089324116706848, "learning_rate": 0.0002, "epoch": 3.4036850921273034, "step": 5080}, {"loss": 1.512, "grad_norm": 0.5710174441337585, "learning_rate": 0.0002, "epoch": 3.4103852596314908, "step": 5090}, {"loss": 1.5492, "grad_norm": 0.6670721173286438, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 5100}, {"loss": 1.5227, "grad_norm": 0.6875665187835693, "learning_rate": 0.0002, "epoch": 3.423785594639866, "step": 5110}, {"loss": 1.4496, "grad_norm": 0.5375880599021912, "learning_rate": 0.0002, "epoch": 3.4304857621440537, "step": 5120}, {"loss": 1.5527, "grad_norm": 0.6550399661064148, "learning_rate": 0.0002, "epoch": 3.4371859296482414, "step": 5130}, {"loss": 1.5687, "grad_norm": 0.5948067903518677, "learning_rate": 0.0002, "epoch": 3.4438860971524288, "step": 5140}, {"loss": 1.4813, "grad_norm": 0.6134477257728577, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 5150}, {"loss": 1.5069, "grad_norm": 0.6506398320198059, "learning_rate": 0.0002, "epoch": 3.457286432160804, "step": 5160}, {"loss": 1.4422, "grad_norm": 0.6060147881507874, "learning_rate": 0.0002, "epoch": 3.4639865996649917, "step": 5170}, {"loss": 1.5093, "grad_norm": 0.6173806190490723, "learning_rate": 0.0002, "epoch": 3.4706867671691795, "step": 5180}, {"loss": 1.4975, "grad_norm": 0.6032607555389404, "learning_rate": 0.0002, "epoch": 3.477386934673367, "step": 5190}, {"loss": 1.4979, "grad_norm": 0.5652492046356201, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 5200}, {"loss": 1.4883, "grad_norm": 0.6168607473373413, "learning_rate": 0.0002, "epoch": 3.490787269681742, "step": 5210}, {"loss": 1.5164, "grad_norm": 0.6170629262924194, "learning_rate": 0.0002, "epoch": 3.4974874371859297, "step": 5220}, {"loss": 1.4879, "grad_norm": 0.6926297545433044, "learning_rate": 0.0002, "epoch": 3.5041876046901175, "step": 5230}, {"loss": 1.4982, "grad_norm": 0.6702437996864319, "learning_rate": 0.0002, "epoch": 3.510887772194305, "step": 5240}, {"loss": 1.4986, "grad_norm": 0.5421436429023743, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 5250}, {"loss": 1.4673, "grad_norm": 0.5726765990257263, "learning_rate": 0.0002, "epoch": 3.52428810720268, "step": 5260}, {"loss": 1.5423, "grad_norm": 0.5685455203056335, "learning_rate": 0.0002, "epoch": 3.5309882747068677, "step": 5270}, {"loss": 1.4715, "grad_norm": 0.6018396019935608, "learning_rate": 0.0002, "epoch": 3.5376884422110555, "step": 5280}, {"loss": 1.5451, "grad_norm": 0.5731932520866394, "learning_rate": 0.0002, "epoch": 3.544388609715243, "step": 5290}, {"loss": 1.4752, "grad_norm": 0.6601519584655762, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 5300}, {"loss": 1.5434, "grad_norm": 0.5545530319213867, "learning_rate": 0.0002, "epoch": 3.557788944723618, "step": 5310}, {"loss": 1.5438, "grad_norm": 0.5998541116714478, "learning_rate": 0.0002, "epoch": 3.5644891122278057, "step": 5320}, {"loss": 1.56, "grad_norm": 0.5651767253875732, "learning_rate": 0.0002, "epoch": 3.5711892797319935, "step": 5330}, {"loss": 1.4829, "grad_norm": 0.7425084114074707, "learning_rate": 0.0002, "epoch": 3.577889447236181, "step": 5340}, {"loss": 1.5571, "grad_norm": 0.5770602226257324, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 5350}, {"loss": 1.458, "grad_norm": 0.54723060131073, "learning_rate": 0.0002, "epoch": 3.591289782244556, "step": 5360}, {"loss": 1.497, "grad_norm": 0.6658238172531128, "learning_rate": 0.0002, "epoch": 3.5979899497487438, "step": 5370}, {"loss": 1.5456, "grad_norm": 0.5787645578384399, "learning_rate": 0.0002, "epoch": 3.6046901172529315, "step": 5380}, {"loss": 1.5343, "grad_norm": 0.594913125038147, "learning_rate": 0.0002, "epoch": 3.611390284757119, "step": 5390}, {"loss": 1.4727, "grad_norm": 0.4964977502822876, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 5400}, {"loss": 1.5062, "grad_norm": 0.6087527275085449, "learning_rate": 0.0002, "epoch": 3.624790619765494, "step": 5410}, {"loss": 1.5098, "grad_norm": 0.6315323710441589, "learning_rate": 0.0002, "epoch": 3.6314907872696818, "step": 5420}, {"loss": 1.4855, "grad_norm": 0.574799120426178, "learning_rate": 0.0002, "epoch": 3.6381909547738696, "step": 5430}, {"loss": 1.4595, "grad_norm": 0.5949277877807617, "learning_rate": 0.0002, "epoch": 3.644891122278057, "step": 5440}, {"loss": 1.4816, "grad_norm": 0.5640677213668823, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 5450}, {"loss": 1.525, "grad_norm": 0.6198237538337708, "learning_rate": 0.0002, "epoch": 3.658291457286432, "step": 5460}, {"loss": 1.5676, "grad_norm": 0.6902034878730774, "learning_rate": 0.0002, "epoch": 3.66499162479062, "step": 5470}, {"loss": 1.544, "grad_norm": 0.5686674118041992, "learning_rate": 0.0002, "epoch": 3.6716917922948076, "step": 5480}, {"loss": 1.5255, "grad_norm": 0.6532107591629028, "learning_rate": 0.0002, "epoch": 3.678391959798995, "step": 5490}, {"loss": 1.5767, "grad_norm": 0.5790849924087524, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 5500}, {"loss": 1.5507, "grad_norm": 0.6055065393447876, "learning_rate": 0.0002, "epoch": 3.69179229480737, "step": 5510}, {"loss": 1.4656, "grad_norm": 0.5630605816841125, "learning_rate": 0.0002, "epoch": 3.698492462311558, "step": 5520}, {"loss": 1.537, "grad_norm": 0.6005825996398926, "learning_rate": 0.0002, "epoch": 3.7051926298157456, "step": 5530}, {"loss": 1.5313, "grad_norm": 0.6553038954734802, "learning_rate": 0.0002, "epoch": 3.711892797319933, "step": 5540}, {"loss": 1.4943, "grad_norm": 0.5601094961166382, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 5550}, {"loss": 1.5161, "grad_norm": 0.6598808169364929, "learning_rate": 0.0002, "epoch": 3.725293132328308, "step": 5560}, {"loss": 1.5345, "grad_norm": 0.5506255626678467, "learning_rate": 0.0002, "epoch": 3.731993299832496, "step": 5570}, {"loss": 1.4805, "grad_norm": 0.6001223921775818, "learning_rate": 0.0002, "epoch": 3.7386934673366836, "step": 5580}, {"loss": 1.4652, "grad_norm": 0.6287297606468201, "learning_rate": 0.0002, "epoch": 3.745393634840871, "step": 5590}, {"loss": 1.5246, "grad_norm": 0.6253238916397095, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 5600}, {"loss": 1.5691, "grad_norm": 0.5713174939155579, "learning_rate": 0.0002, "epoch": 3.758793969849246, "step": 5610}, {"loss": 1.5661, "grad_norm": 0.6198310852050781, "learning_rate": 0.0002, "epoch": 3.765494137353434, "step": 5620}, {"loss": 1.5448, "grad_norm": 0.5941224098205566, "learning_rate": 0.0002, "epoch": 3.7721943048576216, "step": 5630}, {"loss": 1.4925, "grad_norm": 0.606002151966095, "learning_rate": 0.0002, "epoch": 3.778894472361809, "step": 5640}, {"loss": 1.5182, "grad_norm": 0.6540704965591431, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 5650}, {"loss": 1.5903, "grad_norm": 0.6147415041923523, "learning_rate": 0.0002, "epoch": 3.792294807370184, "step": 5660}, {"loss": 1.5329, "grad_norm": 0.5649605393409729, "learning_rate": 0.0002, "epoch": 3.798994974874372, "step": 5670}, {"loss": 1.5747, "grad_norm": 0.6788773536682129, "learning_rate": 0.0002, "epoch": 3.8056951423785597, "step": 5680}, {"loss": 1.535, "grad_norm": 0.6581860780715942, "learning_rate": 0.0002, "epoch": 3.812395309882747, "step": 5690}, {"loss": 1.4587, "grad_norm": 0.5529348850250244, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 5700}, {"loss": 1.5594, "grad_norm": 0.6320232152938843, "learning_rate": 0.0002, "epoch": 3.825795644891122, "step": 5710}, {"loss": 1.4696, "grad_norm": 0.6529698371887207, "learning_rate": 0.0002, "epoch": 3.83249581239531, "step": 5720}, {"loss": 1.5854, "grad_norm": 0.5983362793922424, "learning_rate": 0.0002, "epoch": 3.8391959798994977, "step": 5730}, {"loss": 1.465, "grad_norm": 0.6335684061050415, "learning_rate": 0.0002, "epoch": 3.845896147403685, "step": 5740}, {"loss": 1.5545, "grad_norm": 0.700446605682373, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 5750}, {"loss": 1.5707, "grad_norm": 0.6092597842216492, "learning_rate": 0.0002, "epoch": 3.85929648241206, "step": 5760}, {"loss": 1.5729, "grad_norm": 0.564146101474762, "learning_rate": 0.0002, "epoch": 3.865996649916248, "step": 5770}, {"loss": 1.5872, "grad_norm": 0.615275502204895, "learning_rate": 0.0002, "epoch": 3.8726968174204357, "step": 5780}, {"loss": 1.5142, "grad_norm": 0.6685376763343811, "learning_rate": 0.0002, "epoch": 3.879396984924623, "step": 5790}, {"loss": 1.4752, "grad_norm": 0.6116922497749329, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 5800}, {"loss": 1.5179, "grad_norm": 0.5486813187599182, "learning_rate": 0.0002, "epoch": 3.892797319932998, "step": 5810}, {"loss": 1.5167, "grad_norm": 0.6208204030990601, "learning_rate": 0.0002, "epoch": 3.899497487437186, "step": 5820}, {"loss": 1.5334, "grad_norm": 0.6500625014305115, "learning_rate": 0.0002, "epoch": 3.9061976549413737, "step": 5830}, {"loss": 1.4716, "grad_norm": 0.5948089361190796, "learning_rate": 0.0002, "epoch": 3.912897822445561, "step": 5840}, {"loss": 1.6011, "grad_norm": 0.7210732698440552, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 5850}, {"loss": 1.5519, "grad_norm": 0.6662322878837585, "learning_rate": 0.0002, "epoch": 3.926298157453936, "step": 5860}, {"loss": 1.5656, "grad_norm": 0.5613839626312256, "learning_rate": 0.0002, "epoch": 3.932998324958124, "step": 5870}, {"loss": 1.544, "grad_norm": 0.6069002151489258, "learning_rate": 0.0002, "epoch": 3.9396984924623117, "step": 5880}, {"loss": 1.6745, "grad_norm": 0.7075562477111816, "learning_rate": 0.0002, "epoch": 3.946398659966499, "step": 5890}, {"loss": 1.5391, "grad_norm": 0.6316173076629639, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 5900}, {"loss": 1.6314, "grad_norm": 0.5716308355331421, "learning_rate": 0.0002, "epoch": 3.959798994974874, "step": 5910}, {"loss": 1.5947, "grad_norm": 0.6800096035003662, "learning_rate": 0.0002, "epoch": 3.966499162479062, "step": 5920}, {"loss": 1.5189, "grad_norm": 0.6057983040809631, "learning_rate": 0.0002, "epoch": 3.9731993299832498, "step": 5930}, {"loss": 1.5431, "grad_norm": 0.5938987731933594, "learning_rate": 0.0002, "epoch": 3.979899497487437, "step": 5940}, {"loss": 1.5111, "grad_norm": 0.6963576674461365, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 5950}, {"loss": 1.5521, "grad_norm": 0.6279940009117126, "learning_rate": 0.0002, "epoch": 3.993299832495812, "step": 5960}, {"loss": 1.5974, "grad_norm": 0.7161159515380859, "learning_rate": 0.0002, "epoch": 4.0, "step": 5970}, {"eval_loss": 1.8655421733856201, "eval_runtime": 37.9276, "eval_samples_per_second": 13.579, "eval_steps_per_second": 1.714, "epoch": 4.0, "step": 5970}, {"loss": 1.3666, "grad_norm": 0.7380476593971252, "learning_rate": 0.0002, "epoch": 4.006700167504188, "step": 5980}, {"loss": 1.3913, "grad_norm": 0.7148947715759277, "learning_rate": 0.0002, "epoch": 4.013400335008376, "step": 5990}, {"loss": 1.4204, "grad_norm": 0.6177082657814026, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 6000}, {"loss": 1.4421, "grad_norm": 0.8552946448326111, "learning_rate": 0.0002, "epoch": 4.02680067001675, "step": 6010}, {"loss": 1.4342, "grad_norm": 0.8033416271209717, "learning_rate": 0.0002, "epoch": 4.033500837520938, "step": 6020}, {"loss": 1.4092, "grad_norm": 0.8501318097114563, "learning_rate": 0.0002, "epoch": 4.040201005025126, "step": 6030}, {"loss": 1.3367, "grad_norm": 0.6981393098831177, "learning_rate": 0.0002, "epoch": 4.046901172529314, "step": 6040}, {"loss": 1.3925, "grad_norm": 0.7227180600166321, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 6050}, {"loss": 1.4007, "grad_norm": 0.6923989653587341, "learning_rate": 0.0002, "epoch": 4.060301507537688, "step": 6060}, {"loss": 1.3837, "grad_norm": 0.879779040813446, "learning_rate": 0.0002, "epoch": 4.067001675041876, "step": 6070}, {"loss": 1.4383, "grad_norm": 0.8184754848480225, "learning_rate": 0.0002, "epoch": 4.073701842546064, "step": 6080}, {"loss": 1.3128, "grad_norm": 0.8211342692375183, "learning_rate": 0.0002, "epoch": 4.080402010050252, "step": 6090}, {"loss": 1.3892, "grad_norm": 0.7542396783828735, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 6100}, {"loss": 1.3607, "grad_norm": 0.6631066799163818, "learning_rate": 0.0002, "epoch": 4.093802345058626, "step": 6110}, {"loss": 1.3275, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.100502512562814, "step": 6120}, {"loss": 1.3443, "grad_norm": 0.681851863861084, "learning_rate": 0.0002, "epoch": 4.107202680067002, "step": 6130}, {"loss": 1.3486, "grad_norm": 0.8757794499397278, "learning_rate": 0.0002, "epoch": 4.11390284757119, "step": 6140}, {"loss": 1.351, "grad_norm": 0.6567301750183105, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 6150}, {"loss": 1.3824, "grad_norm": 0.7950329184532166, "learning_rate": 0.0002, "epoch": 4.127303182579564, "step": 6160}, {"loss": 1.3738, "grad_norm": 0.7545644044876099, "learning_rate": 0.0002, "epoch": 4.134003350083752, "step": 6170}, {"loss": 1.4214, "grad_norm": 0.7172710299491882, "learning_rate": 0.0002, "epoch": 4.14070351758794, "step": 6180}, {"loss": 1.4091, "grad_norm": 0.7040584087371826, "learning_rate": 0.0002, "epoch": 4.147403685092128, "step": 6190}, {"loss": 1.4149, "grad_norm": 0.7482913732528687, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 6200}, {"loss": 1.3227, "grad_norm": 0.8523276448249817, "learning_rate": 0.0002, "epoch": 4.160804020100502, "step": 6210}, {"loss": 1.4194, "grad_norm": 0.6672041416168213, "learning_rate": 0.0002, "epoch": 4.16750418760469, "step": 6220}, {"loss": 1.3953, "grad_norm": 0.7523500919342041, "learning_rate": 0.0002, "epoch": 4.174204355108878, "step": 6230}, {"loss": 1.371, "grad_norm": 0.8085253834724426, "learning_rate": 0.0002, "epoch": 4.180904522613066, "step": 6240}, {"loss": 1.3293, "grad_norm": 0.789450466632843, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 6250}, {"loss": 1.3539, "grad_norm": 0.7502310872077942, "learning_rate": 0.0002, "epoch": 4.19430485762144, "step": 6260}, {"loss": 1.3415, "grad_norm": 0.7397456765174866, "learning_rate": 0.0002, "epoch": 4.201005025125628, "step": 6270}, {"loss": 1.3963, "grad_norm": 0.6921947002410889, "learning_rate": 0.0002, "epoch": 4.207705192629816, "step": 6280}, {"loss": 1.3125, "grad_norm": 0.9334571957588196, "learning_rate": 0.0002, "epoch": 4.214405360134004, "step": 6290}, {"loss": 1.3612, "grad_norm": 0.725799024105072, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 6300}, {"loss": 1.4217, "grad_norm": 0.8290495872497559, "learning_rate": 0.0002, "epoch": 4.227805695142378, "step": 6310}, {"loss": 1.4135, "grad_norm": 0.688983678817749, "learning_rate": 0.0002, "epoch": 4.234505862646566, "step": 6320}, {"loss": 1.3807, "grad_norm": 0.8620913028717041, "learning_rate": 0.0002, "epoch": 4.241206030150754, "step": 6330}, {"loss": 1.3738, "grad_norm": 0.8008657693862915, "learning_rate": 0.0002, "epoch": 4.247906197654942, "step": 6340}, {"loss": 1.4005, "grad_norm": 0.7379199266433716, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 6350}, {"loss": 1.426, "grad_norm": 0.7842815518379211, "learning_rate": 0.0002, "epoch": 4.261306532663316, "step": 6360}, {"loss": 1.4262, "grad_norm": 0.812600314617157, "learning_rate": 0.0002, "epoch": 4.268006700167504, "step": 6370}, {"loss": 1.4028, "grad_norm": 0.7852841019630432, "learning_rate": 0.0002, "epoch": 4.274706867671692, "step": 6380}, {"loss": 1.3722, "grad_norm": 1.0377534627914429, "learning_rate": 0.0002, "epoch": 4.28140703517588, "step": 6390}, {"loss": 1.3755, "grad_norm": 1.03935706615448, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 6400}, {"loss": 1.3961, "grad_norm": 0.7244732975959778, "learning_rate": 0.0002, "epoch": 4.294807370184254, "step": 6410}, {"loss": 1.4608, "grad_norm": 0.7137406468391418, "learning_rate": 0.0002, "epoch": 4.301507537688442, "step": 6420}, {"loss": 1.4461, "grad_norm": 0.7492543458938599, "learning_rate": 0.0002, "epoch": 4.30820770519263, "step": 6430}, {"loss": 1.4562, "grad_norm": 0.7065439224243164, "learning_rate": 0.0002, "epoch": 4.314907872696818, "step": 6440}, {"loss": 1.4246, "grad_norm": 0.7786989808082581, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 6450}, {"loss": 1.3098, "grad_norm": 0.7369208335876465, "learning_rate": 0.0002, "epoch": 4.328308207705192, "step": 6460}, {"loss": 1.3686, "grad_norm": 0.7412346005439758, "learning_rate": 0.0002, "epoch": 4.33500837520938, "step": 6470}, {"loss": 1.4087, "grad_norm": 0.780927300453186, "learning_rate": 0.0002, "epoch": 4.341708542713568, "step": 6480}, {"loss": 1.3628, "grad_norm": 0.8320930600166321, "learning_rate": 0.0002, "epoch": 4.348408710217756, "step": 6490}, {"loss": 1.3715, "grad_norm": 0.6871094703674316, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 6500}, {"loss": 1.3257, "grad_norm": 0.6751559972763062, "learning_rate": 0.0002, "epoch": 4.36180904522613, "step": 6510}, {"loss": 1.4311, "grad_norm": 0.7723976969718933, "learning_rate": 0.0002, "epoch": 4.368509212730318, "step": 6520}, {"loss": 1.4086, "grad_norm": 0.7915401458740234, "learning_rate": 0.0002, "epoch": 4.375209380234506, "step": 6530}, {"loss": 1.3973, "grad_norm": 0.7329102754592896, "learning_rate": 0.0002, "epoch": 4.381909547738694, "step": 6540}, {"loss": 1.447, "grad_norm": 0.7388760447502136, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 6550}, {"loss": 1.4378, "grad_norm": 0.8282579183578491, "learning_rate": 0.0002, "epoch": 4.3953098827470685, "step": 6560}, {"loss": 1.3923, "grad_norm": 0.7192724347114563, "learning_rate": 0.0002, "epoch": 4.402010050251256, "step": 6570}, {"loss": 1.4141, "grad_norm": 0.746526837348938, "learning_rate": 0.0002, "epoch": 4.408710217755444, "step": 6580}, {"loss": 1.33, "grad_norm": 0.8738046288490295, "learning_rate": 0.0002, "epoch": 4.415410385259632, "step": 6590}, {"loss": 1.3995, "grad_norm": 0.8408458828926086, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 6600}, {"loss": 1.4148, "grad_norm": 0.8110666275024414, "learning_rate": 0.0002, "epoch": 4.4288107202680065, "step": 6610}, {"loss": 1.441, "grad_norm": 0.8602406978607178, "learning_rate": 0.0002, "epoch": 4.435510887772194, "step": 6620}, {"loss": 1.4319, "grad_norm": 0.7549102902412415, "learning_rate": 0.0002, "epoch": 4.442211055276382, "step": 6630}, {"loss": 1.388, "grad_norm": 0.7831804156303406, "learning_rate": 0.0002, "epoch": 4.44891122278057, "step": 6640}, {"loss": 1.4283, "grad_norm": 0.7269673943519592, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 6650}, {"loss": 1.4132, "grad_norm": 0.7397838830947876, "learning_rate": 0.0002, "epoch": 4.4623115577889445, "step": 6660}, {"loss": 1.3174, "grad_norm": 0.713707447052002, "learning_rate": 0.0002, "epoch": 4.469011725293132, "step": 6670}, {"loss": 1.3406, "grad_norm": 0.7525581121444702, "learning_rate": 0.0002, "epoch": 4.47571189279732, "step": 6680}, {"loss": 1.4283, "grad_norm": 0.8030191659927368, "learning_rate": 0.0002, "epoch": 4.482412060301508, "step": 6690}, {"loss": 1.4586, "grad_norm": 0.7469439506530762, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 6700}, {"loss": 1.367, "grad_norm": 0.7743868231773376, "learning_rate": 0.0002, "epoch": 4.4958123953098825, "step": 6710}, {"loss": 1.3439, "grad_norm": 0.6539737582206726, "learning_rate": 0.0002, "epoch": 4.50251256281407, "step": 6720}, {"loss": 1.4513, "grad_norm": 0.825818657875061, "learning_rate": 0.0002, "epoch": 4.509212730318258, "step": 6730}, {"loss": 1.3984, "grad_norm": 0.8048575520515442, "learning_rate": 0.0002, "epoch": 4.515912897822446, "step": 6740}, {"loss": 1.3923, "grad_norm": 0.7828766107559204, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 6750}, {"loss": 1.3886, "grad_norm": 0.7406010031700134, "learning_rate": 0.0002, "epoch": 4.5293132328308205, "step": 6760}, {"loss": 1.3109, "grad_norm": 0.840345561504364, "learning_rate": 0.0002, "epoch": 4.536013400335008, "step": 6770}, {"loss": 1.4808, "grad_norm": 0.8492622971534729, "learning_rate": 0.0002, "epoch": 4.542713567839196, "step": 6780}, {"loss": 1.4384, "grad_norm": 0.7130163908004761, "learning_rate": 0.0002, "epoch": 4.549413735343384, "step": 6790}, {"loss": 1.4531, "grad_norm": 0.8454728126525879, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 6800}, {"loss": 1.3239, "grad_norm": 0.7847645282745361, "learning_rate": 0.0002, "epoch": 4.562814070351759, "step": 6810}, {"loss": 1.4181, "grad_norm": 0.7245864272117615, "learning_rate": 0.0002, "epoch": 4.569514237855946, "step": 6820}, {"loss": 1.3233, "grad_norm": 0.768893301486969, "learning_rate": 0.0002, "epoch": 4.576214405360134, "step": 6830}, {"loss": 1.3932, "grad_norm": 0.8028400540351868, "learning_rate": 0.0002, "epoch": 4.582914572864322, "step": 6840}, {"loss": 1.3745, "grad_norm": 0.763945460319519, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 6850}, {"loss": 1.4797, "grad_norm": 0.7417685389518738, "learning_rate": 0.0002, "epoch": 4.596314907872697, "step": 6860}, {"loss": 1.4468, "grad_norm": 0.7603038549423218, "learning_rate": 0.0002, "epoch": 4.603015075376884, "step": 6870}, {"loss": 1.4095, "grad_norm": 0.7981528043746948, "learning_rate": 0.0002, "epoch": 4.609715242881072, "step": 6880}, {"loss": 1.3963, "grad_norm": 0.8077111840248108, "learning_rate": 0.0002, "epoch": 4.61641541038526, "step": 6890}, {"loss": 1.4721, "grad_norm": 0.8778454065322876, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 6900}, {"loss": 1.3758, "grad_norm": 0.8620710372924805, "learning_rate": 0.0002, "epoch": 4.629815745393635, "step": 6910}, {"loss": 1.344, "grad_norm": 0.7486072778701782, "learning_rate": 0.0002, "epoch": 4.636515912897822, "step": 6920}, {"loss": 1.3913, "grad_norm": 0.7493042945861816, "learning_rate": 0.0002, "epoch": 4.64321608040201, "step": 6930}, {"loss": 1.397, "grad_norm": 0.7388978600502014, "learning_rate": 0.0002, "epoch": 4.649916247906198, "step": 6940}, {"loss": 1.3593, "grad_norm": 0.798530638217926, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 6950}, {"loss": 1.3982, "grad_norm": 0.7929500937461853, "learning_rate": 0.0002, "epoch": 4.663316582914573, "step": 6960}, {"loss": 1.4183, "grad_norm": 0.9186785221099854, "learning_rate": 0.0002, "epoch": 4.67001675041876, "step": 6970}, {"loss": 1.3955, "grad_norm": 1.1103485822677612, "learning_rate": 0.0002, "epoch": 4.676716917922948, "step": 6980}, {"loss": 1.3941, "grad_norm": 0.8000466823577881, "learning_rate": 0.0002, "epoch": 4.683417085427136, "step": 6990}, {"loss": 1.371, "grad_norm": 0.7520599961280823, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 7000}, {"loss": 1.4582, "grad_norm": 0.7971973419189453, "learning_rate": 0.0002, "epoch": 4.696817420435511, "step": 7010}, {"loss": 1.3682, "grad_norm": 0.7363343834877014, "learning_rate": 0.0002, "epoch": 4.703517587939698, "step": 7020}, {"loss": 1.3889, "grad_norm": 0.8268865942955017, "learning_rate": 0.0002, "epoch": 4.710217755443886, "step": 7030}, {"loss": 1.4382, "grad_norm": 0.7054963111877441, "learning_rate": 0.0002, "epoch": 4.716917922948074, "step": 7040}, {"loss": 1.4578, "grad_norm": 0.8196262121200562, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 7050}, {"loss": 1.365, "grad_norm": 0.8276031017303467, "learning_rate": 0.0002, "epoch": 4.730318257956449, "step": 7060}, {"loss": 1.3887, "grad_norm": 0.8248157501220703, "learning_rate": 0.0002, "epoch": 4.7370184254606365, "step": 7070}, {"loss": 1.4193, "grad_norm": 0.8937979936599731, "learning_rate": 0.0002, "epoch": 4.743718592964824, "step": 7080}, {"loss": 1.4334, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 4.750418760469012, "step": 7090}, {"loss": 1.4385, "grad_norm": 0.9495313763618469, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 7100}, {"loss": 1.4504, "grad_norm": 0.8598204255104065, "learning_rate": 0.0002, "epoch": 4.763819095477387, "step": 7110}, {"loss": 1.3969, "grad_norm": 0.8951472640037537, "learning_rate": 0.0002, "epoch": 4.7705192629815745, "step": 7120}, {"loss": 1.4339, "grad_norm": 0.9110309481620789, "learning_rate": 0.0002, "epoch": 4.777219430485762, "step": 7130}, {"loss": 1.4001, "grad_norm": 0.7929584980010986, "learning_rate": 0.0002, "epoch": 4.78391959798995, "step": 7140}, {"loss": 1.467, "grad_norm": 0.7415322661399841, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 7150}, {"loss": 1.5107, "grad_norm": 0.7504757046699524, "learning_rate": 0.0002, "epoch": 4.797319932998325, "step": 7160}, {"loss": 1.3736, "grad_norm": 0.7166924476623535, "learning_rate": 0.0002, "epoch": 4.8040201005025125, "step": 7170}, {"loss": 1.4088, "grad_norm": 0.7728400826454163, "learning_rate": 0.0002, "epoch": 4.8107202680067, "step": 7180}, {"loss": 1.3814, "grad_norm": 0.7992154955863953, "learning_rate": 0.0002, "epoch": 4.817420435510888, "step": 7190}, {"loss": 1.3958, "grad_norm": 0.8655321002006531, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 7200}, {"loss": 1.3837, "grad_norm": 0.7672632336616516, "learning_rate": 0.0002, "epoch": 4.830820770519263, "step": 7210}, {"loss": 1.4578, "grad_norm": 0.708416223526001, "learning_rate": 0.0002, "epoch": 4.8375209380234505, "step": 7220}, {"loss": 1.5413, "grad_norm": 0.8914081454277039, "learning_rate": 0.0002, "epoch": 4.844221105527638, "step": 7230}, {"loss": 1.3569, "grad_norm": 0.7141931653022766, "learning_rate": 0.0002, "epoch": 4.850921273031826, "step": 7240}, {"loss": 1.4532, "grad_norm": 0.6913040280342102, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 7250}, {"loss": 1.3912, "grad_norm": 0.7871233820915222, "learning_rate": 0.0002, "epoch": 4.864321608040201, "step": 7260}, {"loss": 1.3688, "grad_norm": 0.8466277122497559, "learning_rate": 0.0002, "epoch": 4.8710217755443885, "step": 7270}, {"loss": 1.33, "grad_norm": 0.8492183685302734, "learning_rate": 0.0002, "epoch": 4.877721943048576, "step": 7280}, {"loss": 1.3744, "grad_norm": 0.8339574933052063, "learning_rate": 0.0002, "epoch": 4.884422110552764, "step": 7290}, {"loss": 1.4157, "grad_norm": 0.787022590637207, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 7300}, {"loss": 1.3725, "grad_norm": 0.8877332806587219, "learning_rate": 0.0002, "epoch": 4.897822445561139, "step": 7310}, {"loss": 1.3968, "grad_norm": 0.744989812374115, "learning_rate": 0.0002, "epoch": 4.9045226130653266, "step": 7320}, {"loss": 1.4421, "grad_norm": 0.8027268648147583, "learning_rate": 0.0002, "epoch": 4.911222780569514, "step": 7330}, {"loss": 1.425, "grad_norm": 0.6437455415725708, "learning_rate": 0.0002, "epoch": 4.917922948073702, "step": 7340}, {"loss": 1.4829, "grad_norm": 0.685999870300293, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 7350}, {"loss": 1.4352, "grad_norm": 0.9086187481880188, "learning_rate": 0.0002, "epoch": 4.931323283082077, "step": 7360}, {"loss": 1.4245, "grad_norm": 0.8272411227226257, "learning_rate": 0.0002, "epoch": 4.938023450586265, "step": 7370}, {"loss": 1.4226, "grad_norm": 0.9227852821350098, "learning_rate": 0.0002, "epoch": 4.944723618090452, "step": 7380}, {"loss": 1.3643, "grad_norm": 0.7688441276550293, "learning_rate": 0.0002, "epoch": 4.95142378559464, "step": 7390}, {"loss": 1.4491, "grad_norm": 0.8662643432617188, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 7400}, {"loss": 1.4194, "grad_norm": 0.9234127998352051, "learning_rate": 0.0002, "epoch": 4.964824120603015, "step": 7410}, {"loss": 1.4009, "grad_norm": 0.9131470918655396, "learning_rate": 0.0002, "epoch": 4.971524288107203, "step": 7420}, {"loss": 1.4544, "grad_norm": 0.7377504110336304, "learning_rate": 0.0002, "epoch": 4.97822445561139, "step": 7430}, {"loss": 1.4008, "grad_norm": 0.8762801289558411, "learning_rate": 0.0002, "epoch": 4.984924623115578, "step": 7440}, {"loss": 1.4304, "grad_norm": 0.7919872999191284, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 7450}, {"loss": 1.3817, "grad_norm": 0.7144299149513245, "learning_rate": 0.0002, "epoch": 4.998324958123953, "step": 7460}]} +{"epoch": 6.0, "step": 8955, "epoch_duration": 1643.898383140564, "total_accumulated_duration": 9601.341349124908, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}, {"eval_loss": 1.8036354780197144, "eval_runtime": 37.8949, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.715, "epoch": 0.9996649916247906, "step": 1492}, {"loss": 1.7138, "grad_norm": 0.29252371191978455, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1500}, {"loss": 1.8198, "grad_norm": 0.31607162952423096, "learning_rate": 0.0002, "epoch": 1.0117252931323284, "step": 1510}, {"loss": 1.6779, "grad_norm": 0.32294467091560364, "learning_rate": 0.0002, "epoch": 1.018425460636516, "step": 1520}, {"loss": 1.7919, "grad_norm": 0.3868017792701721, "learning_rate": 0.0002, "epoch": 1.0251256281407035, "step": 1530}, {"loss": 1.7954, "grad_norm": 0.3178282082080841, "learning_rate": 0.0002, "epoch": 1.031825795644891, "step": 1540}, {"loss": 1.7136, "grad_norm": 0.3706750273704529, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1550}, {"loss": 1.7382, "grad_norm": 0.33930912613868713, "learning_rate": 0.0002, "epoch": 1.0452261306532664, "step": 1560}, {"loss": 1.7602, "grad_norm": 0.33970504999160767, "learning_rate": 0.0002, "epoch": 1.051926298157454, "step": 1570}, {"loss": 1.6573, "grad_norm": 0.42553383111953735, "learning_rate": 0.0002, "epoch": 1.0586264656616415, "step": 1580}, {"loss": 1.645, "grad_norm": 0.3772421181201935, "learning_rate": 0.0002, "epoch": 1.065326633165829, "step": 1590}, {"loss": 1.7362, "grad_norm": 0.34212902188301086, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1600}, {"loss": 1.7057, "grad_norm": 0.3798283338546753, "learning_rate": 0.0002, "epoch": 1.0787269681742044, "step": 1610}, {"loss": 1.7468, "grad_norm": 0.36909598112106323, "learning_rate": 0.0002, "epoch": 1.085427135678392, "step": 1620}, {"loss": 1.7807, "grad_norm": 0.3344230651855469, "learning_rate": 0.0002, "epoch": 1.0921273031825796, "step": 1630}, {"loss": 1.7111, "grad_norm": 0.3862569332122803, "learning_rate": 0.0002, "epoch": 1.0988274706867671, "step": 1640}, {"loss": 1.7163, "grad_norm": 0.31188511848449707, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1650}, {"loss": 1.7263, "grad_norm": 0.3563670814037323, "learning_rate": 0.0002, "epoch": 1.1122278056951425, "step": 1660}, {"loss": 1.7718, "grad_norm": 0.35052165389060974, "learning_rate": 0.0002, "epoch": 1.11892797319933, "step": 1670}, {"loss": 1.7601, "grad_norm": 0.3285699188709259, "learning_rate": 0.0002, "epoch": 1.1256281407035176, "step": 1680}, {"loss": 1.6877, "grad_norm": 0.3639393746852875, "learning_rate": 0.0002, "epoch": 1.1323283082077051, "step": 1690}, {"loss": 1.7719, "grad_norm": 0.3842753767967224, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1700}, {"loss": 1.7002, "grad_norm": 0.3624933063983917, "learning_rate": 0.0002, "epoch": 1.1457286432160805, "step": 1710}, {"loss": 1.7243, "grad_norm": 0.3641220033168793, "learning_rate": 0.0002, "epoch": 1.152428810720268, "step": 1720}, {"loss": 1.752, "grad_norm": 0.32765355706214905, "learning_rate": 0.0002, "epoch": 1.1591289782244556, "step": 1730}, {"loss": 1.6556, "grad_norm": 0.34974896907806396, "learning_rate": 0.0002, "epoch": 1.1658291457286432, "step": 1740}, {"loss": 1.7273, "grad_norm": 0.3910926580429077, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1750}, {"loss": 1.7204, "grad_norm": 0.3564300537109375, "learning_rate": 0.0002, "epoch": 1.1792294807370185, "step": 1760}, {"loss": 1.746, "grad_norm": 0.34822574257850647, "learning_rate": 0.0002, "epoch": 1.185929648241206, "step": 1770}, {"loss": 1.7256, "grad_norm": 0.36185044050216675, "learning_rate": 0.0002, "epoch": 1.1926298157453936, "step": 1780}, {"loss": 1.6431, "grad_norm": 0.34866711497306824, "learning_rate": 0.0002, "epoch": 1.1993299832495812, "step": 1790}, {"loss": 1.8084, "grad_norm": 0.4017769992351532, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1800}, {"loss": 1.6985, "grad_norm": 0.32930681109428406, "learning_rate": 0.0002, "epoch": 1.2127303182579565, "step": 1810}, {"loss": 1.7606, "grad_norm": 0.35951921343803406, "learning_rate": 0.0002, "epoch": 1.219430485762144, "step": 1820}, {"loss": 1.6933, "grad_norm": 0.37366992235183716, "learning_rate": 0.0002, "epoch": 1.2261306532663316, "step": 1830}, {"loss": 1.6737, "grad_norm": 0.3565689027309418, "learning_rate": 0.0002, "epoch": 1.2328308207705192, "step": 1840}, {"loss": 1.8013, "grad_norm": 0.3692343533039093, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1850}, {"loss": 1.736, "grad_norm": 0.38426971435546875, "learning_rate": 0.0002, "epoch": 1.2462311557788945, "step": 1860}, {"loss": 1.7031, "grad_norm": 0.33559855818748474, "learning_rate": 0.0002, "epoch": 1.252931323283082, "step": 1870}, {"loss": 1.7033, "grad_norm": 0.34181106090545654, "learning_rate": 0.0002, "epoch": 1.2596314907872697, "step": 1880}, {"loss": 1.7707, "grad_norm": 0.3916318416595459, "learning_rate": 0.0002, "epoch": 1.2663316582914572, "step": 1890}, {"loss": 1.6686, "grad_norm": 0.3887825012207031, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1900}, {"loss": 1.7062, "grad_norm": 0.33583927154541016, "learning_rate": 0.0002, "epoch": 1.2797319932998326, "step": 1910}, {"loss": 1.717, "grad_norm": 0.37639349699020386, "learning_rate": 0.0002, "epoch": 1.2864321608040201, "step": 1920}, {"loss": 1.777, "grad_norm": 0.38059428334236145, "learning_rate": 0.0002, "epoch": 1.2931323283082077, "step": 1930}, {"loss": 1.6126, "grad_norm": 0.37253183126449585, "learning_rate": 0.0002, "epoch": 1.2998324958123952, "step": 1940}, {"loss": 1.6758, "grad_norm": 0.37371566891670227, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1950}, {"loss": 1.6788, "grad_norm": 0.4080910086631775, "learning_rate": 0.0002, "epoch": 1.3132328308207706, "step": 1960}, {"loss": 1.6518, "grad_norm": 0.3174354135990143, "learning_rate": 0.0002, "epoch": 1.3199329983249581, "step": 1970}, {"loss": 1.7925, "grad_norm": 0.4518888294696808, "learning_rate": 0.0002, "epoch": 1.3266331658291457, "step": 1980}, {"loss": 1.7085, "grad_norm": 0.3627921938896179, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 1990}, {"loss": 1.7676, "grad_norm": 0.3655930161476135, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 2000}, {"loss": 1.7016, "grad_norm": 0.3509993255138397, "learning_rate": 0.0002, "epoch": 1.3467336683417086, "step": 2010}, {"loss": 1.7359, "grad_norm": 0.4281129240989685, "learning_rate": 0.0002, "epoch": 1.3534338358458962, "step": 2020}, {"loss": 1.6884, "grad_norm": 0.3821414113044739, "learning_rate": 0.0002, "epoch": 1.3601340033500837, "step": 2030}, {"loss": 1.7075, "grad_norm": 0.3907586336135864, "learning_rate": 0.0002, "epoch": 1.3668341708542713, "step": 2040}, {"loss": 1.7424, "grad_norm": 0.37792932987213135, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 2050}, {"loss": 1.7305, "grad_norm": 0.3693985641002655, "learning_rate": 0.0002, "epoch": 1.3802345058626466, "step": 2060}, {"loss": 1.7434, "grad_norm": 0.32275936007499695, "learning_rate": 0.0002, "epoch": 1.3869346733668342, "step": 2070}, {"loss": 1.6677, "grad_norm": 0.3789440095424652, "learning_rate": 0.0002, "epoch": 1.3936348408710217, "step": 2080}, {"loss": 1.6825, "grad_norm": 0.3638380467891693, "learning_rate": 0.0002, "epoch": 1.4003350083752093, "step": 2090}, {"loss": 1.6542, "grad_norm": 0.3495481610298157, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 2100}, {"loss": 1.7225, "grad_norm": 0.37920597195625305, "learning_rate": 0.0002, "epoch": 1.4137353433835846, "step": 2110}, {"loss": 1.7329, "grad_norm": 0.37218064069747925, "learning_rate": 0.0002, "epoch": 1.4204355108877722, "step": 2120}, {"loss": 1.799, "grad_norm": 0.38074082136154175, "learning_rate": 0.0002, "epoch": 1.4271356783919598, "step": 2130}, {"loss": 1.7403, "grad_norm": 0.3455527126789093, "learning_rate": 0.0002, "epoch": 1.4338358458961473, "step": 2140}, {"loss": 1.776, "grad_norm": 0.3712003529071808, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 2150}, {"loss": 1.7619, "grad_norm": 0.3786754906177521, "learning_rate": 0.0002, "epoch": 1.4472361809045227, "step": 2160}, {"loss": 1.68, "grad_norm": 0.3879223167896271, "learning_rate": 0.0002, "epoch": 1.4539363484087102, "step": 2170}, {"loss": 1.7, "grad_norm": 0.38738805055618286, "learning_rate": 0.0002, "epoch": 1.4606365159128978, "step": 2180}, {"loss": 1.7581, "grad_norm": 0.39768800139427185, "learning_rate": 0.0002, "epoch": 1.4673366834170856, "step": 2190}, {"loss": 1.7671, "grad_norm": 0.4172441065311432, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 2200}, {"loss": 1.6736, "grad_norm": 0.4043174982070923, "learning_rate": 0.0002, "epoch": 1.4807370184254607, "step": 2210}, {"loss": 1.7444, "grad_norm": 0.3750883936882019, "learning_rate": 0.0002, "epoch": 1.4874371859296482, "step": 2220}, {"loss": 1.6861, "grad_norm": 0.3552253246307373, "learning_rate": 0.0002, "epoch": 1.4941373534338358, "step": 2230}, {"loss": 1.6471, "grad_norm": 0.34607139229774475, "learning_rate": 0.0002, "epoch": 1.5008375209380236, "step": 2240}, {"loss": 1.6962, "grad_norm": 0.3406706750392914, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 2250}, {"loss": 1.7694, "grad_norm": 0.36654895544052124, "learning_rate": 0.0002, "epoch": 1.5142378559463987, "step": 2260}, {"loss": 1.6812, "grad_norm": 0.3914054334163666, "learning_rate": 0.0002, "epoch": 1.5209380234505863, "step": 2270}, {"loss": 1.6822, "grad_norm": 0.42012137174606323, "learning_rate": 0.0002, "epoch": 1.5276381909547738, "step": 2280}, {"loss": 1.697, "grad_norm": 0.39563435316085815, "learning_rate": 0.0002, "epoch": 1.5343383584589616, "step": 2290}, {"loss": 1.7491, "grad_norm": 0.3508438766002655, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 2300}, {"loss": 1.7727, "grad_norm": 0.3785218596458435, "learning_rate": 0.0002, "epoch": 1.5477386934673367, "step": 2310}, {"loss": 1.6963, "grad_norm": 0.39377647638320923, "learning_rate": 0.0002, "epoch": 1.5544388609715243, "step": 2320}, {"loss": 1.7263, "grad_norm": 0.3391438126564026, "learning_rate": 0.0002, "epoch": 1.5611390284757118, "step": 2330}, {"loss": 1.7722, "grad_norm": 0.37944263219833374, "learning_rate": 0.0002, "epoch": 1.5678391959798996, "step": 2340}, {"loss": 1.6371, "grad_norm": 0.3523491322994232, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 2350}, {"loss": 1.7583, "grad_norm": 0.3911575973033905, "learning_rate": 0.0002, "epoch": 1.5812395309882747, "step": 2360}, {"loss": 1.7117, "grad_norm": 0.33832186460494995, "learning_rate": 0.0002, "epoch": 1.5879396984924623, "step": 2370}, {"loss": 1.7701, "grad_norm": 0.3665979206562042, "learning_rate": 0.0002, "epoch": 1.5946398659966499, "step": 2380}, {"loss": 1.779, "grad_norm": 0.3871748149394989, "learning_rate": 0.0002, "epoch": 1.6013400335008376, "step": 2390}, {"loss": 1.7109, "grad_norm": 0.3586967885494232, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 2400}, {"loss": 1.7096, "grad_norm": 0.3563673198223114, "learning_rate": 0.0002, "epoch": 1.6147403685092128, "step": 2410}, {"loss": 1.745, "grad_norm": 0.37588971853256226, "learning_rate": 0.0002, "epoch": 1.6214405360134003, "step": 2420}, {"loss": 1.7086, "grad_norm": 0.352556437253952, "learning_rate": 0.0002, "epoch": 1.6281407035175879, "step": 2430}, {"loss": 1.6547, "grad_norm": 0.3716259300708771, "learning_rate": 0.0002, "epoch": 1.6348408710217757, "step": 2440}, {"loss": 1.7033, "grad_norm": 0.372001975774765, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 2450}, {"loss": 1.6584, "grad_norm": 0.3430042862892151, "learning_rate": 0.0002, "epoch": 1.6482412060301508, "step": 2460}, {"loss": 1.7217, "grad_norm": 0.3741483688354492, "learning_rate": 0.0002, "epoch": 1.6549413735343383, "step": 2470}, {"loss": 1.7701, "grad_norm": 0.3610571324825287, "learning_rate": 0.0002, "epoch": 1.661641541038526, "step": 2480}, {"loss": 1.7057, "grad_norm": 0.4204719066619873, "learning_rate": 0.0002, "epoch": 1.6683417085427137, "step": 2490}, {"loss": 1.7954, "grad_norm": 0.3938186466693878, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2500}, {"loss": 1.6633, "grad_norm": 0.3421435058116913, "learning_rate": 0.0002, "epoch": 1.6817420435510888, "step": 2510}, {"loss": 1.7996, "grad_norm": 0.42441412806510925, "learning_rate": 0.0002, "epoch": 1.6884422110552764, "step": 2520}, {"loss": 1.7142, "grad_norm": 0.38071519136428833, "learning_rate": 0.0002, "epoch": 1.695142378559464, "step": 2530}, {"loss": 1.7232, "grad_norm": 0.34078919887542725, "learning_rate": 0.0002, "epoch": 1.7018425460636517, "step": 2540}, {"loss": 1.7126, "grad_norm": 0.412844181060791, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2550}, {"loss": 1.7149, "grad_norm": 0.3753604292869568, "learning_rate": 0.0002, "epoch": 1.7152428810720268, "step": 2560}, {"loss": 1.7011, "grad_norm": 0.41588476300239563, "learning_rate": 0.0002, "epoch": 1.7219430485762144, "step": 2570}, {"loss": 1.6427, "grad_norm": 0.35504111647605896, "learning_rate": 0.0002, "epoch": 1.728643216080402, "step": 2580}, {"loss": 1.7296, "grad_norm": 0.36909720301628113, "learning_rate": 0.0002, "epoch": 1.7353433835845897, "step": 2590}, {"loss": 1.7022, "grad_norm": 0.4149979054927826, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2600}, {"loss": 1.77, "grad_norm": 0.38859328627586365, "learning_rate": 0.0002, "epoch": 1.7487437185929648, "step": 2610}, {"loss": 1.7036, "grad_norm": 0.36738792061805725, "learning_rate": 0.0002, "epoch": 1.7554438860971524, "step": 2620}, {"loss": 1.764, "grad_norm": 0.3968178927898407, "learning_rate": 0.0002, "epoch": 1.76214405360134, "step": 2630}, {"loss": 1.7687, "grad_norm": 0.3972901999950409, "learning_rate": 0.0002, "epoch": 1.7688442211055277, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.3949959874153137, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2650}, {"loss": 1.7247, "grad_norm": 0.44074657559394836, "learning_rate": 0.0002, "epoch": 1.7822445561139029, "step": 2660}, {"loss": 1.7188, "grad_norm": 0.39743664860725403, "learning_rate": 0.0002, "epoch": 1.7889447236180904, "step": 2670}, {"loss": 1.7258, "grad_norm": 0.3950406610965729, "learning_rate": 0.0002, "epoch": 1.795644891122278, "step": 2680}, {"loss": 1.6906, "grad_norm": 0.3568263649940491, "learning_rate": 0.0002, "epoch": 1.8023450586264658, "step": 2690}, {"loss": 1.6735, "grad_norm": 0.3819476366043091, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2700}, {"loss": 1.7198, "grad_norm": 0.3480634391307831, "learning_rate": 0.0002, "epoch": 1.8157453936348409, "step": 2710}, {"loss": 1.7042, "grad_norm": 0.3875853419303894, "learning_rate": 0.0002, "epoch": 1.8224455611390284, "step": 2720}, {"loss": 1.6988, "grad_norm": 0.3441337049007416, "learning_rate": 0.0002, "epoch": 1.829145728643216, "step": 2730}, {"loss": 1.7647, "grad_norm": 0.35692882537841797, "learning_rate": 0.0002, "epoch": 1.8358458961474038, "step": 2740}, {"loss": 1.7033, "grad_norm": 0.36959215998649597, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2750}, {"loss": 1.7657, "grad_norm": 0.3893393278121948, "learning_rate": 0.0002, "epoch": 1.849246231155779, "step": 2760}, {"loss": 1.7068, "grad_norm": 0.37817293405532837, "learning_rate": 0.0002, "epoch": 1.8559463986599665, "step": 2770}, {"loss": 1.761, "grad_norm": 0.36071285605430603, "learning_rate": 0.0002, "epoch": 1.862646566164154, "step": 2780}, {"loss": 1.7623, "grad_norm": 0.3758420944213867, "learning_rate": 0.0002, "epoch": 1.8693467336683418, "step": 2790}, {"loss": 1.6743, "grad_norm": 0.3889938294887543, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2800}, {"loss": 1.6151, "grad_norm": 0.34361857175827026, "learning_rate": 0.0002, "epoch": 1.882747068676717, "step": 2810}, {"loss": 1.6038, "grad_norm": 0.39283323287963867, "learning_rate": 0.0002, "epoch": 1.8894472361809045, "step": 2820}, {"loss": 1.7555, "grad_norm": 0.3919452726840973, "learning_rate": 0.0002, "epoch": 1.896147403685092, "step": 2830}, {"loss": 1.673, "grad_norm": 0.38215070962905884, "learning_rate": 0.0002, "epoch": 1.9028475711892798, "step": 2840}, {"loss": 1.7044, "grad_norm": 0.4235064387321472, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2850}, {"loss": 1.7123, "grad_norm": 0.35694634914398193, "learning_rate": 0.0002, "epoch": 1.916247906197655, "step": 2860}, {"loss": 1.8128, "grad_norm": 0.383492112159729, "learning_rate": 0.0002, "epoch": 1.9229480737018425, "step": 2870}, {"loss": 1.7581, "grad_norm": 0.5945147275924683, "learning_rate": 0.0002, "epoch": 1.92964824120603, "step": 2880}, {"loss": 1.7421, "grad_norm": 0.3367522358894348, "learning_rate": 0.0002, "epoch": 1.9363484087102178, "step": 2890}, {"loss": 1.6561, "grad_norm": 0.35300394892692566, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2900}, {"loss": 1.7033, "grad_norm": 0.38084495067596436, "learning_rate": 0.0002, "epoch": 1.949748743718593, "step": 2910}, {"loss": 1.7132, "grad_norm": 0.37559160590171814, "learning_rate": 0.0002, "epoch": 1.9564489112227805, "step": 2920}, {"loss": 1.6759, "grad_norm": 0.3661738336086273, "learning_rate": 0.0002, "epoch": 1.963149078726968, "step": 2930}, {"loss": 1.7643, "grad_norm": 0.4073849320411682, "learning_rate": 0.0002, "epoch": 1.9698492462311559, "step": 2940}, {"loss": 1.6806, "grad_norm": 0.3723304271697998, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2950}, {"loss": 1.7611, "grad_norm": 0.3991098999977112, "learning_rate": 0.0002, "epoch": 1.983249581239531, "step": 2960}, {"loss": 1.7263, "grad_norm": 0.3947085440158844, "learning_rate": 0.0002, "epoch": 1.9899497487437185, "step": 2970}, {"loss": 1.7217, "grad_norm": 0.3786258399486542, "learning_rate": 0.0002, "epoch": 1.996649916247906, "step": 2980}, {"eval_loss": 1.8028968572616577, "eval_runtime": 37.8985, "eval_samples_per_second": 13.589, "eval_steps_per_second": 1.715, "epoch": 2.0, "step": 2985}, {"loss": 1.695, "grad_norm": 0.34824079275131226, "learning_rate": 0.0002, "epoch": 2.003350083752094, "step": 2990}, {"loss": 1.5853, "grad_norm": 0.3394894003868103, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 3000}, {"loss": 1.5783, "grad_norm": 0.36910977959632874, "learning_rate": 0.0002, "epoch": 2.016750418760469, "step": 3010}, {"loss": 1.6105, "grad_norm": 0.45000967383384705, "learning_rate": 0.0002, "epoch": 2.023450586264657, "step": 3020}, {"loss": 1.6019, "grad_norm": 0.3791407346725464, "learning_rate": 0.0002, "epoch": 2.030150753768844, "step": 3030}, {"loss": 1.5832, "grad_norm": 0.387321799993515, "learning_rate": 0.0002, "epoch": 2.036850921273032, "step": 3040}, {"loss": 1.6834, "grad_norm": 0.4185757040977478, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 3050}, {"loss": 1.5696, "grad_norm": 0.45110777020454407, "learning_rate": 0.0002, "epoch": 2.050251256281407, "step": 3060}, {"loss": 1.6231, "grad_norm": 0.42663660645484924, "learning_rate": 0.0002, "epoch": 2.056951423785595, "step": 3070}, {"loss": 1.6279, "grad_norm": 0.4546292722225189, "learning_rate": 0.0002, "epoch": 2.063651591289782, "step": 3080}, {"loss": 1.6141, "grad_norm": 0.3979759216308594, "learning_rate": 0.0002, "epoch": 2.07035175879397, "step": 3090}, {"loss": 1.6343, "grad_norm": 0.43596673011779785, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 3100}, {"loss": 1.5441, "grad_norm": 0.40120232105255127, "learning_rate": 0.0002, "epoch": 2.083752093802345, "step": 3110}, {"loss": 1.6309, "grad_norm": 0.44449281692504883, "learning_rate": 0.0002, "epoch": 2.090452261306533, "step": 3120}, {"loss": 1.5652, "grad_norm": 0.42672568559646606, "learning_rate": 0.0002, "epoch": 2.09715242881072, "step": 3130}, {"loss": 1.682, "grad_norm": 0.4232690930366516, "learning_rate": 0.0002, "epoch": 2.103852596314908, "step": 3140}, {"loss": 1.624, "grad_norm": 0.4299317002296448, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 3150}, {"loss": 1.6766, "grad_norm": 0.4067758023738861, "learning_rate": 0.0002, "epoch": 2.117252931323283, "step": 3160}, {"loss": 1.6759, "grad_norm": 0.4918815791606903, "learning_rate": 0.0002, "epoch": 2.123953098827471, "step": 3170}, {"loss": 1.6478, "grad_norm": 0.4140559732913971, "learning_rate": 0.0002, "epoch": 2.130653266331658, "step": 3180}, {"loss": 1.6641, "grad_norm": 0.4555995464324951, "learning_rate": 0.0002, "epoch": 2.137353433835846, "step": 3190}, {"loss": 1.5888, "grad_norm": 0.42943915724754333, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 3200}, {"loss": 1.5886, "grad_norm": 0.4730435013771057, "learning_rate": 0.0002, "epoch": 2.150753768844221, "step": 3210}, {"loss": 1.6022, "grad_norm": 0.43310216069221497, "learning_rate": 0.0002, "epoch": 2.157453936348409, "step": 3220}, {"loss": 1.6058, "grad_norm": 0.42054110765457153, "learning_rate": 0.0002, "epoch": 2.164154103852596, "step": 3230}, {"loss": 1.6749, "grad_norm": 0.4897233247756958, "learning_rate": 0.0002, "epoch": 2.170854271356784, "step": 3240}, {"loss": 1.6983, "grad_norm": 0.42194533348083496, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 3250}, {"loss": 1.6159, "grad_norm": 0.44494450092315674, "learning_rate": 0.0002, "epoch": 2.184254606365159, "step": 3260}, {"loss": 1.6977, "grad_norm": 0.43524879217147827, "learning_rate": 0.0002, "epoch": 2.190954773869347, "step": 3270}, {"loss": 1.528, "grad_norm": 0.4621117413043976, "learning_rate": 0.0002, "epoch": 2.1976549413735342, "step": 3280}, {"loss": 1.632, "grad_norm": 0.4073285460472107, "learning_rate": 0.0002, "epoch": 2.204355108877722, "step": 3290}, {"loss": 1.6141, "grad_norm": 0.47868335247039795, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 3300}, {"loss": 1.6857, "grad_norm": 0.4264970123767853, "learning_rate": 0.0002, "epoch": 2.217755443886097, "step": 3310}, {"loss": 1.5653, "grad_norm": 0.4491245150566101, "learning_rate": 0.0002, "epoch": 2.224455611390285, "step": 3320}, {"loss": 1.5881, "grad_norm": 0.4010344445705414, "learning_rate": 0.0002, "epoch": 2.2311557788944723, "step": 3330}, {"loss": 1.6684, "grad_norm": 0.4232759177684784, "learning_rate": 0.0002, "epoch": 2.23785594639866, "step": 3340}, {"loss": 1.6336, "grad_norm": 0.5099776983261108, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 3350}, {"loss": 1.6764, "grad_norm": 0.5223407745361328, "learning_rate": 0.0002, "epoch": 2.251256281407035, "step": 3360}, {"loss": 1.6625, "grad_norm": 0.47818470001220703, "learning_rate": 0.0002, "epoch": 2.257956448911223, "step": 3370}, {"loss": 1.5946, "grad_norm": 0.4721255898475647, "learning_rate": 0.0002, "epoch": 2.2646566164154103, "step": 3380}, {"loss": 1.5568, "grad_norm": 0.4113229513168335, "learning_rate": 0.0002, "epoch": 2.271356783919598, "step": 3390}, {"loss": 1.6494, "grad_norm": 0.507080078125, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 3400}, {"loss": 1.6183, "grad_norm": 0.4852292239665985, "learning_rate": 0.0002, "epoch": 2.284757118927973, "step": 3410}, {"loss": 1.6132, "grad_norm": 0.4503684341907501, "learning_rate": 0.0002, "epoch": 2.291457286432161, "step": 3420}, {"loss": 1.6649, "grad_norm": 0.8359600305557251, "learning_rate": 0.0002, "epoch": 2.2981574539363483, "step": 3430}, {"loss": 1.6644, "grad_norm": 0.44604045152664185, "learning_rate": 0.0002, "epoch": 2.304857621440536, "step": 3440}, {"loss": 1.5972, "grad_norm": 0.45667049288749695, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 3450}, {"loss": 1.6667, "grad_norm": 0.4879349172115326, "learning_rate": 0.0002, "epoch": 2.318257956448911, "step": 3460}, {"loss": 1.5804, "grad_norm": 0.4033963084220886, "learning_rate": 0.0002, "epoch": 2.324958123953099, "step": 3470}, {"loss": 1.5838, "grad_norm": 0.44494301080703735, "learning_rate": 0.0002, "epoch": 2.3316582914572863, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4794621765613556, "learning_rate": 0.0002, "epoch": 2.338358458961474, "step": 3490}, {"loss": 1.6807, "grad_norm": 0.41404327750205994, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 3500}, {"loss": 1.714, "grad_norm": 0.4664851725101471, "learning_rate": 0.0002, "epoch": 2.351758793969849, "step": 3510}, {"loss": 1.6537, "grad_norm": 0.4263697564601898, "learning_rate": 0.0002, "epoch": 2.358458961474037, "step": 3520}, {"loss": 1.6551, "grad_norm": 0.5035167336463928, "learning_rate": 0.0002, "epoch": 2.3651591289782243, "step": 3530}, {"loss": 1.6208, "grad_norm": 0.4380664527416229, "learning_rate": 0.0002, "epoch": 2.371859296482412, "step": 3540}, {"loss": 1.634, "grad_norm": 0.5227681994438171, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 3550}, {"loss": 1.6146, "grad_norm": 0.4382302761077881, "learning_rate": 0.0002, "epoch": 2.3852596314907872, "step": 3560}, {"loss": 1.5653, "grad_norm": 0.4392451047897339, "learning_rate": 0.0002, "epoch": 2.391959798994975, "step": 3570}, {"loss": 1.6626, "grad_norm": 0.4372786581516266, "learning_rate": 0.0002, "epoch": 2.3986599664991624, "step": 3580}, {"loss": 1.519, "grad_norm": 0.5015502572059631, "learning_rate": 0.0002, "epoch": 2.40536013400335, "step": 3590}, {"loss": 1.588, "grad_norm": 0.5653210878372192, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 3600}, {"loss": 1.6075, "grad_norm": 0.53007972240448, "learning_rate": 0.0002, "epoch": 2.4187604690117253, "step": 3610}, {"loss": 1.6421, "grad_norm": 0.4659176766872406, "learning_rate": 0.0002, "epoch": 2.425460636515913, "step": 3620}, {"loss": 1.625, "grad_norm": 0.5637837052345276, "learning_rate": 0.0002, "epoch": 2.4321608040201004, "step": 3630}, {"loss": 1.6168, "grad_norm": 0.4248391389846802, "learning_rate": 0.0002, "epoch": 2.438860971524288, "step": 3640}, {"loss": 1.6822, "grad_norm": 0.44668248295783997, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 3650}, {"loss": 1.6417, "grad_norm": 0.43990179896354675, "learning_rate": 0.0002, "epoch": 2.4522613065326633, "step": 3660}, {"loss": 1.6723, "grad_norm": 0.4532523453235626, "learning_rate": 0.0002, "epoch": 2.458961474036851, "step": 3670}, {"loss": 1.6957, "grad_norm": 0.6605591773986816, "learning_rate": 0.0002, "epoch": 2.4656616415410384, "step": 3680}, {"loss": 1.6159, "grad_norm": 0.4694533348083496, "learning_rate": 0.0002, "epoch": 2.472361809045226, "step": 3690}, {"loss": 1.6239, "grad_norm": 0.4485011100769043, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 3700}, {"loss": 1.6834, "grad_norm": 0.4761785864830017, "learning_rate": 0.0002, "epoch": 2.4857621440536013, "step": 3710}, {"loss": 1.6313, "grad_norm": 0.5116432309150696, "learning_rate": 0.0002, "epoch": 2.492462311557789, "step": 3720}, {"loss": 1.5054, "grad_norm": 0.49523618817329407, "learning_rate": 0.0002, "epoch": 2.4991624790619764, "step": 3730}, {"loss": 1.6249, "grad_norm": 0.43826380372047424, "learning_rate": 0.0002, "epoch": 2.505862646566164, "step": 3740}, {"loss": 1.5762, "grad_norm": 0.4916154146194458, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3750}, {"loss": 1.5157, "grad_norm": 0.5381299257278442, "learning_rate": 0.0002, "epoch": 2.5192629815745393, "step": 3760}, {"loss": 1.6467, "grad_norm": 0.44947415590286255, "learning_rate": 0.0002, "epoch": 2.525963149078727, "step": 3770}, {"loss": 1.67, "grad_norm": 0.49979084730148315, "learning_rate": 0.0002, "epoch": 2.5326633165829144, "step": 3780}, {"loss": 1.622, "grad_norm": 0.43046900629997253, "learning_rate": 0.0002, "epoch": 2.539363484087102, "step": 3790}, {"loss": 1.6789, "grad_norm": 0.4513470530509949, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3800}, {"loss": 1.6335, "grad_norm": 0.49900051951408386, "learning_rate": 0.0002, "epoch": 2.5527638190954773, "step": 3810}, {"loss": 1.6322, "grad_norm": 0.4348420202732086, "learning_rate": 0.0002, "epoch": 2.559463986599665, "step": 3820}, {"loss": 1.6218, "grad_norm": 0.4684867560863495, "learning_rate": 0.0002, "epoch": 2.5661641541038525, "step": 3830}, {"loss": 1.6535, "grad_norm": 0.44430989027023315, "learning_rate": 0.0002, "epoch": 2.5728643216080402, "step": 3840}, {"loss": 1.5909, "grad_norm": 0.47375255823135376, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3850}, {"loss": 1.6269, "grad_norm": 0.45493075251579285, "learning_rate": 0.0002, "epoch": 2.5862646566164154, "step": 3860}, {"loss": 1.604, "grad_norm": 0.4563275873661041, "learning_rate": 0.0002, "epoch": 2.592964824120603, "step": 3870}, {"loss": 1.642, "grad_norm": 0.46060335636138916, "learning_rate": 0.0002, "epoch": 2.5996649916247905, "step": 3880}, {"loss": 1.6302, "grad_norm": 0.4718867540359497, "learning_rate": 0.0002, "epoch": 2.6063651591289783, "step": 3890}, {"loss": 1.6242, "grad_norm": 0.41570305824279785, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3900}, {"loss": 1.6401, "grad_norm": 0.4603121876716614, "learning_rate": 0.0002, "epoch": 2.6197654941373534, "step": 3910}, {"loss": 1.6839, "grad_norm": 0.4734652638435364, "learning_rate": 0.0002, "epoch": 2.626465661641541, "step": 3920}, {"loss": 1.5448, "grad_norm": 0.45348483324050903, "learning_rate": 0.0002, "epoch": 2.6331658291457285, "step": 3930}, {"loss": 1.6157, "grad_norm": 0.46559447050094604, "learning_rate": 0.0002, "epoch": 2.6398659966499163, "step": 3940}, {"loss": 1.7052, "grad_norm": 0.44113144278526306, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3950}, {"loss": 1.6315, "grad_norm": 0.41415104269981384, "learning_rate": 0.0002, "epoch": 2.6532663316582914, "step": 3960}, {"loss": 1.6589, "grad_norm": 0.48868080973625183, "learning_rate": 0.0002, "epoch": 2.659966499162479, "step": 3970}, {"loss": 1.6211, "grad_norm": 0.49610549211502075, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 3980}, {"loss": 1.6235, "grad_norm": 0.4309130907058716, "learning_rate": 0.0002, "epoch": 2.6733668341708543, "step": 3990}, {"loss": 1.6452, "grad_norm": 0.4489327669143677, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 4000}, {"loss": 1.5744, "grad_norm": 0.5380139946937561, "learning_rate": 0.0002, "epoch": 2.6867671691792294, "step": 4010}, {"loss": 1.6524, "grad_norm": 0.5076672434806824, "learning_rate": 0.0002, "epoch": 2.693467336683417, "step": 4020}, {"loss": 1.636, "grad_norm": 0.47620031237602234, "learning_rate": 0.0002, "epoch": 2.7001675041876045, "step": 4030}, {"loss": 1.5543, "grad_norm": 0.48089155554771423, "learning_rate": 0.0002, "epoch": 2.7068676716917923, "step": 4040}, {"loss": 1.6396, "grad_norm": 0.5108814239501953, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 4050}, {"loss": 1.5905, "grad_norm": 0.4196513295173645, "learning_rate": 0.0002, "epoch": 2.7202680067001674, "step": 4060}, {"loss": 1.686, "grad_norm": 0.4574664831161499, "learning_rate": 0.0002, "epoch": 2.726968174204355, "step": 4070}, {"loss": 1.6234, "grad_norm": 0.4671640992164612, "learning_rate": 0.0002, "epoch": 2.7336683417085426, "step": 4080}, {"loss": 1.6827, "grad_norm": 0.49355530738830566, "learning_rate": 0.0002, "epoch": 2.7403685092127303, "step": 4090}, {"loss": 1.6999, "grad_norm": 0.46716663241386414, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 4100}, {"loss": 1.6463, "grad_norm": 0.45420581102371216, "learning_rate": 0.0002, "epoch": 2.7537688442211055, "step": 4110}, {"loss": 1.5718, "grad_norm": 0.4680487811565399, "learning_rate": 0.0002, "epoch": 2.7604690117252932, "step": 4120}, {"loss": 1.5968, "grad_norm": 0.5375032424926758, "learning_rate": 0.0002, "epoch": 2.7671691792294806, "step": 4130}, {"loss": 1.5254, "grad_norm": 0.46026280522346497, "learning_rate": 0.0002, "epoch": 2.7738693467336684, "step": 4140}, {"loss": 1.6613, "grad_norm": 0.43658447265625, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 4150}, {"loss": 1.6546, "grad_norm": 0.4935547113418579, "learning_rate": 0.0002, "epoch": 2.7872696817420435, "step": 4160}, {"loss": 1.5961, "grad_norm": 0.8167962431907654, "learning_rate": 0.0002, "epoch": 2.7939698492462313, "step": 4170}, {"loss": 1.6907, "grad_norm": 0.4289683997631073, "learning_rate": 0.0002, "epoch": 2.8006700167504186, "step": 4180}, {"loss": 1.6385, "grad_norm": 0.4569324254989624, "learning_rate": 0.0002, "epoch": 2.8073701842546064, "step": 4190}, {"loss": 1.6077, "grad_norm": 0.474795937538147, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 4200}, {"loss": 1.6223, "grad_norm": 0.44272229075431824, "learning_rate": 0.0002, "epoch": 2.8207705192629815, "step": 4210}, {"loss": 1.6706, "grad_norm": 0.525240957736969, "learning_rate": 0.0002, "epoch": 2.8274706867671693, "step": 4220}, {"loss": 1.7196, "grad_norm": 0.4802303910255432, "learning_rate": 0.0002, "epoch": 2.8341708542713566, "step": 4230}, {"loss": 1.6002, "grad_norm": 0.46400442719459534, "learning_rate": 0.0002, "epoch": 2.8408710217755444, "step": 4240}, {"loss": 1.6052, "grad_norm": 0.49884888529777527, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 4250}, {"loss": 1.6919, "grad_norm": 0.5015072226524353, "learning_rate": 0.0002, "epoch": 2.8542713567839195, "step": 4260}, {"loss": 1.6335, "grad_norm": 0.4335440695285797, "learning_rate": 0.0002, "epoch": 2.8609715242881073, "step": 4270}, {"loss": 1.5664, "grad_norm": 0.5131644606590271, "learning_rate": 0.0002, "epoch": 2.8676716917922946, "step": 4280}, {"loss": 1.6409, "grad_norm": 0.6977195739746094, "learning_rate": 0.0002, "epoch": 2.8743718592964824, "step": 4290}, {"loss": 1.7192, "grad_norm": 0.5133762955665588, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 4300}, {"loss": 1.6257, "grad_norm": 0.4737614393234253, "learning_rate": 0.0002, "epoch": 2.8877721943048575, "step": 4310}, {"loss": 1.6076, "grad_norm": 0.4580535590648651, "learning_rate": 0.0002, "epoch": 2.8944723618090453, "step": 4320}, {"loss": 1.6538, "grad_norm": 0.43863341212272644, "learning_rate": 0.0002, "epoch": 2.901172529313233, "step": 4330}, {"loss": 1.6091, "grad_norm": 0.4103737473487854, "learning_rate": 0.0002, "epoch": 2.9078726968174204, "step": 4340}, {"loss": 1.7106, "grad_norm": 0.438014417886734, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 4350}, {"loss": 1.6025, "grad_norm": 0.5068213939666748, "learning_rate": 0.0002, "epoch": 2.9212730318257956, "step": 4360}, {"loss": 1.6426, "grad_norm": 0.45305484533309937, "learning_rate": 0.0002, "epoch": 2.9279731993299833, "step": 4370}, {"loss": 1.5726, "grad_norm": 0.4612090289592743, "learning_rate": 0.0002, "epoch": 2.934673366834171, "step": 4380}, {"loss": 1.6536, "grad_norm": 0.508736789226532, "learning_rate": 0.0002, "epoch": 2.9413735343383585, "step": 4390}, {"loss": 1.6132, "grad_norm": 0.4924427270889282, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 4400}, {"loss": 1.7007, "grad_norm": 0.5707460641860962, "learning_rate": 0.0002, "epoch": 2.9547738693467336, "step": 4410}, {"loss": 1.6814, "grad_norm": 0.42270299792289734, "learning_rate": 0.0002, "epoch": 2.9614740368509214, "step": 4420}, {"loss": 1.6644, "grad_norm": 0.4429931044578552, "learning_rate": 0.0002, "epoch": 2.968174204355109, "step": 4430}, {"loss": 1.6251, "grad_norm": 0.49760574102401733, "learning_rate": 0.0002, "epoch": 2.9748743718592965, "step": 4440}, {"loss": 1.6169, "grad_norm": 0.4558229148387909, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 4450}, {"loss": 1.6055, "grad_norm": 0.39848530292510986, "learning_rate": 0.0002, "epoch": 2.9882747068676716, "step": 4460}, {"loss": 1.6705, "grad_norm": 0.5224862098693848, "learning_rate": 0.0002, "epoch": 2.9949748743718594, "step": 4470}, {"eval_loss": 1.8228833675384521, "eval_runtime": 37.9049, "eval_samples_per_second": 13.587, "eval_steps_per_second": 1.715, "epoch": 2.9996649916247904, "step": 4477}, {"loss": 1.6637, "grad_norm": 0.41169142723083496, "learning_rate": 0.0002, "epoch": 3.0016750418760467, "step": 4480}, {"loss": 1.5974, "grad_norm": 0.4865207374095917, "learning_rate": 0.0002, "epoch": 3.0083752093802345, "step": 4490}, {"loss": 1.5297, "grad_norm": 0.5462028384208679, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 4500}, {"loss": 1.5251, "grad_norm": 0.6169732809066772, "learning_rate": 0.0002, "epoch": 3.0217755443886096, "step": 4510}, {"loss": 1.5559, "grad_norm": 0.5667954087257385, "learning_rate": 0.0002, "epoch": 3.0284757118927974, "step": 4520}, {"loss": 1.5037, "grad_norm": 0.5758325457572937, "learning_rate": 0.0002, "epoch": 3.0351758793969847, "step": 4530}, {"loss": 1.4873, "grad_norm": 0.5220064520835876, "learning_rate": 0.0002, "epoch": 3.0418760469011725, "step": 4540}, {"loss": 1.5126, "grad_norm": 0.5469558835029602, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 4550}, {"loss": 1.4275, "grad_norm": 0.5680848956108093, "learning_rate": 0.0002, "epoch": 3.0552763819095476, "step": 4560}, {"loss": 1.5187, "grad_norm": 0.5906574726104736, "learning_rate": 0.0002, "epoch": 3.0619765494137354, "step": 4570}, {"loss": 1.4551, "grad_norm": 0.4725631773471832, "learning_rate": 0.0002, "epoch": 3.0686767169179228, "step": 4580}, {"loss": 1.5083, "grad_norm": 0.5273477435112, "learning_rate": 0.0002, "epoch": 3.0753768844221105, "step": 4590}, {"loss": 1.5154, "grad_norm": 0.5861203074455261, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 4600}, {"loss": 1.4924, "grad_norm": 0.5343965291976929, "learning_rate": 0.0002, "epoch": 3.0887772194304857, "step": 4610}, {"loss": 1.5608, "grad_norm": 0.5348150730133057, "learning_rate": 0.0002, "epoch": 3.0954773869346734, "step": 4620}, {"loss": 1.5399, "grad_norm": 0.5971846580505371, "learning_rate": 0.0002, "epoch": 3.102177554438861, "step": 4630}, {"loss": 1.4662, "grad_norm": 0.5203177332878113, "learning_rate": 0.0002, "epoch": 3.1088777219430486, "step": 4640}, {"loss": 1.5805, "grad_norm": 0.55289226770401, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 4650}, {"loss": 1.4745, "grad_norm": 0.6878530979156494, "learning_rate": 0.0002, "epoch": 3.1222780569514237, "step": 4660}, {"loss": 1.5335, "grad_norm": 0.6173256635665894, "learning_rate": 0.0002, "epoch": 3.1289782244556115, "step": 4670}, {"loss": 1.51, "grad_norm": 0.536796510219574, "learning_rate": 0.0002, "epoch": 3.135678391959799, "step": 4680}, {"loss": 1.4713, "grad_norm": 0.58846116065979, "learning_rate": 0.0002, "epoch": 3.1423785594639866, "step": 4690}, {"loss": 1.5114, "grad_norm": 0.645889401435852, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 4700}, {"loss": 1.4705, "grad_norm": 0.6118691563606262, "learning_rate": 0.0002, "epoch": 3.1557788944723617, "step": 4710}, {"loss": 1.5533, "grad_norm": 0.5189669132232666, "learning_rate": 0.0002, "epoch": 3.1624790619765495, "step": 4720}, {"loss": 1.4769, "grad_norm": 0.5794713497161865, "learning_rate": 0.0002, "epoch": 3.169179229480737, "step": 4730}, {"loss": 1.4849, "grad_norm": 0.6579326391220093, "learning_rate": 0.0002, "epoch": 3.1758793969849246, "step": 4740}, {"loss": 1.545, "grad_norm": 0.5822742581367493, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 4750}, {"loss": 1.4358, "grad_norm": 0.5475956201553345, "learning_rate": 0.0002, "epoch": 3.1892797319932997, "step": 4760}, {"loss": 1.4723, "grad_norm": 0.6743834018707275, "learning_rate": 0.0002, "epoch": 3.1959798994974875, "step": 4770}, {"loss": 1.5161, "grad_norm": 0.6110585927963257, "learning_rate": 0.0002, "epoch": 3.202680067001675, "step": 4780}, {"loss": 1.5455, "grad_norm": 0.5426181554794312, "learning_rate": 0.0002, "epoch": 3.2093802345058626, "step": 4790}, {"loss": 1.5315, "grad_norm": 0.6077824234962463, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 4800}, {"loss": 1.5314, "grad_norm": 0.5785858631134033, "learning_rate": 0.0002, "epoch": 3.2227805695142377, "step": 4810}, {"loss": 1.4041, "grad_norm": 0.6425958275794983, "learning_rate": 0.0002, "epoch": 3.2294807370184255, "step": 4820}, {"loss": 1.4751, "grad_norm": 0.6607080698013306, "learning_rate": 0.0002, "epoch": 3.236180904522613, "step": 4830}, {"loss": 1.5267, "grad_norm": 0.5385788679122925, "learning_rate": 0.0002, "epoch": 3.2428810720268006, "step": 4840}, {"loss": 1.4673, "grad_norm": 0.5630403757095337, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 4850}, {"loss": 1.5257, "grad_norm": 0.6340779662132263, "learning_rate": 0.0002, "epoch": 3.2562814070351758, "step": 4860}, {"loss": 1.5148, "grad_norm": 0.5305342674255371, "learning_rate": 0.0002, "epoch": 3.2629815745393635, "step": 4870}, {"loss": 1.5162, "grad_norm": 0.597670316696167, "learning_rate": 0.0002, "epoch": 3.2696817420435513, "step": 4880}, {"loss": 1.5429, "grad_norm": 0.665553867816925, "learning_rate": 0.0002, "epoch": 3.2763819095477387, "step": 4890}, {"loss": 1.4607, "grad_norm": 0.579767644405365, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 4900}, {"loss": 1.4999, "grad_norm": 0.5512481331825256, "learning_rate": 0.0002, "epoch": 3.289782244556114, "step": 4910}, {"loss": 1.5022, "grad_norm": 0.5916532278060913, "learning_rate": 0.0002, "epoch": 3.2964824120603016, "step": 4920}, {"loss": 1.4889, "grad_norm": 0.7521726489067078, "learning_rate": 0.0002, "epoch": 3.3031825795644894, "step": 4930}, {"loss": 1.4223, "grad_norm": 0.5352797508239746, "learning_rate": 0.0002, "epoch": 3.3098827470686767, "step": 4940}, {"loss": 1.5122, "grad_norm": 0.5950371623039246, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 4950}, {"loss": 1.5072, "grad_norm": 0.8020477890968323, "learning_rate": 0.0002, "epoch": 3.323283082077052, "step": 4960}, {"loss": 1.5422, "grad_norm": 0.6790024638175964, "learning_rate": 0.0002, "epoch": 3.3299832495812396, "step": 4970}, {"loss": 1.5363, "grad_norm": 0.687627375125885, "learning_rate": 0.0002, "epoch": 3.3366834170854274, "step": 4980}, {"loss": 1.5276, "grad_norm": 0.6094385385513306, "learning_rate": 0.0002, "epoch": 3.3433835845896147, "step": 4990}, {"loss": 1.549, "grad_norm": 0.6541242003440857, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 5000}, {"loss": 1.6067, "grad_norm": 0.5560880303382874, "learning_rate": 0.0002, "epoch": 3.35678391959799, "step": 5010}, {"loss": 1.5769, "grad_norm": 0.5440094470977783, "learning_rate": 0.0002, "epoch": 3.3634840871021776, "step": 5020}, {"loss": 1.6183, "grad_norm": 0.5749301314353943, "learning_rate": 0.0002, "epoch": 3.3701842546063654, "step": 5030}, {"loss": 1.4801, "grad_norm": 0.5919716954231262, "learning_rate": 0.0002, "epoch": 3.3768844221105527, "step": 5040}, {"loss": 1.5957, "grad_norm": 0.6331481337547302, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 5050}, {"loss": 1.5188, "grad_norm": 0.5687161684036255, "learning_rate": 0.0002, "epoch": 3.390284757118928, "step": 5060}, {"loss": 1.5702, "grad_norm": 0.6718577742576599, "learning_rate": 0.0002, "epoch": 3.3969849246231156, "step": 5070}, {"loss": 1.5577, "grad_norm": 0.5089324116706848, "learning_rate": 0.0002, "epoch": 3.4036850921273034, "step": 5080}, {"loss": 1.512, "grad_norm": 0.5710174441337585, "learning_rate": 0.0002, "epoch": 3.4103852596314908, "step": 5090}, {"loss": 1.5492, "grad_norm": 0.6670721173286438, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 5100}, {"loss": 1.5227, "grad_norm": 0.6875665187835693, "learning_rate": 0.0002, "epoch": 3.423785594639866, "step": 5110}, {"loss": 1.4496, "grad_norm": 0.5375880599021912, "learning_rate": 0.0002, "epoch": 3.4304857621440537, "step": 5120}, {"loss": 1.5527, "grad_norm": 0.6550399661064148, "learning_rate": 0.0002, "epoch": 3.4371859296482414, "step": 5130}, {"loss": 1.5687, "grad_norm": 0.5948067903518677, "learning_rate": 0.0002, "epoch": 3.4438860971524288, "step": 5140}, {"loss": 1.4813, "grad_norm": 0.6134477257728577, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 5150}, {"loss": 1.5069, "grad_norm": 0.6506398320198059, "learning_rate": 0.0002, "epoch": 3.457286432160804, "step": 5160}, {"loss": 1.4422, "grad_norm": 0.6060147881507874, "learning_rate": 0.0002, "epoch": 3.4639865996649917, "step": 5170}, {"loss": 1.5093, "grad_norm": 0.6173806190490723, "learning_rate": 0.0002, "epoch": 3.4706867671691795, "step": 5180}, {"loss": 1.4975, "grad_norm": 0.6032607555389404, "learning_rate": 0.0002, "epoch": 3.477386934673367, "step": 5190}, {"loss": 1.4979, "grad_norm": 0.5652492046356201, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 5200}, {"loss": 1.4883, "grad_norm": 0.6168607473373413, "learning_rate": 0.0002, "epoch": 3.490787269681742, "step": 5210}, {"loss": 1.5164, "grad_norm": 0.6170629262924194, "learning_rate": 0.0002, "epoch": 3.4974874371859297, "step": 5220}, {"loss": 1.4879, "grad_norm": 0.6926297545433044, "learning_rate": 0.0002, "epoch": 3.5041876046901175, "step": 5230}, {"loss": 1.4982, "grad_norm": 0.6702437996864319, "learning_rate": 0.0002, "epoch": 3.510887772194305, "step": 5240}, {"loss": 1.4986, "grad_norm": 0.5421436429023743, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 5250}, {"loss": 1.4673, "grad_norm": 0.5726765990257263, "learning_rate": 0.0002, "epoch": 3.52428810720268, "step": 5260}, {"loss": 1.5423, "grad_norm": 0.5685455203056335, "learning_rate": 0.0002, "epoch": 3.5309882747068677, "step": 5270}, {"loss": 1.4715, "grad_norm": 0.6018396019935608, "learning_rate": 0.0002, "epoch": 3.5376884422110555, "step": 5280}, {"loss": 1.5451, "grad_norm": 0.5731932520866394, "learning_rate": 0.0002, "epoch": 3.544388609715243, "step": 5290}, {"loss": 1.4752, "grad_norm": 0.6601519584655762, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 5300}, {"loss": 1.5434, "grad_norm": 0.5545530319213867, "learning_rate": 0.0002, "epoch": 3.557788944723618, "step": 5310}, {"loss": 1.5438, "grad_norm": 0.5998541116714478, "learning_rate": 0.0002, "epoch": 3.5644891122278057, "step": 5320}, {"loss": 1.56, "grad_norm": 0.5651767253875732, "learning_rate": 0.0002, "epoch": 3.5711892797319935, "step": 5330}, {"loss": 1.4829, "grad_norm": 0.7425084114074707, "learning_rate": 0.0002, "epoch": 3.577889447236181, "step": 5340}, {"loss": 1.5571, "grad_norm": 0.5770602226257324, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 5350}, {"loss": 1.458, "grad_norm": 0.54723060131073, "learning_rate": 0.0002, "epoch": 3.591289782244556, "step": 5360}, {"loss": 1.497, "grad_norm": 0.6658238172531128, "learning_rate": 0.0002, "epoch": 3.5979899497487438, "step": 5370}, {"loss": 1.5456, "grad_norm": 0.5787645578384399, "learning_rate": 0.0002, "epoch": 3.6046901172529315, "step": 5380}, {"loss": 1.5343, "grad_norm": 0.594913125038147, "learning_rate": 0.0002, "epoch": 3.611390284757119, "step": 5390}, {"loss": 1.4727, "grad_norm": 0.4964977502822876, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 5400}, {"loss": 1.5062, "grad_norm": 0.6087527275085449, "learning_rate": 0.0002, "epoch": 3.624790619765494, "step": 5410}, {"loss": 1.5098, "grad_norm": 0.6315323710441589, "learning_rate": 0.0002, "epoch": 3.6314907872696818, "step": 5420}, {"loss": 1.4855, "grad_norm": 0.574799120426178, "learning_rate": 0.0002, "epoch": 3.6381909547738696, "step": 5430}, {"loss": 1.4595, "grad_norm": 0.5949277877807617, "learning_rate": 0.0002, "epoch": 3.644891122278057, "step": 5440}, {"loss": 1.4816, "grad_norm": 0.5640677213668823, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 5450}, {"loss": 1.525, "grad_norm": 0.6198237538337708, "learning_rate": 0.0002, "epoch": 3.658291457286432, "step": 5460}, {"loss": 1.5676, "grad_norm": 0.6902034878730774, "learning_rate": 0.0002, "epoch": 3.66499162479062, "step": 5470}, {"loss": 1.544, "grad_norm": 0.5686674118041992, "learning_rate": 0.0002, "epoch": 3.6716917922948076, "step": 5480}, {"loss": 1.5255, "grad_norm": 0.6532107591629028, "learning_rate": 0.0002, "epoch": 3.678391959798995, "step": 5490}, {"loss": 1.5767, "grad_norm": 0.5790849924087524, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 5500}, {"loss": 1.5507, "grad_norm": 0.6055065393447876, "learning_rate": 0.0002, "epoch": 3.69179229480737, "step": 5510}, {"loss": 1.4656, "grad_norm": 0.5630605816841125, "learning_rate": 0.0002, "epoch": 3.698492462311558, "step": 5520}, {"loss": 1.537, "grad_norm": 0.6005825996398926, "learning_rate": 0.0002, "epoch": 3.7051926298157456, "step": 5530}, {"loss": 1.5313, "grad_norm": 0.6553038954734802, "learning_rate": 0.0002, "epoch": 3.711892797319933, "step": 5540}, {"loss": 1.4943, "grad_norm": 0.5601094961166382, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 5550}, {"loss": 1.5161, "grad_norm": 0.6598808169364929, "learning_rate": 0.0002, "epoch": 3.725293132328308, "step": 5560}, {"loss": 1.5345, "grad_norm": 0.5506255626678467, "learning_rate": 0.0002, "epoch": 3.731993299832496, "step": 5570}, {"loss": 1.4805, "grad_norm": 0.6001223921775818, "learning_rate": 0.0002, "epoch": 3.7386934673366836, "step": 5580}, {"loss": 1.4652, "grad_norm": 0.6287297606468201, "learning_rate": 0.0002, "epoch": 3.745393634840871, "step": 5590}, {"loss": 1.5246, "grad_norm": 0.6253238916397095, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 5600}, {"loss": 1.5691, "grad_norm": 0.5713174939155579, "learning_rate": 0.0002, "epoch": 3.758793969849246, "step": 5610}, {"loss": 1.5661, "grad_norm": 0.6198310852050781, "learning_rate": 0.0002, "epoch": 3.765494137353434, "step": 5620}, {"loss": 1.5448, "grad_norm": 0.5941224098205566, "learning_rate": 0.0002, "epoch": 3.7721943048576216, "step": 5630}, {"loss": 1.4925, "grad_norm": 0.606002151966095, "learning_rate": 0.0002, "epoch": 3.778894472361809, "step": 5640}, {"loss": 1.5182, "grad_norm": 0.6540704965591431, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 5650}, {"loss": 1.5903, "grad_norm": 0.6147415041923523, "learning_rate": 0.0002, "epoch": 3.792294807370184, "step": 5660}, {"loss": 1.5329, "grad_norm": 0.5649605393409729, "learning_rate": 0.0002, "epoch": 3.798994974874372, "step": 5670}, {"loss": 1.5747, "grad_norm": 0.6788773536682129, "learning_rate": 0.0002, "epoch": 3.8056951423785597, "step": 5680}, {"loss": 1.535, "grad_norm": 0.6581860780715942, "learning_rate": 0.0002, "epoch": 3.812395309882747, "step": 5690}, {"loss": 1.4587, "grad_norm": 0.5529348850250244, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 5700}, {"loss": 1.5594, "grad_norm": 0.6320232152938843, "learning_rate": 0.0002, "epoch": 3.825795644891122, "step": 5710}, {"loss": 1.4696, "grad_norm": 0.6529698371887207, "learning_rate": 0.0002, "epoch": 3.83249581239531, "step": 5720}, {"loss": 1.5854, "grad_norm": 0.5983362793922424, "learning_rate": 0.0002, "epoch": 3.8391959798994977, "step": 5730}, {"loss": 1.465, "grad_norm": 0.6335684061050415, "learning_rate": 0.0002, "epoch": 3.845896147403685, "step": 5740}, {"loss": 1.5545, "grad_norm": 0.700446605682373, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 5750}, {"loss": 1.5707, "grad_norm": 0.6092597842216492, "learning_rate": 0.0002, "epoch": 3.85929648241206, "step": 5760}, {"loss": 1.5729, "grad_norm": 0.564146101474762, "learning_rate": 0.0002, "epoch": 3.865996649916248, "step": 5770}, {"loss": 1.5872, "grad_norm": 0.615275502204895, "learning_rate": 0.0002, "epoch": 3.8726968174204357, "step": 5780}, {"loss": 1.5142, "grad_norm": 0.6685376763343811, "learning_rate": 0.0002, "epoch": 3.879396984924623, "step": 5790}, {"loss": 1.4752, "grad_norm": 0.6116922497749329, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 5800}, {"loss": 1.5179, "grad_norm": 0.5486813187599182, "learning_rate": 0.0002, "epoch": 3.892797319932998, "step": 5810}, {"loss": 1.5167, "grad_norm": 0.6208204030990601, "learning_rate": 0.0002, "epoch": 3.899497487437186, "step": 5820}, {"loss": 1.5334, "grad_norm": 0.6500625014305115, "learning_rate": 0.0002, "epoch": 3.9061976549413737, "step": 5830}, {"loss": 1.4716, "grad_norm": 0.5948089361190796, "learning_rate": 0.0002, "epoch": 3.912897822445561, "step": 5840}, {"loss": 1.6011, "grad_norm": 0.7210732698440552, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 5850}, {"loss": 1.5519, "grad_norm": 0.6662322878837585, "learning_rate": 0.0002, "epoch": 3.926298157453936, "step": 5860}, {"loss": 1.5656, "grad_norm": 0.5613839626312256, "learning_rate": 0.0002, "epoch": 3.932998324958124, "step": 5870}, {"loss": 1.544, "grad_norm": 0.6069002151489258, "learning_rate": 0.0002, "epoch": 3.9396984924623117, "step": 5880}, {"loss": 1.6745, "grad_norm": 0.7075562477111816, "learning_rate": 0.0002, "epoch": 3.946398659966499, "step": 5890}, {"loss": 1.5391, "grad_norm": 0.6316173076629639, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 5900}, {"loss": 1.6314, "grad_norm": 0.5716308355331421, "learning_rate": 0.0002, "epoch": 3.959798994974874, "step": 5910}, {"loss": 1.5947, "grad_norm": 0.6800096035003662, "learning_rate": 0.0002, "epoch": 3.966499162479062, "step": 5920}, {"loss": 1.5189, "grad_norm": 0.6057983040809631, "learning_rate": 0.0002, "epoch": 3.9731993299832498, "step": 5930}, {"loss": 1.5431, "grad_norm": 0.5938987731933594, "learning_rate": 0.0002, "epoch": 3.979899497487437, "step": 5940}, {"loss": 1.5111, "grad_norm": 0.6963576674461365, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 5950}, {"loss": 1.5521, "grad_norm": 0.6279940009117126, "learning_rate": 0.0002, "epoch": 3.993299832495812, "step": 5960}, {"loss": 1.5974, "grad_norm": 0.7161159515380859, "learning_rate": 0.0002, "epoch": 4.0, "step": 5970}, {"eval_loss": 1.8655421733856201, "eval_runtime": 37.9276, "eval_samples_per_second": 13.579, "eval_steps_per_second": 1.714, "epoch": 4.0, "step": 5970}, {"loss": 1.3666, "grad_norm": 0.7380476593971252, "learning_rate": 0.0002, "epoch": 4.006700167504188, "step": 5980}, {"loss": 1.3913, "grad_norm": 0.7148947715759277, "learning_rate": 0.0002, "epoch": 4.013400335008376, "step": 5990}, {"loss": 1.4204, "grad_norm": 0.6177082657814026, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 6000}, {"loss": 1.4421, "grad_norm": 0.8552946448326111, "learning_rate": 0.0002, "epoch": 4.02680067001675, "step": 6010}, {"loss": 1.4342, "grad_norm": 0.8033416271209717, "learning_rate": 0.0002, "epoch": 4.033500837520938, "step": 6020}, {"loss": 1.4092, "grad_norm": 0.8501318097114563, "learning_rate": 0.0002, "epoch": 4.040201005025126, "step": 6030}, {"loss": 1.3367, "grad_norm": 0.6981393098831177, "learning_rate": 0.0002, "epoch": 4.046901172529314, "step": 6040}, {"loss": 1.3925, "grad_norm": 0.7227180600166321, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 6050}, {"loss": 1.4007, "grad_norm": 0.6923989653587341, "learning_rate": 0.0002, "epoch": 4.060301507537688, "step": 6060}, {"loss": 1.3837, "grad_norm": 0.879779040813446, "learning_rate": 0.0002, "epoch": 4.067001675041876, "step": 6070}, {"loss": 1.4383, "grad_norm": 0.8184754848480225, "learning_rate": 0.0002, "epoch": 4.073701842546064, "step": 6080}, {"loss": 1.3128, "grad_norm": 0.8211342692375183, "learning_rate": 0.0002, "epoch": 4.080402010050252, "step": 6090}, {"loss": 1.3892, "grad_norm": 0.7542396783828735, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 6100}, {"loss": 1.3607, "grad_norm": 0.6631066799163818, "learning_rate": 0.0002, "epoch": 4.093802345058626, "step": 6110}, {"loss": 1.3275, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.100502512562814, "step": 6120}, {"loss": 1.3443, "grad_norm": 0.681851863861084, "learning_rate": 0.0002, "epoch": 4.107202680067002, "step": 6130}, {"loss": 1.3486, "grad_norm": 0.8757794499397278, "learning_rate": 0.0002, "epoch": 4.11390284757119, "step": 6140}, {"loss": 1.351, "grad_norm": 0.6567301750183105, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 6150}, {"loss": 1.3824, "grad_norm": 0.7950329184532166, "learning_rate": 0.0002, "epoch": 4.127303182579564, "step": 6160}, {"loss": 1.3738, "grad_norm": 0.7545644044876099, "learning_rate": 0.0002, "epoch": 4.134003350083752, "step": 6170}, {"loss": 1.4214, "grad_norm": 0.7172710299491882, "learning_rate": 0.0002, "epoch": 4.14070351758794, "step": 6180}, {"loss": 1.4091, "grad_norm": 0.7040584087371826, "learning_rate": 0.0002, "epoch": 4.147403685092128, "step": 6190}, {"loss": 1.4149, "grad_norm": 0.7482913732528687, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 6200}, {"loss": 1.3227, "grad_norm": 0.8523276448249817, "learning_rate": 0.0002, "epoch": 4.160804020100502, "step": 6210}, {"loss": 1.4194, "grad_norm": 0.6672041416168213, "learning_rate": 0.0002, "epoch": 4.16750418760469, "step": 6220}, {"loss": 1.3953, "grad_norm": 0.7523500919342041, "learning_rate": 0.0002, "epoch": 4.174204355108878, "step": 6230}, {"loss": 1.371, "grad_norm": 0.8085253834724426, "learning_rate": 0.0002, "epoch": 4.180904522613066, "step": 6240}, {"loss": 1.3293, "grad_norm": 0.789450466632843, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 6250}, {"loss": 1.3539, "grad_norm": 0.7502310872077942, "learning_rate": 0.0002, "epoch": 4.19430485762144, "step": 6260}, {"loss": 1.3415, "grad_norm": 0.7397456765174866, "learning_rate": 0.0002, "epoch": 4.201005025125628, "step": 6270}, {"loss": 1.3963, "grad_norm": 0.6921947002410889, "learning_rate": 0.0002, "epoch": 4.207705192629816, "step": 6280}, {"loss": 1.3125, "grad_norm": 0.9334571957588196, "learning_rate": 0.0002, "epoch": 4.214405360134004, "step": 6290}, {"loss": 1.3612, "grad_norm": 0.725799024105072, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 6300}, {"loss": 1.4217, "grad_norm": 0.8290495872497559, "learning_rate": 0.0002, "epoch": 4.227805695142378, "step": 6310}, {"loss": 1.4135, "grad_norm": 0.688983678817749, "learning_rate": 0.0002, "epoch": 4.234505862646566, "step": 6320}, {"loss": 1.3807, "grad_norm": 0.8620913028717041, "learning_rate": 0.0002, "epoch": 4.241206030150754, "step": 6330}, {"loss": 1.3738, "grad_norm": 0.8008657693862915, "learning_rate": 0.0002, "epoch": 4.247906197654942, "step": 6340}, {"loss": 1.4005, "grad_norm": 0.7379199266433716, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 6350}, {"loss": 1.426, "grad_norm": 0.7842815518379211, "learning_rate": 0.0002, "epoch": 4.261306532663316, "step": 6360}, {"loss": 1.4262, "grad_norm": 0.812600314617157, "learning_rate": 0.0002, "epoch": 4.268006700167504, "step": 6370}, {"loss": 1.4028, "grad_norm": 0.7852841019630432, "learning_rate": 0.0002, "epoch": 4.274706867671692, "step": 6380}, {"loss": 1.3722, "grad_norm": 1.0377534627914429, "learning_rate": 0.0002, "epoch": 4.28140703517588, "step": 6390}, {"loss": 1.3755, "grad_norm": 1.03935706615448, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 6400}, {"loss": 1.3961, "grad_norm": 0.7244732975959778, "learning_rate": 0.0002, "epoch": 4.294807370184254, "step": 6410}, {"loss": 1.4608, "grad_norm": 0.7137406468391418, "learning_rate": 0.0002, "epoch": 4.301507537688442, "step": 6420}, {"loss": 1.4461, "grad_norm": 0.7492543458938599, "learning_rate": 0.0002, "epoch": 4.30820770519263, "step": 6430}, {"loss": 1.4562, "grad_norm": 0.7065439224243164, "learning_rate": 0.0002, "epoch": 4.314907872696818, "step": 6440}, {"loss": 1.4246, "grad_norm": 0.7786989808082581, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 6450}, {"loss": 1.3098, "grad_norm": 0.7369208335876465, "learning_rate": 0.0002, "epoch": 4.328308207705192, "step": 6460}, {"loss": 1.3686, "grad_norm": 0.7412346005439758, "learning_rate": 0.0002, "epoch": 4.33500837520938, "step": 6470}, {"loss": 1.4087, "grad_norm": 0.780927300453186, "learning_rate": 0.0002, "epoch": 4.341708542713568, "step": 6480}, {"loss": 1.3628, "grad_norm": 0.8320930600166321, "learning_rate": 0.0002, "epoch": 4.348408710217756, "step": 6490}, {"loss": 1.3715, "grad_norm": 0.6871094703674316, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 6500}, {"loss": 1.3257, "grad_norm": 0.6751559972763062, "learning_rate": 0.0002, "epoch": 4.36180904522613, "step": 6510}, {"loss": 1.4311, "grad_norm": 0.7723976969718933, "learning_rate": 0.0002, "epoch": 4.368509212730318, "step": 6520}, {"loss": 1.4086, "grad_norm": 0.7915401458740234, "learning_rate": 0.0002, "epoch": 4.375209380234506, "step": 6530}, {"loss": 1.3973, "grad_norm": 0.7329102754592896, "learning_rate": 0.0002, "epoch": 4.381909547738694, "step": 6540}, {"loss": 1.447, "grad_norm": 0.7388760447502136, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 6550}, {"loss": 1.4378, "grad_norm": 0.8282579183578491, "learning_rate": 0.0002, "epoch": 4.3953098827470685, "step": 6560}, {"loss": 1.3923, "grad_norm": 0.7192724347114563, "learning_rate": 0.0002, "epoch": 4.402010050251256, "step": 6570}, {"loss": 1.4141, "grad_norm": 0.746526837348938, "learning_rate": 0.0002, "epoch": 4.408710217755444, "step": 6580}, {"loss": 1.33, "grad_norm": 0.8738046288490295, "learning_rate": 0.0002, "epoch": 4.415410385259632, "step": 6590}, {"loss": 1.3995, "grad_norm": 0.8408458828926086, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 6600}, {"loss": 1.4148, "grad_norm": 0.8110666275024414, "learning_rate": 0.0002, "epoch": 4.4288107202680065, "step": 6610}, {"loss": 1.441, "grad_norm": 0.8602406978607178, "learning_rate": 0.0002, "epoch": 4.435510887772194, "step": 6620}, {"loss": 1.4319, "grad_norm": 0.7549102902412415, "learning_rate": 0.0002, "epoch": 4.442211055276382, "step": 6630}, {"loss": 1.388, "grad_norm": 0.7831804156303406, "learning_rate": 0.0002, "epoch": 4.44891122278057, "step": 6640}, {"loss": 1.4283, "grad_norm": 0.7269673943519592, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 6650}, {"loss": 1.4132, "grad_norm": 0.7397838830947876, "learning_rate": 0.0002, "epoch": 4.4623115577889445, "step": 6660}, {"loss": 1.3174, "grad_norm": 0.713707447052002, "learning_rate": 0.0002, "epoch": 4.469011725293132, "step": 6670}, {"loss": 1.3406, "grad_norm": 0.7525581121444702, "learning_rate": 0.0002, "epoch": 4.47571189279732, "step": 6680}, {"loss": 1.4283, "grad_norm": 0.8030191659927368, "learning_rate": 0.0002, "epoch": 4.482412060301508, "step": 6690}, {"loss": 1.4586, "grad_norm": 0.7469439506530762, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 6700}, {"loss": 1.367, "grad_norm": 0.7743868231773376, "learning_rate": 0.0002, "epoch": 4.4958123953098825, "step": 6710}, {"loss": 1.3439, "grad_norm": 0.6539737582206726, "learning_rate": 0.0002, "epoch": 4.50251256281407, "step": 6720}, {"loss": 1.4513, "grad_norm": 0.825818657875061, "learning_rate": 0.0002, "epoch": 4.509212730318258, "step": 6730}, {"loss": 1.3984, "grad_norm": 0.8048575520515442, "learning_rate": 0.0002, "epoch": 4.515912897822446, "step": 6740}, {"loss": 1.3923, "grad_norm": 0.7828766107559204, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 6750}, {"loss": 1.3886, "grad_norm": 0.7406010031700134, "learning_rate": 0.0002, "epoch": 4.5293132328308205, "step": 6760}, {"loss": 1.3109, "grad_norm": 0.840345561504364, "learning_rate": 0.0002, "epoch": 4.536013400335008, "step": 6770}, {"loss": 1.4808, "grad_norm": 0.8492622971534729, "learning_rate": 0.0002, "epoch": 4.542713567839196, "step": 6780}, {"loss": 1.4384, "grad_norm": 0.7130163908004761, "learning_rate": 0.0002, "epoch": 4.549413735343384, "step": 6790}, {"loss": 1.4531, "grad_norm": 0.8454728126525879, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 6800}, {"loss": 1.3239, "grad_norm": 0.7847645282745361, "learning_rate": 0.0002, "epoch": 4.562814070351759, "step": 6810}, {"loss": 1.4181, "grad_norm": 0.7245864272117615, "learning_rate": 0.0002, "epoch": 4.569514237855946, "step": 6820}, {"loss": 1.3233, "grad_norm": 0.768893301486969, "learning_rate": 0.0002, "epoch": 4.576214405360134, "step": 6830}, {"loss": 1.3932, "grad_norm": 0.8028400540351868, "learning_rate": 0.0002, "epoch": 4.582914572864322, "step": 6840}, {"loss": 1.3745, "grad_norm": 0.763945460319519, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 6850}, {"loss": 1.4797, "grad_norm": 0.7417685389518738, "learning_rate": 0.0002, "epoch": 4.596314907872697, "step": 6860}, {"loss": 1.4468, "grad_norm": 0.7603038549423218, "learning_rate": 0.0002, "epoch": 4.603015075376884, "step": 6870}, {"loss": 1.4095, "grad_norm": 0.7981528043746948, "learning_rate": 0.0002, "epoch": 4.609715242881072, "step": 6880}, {"loss": 1.3963, "grad_norm": 0.8077111840248108, "learning_rate": 0.0002, "epoch": 4.61641541038526, "step": 6890}, {"loss": 1.4721, "grad_norm": 0.8778454065322876, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 6900}, {"loss": 1.3758, "grad_norm": 0.8620710372924805, "learning_rate": 0.0002, "epoch": 4.629815745393635, "step": 6910}, {"loss": 1.344, "grad_norm": 0.7486072778701782, "learning_rate": 0.0002, "epoch": 4.636515912897822, "step": 6920}, {"loss": 1.3913, "grad_norm": 0.7493042945861816, "learning_rate": 0.0002, "epoch": 4.64321608040201, "step": 6930}, {"loss": 1.397, "grad_norm": 0.7388978600502014, "learning_rate": 0.0002, "epoch": 4.649916247906198, "step": 6940}, {"loss": 1.3593, "grad_norm": 0.798530638217926, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 6950}, {"loss": 1.3982, "grad_norm": 0.7929500937461853, "learning_rate": 0.0002, "epoch": 4.663316582914573, "step": 6960}, {"loss": 1.4183, "grad_norm": 0.9186785221099854, "learning_rate": 0.0002, "epoch": 4.67001675041876, "step": 6970}, {"loss": 1.3955, "grad_norm": 1.1103485822677612, "learning_rate": 0.0002, "epoch": 4.676716917922948, "step": 6980}, {"loss": 1.3941, "grad_norm": 0.8000466823577881, "learning_rate": 0.0002, "epoch": 4.683417085427136, "step": 6990}, {"loss": 1.371, "grad_norm": 0.7520599961280823, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 7000}, {"loss": 1.4582, "grad_norm": 0.7971973419189453, "learning_rate": 0.0002, "epoch": 4.696817420435511, "step": 7010}, {"loss": 1.3682, "grad_norm": 0.7363343834877014, "learning_rate": 0.0002, "epoch": 4.703517587939698, "step": 7020}, {"loss": 1.3889, "grad_norm": 0.8268865942955017, "learning_rate": 0.0002, "epoch": 4.710217755443886, "step": 7030}, {"loss": 1.4382, "grad_norm": 0.7054963111877441, "learning_rate": 0.0002, "epoch": 4.716917922948074, "step": 7040}, {"loss": 1.4578, "grad_norm": 0.8196262121200562, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 7050}, {"loss": 1.365, "grad_norm": 0.8276031017303467, "learning_rate": 0.0002, "epoch": 4.730318257956449, "step": 7060}, {"loss": 1.3887, "grad_norm": 0.8248157501220703, "learning_rate": 0.0002, "epoch": 4.7370184254606365, "step": 7070}, {"loss": 1.4193, "grad_norm": 0.8937979936599731, "learning_rate": 0.0002, "epoch": 4.743718592964824, "step": 7080}, {"loss": 1.4334, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 4.750418760469012, "step": 7090}, {"loss": 1.4385, "grad_norm": 0.9495313763618469, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 7100}, {"loss": 1.4504, "grad_norm": 0.8598204255104065, "learning_rate": 0.0002, "epoch": 4.763819095477387, "step": 7110}, {"loss": 1.3969, "grad_norm": 0.8951472640037537, "learning_rate": 0.0002, "epoch": 4.7705192629815745, "step": 7120}, {"loss": 1.4339, "grad_norm": 0.9110309481620789, "learning_rate": 0.0002, "epoch": 4.777219430485762, "step": 7130}, {"loss": 1.4001, "grad_norm": 0.7929584980010986, "learning_rate": 0.0002, "epoch": 4.78391959798995, "step": 7140}, {"loss": 1.467, "grad_norm": 0.7415322661399841, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 7150}, {"loss": 1.5107, "grad_norm": 0.7504757046699524, "learning_rate": 0.0002, "epoch": 4.797319932998325, "step": 7160}, {"loss": 1.3736, "grad_norm": 0.7166924476623535, "learning_rate": 0.0002, "epoch": 4.8040201005025125, "step": 7170}, {"loss": 1.4088, "grad_norm": 0.7728400826454163, "learning_rate": 0.0002, "epoch": 4.8107202680067, "step": 7180}, {"loss": 1.3814, "grad_norm": 0.7992154955863953, "learning_rate": 0.0002, "epoch": 4.817420435510888, "step": 7190}, {"loss": 1.3958, "grad_norm": 0.8655321002006531, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 7200}, {"loss": 1.3837, "grad_norm": 0.7672632336616516, "learning_rate": 0.0002, "epoch": 4.830820770519263, "step": 7210}, {"loss": 1.4578, "grad_norm": 0.708416223526001, "learning_rate": 0.0002, "epoch": 4.8375209380234505, "step": 7220}, {"loss": 1.5413, "grad_norm": 0.8914081454277039, "learning_rate": 0.0002, "epoch": 4.844221105527638, "step": 7230}, {"loss": 1.3569, "grad_norm": 0.7141931653022766, "learning_rate": 0.0002, "epoch": 4.850921273031826, "step": 7240}, {"loss": 1.4532, "grad_norm": 0.6913040280342102, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 7250}, {"loss": 1.3912, "grad_norm": 0.7871233820915222, "learning_rate": 0.0002, "epoch": 4.864321608040201, "step": 7260}, {"loss": 1.3688, "grad_norm": 0.8466277122497559, "learning_rate": 0.0002, "epoch": 4.8710217755443885, "step": 7270}, {"loss": 1.33, "grad_norm": 0.8492183685302734, "learning_rate": 0.0002, "epoch": 4.877721943048576, "step": 7280}, {"loss": 1.3744, "grad_norm": 0.8339574933052063, "learning_rate": 0.0002, "epoch": 4.884422110552764, "step": 7290}, {"loss": 1.4157, "grad_norm": 0.787022590637207, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 7300}, {"loss": 1.3725, "grad_norm": 0.8877332806587219, "learning_rate": 0.0002, "epoch": 4.897822445561139, "step": 7310}, {"loss": 1.3968, "grad_norm": 0.744989812374115, "learning_rate": 0.0002, "epoch": 4.9045226130653266, "step": 7320}, {"loss": 1.4421, "grad_norm": 0.8027268648147583, "learning_rate": 0.0002, "epoch": 4.911222780569514, "step": 7330}, {"loss": 1.425, "grad_norm": 0.6437455415725708, "learning_rate": 0.0002, "epoch": 4.917922948073702, "step": 7340}, {"loss": 1.4829, "grad_norm": 0.685999870300293, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 7350}, {"loss": 1.4352, "grad_norm": 0.9086187481880188, "learning_rate": 0.0002, "epoch": 4.931323283082077, "step": 7360}, {"loss": 1.4245, "grad_norm": 0.8272411227226257, "learning_rate": 0.0002, "epoch": 4.938023450586265, "step": 7370}, {"loss": 1.4226, "grad_norm": 0.9227852821350098, "learning_rate": 0.0002, "epoch": 4.944723618090452, "step": 7380}, {"loss": 1.3643, "grad_norm": 0.7688441276550293, "learning_rate": 0.0002, "epoch": 4.95142378559464, "step": 7390}, {"loss": 1.4491, "grad_norm": 0.8662643432617188, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 7400}, {"loss": 1.4194, "grad_norm": 0.9234127998352051, "learning_rate": 0.0002, "epoch": 4.964824120603015, "step": 7410}, {"loss": 1.4009, "grad_norm": 0.9131470918655396, "learning_rate": 0.0002, "epoch": 4.971524288107203, "step": 7420}, {"loss": 1.4544, "grad_norm": 0.7377504110336304, "learning_rate": 0.0002, "epoch": 4.97822445561139, "step": 7430}, {"loss": 1.4008, "grad_norm": 0.8762801289558411, "learning_rate": 0.0002, "epoch": 4.984924623115578, "step": 7440}, {"loss": 1.4304, "grad_norm": 0.7919872999191284, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 7450}, {"loss": 1.3817, "grad_norm": 0.7144299149513245, "learning_rate": 0.0002, "epoch": 4.998324958123953, "step": 7460}, {"eval_loss": 1.9291157722473145, "eval_runtime": 37.9831, "eval_samples_per_second": 13.559, "eval_steps_per_second": 1.711, "epoch": 4.99966499162479, "step": 7462}, {"loss": 1.2753, "grad_norm": 0.7860151529312134, "learning_rate": 0.0002, "epoch": 5.005025125628141, "step": 7470}, {"loss": 1.2149, "grad_norm": 0.9418314695358276, "learning_rate": 0.0002, "epoch": 5.011725293132328, "step": 7480}, {"loss": 1.1966, "grad_norm": 0.8474572896957397, "learning_rate": 0.0002, "epoch": 5.018425460636516, "step": 7490}, {"loss": 1.2111, "grad_norm": 1.0724040269851685, "learning_rate": 0.0002, "epoch": 5.025125628140704, "step": 7500}, {"loss": 1.2228, "grad_norm": 0.9109148979187012, "learning_rate": 0.0002, "epoch": 5.031825795644891, "step": 7510}, {"loss": 1.2239, "grad_norm": 1.0088659524917603, "learning_rate": 0.0002, "epoch": 5.038525963149079, "step": 7520}, {"loss": 1.2156, "grad_norm": 1.1421623229980469, "learning_rate": 0.0002, "epoch": 5.045226130653266, "step": 7530}, {"loss": 1.1739, "grad_norm": 0.9219902157783508, "learning_rate": 0.0002, "epoch": 5.051926298157454, "step": 7540}, {"loss": 1.2686, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 5.058626465661642, "step": 7550}, {"loss": 1.2068, "grad_norm": 0.8889328241348267, "learning_rate": 0.0002, "epoch": 5.065326633165829, "step": 7560}, {"loss": 1.276, "grad_norm": 0.9751363396644592, "learning_rate": 0.0002, "epoch": 5.072026800670017, "step": 7570}, {"loss": 1.2078, "grad_norm": 0.8603123426437378, "learning_rate": 0.0002, "epoch": 5.078726968174204, "step": 7580}, {"loss": 1.2175, "grad_norm": 0.8910616636276245, "learning_rate": 0.0002, "epoch": 5.085427135678392, "step": 7590}, {"loss": 1.2475, "grad_norm": 1.1128392219543457, "learning_rate": 0.0002, "epoch": 5.09212730318258, "step": 7600}, {"loss": 1.3065, "grad_norm": 0.9480258822441101, "learning_rate": 0.0002, "epoch": 5.098827470686767, "step": 7610}, {"loss": 1.193, "grad_norm": 0.906958818435669, "learning_rate": 0.0002, "epoch": 5.105527638190955, "step": 7620}, {"loss": 1.2223, "grad_norm": 0.8741167187690735, "learning_rate": 0.0002, "epoch": 5.1122278056951425, "step": 7630}, {"loss": 1.2126, "grad_norm": 0.966268002986908, "learning_rate": 0.0002, "epoch": 5.11892797319933, "step": 7640}, {"loss": 1.2782, "grad_norm": 0.9124358892440796, "learning_rate": 0.0002, "epoch": 5.125628140703517, "step": 7650}, {"loss": 1.3004, "grad_norm": 1.0436606407165527, "learning_rate": 0.0002, "epoch": 5.132328308207705, "step": 7660}, {"loss": 1.2675, "grad_norm": 0.9217309355735779, "learning_rate": 0.0002, "epoch": 5.139028475711893, "step": 7670}, {"loss": 1.2502, "grad_norm": 1.344765543937683, "learning_rate": 0.0002, "epoch": 5.1457286432160805, "step": 7680}, {"loss": 1.2416, "grad_norm": 1.0730723142623901, "learning_rate": 0.0002, "epoch": 5.152428810720268, "step": 7690}, {"loss": 1.1888, "grad_norm": 0.9321247339248657, "learning_rate": 0.0002, "epoch": 5.159128978224456, "step": 7700}, {"loss": 1.1941, "grad_norm": 0.8482614755630493, "learning_rate": 0.0002, "epoch": 5.165829145728643, "step": 7710}, {"loss": 1.2668, "grad_norm": 0.8274452686309814, "learning_rate": 0.0002, "epoch": 5.172529313232831, "step": 7720}, {"loss": 1.1972, "grad_norm": 0.9120376706123352, "learning_rate": 0.0002, "epoch": 5.1792294807370185, "step": 7730}, {"loss": 1.1648, "grad_norm": 1.0062892436981201, "learning_rate": 0.0002, "epoch": 5.185929648241206, "step": 7740}, {"loss": 1.2199, "grad_norm": 0.9521504640579224, "learning_rate": 0.0002, "epoch": 5.192629815745394, "step": 7750}, {"loss": 1.2855, "grad_norm": 0.8800198435783386, "learning_rate": 0.0002, "epoch": 5.199329983249581, "step": 7760}, {"loss": 1.2535, "grad_norm": 0.9749179482460022, "learning_rate": 0.0002, "epoch": 5.206030150753769, "step": 7770}, {"loss": 1.2975, "grad_norm": 0.9441686868667603, "learning_rate": 0.0002, "epoch": 5.2127303182579565, "step": 7780}, {"loss": 1.256, "grad_norm": 0.9114066362380981, "learning_rate": 0.0002, "epoch": 5.219430485762144, "step": 7790}, {"loss": 1.2621, "grad_norm": 0.9851446151733398, "learning_rate": 0.0002, "epoch": 5.226130653266332, "step": 7800}, {"loss": 1.2502, "grad_norm": 0.9526297450065613, "learning_rate": 0.0002, "epoch": 5.232830820770519, "step": 7810}, {"loss": 1.1502, "grad_norm": 1.05986487865448, "learning_rate": 0.0002, "epoch": 5.239530988274707, "step": 7820}, {"loss": 1.2517, "grad_norm": 0.8956538438796997, "learning_rate": 0.0002, "epoch": 5.2462311557788945, "step": 7830}, {"loss": 1.2556, "grad_norm": 0.9568153619766235, "learning_rate": 0.0002, "epoch": 5.252931323283082, "step": 7840}, {"loss": 1.2442, "grad_norm": 1.0035018920898438, "learning_rate": 0.0002, "epoch": 5.259631490787269, "step": 7850}, {"loss": 1.2605, "grad_norm": 0.8554368615150452, "learning_rate": 0.0002, "epoch": 5.266331658291457, "step": 7860}, {"loss": 1.2799, "grad_norm": 0.9677708148956299, "learning_rate": 0.0002, "epoch": 5.273031825795645, "step": 7870}, {"loss": 1.275, "grad_norm": 0.943606436252594, "learning_rate": 0.0002, "epoch": 5.279731993299833, "step": 7880}, {"loss": 1.2335, "grad_norm": 1.0029335021972656, "learning_rate": 0.0002, "epoch": 5.28643216080402, "step": 7890}, {"loss": 1.2494, "grad_norm": 1.0164015293121338, "learning_rate": 0.0002, "epoch": 5.293132328308207, "step": 7900}, {"loss": 1.3117, "grad_norm": 0.8908365368843079, "learning_rate": 0.0002, "epoch": 5.299832495812395, "step": 7910}, {"loss": 1.2832, "grad_norm": 0.9307826161384583, "learning_rate": 0.0002, "epoch": 5.306532663316583, "step": 7920}, {"loss": 1.242, "grad_norm": 1.0730371475219727, "learning_rate": 0.0002, "epoch": 5.313232830820771, "step": 7930}, {"loss": 1.2003, "grad_norm": 0.844739556312561, "learning_rate": 0.0002, "epoch": 5.319932998324958, "step": 7940}, {"loss": 1.2688, "grad_norm": 1.275833010673523, "learning_rate": 0.0002, "epoch": 5.326633165829146, "step": 7950}, {"loss": 1.2957, "grad_norm": 0.9042661190032959, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 7960}, {"loss": 1.2912, "grad_norm": 0.9374269247055054, "learning_rate": 0.0002, "epoch": 5.340033500837521, "step": 7970}, {"loss": 1.2721, "grad_norm": 1.033098578453064, "learning_rate": 0.0002, "epoch": 5.346733668341709, "step": 7980}, {"loss": 1.3208, "grad_norm": 1.062775731086731, "learning_rate": 0.0002, "epoch": 5.353433835845896, "step": 7990}, {"loss": 1.3065, "grad_norm": 1.1064317226409912, "learning_rate": 0.0002, "epoch": 5.360134003350084, "step": 8000}, {"loss": 1.2341, "grad_norm": 1.1114039421081543, "learning_rate": 0.0002, "epoch": 5.366834170854271, "step": 8010}, {"loss": 1.2255, "grad_norm": 1.0198014974594116, "learning_rate": 0.0002, "epoch": 5.373534338358459, "step": 8020}, {"loss": 1.2433, "grad_norm": 0.8443173170089722, "learning_rate": 0.0002, "epoch": 5.380234505862647, "step": 8030}, {"loss": 1.206, "grad_norm": 1.000881314277649, "learning_rate": 0.0002, "epoch": 5.386934673366834, "step": 8040}, {"loss": 1.2982, "grad_norm": 0.9874443411827087, "learning_rate": 0.0002, "epoch": 5.393634840871022, "step": 8050}, {"loss": 1.2289, "grad_norm": 0.9895344972610474, "learning_rate": 0.0002, "epoch": 5.400335008375209, "step": 8060}, {"loss": 1.249, "grad_norm": 0.8595236539840698, "learning_rate": 0.0002, "epoch": 5.407035175879397, "step": 8070}, {"loss": 1.2308, "grad_norm": 0.9523849487304688, "learning_rate": 0.0002, "epoch": 5.413735343383585, "step": 8080}, {"loss": 1.2343, "grad_norm": 1.0560476779937744, "learning_rate": 0.0002, "epoch": 5.420435510887772, "step": 8090}, {"loss": 1.2956, "grad_norm": 1.0893689393997192, "learning_rate": 0.0002, "epoch": 5.42713567839196, "step": 8100}, {"loss": 1.2846, "grad_norm": 0.9395513534545898, "learning_rate": 0.0002, "epoch": 5.433835845896147, "step": 8110}, {"loss": 1.3444, "grad_norm": 0.9364215135574341, "learning_rate": 0.0002, "epoch": 5.440536013400335, "step": 8120}, {"loss": 1.2944, "grad_norm": 0.9502208232879639, "learning_rate": 0.0002, "epoch": 5.447236180904523, "step": 8130}, {"loss": 1.2971, "grad_norm": 0.9559208154678345, "learning_rate": 0.0002, "epoch": 5.45393634840871, "step": 8140}, {"loss": 1.2495, "grad_norm": 0.9261730313301086, "learning_rate": 0.0002, "epoch": 5.460636515912898, "step": 8150}, {"loss": 1.2599, "grad_norm": 0.9832326173782349, "learning_rate": 0.0002, "epoch": 5.467336683417085, "step": 8160}, {"loss": 1.2771, "grad_norm": 1.065953016281128, "learning_rate": 0.0002, "epoch": 5.474036850921273, "step": 8170}, {"loss": 1.3617, "grad_norm": 0.9139469861984253, "learning_rate": 0.0002, "epoch": 5.480737018425461, "step": 8180}, {"loss": 1.2197, "grad_norm": 1.2322484254837036, "learning_rate": 0.0002, "epoch": 5.4874371859296485, "step": 8190}, {"loss": 1.2879, "grad_norm": 0.9722974896430969, "learning_rate": 0.0002, "epoch": 5.494137353433836, "step": 8200}, {"loss": 1.2664, "grad_norm": 0.9338926076889038, "learning_rate": 0.0002, "epoch": 5.500837520938023, "step": 8210}, {"loss": 1.2128, "grad_norm": 0.9283728003501892, "learning_rate": 0.0002, "epoch": 5.507537688442211, "step": 8220}, {"loss": 1.2141, "grad_norm": 1.0489585399627686, "learning_rate": 0.0002, "epoch": 5.514237855946399, "step": 8230}, {"loss": 1.3257, "grad_norm": 0.9881814122200012, "learning_rate": 0.0002, "epoch": 5.5209380234505865, "step": 8240}, {"loss": 1.2221, "grad_norm": 0.9274460077285767, "learning_rate": 0.0002, "epoch": 5.527638190954773, "step": 8250}, {"loss": 1.2241, "grad_norm": 0.8650718331336975, "learning_rate": 0.0002, "epoch": 5.534338358458961, "step": 8260}, {"loss": 1.2462, "grad_norm": 1.014069676399231, "learning_rate": 0.0002, "epoch": 5.541038525963149, "step": 8270}, {"loss": 1.3502, "grad_norm": 0.9212974905967712, "learning_rate": 0.0002, "epoch": 5.547738693467337, "step": 8280}, {"loss": 1.2779, "grad_norm": 1.1235398054122925, "learning_rate": 0.0002, "epoch": 5.5544388609715245, "step": 8290}, {"loss": 1.306, "grad_norm": 0.961954653263092, "learning_rate": 0.0002, "epoch": 5.561139028475711, "step": 8300}, {"loss": 1.2946, "grad_norm": 0.9386700391769409, "learning_rate": 0.0002, "epoch": 5.567839195979899, "step": 8310}, {"loss": 1.313, "grad_norm": 1.01912522315979, "learning_rate": 0.0002, "epoch": 5.574539363484087, "step": 8320}, {"loss": 1.3121, "grad_norm": 0.9851216077804565, "learning_rate": 0.0002, "epoch": 5.581239530988275, "step": 8330}, {"loss": 1.3071, "grad_norm": 1.0138001441955566, "learning_rate": 0.0002, "epoch": 5.5879396984924625, "step": 8340}, {"loss": 1.2257, "grad_norm": 0.9262447357177734, "learning_rate": 0.0002, "epoch": 5.594639865996649, "step": 8350}, {"loss": 1.2473, "grad_norm": 1.1322970390319824, "learning_rate": 0.0002, "epoch": 5.601340033500837, "step": 8360}, {"loss": 1.3098, "grad_norm": 1.1429349184036255, "learning_rate": 0.0002, "epoch": 5.608040201005025, "step": 8370}, {"loss": 1.2686, "grad_norm": 0.9130118489265442, "learning_rate": 0.0002, "epoch": 5.614740368509213, "step": 8380}, {"loss": 1.2541, "grad_norm": 0.9651545882225037, "learning_rate": 0.0002, "epoch": 5.6214405360134005, "step": 8390}, {"loss": 1.2799, "grad_norm": 0.9595398306846619, "learning_rate": 0.0002, "epoch": 5.628140703517588, "step": 8400}, {"loss": 1.3429, "grad_norm": 1.0049372911453247, "learning_rate": 0.0002, "epoch": 5.634840871021775, "step": 8410}, {"loss": 1.3224, "grad_norm": 1.082804560661316, "learning_rate": 0.0002, "epoch": 5.641541038525963, "step": 8420}, {"loss": 1.297, "grad_norm": 0.9489204287528992, "learning_rate": 0.0002, "epoch": 5.648241206030151, "step": 8430}, {"loss": 1.3424, "grad_norm": 0.9470235109329224, "learning_rate": 0.0002, "epoch": 5.654941373534339, "step": 8440}, {"loss": 1.3358, "grad_norm": 1.0662927627563477, "learning_rate": 0.0002, "epoch": 5.661641541038526, "step": 8450}, {"loss": 1.2973, "grad_norm": 0.9097877740859985, "learning_rate": 0.0002, "epoch": 5.668341708542713, "step": 8460}, {"loss": 1.3072, "grad_norm": 0.9740368127822876, "learning_rate": 0.0002, "epoch": 5.675041876046901, "step": 8470}, {"loss": 1.286, "grad_norm": 0.9878810048103333, "learning_rate": 0.0002, "epoch": 5.681742043551089, "step": 8480}, {"loss": 1.208, "grad_norm": 1.148260474205017, "learning_rate": 0.0002, "epoch": 5.688442211055277, "step": 8490}, {"loss": 1.2842, "grad_norm": 0.9632558822631836, "learning_rate": 0.0002, "epoch": 5.695142378559464, "step": 8500}, {"loss": 1.2787, "grad_norm": 0.876812756061554, "learning_rate": 0.0002, "epoch": 5.701842546063651, "step": 8510}, {"loss": 1.3186, "grad_norm": 1.0730829238891602, "learning_rate": 0.0002, "epoch": 5.708542713567839, "step": 8520}, {"loss": 1.2856, "grad_norm": 1.2239218950271606, "learning_rate": 0.0002, "epoch": 5.715242881072027, "step": 8530}, {"loss": 1.2717, "grad_norm": 0.9460835456848145, "learning_rate": 0.0002, "epoch": 5.721943048576215, "step": 8540}, {"loss": 1.3509, "grad_norm": 0.9086270928382874, "learning_rate": 0.0002, "epoch": 5.728643216080402, "step": 8550}, {"loss": 1.2971, "grad_norm": 1.0258867740631104, "learning_rate": 0.0002, "epoch": 5.735343383584589, "step": 8560}, {"loss": 1.3581, "grad_norm": 1.0543923377990723, "learning_rate": 0.0002, "epoch": 5.742043551088777, "step": 8570}, {"loss": 1.2988, "grad_norm": 0.9063900113105774, "learning_rate": 0.0002, "epoch": 5.748743718592965, "step": 8580}, {"loss": 1.3535, "grad_norm": 1.1838830709457397, "learning_rate": 0.0002, "epoch": 5.755443886097153, "step": 8590}, {"loss": 1.2655, "grad_norm": 0.9631859064102173, "learning_rate": 0.0002, "epoch": 5.76214405360134, "step": 8600}, {"loss": 1.276, "grad_norm": 0.9702655673027039, "learning_rate": 0.0002, "epoch": 5.768844221105527, "step": 8610}, {"loss": 1.3196, "grad_norm": 1.0591435432434082, "learning_rate": 0.0002, "epoch": 5.775544388609715, "step": 8620}, {"loss": 1.267, "grad_norm": 0.9989570379257202, "learning_rate": 0.0002, "epoch": 5.782244556113903, "step": 8630}, {"loss": 1.3227, "grad_norm": 1.0836435556411743, "learning_rate": 0.0002, "epoch": 5.788944723618091, "step": 8640}, {"loss": 1.3334, "grad_norm": 0.8832896947860718, "learning_rate": 0.0002, "epoch": 5.795644891122278, "step": 8650}, {"loss": 1.3214, "grad_norm": 1.0104607343673706, "learning_rate": 0.0002, "epoch": 5.802345058626465, "step": 8660}, {"loss": 1.2703, "grad_norm": 0.8375084400177002, "learning_rate": 0.0002, "epoch": 5.809045226130653, "step": 8670}, {"loss": 1.3554, "grad_norm": 1.1300716400146484, "learning_rate": 0.0002, "epoch": 5.815745393634841, "step": 8680}, {"loss": 1.3468, "grad_norm": 0.9311910271644592, "learning_rate": 0.0002, "epoch": 5.822445561139029, "step": 8690}, {"loss": 1.2749, "grad_norm": 0.9488391876220703, "learning_rate": 0.0002, "epoch": 5.8291457286432165, "step": 8700}, {"loss": 1.2281, "grad_norm": 0.9747629761695862, "learning_rate": 0.0002, "epoch": 5.835845896147403, "step": 8710}, {"loss": 1.2923, "grad_norm": 1.1029598712921143, "learning_rate": 0.0002, "epoch": 5.842546063651591, "step": 8720}, {"loss": 1.3613, "grad_norm": 1.0396875143051147, "learning_rate": 0.0002, "epoch": 5.849246231155779, "step": 8730}, {"loss": 1.3272, "grad_norm": 0.9259780645370483, "learning_rate": 0.0002, "epoch": 5.855946398659967, "step": 8740}, {"loss": 1.3236, "grad_norm": 1.020033597946167, "learning_rate": 0.0002, "epoch": 5.8626465661641545, "step": 8750}, {"loss": 1.3453, "grad_norm": 0.9191218614578247, "learning_rate": 0.0002, "epoch": 5.869346733668341, "step": 8760}, {"loss": 1.3012, "grad_norm": 1.1093107461929321, "learning_rate": 0.0002, "epoch": 5.876046901172529, "step": 8770}, {"loss": 1.2718, "grad_norm": 1.1626793146133423, "learning_rate": 0.0002, "epoch": 5.882747068676717, "step": 8780}, {"loss": 1.2969, "grad_norm": 0.9542945027351379, "learning_rate": 0.0002, "epoch": 5.889447236180905, "step": 8790}, {"loss": 1.3134, "grad_norm": 0.9086058139801025, "learning_rate": 0.0002, "epoch": 5.8961474036850925, "step": 8800}, {"loss": 1.2731, "grad_norm": 0.9249639511108398, "learning_rate": 0.0002, "epoch": 5.902847571189279, "step": 8810}, {"loss": 1.337, "grad_norm": 0.9414396286010742, "learning_rate": 0.0002, "epoch": 5.909547738693467, "step": 8820}, {"loss": 1.2865, "grad_norm": 0.9086037874221802, "learning_rate": 0.0002, "epoch": 5.916247906197655, "step": 8830}, {"loss": 1.2756, "grad_norm": 0.8685907125473022, "learning_rate": 0.0002, "epoch": 5.922948073701843, "step": 8840}, {"loss": 1.297, "grad_norm": 1.036419153213501, "learning_rate": 0.0002, "epoch": 5.9296482412060305, "step": 8850}, {"loss": 1.3207, "grad_norm": 1.0183674097061157, "learning_rate": 0.0002, "epoch": 5.936348408710217, "step": 8860}, {"loss": 1.3922, "grad_norm": 0.966444194316864, "learning_rate": 0.0002, "epoch": 5.943048576214405, "step": 8870}, {"loss": 1.333, "grad_norm": 1.125693917274475, "learning_rate": 0.0002, "epoch": 5.949748743718593, "step": 8880}, {"loss": 1.3116, "grad_norm": 0.9857436418533325, "learning_rate": 0.0002, "epoch": 5.956448911222781, "step": 8890}, {"loss": 1.2526, "grad_norm": 0.9377069473266602, "learning_rate": 0.0002, "epoch": 5.9631490787269685, "step": 8900}, {"loss": 1.3221, "grad_norm": 0.9493814706802368, "learning_rate": 0.0002, "epoch": 5.969849246231155, "step": 8910}, {"loss": 1.2516, "grad_norm": 0.8806208372116089, "learning_rate": 0.0002, "epoch": 5.976549413735343, "step": 8920}, {"loss": 1.2558, "grad_norm": 0.8727600574493408, "learning_rate": 0.0002, "epoch": 5.983249581239531, "step": 8930}, {"loss": 1.3538, "grad_norm": 0.9799810647964478, "learning_rate": 0.0002, "epoch": 5.989949748743719, "step": 8940}, {"loss": 1.3323, "grad_norm": 0.9866513609886169, "learning_rate": 0.0002, "epoch": 5.9966499162479066, "step": 8950}]} +{"epoch": 6.99966499162479, "step": 10447, "epoch_duration": 1589.0742285251617, "total_accumulated_duration": 11190.41557765007, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}, {"eval_loss": 1.8036354780197144, "eval_runtime": 37.8949, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.715, "epoch": 0.9996649916247906, "step": 1492}, {"loss": 1.7138, "grad_norm": 0.29252371191978455, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1500}, {"loss": 1.8198, "grad_norm": 0.31607162952423096, "learning_rate": 0.0002, "epoch": 1.0117252931323284, "step": 1510}, {"loss": 1.6779, "grad_norm": 0.32294467091560364, "learning_rate": 0.0002, "epoch": 1.018425460636516, "step": 1520}, {"loss": 1.7919, "grad_norm": 0.3868017792701721, "learning_rate": 0.0002, "epoch": 1.0251256281407035, "step": 1530}, {"loss": 1.7954, "grad_norm": 0.3178282082080841, "learning_rate": 0.0002, "epoch": 1.031825795644891, "step": 1540}, {"loss": 1.7136, "grad_norm": 0.3706750273704529, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1550}, {"loss": 1.7382, "grad_norm": 0.33930912613868713, "learning_rate": 0.0002, "epoch": 1.0452261306532664, "step": 1560}, {"loss": 1.7602, "grad_norm": 0.33970504999160767, "learning_rate": 0.0002, "epoch": 1.051926298157454, "step": 1570}, {"loss": 1.6573, "grad_norm": 0.42553383111953735, "learning_rate": 0.0002, "epoch": 1.0586264656616415, "step": 1580}, {"loss": 1.645, "grad_norm": 0.3772421181201935, "learning_rate": 0.0002, "epoch": 1.065326633165829, "step": 1590}, {"loss": 1.7362, "grad_norm": 0.34212902188301086, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1600}, {"loss": 1.7057, "grad_norm": 0.3798283338546753, "learning_rate": 0.0002, "epoch": 1.0787269681742044, "step": 1610}, {"loss": 1.7468, "grad_norm": 0.36909598112106323, "learning_rate": 0.0002, "epoch": 1.085427135678392, "step": 1620}, {"loss": 1.7807, "grad_norm": 0.3344230651855469, "learning_rate": 0.0002, "epoch": 1.0921273031825796, "step": 1630}, {"loss": 1.7111, "grad_norm": 0.3862569332122803, "learning_rate": 0.0002, "epoch": 1.0988274706867671, "step": 1640}, {"loss": 1.7163, "grad_norm": 0.31188511848449707, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1650}, {"loss": 1.7263, "grad_norm": 0.3563670814037323, "learning_rate": 0.0002, "epoch": 1.1122278056951425, "step": 1660}, {"loss": 1.7718, "grad_norm": 0.35052165389060974, "learning_rate": 0.0002, "epoch": 1.11892797319933, "step": 1670}, {"loss": 1.7601, "grad_norm": 0.3285699188709259, "learning_rate": 0.0002, "epoch": 1.1256281407035176, "step": 1680}, {"loss": 1.6877, "grad_norm": 0.3639393746852875, "learning_rate": 0.0002, "epoch": 1.1323283082077051, "step": 1690}, {"loss": 1.7719, "grad_norm": 0.3842753767967224, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1700}, {"loss": 1.7002, "grad_norm": 0.3624933063983917, "learning_rate": 0.0002, "epoch": 1.1457286432160805, "step": 1710}, {"loss": 1.7243, "grad_norm": 0.3641220033168793, "learning_rate": 0.0002, "epoch": 1.152428810720268, "step": 1720}, {"loss": 1.752, "grad_norm": 0.32765355706214905, "learning_rate": 0.0002, "epoch": 1.1591289782244556, "step": 1730}, {"loss": 1.6556, "grad_norm": 0.34974896907806396, "learning_rate": 0.0002, "epoch": 1.1658291457286432, "step": 1740}, {"loss": 1.7273, "grad_norm": 0.3910926580429077, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1750}, {"loss": 1.7204, "grad_norm": 0.3564300537109375, "learning_rate": 0.0002, "epoch": 1.1792294807370185, "step": 1760}, {"loss": 1.746, "grad_norm": 0.34822574257850647, "learning_rate": 0.0002, "epoch": 1.185929648241206, "step": 1770}, {"loss": 1.7256, "grad_norm": 0.36185044050216675, "learning_rate": 0.0002, "epoch": 1.1926298157453936, "step": 1780}, {"loss": 1.6431, "grad_norm": 0.34866711497306824, "learning_rate": 0.0002, "epoch": 1.1993299832495812, "step": 1790}, {"loss": 1.8084, "grad_norm": 0.4017769992351532, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1800}, {"loss": 1.6985, "grad_norm": 0.32930681109428406, "learning_rate": 0.0002, "epoch": 1.2127303182579565, "step": 1810}, {"loss": 1.7606, "grad_norm": 0.35951921343803406, "learning_rate": 0.0002, "epoch": 1.219430485762144, "step": 1820}, {"loss": 1.6933, "grad_norm": 0.37366992235183716, "learning_rate": 0.0002, "epoch": 1.2261306532663316, "step": 1830}, {"loss": 1.6737, "grad_norm": 0.3565689027309418, "learning_rate": 0.0002, "epoch": 1.2328308207705192, "step": 1840}, {"loss": 1.8013, "grad_norm": 0.3692343533039093, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1850}, {"loss": 1.736, "grad_norm": 0.38426971435546875, "learning_rate": 0.0002, "epoch": 1.2462311557788945, "step": 1860}, {"loss": 1.7031, "grad_norm": 0.33559855818748474, "learning_rate": 0.0002, "epoch": 1.252931323283082, "step": 1870}, {"loss": 1.7033, "grad_norm": 0.34181106090545654, "learning_rate": 0.0002, "epoch": 1.2596314907872697, "step": 1880}, {"loss": 1.7707, "grad_norm": 0.3916318416595459, "learning_rate": 0.0002, "epoch": 1.2663316582914572, "step": 1890}, {"loss": 1.6686, "grad_norm": 0.3887825012207031, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1900}, {"loss": 1.7062, "grad_norm": 0.33583927154541016, "learning_rate": 0.0002, "epoch": 1.2797319932998326, "step": 1910}, {"loss": 1.717, "grad_norm": 0.37639349699020386, "learning_rate": 0.0002, "epoch": 1.2864321608040201, "step": 1920}, {"loss": 1.777, "grad_norm": 0.38059428334236145, "learning_rate": 0.0002, "epoch": 1.2931323283082077, "step": 1930}, {"loss": 1.6126, "grad_norm": 0.37253183126449585, "learning_rate": 0.0002, "epoch": 1.2998324958123952, "step": 1940}, {"loss": 1.6758, "grad_norm": 0.37371566891670227, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1950}, {"loss": 1.6788, "grad_norm": 0.4080910086631775, "learning_rate": 0.0002, "epoch": 1.3132328308207706, "step": 1960}, {"loss": 1.6518, "grad_norm": 0.3174354135990143, "learning_rate": 0.0002, "epoch": 1.3199329983249581, "step": 1970}, {"loss": 1.7925, "grad_norm": 0.4518888294696808, "learning_rate": 0.0002, "epoch": 1.3266331658291457, "step": 1980}, {"loss": 1.7085, "grad_norm": 0.3627921938896179, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 1990}, {"loss": 1.7676, "grad_norm": 0.3655930161476135, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 2000}, {"loss": 1.7016, "grad_norm": 0.3509993255138397, "learning_rate": 0.0002, "epoch": 1.3467336683417086, "step": 2010}, {"loss": 1.7359, "grad_norm": 0.4281129240989685, "learning_rate": 0.0002, "epoch": 1.3534338358458962, "step": 2020}, {"loss": 1.6884, "grad_norm": 0.3821414113044739, "learning_rate": 0.0002, "epoch": 1.3601340033500837, "step": 2030}, {"loss": 1.7075, "grad_norm": 0.3907586336135864, "learning_rate": 0.0002, "epoch": 1.3668341708542713, "step": 2040}, {"loss": 1.7424, "grad_norm": 0.37792932987213135, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 2050}, {"loss": 1.7305, "grad_norm": 0.3693985641002655, "learning_rate": 0.0002, "epoch": 1.3802345058626466, "step": 2060}, {"loss": 1.7434, "grad_norm": 0.32275936007499695, "learning_rate": 0.0002, "epoch": 1.3869346733668342, "step": 2070}, {"loss": 1.6677, "grad_norm": 0.3789440095424652, "learning_rate": 0.0002, "epoch": 1.3936348408710217, "step": 2080}, {"loss": 1.6825, "grad_norm": 0.3638380467891693, "learning_rate": 0.0002, "epoch": 1.4003350083752093, "step": 2090}, {"loss": 1.6542, "grad_norm": 0.3495481610298157, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 2100}, {"loss": 1.7225, "grad_norm": 0.37920597195625305, "learning_rate": 0.0002, "epoch": 1.4137353433835846, "step": 2110}, {"loss": 1.7329, "grad_norm": 0.37218064069747925, "learning_rate": 0.0002, "epoch": 1.4204355108877722, "step": 2120}, {"loss": 1.799, "grad_norm": 0.38074082136154175, "learning_rate": 0.0002, "epoch": 1.4271356783919598, "step": 2130}, {"loss": 1.7403, "grad_norm": 0.3455527126789093, "learning_rate": 0.0002, "epoch": 1.4338358458961473, "step": 2140}, {"loss": 1.776, "grad_norm": 0.3712003529071808, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 2150}, {"loss": 1.7619, "grad_norm": 0.3786754906177521, "learning_rate": 0.0002, "epoch": 1.4472361809045227, "step": 2160}, {"loss": 1.68, "grad_norm": 0.3879223167896271, "learning_rate": 0.0002, "epoch": 1.4539363484087102, "step": 2170}, {"loss": 1.7, "grad_norm": 0.38738805055618286, "learning_rate": 0.0002, "epoch": 1.4606365159128978, "step": 2180}, {"loss": 1.7581, "grad_norm": 0.39768800139427185, "learning_rate": 0.0002, "epoch": 1.4673366834170856, "step": 2190}, {"loss": 1.7671, "grad_norm": 0.4172441065311432, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 2200}, {"loss": 1.6736, "grad_norm": 0.4043174982070923, "learning_rate": 0.0002, "epoch": 1.4807370184254607, "step": 2210}, {"loss": 1.7444, "grad_norm": 0.3750883936882019, "learning_rate": 0.0002, "epoch": 1.4874371859296482, "step": 2220}, {"loss": 1.6861, "grad_norm": 0.3552253246307373, "learning_rate": 0.0002, "epoch": 1.4941373534338358, "step": 2230}, {"loss": 1.6471, "grad_norm": 0.34607139229774475, "learning_rate": 0.0002, "epoch": 1.5008375209380236, "step": 2240}, {"loss": 1.6962, "grad_norm": 0.3406706750392914, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 2250}, {"loss": 1.7694, "grad_norm": 0.36654895544052124, "learning_rate": 0.0002, "epoch": 1.5142378559463987, "step": 2260}, {"loss": 1.6812, "grad_norm": 0.3914054334163666, "learning_rate": 0.0002, "epoch": 1.5209380234505863, "step": 2270}, {"loss": 1.6822, "grad_norm": 0.42012137174606323, "learning_rate": 0.0002, "epoch": 1.5276381909547738, "step": 2280}, {"loss": 1.697, "grad_norm": 0.39563435316085815, "learning_rate": 0.0002, "epoch": 1.5343383584589616, "step": 2290}, {"loss": 1.7491, "grad_norm": 0.3508438766002655, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 2300}, {"loss": 1.7727, "grad_norm": 0.3785218596458435, "learning_rate": 0.0002, "epoch": 1.5477386934673367, "step": 2310}, {"loss": 1.6963, "grad_norm": 0.39377647638320923, "learning_rate": 0.0002, "epoch": 1.5544388609715243, "step": 2320}, {"loss": 1.7263, "grad_norm": 0.3391438126564026, "learning_rate": 0.0002, "epoch": 1.5611390284757118, "step": 2330}, {"loss": 1.7722, "grad_norm": 0.37944263219833374, "learning_rate": 0.0002, "epoch": 1.5678391959798996, "step": 2340}, {"loss": 1.6371, "grad_norm": 0.3523491322994232, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 2350}, {"loss": 1.7583, "grad_norm": 0.3911575973033905, "learning_rate": 0.0002, "epoch": 1.5812395309882747, "step": 2360}, {"loss": 1.7117, "grad_norm": 0.33832186460494995, "learning_rate": 0.0002, "epoch": 1.5879396984924623, "step": 2370}, {"loss": 1.7701, "grad_norm": 0.3665979206562042, "learning_rate": 0.0002, "epoch": 1.5946398659966499, "step": 2380}, {"loss": 1.779, "grad_norm": 0.3871748149394989, "learning_rate": 0.0002, "epoch": 1.6013400335008376, "step": 2390}, {"loss": 1.7109, "grad_norm": 0.3586967885494232, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 2400}, {"loss": 1.7096, "grad_norm": 0.3563673198223114, "learning_rate": 0.0002, "epoch": 1.6147403685092128, "step": 2410}, {"loss": 1.745, "grad_norm": 0.37588971853256226, "learning_rate": 0.0002, "epoch": 1.6214405360134003, "step": 2420}, {"loss": 1.7086, "grad_norm": 0.352556437253952, "learning_rate": 0.0002, "epoch": 1.6281407035175879, "step": 2430}, {"loss": 1.6547, "grad_norm": 0.3716259300708771, "learning_rate": 0.0002, "epoch": 1.6348408710217757, "step": 2440}, {"loss": 1.7033, "grad_norm": 0.372001975774765, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 2450}, {"loss": 1.6584, "grad_norm": 0.3430042862892151, "learning_rate": 0.0002, "epoch": 1.6482412060301508, "step": 2460}, {"loss": 1.7217, "grad_norm": 0.3741483688354492, "learning_rate": 0.0002, "epoch": 1.6549413735343383, "step": 2470}, {"loss": 1.7701, "grad_norm": 0.3610571324825287, "learning_rate": 0.0002, "epoch": 1.661641541038526, "step": 2480}, {"loss": 1.7057, "grad_norm": 0.4204719066619873, "learning_rate": 0.0002, "epoch": 1.6683417085427137, "step": 2490}, {"loss": 1.7954, "grad_norm": 0.3938186466693878, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2500}, {"loss": 1.6633, "grad_norm": 0.3421435058116913, "learning_rate": 0.0002, "epoch": 1.6817420435510888, "step": 2510}, {"loss": 1.7996, "grad_norm": 0.42441412806510925, "learning_rate": 0.0002, "epoch": 1.6884422110552764, "step": 2520}, {"loss": 1.7142, "grad_norm": 0.38071519136428833, "learning_rate": 0.0002, "epoch": 1.695142378559464, "step": 2530}, {"loss": 1.7232, "grad_norm": 0.34078919887542725, "learning_rate": 0.0002, "epoch": 1.7018425460636517, "step": 2540}, {"loss": 1.7126, "grad_norm": 0.412844181060791, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2550}, {"loss": 1.7149, "grad_norm": 0.3753604292869568, "learning_rate": 0.0002, "epoch": 1.7152428810720268, "step": 2560}, {"loss": 1.7011, "grad_norm": 0.41588476300239563, "learning_rate": 0.0002, "epoch": 1.7219430485762144, "step": 2570}, {"loss": 1.6427, "grad_norm": 0.35504111647605896, "learning_rate": 0.0002, "epoch": 1.728643216080402, "step": 2580}, {"loss": 1.7296, "grad_norm": 0.36909720301628113, "learning_rate": 0.0002, "epoch": 1.7353433835845897, "step": 2590}, {"loss": 1.7022, "grad_norm": 0.4149979054927826, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2600}, {"loss": 1.77, "grad_norm": 0.38859328627586365, "learning_rate": 0.0002, "epoch": 1.7487437185929648, "step": 2610}, {"loss": 1.7036, "grad_norm": 0.36738792061805725, "learning_rate": 0.0002, "epoch": 1.7554438860971524, "step": 2620}, {"loss": 1.764, "grad_norm": 0.3968178927898407, "learning_rate": 0.0002, "epoch": 1.76214405360134, "step": 2630}, {"loss": 1.7687, "grad_norm": 0.3972901999950409, "learning_rate": 0.0002, "epoch": 1.7688442211055277, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.3949959874153137, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2650}, {"loss": 1.7247, "grad_norm": 0.44074657559394836, "learning_rate": 0.0002, "epoch": 1.7822445561139029, "step": 2660}, {"loss": 1.7188, "grad_norm": 0.39743664860725403, "learning_rate": 0.0002, "epoch": 1.7889447236180904, "step": 2670}, {"loss": 1.7258, "grad_norm": 0.3950406610965729, "learning_rate": 0.0002, "epoch": 1.795644891122278, "step": 2680}, {"loss": 1.6906, "grad_norm": 0.3568263649940491, "learning_rate": 0.0002, "epoch": 1.8023450586264658, "step": 2690}, {"loss": 1.6735, "grad_norm": 0.3819476366043091, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2700}, {"loss": 1.7198, "grad_norm": 0.3480634391307831, "learning_rate": 0.0002, "epoch": 1.8157453936348409, "step": 2710}, {"loss": 1.7042, "grad_norm": 0.3875853419303894, "learning_rate": 0.0002, "epoch": 1.8224455611390284, "step": 2720}, {"loss": 1.6988, "grad_norm": 0.3441337049007416, "learning_rate": 0.0002, "epoch": 1.829145728643216, "step": 2730}, {"loss": 1.7647, "grad_norm": 0.35692882537841797, "learning_rate": 0.0002, "epoch": 1.8358458961474038, "step": 2740}, {"loss": 1.7033, "grad_norm": 0.36959215998649597, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2750}, {"loss": 1.7657, "grad_norm": 0.3893393278121948, "learning_rate": 0.0002, "epoch": 1.849246231155779, "step": 2760}, {"loss": 1.7068, "grad_norm": 0.37817293405532837, "learning_rate": 0.0002, "epoch": 1.8559463986599665, "step": 2770}, {"loss": 1.761, "grad_norm": 0.36071285605430603, "learning_rate": 0.0002, "epoch": 1.862646566164154, "step": 2780}, {"loss": 1.7623, "grad_norm": 0.3758420944213867, "learning_rate": 0.0002, "epoch": 1.8693467336683418, "step": 2790}, {"loss": 1.6743, "grad_norm": 0.3889938294887543, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2800}, {"loss": 1.6151, "grad_norm": 0.34361857175827026, "learning_rate": 0.0002, "epoch": 1.882747068676717, "step": 2810}, {"loss": 1.6038, "grad_norm": 0.39283323287963867, "learning_rate": 0.0002, "epoch": 1.8894472361809045, "step": 2820}, {"loss": 1.7555, "grad_norm": 0.3919452726840973, "learning_rate": 0.0002, "epoch": 1.896147403685092, "step": 2830}, {"loss": 1.673, "grad_norm": 0.38215070962905884, "learning_rate": 0.0002, "epoch": 1.9028475711892798, "step": 2840}, {"loss": 1.7044, "grad_norm": 0.4235064387321472, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2850}, {"loss": 1.7123, "grad_norm": 0.35694634914398193, "learning_rate": 0.0002, "epoch": 1.916247906197655, "step": 2860}, {"loss": 1.8128, "grad_norm": 0.383492112159729, "learning_rate": 0.0002, "epoch": 1.9229480737018425, "step": 2870}, {"loss": 1.7581, "grad_norm": 0.5945147275924683, "learning_rate": 0.0002, "epoch": 1.92964824120603, "step": 2880}, {"loss": 1.7421, "grad_norm": 0.3367522358894348, "learning_rate": 0.0002, "epoch": 1.9363484087102178, "step": 2890}, {"loss": 1.6561, "grad_norm": 0.35300394892692566, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2900}, {"loss": 1.7033, "grad_norm": 0.38084495067596436, "learning_rate": 0.0002, "epoch": 1.949748743718593, "step": 2910}, {"loss": 1.7132, "grad_norm": 0.37559160590171814, "learning_rate": 0.0002, "epoch": 1.9564489112227805, "step": 2920}, {"loss": 1.6759, "grad_norm": 0.3661738336086273, "learning_rate": 0.0002, "epoch": 1.963149078726968, "step": 2930}, {"loss": 1.7643, "grad_norm": 0.4073849320411682, "learning_rate": 0.0002, "epoch": 1.9698492462311559, "step": 2940}, {"loss": 1.6806, "grad_norm": 0.3723304271697998, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2950}, {"loss": 1.7611, "grad_norm": 0.3991098999977112, "learning_rate": 0.0002, "epoch": 1.983249581239531, "step": 2960}, {"loss": 1.7263, "grad_norm": 0.3947085440158844, "learning_rate": 0.0002, "epoch": 1.9899497487437185, "step": 2970}, {"loss": 1.7217, "grad_norm": 0.3786258399486542, "learning_rate": 0.0002, "epoch": 1.996649916247906, "step": 2980}, {"eval_loss": 1.8028968572616577, "eval_runtime": 37.8985, "eval_samples_per_second": 13.589, "eval_steps_per_second": 1.715, "epoch": 2.0, "step": 2985}, {"loss": 1.695, "grad_norm": 0.34824079275131226, "learning_rate": 0.0002, "epoch": 2.003350083752094, "step": 2990}, {"loss": 1.5853, "grad_norm": 0.3394894003868103, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 3000}, {"loss": 1.5783, "grad_norm": 0.36910977959632874, "learning_rate": 0.0002, "epoch": 2.016750418760469, "step": 3010}, {"loss": 1.6105, "grad_norm": 0.45000967383384705, "learning_rate": 0.0002, "epoch": 2.023450586264657, "step": 3020}, {"loss": 1.6019, "grad_norm": 0.3791407346725464, "learning_rate": 0.0002, "epoch": 2.030150753768844, "step": 3030}, {"loss": 1.5832, "grad_norm": 0.387321799993515, "learning_rate": 0.0002, "epoch": 2.036850921273032, "step": 3040}, {"loss": 1.6834, "grad_norm": 0.4185757040977478, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 3050}, {"loss": 1.5696, "grad_norm": 0.45110777020454407, "learning_rate": 0.0002, "epoch": 2.050251256281407, "step": 3060}, {"loss": 1.6231, "grad_norm": 0.42663660645484924, "learning_rate": 0.0002, "epoch": 2.056951423785595, "step": 3070}, {"loss": 1.6279, "grad_norm": 0.4546292722225189, "learning_rate": 0.0002, "epoch": 2.063651591289782, "step": 3080}, {"loss": 1.6141, "grad_norm": 0.3979759216308594, "learning_rate": 0.0002, "epoch": 2.07035175879397, "step": 3090}, {"loss": 1.6343, "grad_norm": 0.43596673011779785, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 3100}, {"loss": 1.5441, "grad_norm": 0.40120232105255127, "learning_rate": 0.0002, "epoch": 2.083752093802345, "step": 3110}, {"loss": 1.6309, "grad_norm": 0.44449281692504883, "learning_rate": 0.0002, "epoch": 2.090452261306533, "step": 3120}, {"loss": 1.5652, "grad_norm": 0.42672568559646606, "learning_rate": 0.0002, "epoch": 2.09715242881072, "step": 3130}, {"loss": 1.682, "grad_norm": 0.4232690930366516, "learning_rate": 0.0002, "epoch": 2.103852596314908, "step": 3140}, {"loss": 1.624, "grad_norm": 0.4299317002296448, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 3150}, {"loss": 1.6766, "grad_norm": 0.4067758023738861, "learning_rate": 0.0002, "epoch": 2.117252931323283, "step": 3160}, {"loss": 1.6759, "grad_norm": 0.4918815791606903, "learning_rate": 0.0002, "epoch": 2.123953098827471, "step": 3170}, {"loss": 1.6478, "grad_norm": 0.4140559732913971, "learning_rate": 0.0002, "epoch": 2.130653266331658, "step": 3180}, {"loss": 1.6641, "grad_norm": 0.4555995464324951, "learning_rate": 0.0002, "epoch": 2.137353433835846, "step": 3190}, {"loss": 1.5888, "grad_norm": 0.42943915724754333, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 3200}, {"loss": 1.5886, "grad_norm": 0.4730435013771057, "learning_rate": 0.0002, "epoch": 2.150753768844221, "step": 3210}, {"loss": 1.6022, "grad_norm": 0.43310216069221497, "learning_rate": 0.0002, "epoch": 2.157453936348409, "step": 3220}, {"loss": 1.6058, "grad_norm": 0.42054110765457153, "learning_rate": 0.0002, "epoch": 2.164154103852596, "step": 3230}, {"loss": 1.6749, "grad_norm": 0.4897233247756958, "learning_rate": 0.0002, "epoch": 2.170854271356784, "step": 3240}, {"loss": 1.6983, "grad_norm": 0.42194533348083496, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 3250}, {"loss": 1.6159, "grad_norm": 0.44494450092315674, "learning_rate": 0.0002, "epoch": 2.184254606365159, "step": 3260}, {"loss": 1.6977, "grad_norm": 0.43524879217147827, "learning_rate": 0.0002, "epoch": 2.190954773869347, "step": 3270}, {"loss": 1.528, "grad_norm": 0.4621117413043976, "learning_rate": 0.0002, "epoch": 2.1976549413735342, "step": 3280}, {"loss": 1.632, "grad_norm": 0.4073285460472107, "learning_rate": 0.0002, "epoch": 2.204355108877722, "step": 3290}, {"loss": 1.6141, "grad_norm": 0.47868335247039795, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 3300}, {"loss": 1.6857, "grad_norm": 0.4264970123767853, "learning_rate": 0.0002, "epoch": 2.217755443886097, "step": 3310}, {"loss": 1.5653, "grad_norm": 0.4491245150566101, "learning_rate": 0.0002, "epoch": 2.224455611390285, "step": 3320}, {"loss": 1.5881, "grad_norm": 0.4010344445705414, "learning_rate": 0.0002, "epoch": 2.2311557788944723, "step": 3330}, {"loss": 1.6684, "grad_norm": 0.4232759177684784, "learning_rate": 0.0002, "epoch": 2.23785594639866, "step": 3340}, {"loss": 1.6336, "grad_norm": 0.5099776983261108, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 3350}, {"loss": 1.6764, "grad_norm": 0.5223407745361328, "learning_rate": 0.0002, "epoch": 2.251256281407035, "step": 3360}, {"loss": 1.6625, "grad_norm": 0.47818470001220703, "learning_rate": 0.0002, "epoch": 2.257956448911223, "step": 3370}, {"loss": 1.5946, "grad_norm": 0.4721255898475647, "learning_rate": 0.0002, "epoch": 2.2646566164154103, "step": 3380}, {"loss": 1.5568, "grad_norm": 0.4113229513168335, "learning_rate": 0.0002, "epoch": 2.271356783919598, "step": 3390}, {"loss": 1.6494, "grad_norm": 0.507080078125, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 3400}, {"loss": 1.6183, "grad_norm": 0.4852292239665985, "learning_rate": 0.0002, "epoch": 2.284757118927973, "step": 3410}, {"loss": 1.6132, "grad_norm": 0.4503684341907501, "learning_rate": 0.0002, "epoch": 2.291457286432161, "step": 3420}, {"loss": 1.6649, "grad_norm": 0.8359600305557251, "learning_rate": 0.0002, "epoch": 2.2981574539363483, "step": 3430}, {"loss": 1.6644, "grad_norm": 0.44604045152664185, "learning_rate": 0.0002, "epoch": 2.304857621440536, "step": 3440}, {"loss": 1.5972, "grad_norm": 0.45667049288749695, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 3450}, {"loss": 1.6667, "grad_norm": 0.4879349172115326, "learning_rate": 0.0002, "epoch": 2.318257956448911, "step": 3460}, {"loss": 1.5804, "grad_norm": 0.4033963084220886, "learning_rate": 0.0002, "epoch": 2.324958123953099, "step": 3470}, {"loss": 1.5838, "grad_norm": 0.44494301080703735, "learning_rate": 0.0002, "epoch": 2.3316582914572863, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4794621765613556, "learning_rate": 0.0002, "epoch": 2.338358458961474, "step": 3490}, {"loss": 1.6807, "grad_norm": 0.41404327750205994, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 3500}, {"loss": 1.714, "grad_norm": 0.4664851725101471, "learning_rate": 0.0002, "epoch": 2.351758793969849, "step": 3510}, {"loss": 1.6537, "grad_norm": 0.4263697564601898, "learning_rate": 0.0002, "epoch": 2.358458961474037, "step": 3520}, {"loss": 1.6551, "grad_norm": 0.5035167336463928, "learning_rate": 0.0002, "epoch": 2.3651591289782243, "step": 3530}, {"loss": 1.6208, "grad_norm": 0.4380664527416229, "learning_rate": 0.0002, "epoch": 2.371859296482412, "step": 3540}, {"loss": 1.634, "grad_norm": 0.5227681994438171, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 3550}, {"loss": 1.6146, "grad_norm": 0.4382302761077881, "learning_rate": 0.0002, "epoch": 2.3852596314907872, "step": 3560}, {"loss": 1.5653, "grad_norm": 0.4392451047897339, "learning_rate": 0.0002, "epoch": 2.391959798994975, "step": 3570}, {"loss": 1.6626, "grad_norm": 0.4372786581516266, "learning_rate": 0.0002, "epoch": 2.3986599664991624, "step": 3580}, {"loss": 1.519, "grad_norm": 0.5015502572059631, "learning_rate": 0.0002, "epoch": 2.40536013400335, "step": 3590}, {"loss": 1.588, "grad_norm": 0.5653210878372192, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 3600}, {"loss": 1.6075, "grad_norm": 0.53007972240448, "learning_rate": 0.0002, "epoch": 2.4187604690117253, "step": 3610}, {"loss": 1.6421, "grad_norm": 0.4659176766872406, "learning_rate": 0.0002, "epoch": 2.425460636515913, "step": 3620}, {"loss": 1.625, "grad_norm": 0.5637837052345276, "learning_rate": 0.0002, "epoch": 2.4321608040201004, "step": 3630}, {"loss": 1.6168, "grad_norm": 0.4248391389846802, "learning_rate": 0.0002, "epoch": 2.438860971524288, "step": 3640}, {"loss": 1.6822, "grad_norm": 0.44668248295783997, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 3650}, {"loss": 1.6417, "grad_norm": 0.43990179896354675, "learning_rate": 0.0002, "epoch": 2.4522613065326633, "step": 3660}, {"loss": 1.6723, "grad_norm": 0.4532523453235626, "learning_rate": 0.0002, "epoch": 2.458961474036851, "step": 3670}, {"loss": 1.6957, "grad_norm": 0.6605591773986816, "learning_rate": 0.0002, "epoch": 2.4656616415410384, "step": 3680}, {"loss": 1.6159, "grad_norm": 0.4694533348083496, "learning_rate": 0.0002, "epoch": 2.472361809045226, "step": 3690}, {"loss": 1.6239, "grad_norm": 0.4485011100769043, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 3700}, {"loss": 1.6834, "grad_norm": 0.4761785864830017, "learning_rate": 0.0002, "epoch": 2.4857621440536013, "step": 3710}, {"loss": 1.6313, "grad_norm": 0.5116432309150696, "learning_rate": 0.0002, "epoch": 2.492462311557789, "step": 3720}, {"loss": 1.5054, "grad_norm": 0.49523618817329407, "learning_rate": 0.0002, "epoch": 2.4991624790619764, "step": 3730}, {"loss": 1.6249, "grad_norm": 0.43826380372047424, "learning_rate": 0.0002, "epoch": 2.505862646566164, "step": 3740}, {"loss": 1.5762, "grad_norm": 0.4916154146194458, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3750}, {"loss": 1.5157, "grad_norm": 0.5381299257278442, "learning_rate": 0.0002, "epoch": 2.5192629815745393, "step": 3760}, {"loss": 1.6467, "grad_norm": 0.44947415590286255, "learning_rate": 0.0002, "epoch": 2.525963149078727, "step": 3770}, {"loss": 1.67, "grad_norm": 0.49979084730148315, "learning_rate": 0.0002, "epoch": 2.5326633165829144, "step": 3780}, {"loss": 1.622, "grad_norm": 0.43046900629997253, "learning_rate": 0.0002, "epoch": 2.539363484087102, "step": 3790}, {"loss": 1.6789, "grad_norm": 0.4513470530509949, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3800}, {"loss": 1.6335, "grad_norm": 0.49900051951408386, "learning_rate": 0.0002, "epoch": 2.5527638190954773, "step": 3810}, {"loss": 1.6322, "grad_norm": 0.4348420202732086, "learning_rate": 0.0002, "epoch": 2.559463986599665, "step": 3820}, {"loss": 1.6218, "grad_norm": 0.4684867560863495, "learning_rate": 0.0002, "epoch": 2.5661641541038525, "step": 3830}, {"loss": 1.6535, "grad_norm": 0.44430989027023315, "learning_rate": 0.0002, "epoch": 2.5728643216080402, "step": 3840}, {"loss": 1.5909, "grad_norm": 0.47375255823135376, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3850}, {"loss": 1.6269, "grad_norm": 0.45493075251579285, "learning_rate": 0.0002, "epoch": 2.5862646566164154, "step": 3860}, {"loss": 1.604, "grad_norm": 0.4563275873661041, "learning_rate": 0.0002, "epoch": 2.592964824120603, "step": 3870}, {"loss": 1.642, "grad_norm": 0.46060335636138916, "learning_rate": 0.0002, "epoch": 2.5996649916247905, "step": 3880}, {"loss": 1.6302, "grad_norm": 0.4718867540359497, "learning_rate": 0.0002, "epoch": 2.6063651591289783, "step": 3890}, {"loss": 1.6242, "grad_norm": 0.41570305824279785, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3900}, {"loss": 1.6401, "grad_norm": 0.4603121876716614, "learning_rate": 0.0002, "epoch": 2.6197654941373534, "step": 3910}, {"loss": 1.6839, "grad_norm": 0.4734652638435364, "learning_rate": 0.0002, "epoch": 2.626465661641541, "step": 3920}, {"loss": 1.5448, "grad_norm": 0.45348483324050903, "learning_rate": 0.0002, "epoch": 2.6331658291457285, "step": 3930}, {"loss": 1.6157, "grad_norm": 0.46559447050094604, "learning_rate": 0.0002, "epoch": 2.6398659966499163, "step": 3940}, {"loss": 1.7052, "grad_norm": 0.44113144278526306, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3950}, {"loss": 1.6315, "grad_norm": 0.41415104269981384, "learning_rate": 0.0002, "epoch": 2.6532663316582914, "step": 3960}, {"loss": 1.6589, "grad_norm": 0.48868080973625183, "learning_rate": 0.0002, "epoch": 2.659966499162479, "step": 3970}, {"loss": 1.6211, "grad_norm": 0.49610549211502075, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 3980}, {"loss": 1.6235, "grad_norm": 0.4309130907058716, "learning_rate": 0.0002, "epoch": 2.6733668341708543, "step": 3990}, {"loss": 1.6452, "grad_norm": 0.4489327669143677, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 4000}, {"loss": 1.5744, "grad_norm": 0.5380139946937561, "learning_rate": 0.0002, "epoch": 2.6867671691792294, "step": 4010}, {"loss": 1.6524, "grad_norm": 0.5076672434806824, "learning_rate": 0.0002, "epoch": 2.693467336683417, "step": 4020}, {"loss": 1.636, "grad_norm": 0.47620031237602234, "learning_rate": 0.0002, "epoch": 2.7001675041876045, "step": 4030}, {"loss": 1.5543, "grad_norm": 0.48089155554771423, "learning_rate": 0.0002, "epoch": 2.7068676716917923, "step": 4040}, {"loss": 1.6396, "grad_norm": 0.5108814239501953, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 4050}, {"loss": 1.5905, "grad_norm": 0.4196513295173645, "learning_rate": 0.0002, "epoch": 2.7202680067001674, "step": 4060}, {"loss": 1.686, "grad_norm": 0.4574664831161499, "learning_rate": 0.0002, "epoch": 2.726968174204355, "step": 4070}, {"loss": 1.6234, "grad_norm": 0.4671640992164612, "learning_rate": 0.0002, "epoch": 2.7336683417085426, "step": 4080}, {"loss": 1.6827, "grad_norm": 0.49355530738830566, "learning_rate": 0.0002, "epoch": 2.7403685092127303, "step": 4090}, {"loss": 1.6999, "grad_norm": 0.46716663241386414, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 4100}, {"loss": 1.6463, "grad_norm": 0.45420581102371216, "learning_rate": 0.0002, "epoch": 2.7537688442211055, "step": 4110}, {"loss": 1.5718, "grad_norm": 0.4680487811565399, "learning_rate": 0.0002, "epoch": 2.7604690117252932, "step": 4120}, {"loss": 1.5968, "grad_norm": 0.5375032424926758, "learning_rate": 0.0002, "epoch": 2.7671691792294806, "step": 4130}, {"loss": 1.5254, "grad_norm": 0.46026280522346497, "learning_rate": 0.0002, "epoch": 2.7738693467336684, "step": 4140}, {"loss": 1.6613, "grad_norm": 0.43658447265625, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 4150}, {"loss": 1.6546, "grad_norm": 0.4935547113418579, "learning_rate": 0.0002, "epoch": 2.7872696817420435, "step": 4160}, {"loss": 1.5961, "grad_norm": 0.8167962431907654, "learning_rate": 0.0002, "epoch": 2.7939698492462313, "step": 4170}, {"loss": 1.6907, "grad_norm": 0.4289683997631073, "learning_rate": 0.0002, "epoch": 2.8006700167504186, "step": 4180}, {"loss": 1.6385, "grad_norm": 0.4569324254989624, "learning_rate": 0.0002, "epoch": 2.8073701842546064, "step": 4190}, {"loss": 1.6077, "grad_norm": 0.474795937538147, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 4200}, {"loss": 1.6223, "grad_norm": 0.44272229075431824, "learning_rate": 0.0002, "epoch": 2.8207705192629815, "step": 4210}, {"loss": 1.6706, "grad_norm": 0.525240957736969, "learning_rate": 0.0002, "epoch": 2.8274706867671693, "step": 4220}, {"loss": 1.7196, "grad_norm": 0.4802303910255432, "learning_rate": 0.0002, "epoch": 2.8341708542713566, "step": 4230}, {"loss": 1.6002, "grad_norm": 0.46400442719459534, "learning_rate": 0.0002, "epoch": 2.8408710217755444, "step": 4240}, {"loss": 1.6052, "grad_norm": 0.49884888529777527, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 4250}, {"loss": 1.6919, "grad_norm": 0.5015072226524353, "learning_rate": 0.0002, "epoch": 2.8542713567839195, "step": 4260}, {"loss": 1.6335, "grad_norm": 0.4335440695285797, "learning_rate": 0.0002, "epoch": 2.8609715242881073, "step": 4270}, {"loss": 1.5664, "grad_norm": 0.5131644606590271, "learning_rate": 0.0002, "epoch": 2.8676716917922946, "step": 4280}, {"loss": 1.6409, "grad_norm": 0.6977195739746094, "learning_rate": 0.0002, "epoch": 2.8743718592964824, "step": 4290}, {"loss": 1.7192, "grad_norm": 0.5133762955665588, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 4300}, {"loss": 1.6257, "grad_norm": 0.4737614393234253, "learning_rate": 0.0002, "epoch": 2.8877721943048575, "step": 4310}, {"loss": 1.6076, "grad_norm": 0.4580535590648651, "learning_rate": 0.0002, "epoch": 2.8944723618090453, "step": 4320}, {"loss": 1.6538, "grad_norm": 0.43863341212272644, "learning_rate": 0.0002, "epoch": 2.901172529313233, "step": 4330}, {"loss": 1.6091, "grad_norm": 0.4103737473487854, "learning_rate": 0.0002, "epoch": 2.9078726968174204, "step": 4340}, {"loss": 1.7106, "grad_norm": 0.438014417886734, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 4350}, {"loss": 1.6025, "grad_norm": 0.5068213939666748, "learning_rate": 0.0002, "epoch": 2.9212730318257956, "step": 4360}, {"loss": 1.6426, "grad_norm": 0.45305484533309937, "learning_rate": 0.0002, "epoch": 2.9279731993299833, "step": 4370}, {"loss": 1.5726, "grad_norm": 0.4612090289592743, "learning_rate": 0.0002, "epoch": 2.934673366834171, "step": 4380}, {"loss": 1.6536, "grad_norm": 0.508736789226532, "learning_rate": 0.0002, "epoch": 2.9413735343383585, "step": 4390}, {"loss": 1.6132, "grad_norm": 0.4924427270889282, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 4400}, {"loss": 1.7007, "grad_norm": 0.5707460641860962, "learning_rate": 0.0002, "epoch": 2.9547738693467336, "step": 4410}, {"loss": 1.6814, "grad_norm": 0.42270299792289734, "learning_rate": 0.0002, "epoch": 2.9614740368509214, "step": 4420}, {"loss": 1.6644, "grad_norm": 0.4429931044578552, "learning_rate": 0.0002, "epoch": 2.968174204355109, "step": 4430}, {"loss": 1.6251, "grad_norm": 0.49760574102401733, "learning_rate": 0.0002, "epoch": 2.9748743718592965, "step": 4440}, {"loss": 1.6169, "grad_norm": 0.4558229148387909, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 4450}, {"loss": 1.6055, "grad_norm": 0.39848530292510986, "learning_rate": 0.0002, "epoch": 2.9882747068676716, "step": 4460}, {"loss": 1.6705, "grad_norm": 0.5224862098693848, "learning_rate": 0.0002, "epoch": 2.9949748743718594, "step": 4470}, {"eval_loss": 1.8228833675384521, "eval_runtime": 37.9049, "eval_samples_per_second": 13.587, "eval_steps_per_second": 1.715, "epoch": 2.9996649916247904, "step": 4477}, {"loss": 1.6637, "grad_norm": 0.41169142723083496, "learning_rate": 0.0002, "epoch": 3.0016750418760467, "step": 4480}, {"loss": 1.5974, "grad_norm": 0.4865207374095917, "learning_rate": 0.0002, "epoch": 3.0083752093802345, "step": 4490}, {"loss": 1.5297, "grad_norm": 0.5462028384208679, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 4500}, {"loss": 1.5251, "grad_norm": 0.6169732809066772, "learning_rate": 0.0002, "epoch": 3.0217755443886096, "step": 4510}, {"loss": 1.5559, "grad_norm": 0.5667954087257385, "learning_rate": 0.0002, "epoch": 3.0284757118927974, "step": 4520}, {"loss": 1.5037, "grad_norm": 0.5758325457572937, "learning_rate": 0.0002, "epoch": 3.0351758793969847, "step": 4530}, {"loss": 1.4873, "grad_norm": 0.5220064520835876, "learning_rate": 0.0002, "epoch": 3.0418760469011725, "step": 4540}, {"loss": 1.5126, "grad_norm": 0.5469558835029602, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 4550}, {"loss": 1.4275, "grad_norm": 0.5680848956108093, "learning_rate": 0.0002, "epoch": 3.0552763819095476, "step": 4560}, {"loss": 1.5187, "grad_norm": 0.5906574726104736, "learning_rate": 0.0002, "epoch": 3.0619765494137354, "step": 4570}, {"loss": 1.4551, "grad_norm": 0.4725631773471832, "learning_rate": 0.0002, "epoch": 3.0686767169179228, "step": 4580}, {"loss": 1.5083, "grad_norm": 0.5273477435112, "learning_rate": 0.0002, "epoch": 3.0753768844221105, "step": 4590}, {"loss": 1.5154, "grad_norm": 0.5861203074455261, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 4600}, {"loss": 1.4924, "grad_norm": 0.5343965291976929, "learning_rate": 0.0002, "epoch": 3.0887772194304857, "step": 4610}, {"loss": 1.5608, "grad_norm": 0.5348150730133057, "learning_rate": 0.0002, "epoch": 3.0954773869346734, "step": 4620}, {"loss": 1.5399, "grad_norm": 0.5971846580505371, "learning_rate": 0.0002, "epoch": 3.102177554438861, "step": 4630}, {"loss": 1.4662, "grad_norm": 0.5203177332878113, "learning_rate": 0.0002, "epoch": 3.1088777219430486, "step": 4640}, {"loss": 1.5805, "grad_norm": 0.55289226770401, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 4650}, {"loss": 1.4745, "grad_norm": 0.6878530979156494, "learning_rate": 0.0002, "epoch": 3.1222780569514237, "step": 4660}, {"loss": 1.5335, "grad_norm": 0.6173256635665894, "learning_rate": 0.0002, "epoch": 3.1289782244556115, "step": 4670}, {"loss": 1.51, "grad_norm": 0.536796510219574, "learning_rate": 0.0002, "epoch": 3.135678391959799, "step": 4680}, {"loss": 1.4713, "grad_norm": 0.58846116065979, "learning_rate": 0.0002, "epoch": 3.1423785594639866, "step": 4690}, {"loss": 1.5114, "grad_norm": 0.645889401435852, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 4700}, {"loss": 1.4705, "grad_norm": 0.6118691563606262, "learning_rate": 0.0002, "epoch": 3.1557788944723617, "step": 4710}, {"loss": 1.5533, "grad_norm": 0.5189669132232666, "learning_rate": 0.0002, "epoch": 3.1624790619765495, "step": 4720}, {"loss": 1.4769, "grad_norm": 0.5794713497161865, "learning_rate": 0.0002, "epoch": 3.169179229480737, "step": 4730}, {"loss": 1.4849, "grad_norm": 0.6579326391220093, "learning_rate": 0.0002, "epoch": 3.1758793969849246, "step": 4740}, {"loss": 1.545, "grad_norm": 0.5822742581367493, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 4750}, {"loss": 1.4358, "grad_norm": 0.5475956201553345, "learning_rate": 0.0002, "epoch": 3.1892797319932997, "step": 4760}, {"loss": 1.4723, "grad_norm": 0.6743834018707275, "learning_rate": 0.0002, "epoch": 3.1959798994974875, "step": 4770}, {"loss": 1.5161, "grad_norm": 0.6110585927963257, "learning_rate": 0.0002, "epoch": 3.202680067001675, "step": 4780}, {"loss": 1.5455, "grad_norm": 0.5426181554794312, "learning_rate": 0.0002, "epoch": 3.2093802345058626, "step": 4790}, {"loss": 1.5315, "grad_norm": 0.6077824234962463, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 4800}, {"loss": 1.5314, "grad_norm": 0.5785858631134033, "learning_rate": 0.0002, "epoch": 3.2227805695142377, "step": 4810}, {"loss": 1.4041, "grad_norm": 0.6425958275794983, "learning_rate": 0.0002, "epoch": 3.2294807370184255, "step": 4820}, {"loss": 1.4751, "grad_norm": 0.6607080698013306, "learning_rate": 0.0002, "epoch": 3.236180904522613, "step": 4830}, {"loss": 1.5267, "grad_norm": 0.5385788679122925, "learning_rate": 0.0002, "epoch": 3.2428810720268006, "step": 4840}, {"loss": 1.4673, "grad_norm": 0.5630403757095337, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 4850}, {"loss": 1.5257, "grad_norm": 0.6340779662132263, "learning_rate": 0.0002, "epoch": 3.2562814070351758, "step": 4860}, {"loss": 1.5148, "grad_norm": 0.5305342674255371, "learning_rate": 0.0002, "epoch": 3.2629815745393635, "step": 4870}, {"loss": 1.5162, "grad_norm": 0.597670316696167, "learning_rate": 0.0002, "epoch": 3.2696817420435513, "step": 4880}, {"loss": 1.5429, "grad_norm": 0.665553867816925, "learning_rate": 0.0002, "epoch": 3.2763819095477387, "step": 4890}, {"loss": 1.4607, "grad_norm": 0.579767644405365, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 4900}, {"loss": 1.4999, "grad_norm": 0.5512481331825256, "learning_rate": 0.0002, "epoch": 3.289782244556114, "step": 4910}, {"loss": 1.5022, "grad_norm": 0.5916532278060913, "learning_rate": 0.0002, "epoch": 3.2964824120603016, "step": 4920}, {"loss": 1.4889, "grad_norm": 0.7521726489067078, "learning_rate": 0.0002, "epoch": 3.3031825795644894, "step": 4930}, {"loss": 1.4223, "grad_norm": 0.5352797508239746, "learning_rate": 0.0002, "epoch": 3.3098827470686767, "step": 4940}, {"loss": 1.5122, "grad_norm": 0.5950371623039246, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 4950}, {"loss": 1.5072, "grad_norm": 0.8020477890968323, "learning_rate": 0.0002, "epoch": 3.323283082077052, "step": 4960}, {"loss": 1.5422, "grad_norm": 0.6790024638175964, "learning_rate": 0.0002, "epoch": 3.3299832495812396, "step": 4970}, {"loss": 1.5363, "grad_norm": 0.687627375125885, "learning_rate": 0.0002, "epoch": 3.3366834170854274, "step": 4980}, {"loss": 1.5276, "grad_norm": 0.6094385385513306, "learning_rate": 0.0002, "epoch": 3.3433835845896147, "step": 4990}, {"loss": 1.549, "grad_norm": 0.6541242003440857, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 5000}, {"loss": 1.6067, "grad_norm": 0.5560880303382874, "learning_rate": 0.0002, "epoch": 3.35678391959799, "step": 5010}, {"loss": 1.5769, "grad_norm": 0.5440094470977783, "learning_rate": 0.0002, "epoch": 3.3634840871021776, "step": 5020}, {"loss": 1.6183, "grad_norm": 0.5749301314353943, "learning_rate": 0.0002, "epoch": 3.3701842546063654, "step": 5030}, {"loss": 1.4801, "grad_norm": 0.5919716954231262, "learning_rate": 0.0002, "epoch": 3.3768844221105527, "step": 5040}, {"loss": 1.5957, "grad_norm": 0.6331481337547302, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 5050}, {"loss": 1.5188, "grad_norm": 0.5687161684036255, "learning_rate": 0.0002, "epoch": 3.390284757118928, "step": 5060}, {"loss": 1.5702, "grad_norm": 0.6718577742576599, "learning_rate": 0.0002, "epoch": 3.3969849246231156, "step": 5070}, {"loss": 1.5577, "grad_norm": 0.5089324116706848, "learning_rate": 0.0002, "epoch": 3.4036850921273034, "step": 5080}, {"loss": 1.512, "grad_norm": 0.5710174441337585, "learning_rate": 0.0002, "epoch": 3.4103852596314908, "step": 5090}, {"loss": 1.5492, "grad_norm": 0.6670721173286438, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 5100}, {"loss": 1.5227, "grad_norm": 0.6875665187835693, "learning_rate": 0.0002, "epoch": 3.423785594639866, "step": 5110}, {"loss": 1.4496, "grad_norm": 0.5375880599021912, "learning_rate": 0.0002, "epoch": 3.4304857621440537, "step": 5120}, {"loss": 1.5527, "grad_norm": 0.6550399661064148, "learning_rate": 0.0002, "epoch": 3.4371859296482414, "step": 5130}, {"loss": 1.5687, "grad_norm": 0.5948067903518677, "learning_rate": 0.0002, "epoch": 3.4438860971524288, "step": 5140}, {"loss": 1.4813, "grad_norm": 0.6134477257728577, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 5150}, {"loss": 1.5069, "grad_norm": 0.6506398320198059, "learning_rate": 0.0002, "epoch": 3.457286432160804, "step": 5160}, {"loss": 1.4422, "grad_norm": 0.6060147881507874, "learning_rate": 0.0002, "epoch": 3.4639865996649917, "step": 5170}, {"loss": 1.5093, "grad_norm": 0.6173806190490723, "learning_rate": 0.0002, "epoch": 3.4706867671691795, "step": 5180}, {"loss": 1.4975, "grad_norm": 0.6032607555389404, "learning_rate": 0.0002, "epoch": 3.477386934673367, "step": 5190}, {"loss": 1.4979, "grad_norm": 0.5652492046356201, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 5200}, {"loss": 1.4883, "grad_norm": 0.6168607473373413, "learning_rate": 0.0002, "epoch": 3.490787269681742, "step": 5210}, {"loss": 1.5164, "grad_norm": 0.6170629262924194, "learning_rate": 0.0002, "epoch": 3.4974874371859297, "step": 5220}, {"loss": 1.4879, "grad_norm": 0.6926297545433044, "learning_rate": 0.0002, "epoch": 3.5041876046901175, "step": 5230}, {"loss": 1.4982, "grad_norm": 0.6702437996864319, "learning_rate": 0.0002, "epoch": 3.510887772194305, "step": 5240}, {"loss": 1.4986, "grad_norm": 0.5421436429023743, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 5250}, {"loss": 1.4673, "grad_norm": 0.5726765990257263, "learning_rate": 0.0002, "epoch": 3.52428810720268, "step": 5260}, {"loss": 1.5423, "grad_norm": 0.5685455203056335, "learning_rate": 0.0002, "epoch": 3.5309882747068677, "step": 5270}, {"loss": 1.4715, "grad_norm": 0.6018396019935608, "learning_rate": 0.0002, "epoch": 3.5376884422110555, "step": 5280}, {"loss": 1.5451, "grad_norm": 0.5731932520866394, "learning_rate": 0.0002, "epoch": 3.544388609715243, "step": 5290}, {"loss": 1.4752, "grad_norm": 0.6601519584655762, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 5300}, {"loss": 1.5434, "grad_norm": 0.5545530319213867, "learning_rate": 0.0002, "epoch": 3.557788944723618, "step": 5310}, {"loss": 1.5438, "grad_norm": 0.5998541116714478, "learning_rate": 0.0002, "epoch": 3.5644891122278057, "step": 5320}, {"loss": 1.56, "grad_norm": 0.5651767253875732, "learning_rate": 0.0002, "epoch": 3.5711892797319935, "step": 5330}, {"loss": 1.4829, "grad_norm": 0.7425084114074707, "learning_rate": 0.0002, "epoch": 3.577889447236181, "step": 5340}, {"loss": 1.5571, "grad_norm": 0.5770602226257324, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 5350}, {"loss": 1.458, "grad_norm": 0.54723060131073, "learning_rate": 0.0002, "epoch": 3.591289782244556, "step": 5360}, {"loss": 1.497, "grad_norm": 0.6658238172531128, "learning_rate": 0.0002, "epoch": 3.5979899497487438, "step": 5370}, {"loss": 1.5456, "grad_norm": 0.5787645578384399, "learning_rate": 0.0002, "epoch": 3.6046901172529315, "step": 5380}, {"loss": 1.5343, "grad_norm": 0.594913125038147, "learning_rate": 0.0002, "epoch": 3.611390284757119, "step": 5390}, {"loss": 1.4727, "grad_norm": 0.4964977502822876, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 5400}, {"loss": 1.5062, "grad_norm": 0.6087527275085449, "learning_rate": 0.0002, "epoch": 3.624790619765494, "step": 5410}, {"loss": 1.5098, "grad_norm": 0.6315323710441589, "learning_rate": 0.0002, "epoch": 3.6314907872696818, "step": 5420}, {"loss": 1.4855, "grad_norm": 0.574799120426178, "learning_rate": 0.0002, "epoch": 3.6381909547738696, "step": 5430}, {"loss": 1.4595, "grad_norm": 0.5949277877807617, "learning_rate": 0.0002, "epoch": 3.644891122278057, "step": 5440}, {"loss": 1.4816, "grad_norm": 0.5640677213668823, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 5450}, {"loss": 1.525, "grad_norm": 0.6198237538337708, "learning_rate": 0.0002, "epoch": 3.658291457286432, "step": 5460}, {"loss": 1.5676, "grad_norm": 0.6902034878730774, "learning_rate": 0.0002, "epoch": 3.66499162479062, "step": 5470}, {"loss": 1.544, "grad_norm": 0.5686674118041992, "learning_rate": 0.0002, "epoch": 3.6716917922948076, "step": 5480}, {"loss": 1.5255, "grad_norm": 0.6532107591629028, "learning_rate": 0.0002, "epoch": 3.678391959798995, "step": 5490}, {"loss": 1.5767, "grad_norm": 0.5790849924087524, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 5500}, {"loss": 1.5507, "grad_norm": 0.6055065393447876, "learning_rate": 0.0002, "epoch": 3.69179229480737, "step": 5510}, {"loss": 1.4656, "grad_norm": 0.5630605816841125, "learning_rate": 0.0002, "epoch": 3.698492462311558, "step": 5520}, {"loss": 1.537, "grad_norm": 0.6005825996398926, "learning_rate": 0.0002, "epoch": 3.7051926298157456, "step": 5530}, {"loss": 1.5313, "grad_norm": 0.6553038954734802, "learning_rate": 0.0002, "epoch": 3.711892797319933, "step": 5540}, {"loss": 1.4943, "grad_norm": 0.5601094961166382, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 5550}, {"loss": 1.5161, "grad_norm": 0.6598808169364929, "learning_rate": 0.0002, "epoch": 3.725293132328308, "step": 5560}, {"loss": 1.5345, "grad_norm": 0.5506255626678467, "learning_rate": 0.0002, "epoch": 3.731993299832496, "step": 5570}, {"loss": 1.4805, "grad_norm": 0.6001223921775818, "learning_rate": 0.0002, "epoch": 3.7386934673366836, "step": 5580}, {"loss": 1.4652, "grad_norm": 0.6287297606468201, "learning_rate": 0.0002, "epoch": 3.745393634840871, "step": 5590}, {"loss": 1.5246, "grad_norm": 0.6253238916397095, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 5600}, {"loss": 1.5691, "grad_norm": 0.5713174939155579, "learning_rate": 0.0002, "epoch": 3.758793969849246, "step": 5610}, {"loss": 1.5661, "grad_norm": 0.6198310852050781, "learning_rate": 0.0002, "epoch": 3.765494137353434, "step": 5620}, {"loss": 1.5448, "grad_norm": 0.5941224098205566, "learning_rate": 0.0002, "epoch": 3.7721943048576216, "step": 5630}, {"loss": 1.4925, "grad_norm": 0.606002151966095, "learning_rate": 0.0002, "epoch": 3.778894472361809, "step": 5640}, {"loss": 1.5182, "grad_norm": 0.6540704965591431, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 5650}, {"loss": 1.5903, "grad_norm": 0.6147415041923523, "learning_rate": 0.0002, "epoch": 3.792294807370184, "step": 5660}, {"loss": 1.5329, "grad_norm": 0.5649605393409729, "learning_rate": 0.0002, "epoch": 3.798994974874372, "step": 5670}, {"loss": 1.5747, "grad_norm": 0.6788773536682129, "learning_rate": 0.0002, "epoch": 3.8056951423785597, "step": 5680}, {"loss": 1.535, "grad_norm": 0.6581860780715942, "learning_rate": 0.0002, "epoch": 3.812395309882747, "step": 5690}, {"loss": 1.4587, "grad_norm": 0.5529348850250244, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 5700}, {"loss": 1.5594, "grad_norm": 0.6320232152938843, "learning_rate": 0.0002, "epoch": 3.825795644891122, "step": 5710}, {"loss": 1.4696, "grad_norm": 0.6529698371887207, "learning_rate": 0.0002, "epoch": 3.83249581239531, "step": 5720}, {"loss": 1.5854, "grad_norm": 0.5983362793922424, "learning_rate": 0.0002, "epoch": 3.8391959798994977, "step": 5730}, {"loss": 1.465, "grad_norm": 0.6335684061050415, "learning_rate": 0.0002, "epoch": 3.845896147403685, "step": 5740}, {"loss": 1.5545, "grad_norm": 0.700446605682373, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 5750}, {"loss": 1.5707, "grad_norm": 0.6092597842216492, "learning_rate": 0.0002, "epoch": 3.85929648241206, "step": 5760}, {"loss": 1.5729, "grad_norm": 0.564146101474762, "learning_rate": 0.0002, "epoch": 3.865996649916248, "step": 5770}, {"loss": 1.5872, "grad_norm": 0.615275502204895, "learning_rate": 0.0002, "epoch": 3.8726968174204357, "step": 5780}, {"loss": 1.5142, "grad_norm": 0.6685376763343811, "learning_rate": 0.0002, "epoch": 3.879396984924623, "step": 5790}, {"loss": 1.4752, "grad_norm": 0.6116922497749329, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 5800}, {"loss": 1.5179, "grad_norm": 0.5486813187599182, "learning_rate": 0.0002, "epoch": 3.892797319932998, "step": 5810}, {"loss": 1.5167, "grad_norm": 0.6208204030990601, "learning_rate": 0.0002, "epoch": 3.899497487437186, "step": 5820}, {"loss": 1.5334, "grad_norm": 0.6500625014305115, "learning_rate": 0.0002, "epoch": 3.9061976549413737, "step": 5830}, {"loss": 1.4716, "grad_norm": 0.5948089361190796, "learning_rate": 0.0002, "epoch": 3.912897822445561, "step": 5840}, {"loss": 1.6011, "grad_norm": 0.7210732698440552, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 5850}, {"loss": 1.5519, "grad_norm": 0.6662322878837585, "learning_rate": 0.0002, "epoch": 3.926298157453936, "step": 5860}, {"loss": 1.5656, "grad_norm": 0.5613839626312256, "learning_rate": 0.0002, "epoch": 3.932998324958124, "step": 5870}, {"loss": 1.544, "grad_norm": 0.6069002151489258, "learning_rate": 0.0002, "epoch": 3.9396984924623117, "step": 5880}, {"loss": 1.6745, "grad_norm": 0.7075562477111816, "learning_rate": 0.0002, "epoch": 3.946398659966499, "step": 5890}, {"loss": 1.5391, "grad_norm": 0.6316173076629639, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 5900}, {"loss": 1.6314, "grad_norm": 0.5716308355331421, "learning_rate": 0.0002, "epoch": 3.959798994974874, "step": 5910}, {"loss": 1.5947, "grad_norm": 0.6800096035003662, "learning_rate": 0.0002, "epoch": 3.966499162479062, "step": 5920}, {"loss": 1.5189, "grad_norm": 0.6057983040809631, "learning_rate": 0.0002, "epoch": 3.9731993299832498, "step": 5930}, {"loss": 1.5431, "grad_norm": 0.5938987731933594, "learning_rate": 0.0002, "epoch": 3.979899497487437, "step": 5940}, {"loss": 1.5111, "grad_norm": 0.6963576674461365, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 5950}, {"loss": 1.5521, "grad_norm": 0.6279940009117126, "learning_rate": 0.0002, "epoch": 3.993299832495812, "step": 5960}, {"loss": 1.5974, "grad_norm": 0.7161159515380859, "learning_rate": 0.0002, "epoch": 4.0, "step": 5970}, {"eval_loss": 1.8655421733856201, "eval_runtime": 37.9276, "eval_samples_per_second": 13.579, "eval_steps_per_second": 1.714, "epoch": 4.0, "step": 5970}, {"loss": 1.3666, "grad_norm": 0.7380476593971252, "learning_rate": 0.0002, "epoch": 4.006700167504188, "step": 5980}, {"loss": 1.3913, "grad_norm": 0.7148947715759277, "learning_rate": 0.0002, "epoch": 4.013400335008376, "step": 5990}, {"loss": 1.4204, "grad_norm": 0.6177082657814026, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 6000}, {"loss": 1.4421, "grad_norm": 0.8552946448326111, "learning_rate": 0.0002, "epoch": 4.02680067001675, "step": 6010}, {"loss": 1.4342, "grad_norm": 0.8033416271209717, "learning_rate": 0.0002, "epoch": 4.033500837520938, "step": 6020}, {"loss": 1.4092, "grad_norm": 0.8501318097114563, "learning_rate": 0.0002, "epoch": 4.040201005025126, "step": 6030}, {"loss": 1.3367, "grad_norm": 0.6981393098831177, "learning_rate": 0.0002, "epoch": 4.046901172529314, "step": 6040}, {"loss": 1.3925, "grad_norm": 0.7227180600166321, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 6050}, {"loss": 1.4007, "grad_norm": 0.6923989653587341, "learning_rate": 0.0002, "epoch": 4.060301507537688, "step": 6060}, {"loss": 1.3837, "grad_norm": 0.879779040813446, "learning_rate": 0.0002, "epoch": 4.067001675041876, "step": 6070}, {"loss": 1.4383, "grad_norm": 0.8184754848480225, "learning_rate": 0.0002, "epoch": 4.073701842546064, "step": 6080}, {"loss": 1.3128, "grad_norm": 0.8211342692375183, "learning_rate": 0.0002, "epoch": 4.080402010050252, "step": 6090}, {"loss": 1.3892, "grad_norm": 0.7542396783828735, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 6100}, {"loss": 1.3607, "grad_norm": 0.6631066799163818, "learning_rate": 0.0002, "epoch": 4.093802345058626, "step": 6110}, {"loss": 1.3275, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.100502512562814, "step": 6120}, {"loss": 1.3443, "grad_norm": 0.681851863861084, "learning_rate": 0.0002, "epoch": 4.107202680067002, "step": 6130}, {"loss": 1.3486, "grad_norm": 0.8757794499397278, "learning_rate": 0.0002, "epoch": 4.11390284757119, "step": 6140}, {"loss": 1.351, "grad_norm": 0.6567301750183105, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 6150}, {"loss": 1.3824, "grad_norm": 0.7950329184532166, "learning_rate": 0.0002, "epoch": 4.127303182579564, "step": 6160}, {"loss": 1.3738, "grad_norm": 0.7545644044876099, "learning_rate": 0.0002, "epoch": 4.134003350083752, "step": 6170}, {"loss": 1.4214, "grad_norm": 0.7172710299491882, "learning_rate": 0.0002, "epoch": 4.14070351758794, "step": 6180}, {"loss": 1.4091, "grad_norm": 0.7040584087371826, "learning_rate": 0.0002, "epoch": 4.147403685092128, "step": 6190}, {"loss": 1.4149, "grad_norm": 0.7482913732528687, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 6200}, {"loss": 1.3227, "grad_norm": 0.8523276448249817, "learning_rate": 0.0002, "epoch": 4.160804020100502, "step": 6210}, {"loss": 1.4194, "grad_norm": 0.6672041416168213, "learning_rate": 0.0002, "epoch": 4.16750418760469, "step": 6220}, {"loss": 1.3953, "grad_norm": 0.7523500919342041, "learning_rate": 0.0002, "epoch": 4.174204355108878, "step": 6230}, {"loss": 1.371, "grad_norm": 0.8085253834724426, "learning_rate": 0.0002, "epoch": 4.180904522613066, "step": 6240}, {"loss": 1.3293, "grad_norm": 0.789450466632843, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 6250}, {"loss": 1.3539, "grad_norm": 0.7502310872077942, "learning_rate": 0.0002, "epoch": 4.19430485762144, "step": 6260}, {"loss": 1.3415, "grad_norm": 0.7397456765174866, "learning_rate": 0.0002, "epoch": 4.201005025125628, "step": 6270}, {"loss": 1.3963, "grad_norm": 0.6921947002410889, "learning_rate": 0.0002, "epoch": 4.207705192629816, "step": 6280}, {"loss": 1.3125, "grad_norm": 0.9334571957588196, "learning_rate": 0.0002, "epoch": 4.214405360134004, "step": 6290}, {"loss": 1.3612, "grad_norm": 0.725799024105072, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 6300}, {"loss": 1.4217, "grad_norm": 0.8290495872497559, "learning_rate": 0.0002, "epoch": 4.227805695142378, "step": 6310}, {"loss": 1.4135, "grad_norm": 0.688983678817749, "learning_rate": 0.0002, "epoch": 4.234505862646566, "step": 6320}, {"loss": 1.3807, "grad_norm": 0.8620913028717041, "learning_rate": 0.0002, "epoch": 4.241206030150754, "step": 6330}, {"loss": 1.3738, "grad_norm": 0.8008657693862915, "learning_rate": 0.0002, "epoch": 4.247906197654942, "step": 6340}, {"loss": 1.4005, "grad_norm": 0.7379199266433716, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 6350}, {"loss": 1.426, "grad_norm": 0.7842815518379211, "learning_rate": 0.0002, "epoch": 4.261306532663316, "step": 6360}, {"loss": 1.4262, "grad_norm": 0.812600314617157, "learning_rate": 0.0002, "epoch": 4.268006700167504, "step": 6370}, {"loss": 1.4028, "grad_norm": 0.7852841019630432, "learning_rate": 0.0002, "epoch": 4.274706867671692, "step": 6380}, {"loss": 1.3722, "grad_norm": 1.0377534627914429, "learning_rate": 0.0002, "epoch": 4.28140703517588, "step": 6390}, {"loss": 1.3755, "grad_norm": 1.03935706615448, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 6400}, {"loss": 1.3961, "grad_norm": 0.7244732975959778, "learning_rate": 0.0002, "epoch": 4.294807370184254, "step": 6410}, {"loss": 1.4608, "grad_norm": 0.7137406468391418, "learning_rate": 0.0002, "epoch": 4.301507537688442, "step": 6420}, {"loss": 1.4461, "grad_norm": 0.7492543458938599, "learning_rate": 0.0002, "epoch": 4.30820770519263, "step": 6430}, {"loss": 1.4562, "grad_norm": 0.7065439224243164, "learning_rate": 0.0002, "epoch": 4.314907872696818, "step": 6440}, {"loss": 1.4246, "grad_norm": 0.7786989808082581, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 6450}, {"loss": 1.3098, "grad_norm": 0.7369208335876465, "learning_rate": 0.0002, "epoch": 4.328308207705192, "step": 6460}, {"loss": 1.3686, "grad_norm": 0.7412346005439758, "learning_rate": 0.0002, "epoch": 4.33500837520938, "step": 6470}, {"loss": 1.4087, "grad_norm": 0.780927300453186, "learning_rate": 0.0002, "epoch": 4.341708542713568, "step": 6480}, {"loss": 1.3628, "grad_norm": 0.8320930600166321, "learning_rate": 0.0002, "epoch": 4.348408710217756, "step": 6490}, {"loss": 1.3715, "grad_norm": 0.6871094703674316, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 6500}, {"loss": 1.3257, "grad_norm": 0.6751559972763062, "learning_rate": 0.0002, "epoch": 4.36180904522613, "step": 6510}, {"loss": 1.4311, "grad_norm": 0.7723976969718933, "learning_rate": 0.0002, "epoch": 4.368509212730318, "step": 6520}, {"loss": 1.4086, "grad_norm": 0.7915401458740234, "learning_rate": 0.0002, "epoch": 4.375209380234506, "step": 6530}, {"loss": 1.3973, "grad_norm": 0.7329102754592896, "learning_rate": 0.0002, "epoch": 4.381909547738694, "step": 6540}, {"loss": 1.447, "grad_norm": 0.7388760447502136, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 6550}, {"loss": 1.4378, "grad_norm": 0.8282579183578491, "learning_rate": 0.0002, "epoch": 4.3953098827470685, "step": 6560}, {"loss": 1.3923, "grad_norm": 0.7192724347114563, "learning_rate": 0.0002, "epoch": 4.402010050251256, "step": 6570}, {"loss": 1.4141, "grad_norm": 0.746526837348938, "learning_rate": 0.0002, "epoch": 4.408710217755444, "step": 6580}, {"loss": 1.33, "grad_norm": 0.8738046288490295, "learning_rate": 0.0002, "epoch": 4.415410385259632, "step": 6590}, {"loss": 1.3995, "grad_norm": 0.8408458828926086, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 6600}, {"loss": 1.4148, "grad_norm": 0.8110666275024414, "learning_rate": 0.0002, "epoch": 4.4288107202680065, "step": 6610}, {"loss": 1.441, "grad_norm": 0.8602406978607178, "learning_rate": 0.0002, "epoch": 4.435510887772194, "step": 6620}, {"loss": 1.4319, "grad_norm": 0.7549102902412415, "learning_rate": 0.0002, "epoch": 4.442211055276382, "step": 6630}, {"loss": 1.388, "grad_norm": 0.7831804156303406, "learning_rate": 0.0002, "epoch": 4.44891122278057, "step": 6640}, {"loss": 1.4283, "grad_norm": 0.7269673943519592, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 6650}, {"loss": 1.4132, "grad_norm": 0.7397838830947876, "learning_rate": 0.0002, "epoch": 4.4623115577889445, "step": 6660}, {"loss": 1.3174, "grad_norm": 0.713707447052002, "learning_rate": 0.0002, "epoch": 4.469011725293132, "step": 6670}, {"loss": 1.3406, "grad_norm": 0.7525581121444702, "learning_rate": 0.0002, "epoch": 4.47571189279732, "step": 6680}, {"loss": 1.4283, "grad_norm": 0.8030191659927368, "learning_rate": 0.0002, "epoch": 4.482412060301508, "step": 6690}, {"loss": 1.4586, "grad_norm": 0.7469439506530762, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 6700}, {"loss": 1.367, "grad_norm": 0.7743868231773376, "learning_rate": 0.0002, "epoch": 4.4958123953098825, "step": 6710}, {"loss": 1.3439, "grad_norm": 0.6539737582206726, "learning_rate": 0.0002, "epoch": 4.50251256281407, "step": 6720}, {"loss": 1.4513, "grad_norm": 0.825818657875061, "learning_rate": 0.0002, "epoch": 4.509212730318258, "step": 6730}, {"loss": 1.3984, "grad_norm": 0.8048575520515442, "learning_rate": 0.0002, "epoch": 4.515912897822446, "step": 6740}, {"loss": 1.3923, "grad_norm": 0.7828766107559204, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 6750}, {"loss": 1.3886, "grad_norm": 0.7406010031700134, "learning_rate": 0.0002, "epoch": 4.5293132328308205, "step": 6760}, {"loss": 1.3109, "grad_norm": 0.840345561504364, "learning_rate": 0.0002, "epoch": 4.536013400335008, "step": 6770}, {"loss": 1.4808, "grad_norm": 0.8492622971534729, "learning_rate": 0.0002, "epoch": 4.542713567839196, "step": 6780}, {"loss": 1.4384, "grad_norm": 0.7130163908004761, "learning_rate": 0.0002, "epoch": 4.549413735343384, "step": 6790}, {"loss": 1.4531, "grad_norm": 0.8454728126525879, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 6800}, {"loss": 1.3239, "grad_norm": 0.7847645282745361, "learning_rate": 0.0002, "epoch": 4.562814070351759, "step": 6810}, {"loss": 1.4181, "grad_norm": 0.7245864272117615, "learning_rate": 0.0002, "epoch": 4.569514237855946, "step": 6820}, {"loss": 1.3233, "grad_norm": 0.768893301486969, "learning_rate": 0.0002, "epoch": 4.576214405360134, "step": 6830}, {"loss": 1.3932, "grad_norm": 0.8028400540351868, "learning_rate": 0.0002, "epoch": 4.582914572864322, "step": 6840}, {"loss": 1.3745, "grad_norm": 0.763945460319519, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 6850}, {"loss": 1.4797, "grad_norm": 0.7417685389518738, "learning_rate": 0.0002, "epoch": 4.596314907872697, "step": 6860}, {"loss": 1.4468, "grad_norm": 0.7603038549423218, "learning_rate": 0.0002, "epoch": 4.603015075376884, "step": 6870}, {"loss": 1.4095, "grad_norm": 0.7981528043746948, "learning_rate": 0.0002, "epoch": 4.609715242881072, "step": 6880}, {"loss": 1.3963, "grad_norm": 0.8077111840248108, "learning_rate": 0.0002, "epoch": 4.61641541038526, "step": 6890}, {"loss": 1.4721, "grad_norm": 0.8778454065322876, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 6900}, {"loss": 1.3758, "grad_norm": 0.8620710372924805, "learning_rate": 0.0002, "epoch": 4.629815745393635, "step": 6910}, {"loss": 1.344, "grad_norm": 0.7486072778701782, "learning_rate": 0.0002, "epoch": 4.636515912897822, "step": 6920}, {"loss": 1.3913, "grad_norm": 0.7493042945861816, "learning_rate": 0.0002, "epoch": 4.64321608040201, "step": 6930}, {"loss": 1.397, "grad_norm": 0.7388978600502014, "learning_rate": 0.0002, "epoch": 4.649916247906198, "step": 6940}, {"loss": 1.3593, "grad_norm": 0.798530638217926, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 6950}, {"loss": 1.3982, "grad_norm": 0.7929500937461853, "learning_rate": 0.0002, "epoch": 4.663316582914573, "step": 6960}, {"loss": 1.4183, "grad_norm": 0.9186785221099854, "learning_rate": 0.0002, "epoch": 4.67001675041876, "step": 6970}, {"loss": 1.3955, "grad_norm": 1.1103485822677612, "learning_rate": 0.0002, "epoch": 4.676716917922948, "step": 6980}, {"loss": 1.3941, "grad_norm": 0.8000466823577881, "learning_rate": 0.0002, "epoch": 4.683417085427136, "step": 6990}, {"loss": 1.371, "grad_norm": 0.7520599961280823, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 7000}, {"loss": 1.4582, "grad_norm": 0.7971973419189453, "learning_rate": 0.0002, "epoch": 4.696817420435511, "step": 7010}, {"loss": 1.3682, "grad_norm": 0.7363343834877014, "learning_rate": 0.0002, "epoch": 4.703517587939698, "step": 7020}, {"loss": 1.3889, "grad_norm": 0.8268865942955017, "learning_rate": 0.0002, "epoch": 4.710217755443886, "step": 7030}, {"loss": 1.4382, "grad_norm": 0.7054963111877441, "learning_rate": 0.0002, "epoch": 4.716917922948074, "step": 7040}, {"loss": 1.4578, "grad_norm": 0.8196262121200562, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 7050}, {"loss": 1.365, "grad_norm": 0.8276031017303467, "learning_rate": 0.0002, "epoch": 4.730318257956449, "step": 7060}, {"loss": 1.3887, "grad_norm": 0.8248157501220703, "learning_rate": 0.0002, "epoch": 4.7370184254606365, "step": 7070}, {"loss": 1.4193, "grad_norm": 0.8937979936599731, "learning_rate": 0.0002, "epoch": 4.743718592964824, "step": 7080}, {"loss": 1.4334, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 4.750418760469012, "step": 7090}, {"loss": 1.4385, "grad_norm": 0.9495313763618469, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 7100}, {"loss": 1.4504, "grad_norm": 0.8598204255104065, "learning_rate": 0.0002, "epoch": 4.763819095477387, "step": 7110}, {"loss": 1.3969, "grad_norm": 0.8951472640037537, "learning_rate": 0.0002, "epoch": 4.7705192629815745, "step": 7120}, {"loss": 1.4339, "grad_norm": 0.9110309481620789, "learning_rate": 0.0002, "epoch": 4.777219430485762, "step": 7130}, {"loss": 1.4001, "grad_norm": 0.7929584980010986, "learning_rate": 0.0002, "epoch": 4.78391959798995, "step": 7140}, {"loss": 1.467, "grad_norm": 0.7415322661399841, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 7150}, {"loss": 1.5107, "grad_norm": 0.7504757046699524, "learning_rate": 0.0002, "epoch": 4.797319932998325, "step": 7160}, {"loss": 1.3736, "grad_norm": 0.7166924476623535, "learning_rate": 0.0002, "epoch": 4.8040201005025125, "step": 7170}, {"loss": 1.4088, "grad_norm": 0.7728400826454163, "learning_rate": 0.0002, "epoch": 4.8107202680067, "step": 7180}, {"loss": 1.3814, "grad_norm": 0.7992154955863953, "learning_rate": 0.0002, "epoch": 4.817420435510888, "step": 7190}, {"loss": 1.3958, "grad_norm": 0.8655321002006531, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 7200}, {"loss": 1.3837, "grad_norm": 0.7672632336616516, "learning_rate": 0.0002, "epoch": 4.830820770519263, "step": 7210}, {"loss": 1.4578, "grad_norm": 0.708416223526001, "learning_rate": 0.0002, "epoch": 4.8375209380234505, "step": 7220}, {"loss": 1.5413, "grad_norm": 0.8914081454277039, "learning_rate": 0.0002, "epoch": 4.844221105527638, "step": 7230}, {"loss": 1.3569, "grad_norm": 0.7141931653022766, "learning_rate": 0.0002, "epoch": 4.850921273031826, "step": 7240}, {"loss": 1.4532, "grad_norm": 0.6913040280342102, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 7250}, {"loss": 1.3912, "grad_norm": 0.7871233820915222, "learning_rate": 0.0002, "epoch": 4.864321608040201, "step": 7260}, {"loss": 1.3688, "grad_norm": 0.8466277122497559, "learning_rate": 0.0002, "epoch": 4.8710217755443885, "step": 7270}, {"loss": 1.33, "grad_norm": 0.8492183685302734, "learning_rate": 0.0002, "epoch": 4.877721943048576, "step": 7280}, {"loss": 1.3744, "grad_norm": 0.8339574933052063, "learning_rate": 0.0002, "epoch": 4.884422110552764, "step": 7290}, {"loss": 1.4157, "grad_norm": 0.787022590637207, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 7300}, {"loss": 1.3725, "grad_norm": 0.8877332806587219, "learning_rate": 0.0002, "epoch": 4.897822445561139, "step": 7310}, {"loss": 1.3968, "grad_norm": 0.744989812374115, "learning_rate": 0.0002, "epoch": 4.9045226130653266, "step": 7320}, {"loss": 1.4421, "grad_norm": 0.8027268648147583, "learning_rate": 0.0002, "epoch": 4.911222780569514, "step": 7330}, {"loss": 1.425, "grad_norm": 0.6437455415725708, "learning_rate": 0.0002, "epoch": 4.917922948073702, "step": 7340}, {"loss": 1.4829, "grad_norm": 0.685999870300293, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 7350}, {"loss": 1.4352, "grad_norm": 0.9086187481880188, "learning_rate": 0.0002, "epoch": 4.931323283082077, "step": 7360}, {"loss": 1.4245, "grad_norm": 0.8272411227226257, "learning_rate": 0.0002, "epoch": 4.938023450586265, "step": 7370}, {"loss": 1.4226, "grad_norm": 0.9227852821350098, "learning_rate": 0.0002, "epoch": 4.944723618090452, "step": 7380}, {"loss": 1.3643, "grad_norm": 0.7688441276550293, "learning_rate": 0.0002, "epoch": 4.95142378559464, "step": 7390}, {"loss": 1.4491, "grad_norm": 0.8662643432617188, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 7400}, {"loss": 1.4194, "grad_norm": 0.9234127998352051, "learning_rate": 0.0002, "epoch": 4.964824120603015, "step": 7410}, {"loss": 1.4009, "grad_norm": 0.9131470918655396, "learning_rate": 0.0002, "epoch": 4.971524288107203, "step": 7420}, {"loss": 1.4544, "grad_norm": 0.7377504110336304, "learning_rate": 0.0002, "epoch": 4.97822445561139, "step": 7430}, {"loss": 1.4008, "grad_norm": 0.8762801289558411, "learning_rate": 0.0002, "epoch": 4.984924623115578, "step": 7440}, {"loss": 1.4304, "grad_norm": 0.7919872999191284, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 7450}, {"loss": 1.3817, "grad_norm": 0.7144299149513245, "learning_rate": 0.0002, "epoch": 4.998324958123953, "step": 7460}, {"eval_loss": 1.9291157722473145, "eval_runtime": 37.9831, "eval_samples_per_second": 13.559, "eval_steps_per_second": 1.711, "epoch": 4.99966499162479, "step": 7462}, {"loss": 1.2753, "grad_norm": 0.7860151529312134, "learning_rate": 0.0002, "epoch": 5.005025125628141, "step": 7470}, {"loss": 1.2149, "grad_norm": 0.9418314695358276, "learning_rate": 0.0002, "epoch": 5.011725293132328, "step": 7480}, {"loss": 1.1966, "grad_norm": 0.8474572896957397, "learning_rate": 0.0002, "epoch": 5.018425460636516, "step": 7490}, {"loss": 1.2111, "grad_norm": 1.0724040269851685, "learning_rate": 0.0002, "epoch": 5.025125628140704, "step": 7500}, {"loss": 1.2228, "grad_norm": 0.9109148979187012, "learning_rate": 0.0002, "epoch": 5.031825795644891, "step": 7510}, {"loss": 1.2239, "grad_norm": 1.0088659524917603, "learning_rate": 0.0002, "epoch": 5.038525963149079, "step": 7520}, {"loss": 1.2156, "grad_norm": 1.1421623229980469, "learning_rate": 0.0002, "epoch": 5.045226130653266, "step": 7530}, {"loss": 1.1739, "grad_norm": 0.9219902157783508, "learning_rate": 0.0002, "epoch": 5.051926298157454, "step": 7540}, {"loss": 1.2686, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 5.058626465661642, "step": 7550}, {"loss": 1.2068, "grad_norm": 0.8889328241348267, "learning_rate": 0.0002, "epoch": 5.065326633165829, "step": 7560}, {"loss": 1.276, "grad_norm": 0.9751363396644592, "learning_rate": 0.0002, "epoch": 5.072026800670017, "step": 7570}, {"loss": 1.2078, "grad_norm": 0.8603123426437378, "learning_rate": 0.0002, "epoch": 5.078726968174204, "step": 7580}, {"loss": 1.2175, "grad_norm": 0.8910616636276245, "learning_rate": 0.0002, "epoch": 5.085427135678392, "step": 7590}, {"loss": 1.2475, "grad_norm": 1.1128392219543457, "learning_rate": 0.0002, "epoch": 5.09212730318258, "step": 7600}, {"loss": 1.3065, "grad_norm": 0.9480258822441101, "learning_rate": 0.0002, "epoch": 5.098827470686767, "step": 7610}, {"loss": 1.193, "grad_norm": 0.906958818435669, "learning_rate": 0.0002, "epoch": 5.105527638190955, "step": 7620}, {"loss": 1.2223, "grad_norm": 0.8741167187690735, "learning_rate": 0.0002, "epoch": 5.1122278056951425, "step": 7630}, {"loss": 1.2126, "grad_norm": 0.966268002986908, "learning_rate": 0.0002, "epoch": 5.11892797319933, "step": 7640}, {"loss": 1.2782, "grad_norm": 0.9124358892440796, "learning_rate": 0.0002, "epoch": 5.125628140703517, "step": 7650}, {"loss": 1.3004, "grad_norm": 1.0436606407165527, "learning_rate": 0.0002, "epoch": 5.132328308207705, "step": 7660}, {"loss": 1.2675, "grad_norm": 0.9217309355735779, "learning_rate": 0.0002, "epoch": 5.139028475711893, "step": 7670}, {"loss": 1.2502, "grad_norm": 1.344765543937683, "learning_rate": 0.0002, "epoch": 5.1457286432160805, "step": 7680}, {"loss": 1.2416, "grad_norm": 1.0730723142623901, "learning_rate": 0.0002, "epoch": 5.152428810720268, "step": 7690}, {"loss": 1.1888, "grad_norm": 0.9321247339248657, "learning_rate": 0.0002, "epoch": 5.159128978224456, "step": 7700}, {"loss": 1.1941, "grad_norm": 0.8482614755630493, "learning_rate": 0.0002, "epoch": 5.165829145728643, "step": 7710}, {"loss": 1.2668, "grad_norm": 0.8274452686309814, "learning_rate": 0.0002, "epoch": 5.172529313232831, "step": 7720}, {"loss": 1.1972, "grad_norm": 0.9120376706123352, "learning_rate": 0.0002, "epoch": 5.1792294807370185, "step": 7730}, {"loss": 1.1648, "grad_norm": 1.0062892436981201, "learning_rate": 0.0002, "epoch": 5.185929648241206, "step": 7740}, {"loss": 1.2199, "grad_norm": 0.9521504640579224, "learning_rate": 0.0002, "epoch": 5.192629815745394, "step": 7750}, {"loss": 1.2855, "grad_norm": 0.8800198435783386, "learning_rate": 0.0002, "epoch": 5.199329983249581, "step": 7760}, {"loss": 1.2535, "grad_norm": 0.9749179482460022, "learning_rate": 0.0002, "epoch": 5.206030150753769, "step": 7770}, {"loss": 1.2975, "grad_norm": 0.9441686868667603, "learning_rate": 0.0002, "epoch": 5.2127303182579565, "step": 7780}, {"loss": 1.256, "grad_norm": 0.9114066362380981, "learning_rate": 0.0002, "epoch": 5.219430485762144, "step": 7790}, {"loss": 1.2621, "grad_norm": 0.9851446151733398, "learning_rate": 0.0002, "epoch": 5.226130653266332, "step": 7800}, {"loss": 1.2502, "grad_norm": 0.9526297450065613, "learning_rate": 0.0002, "epoch": 5.232830820770519, "step": 7810}, {"loss": 1.1502, "grad_norm": 1.05986487865448, "learning_rate": 0.0002, "epoch": 5.239530988274707, "step": 7820}, {"loss": 1.2517, "grad_norm": 0.8956538438796997, "learning_rate": 0.0002, "epoch": 5.2462311557788945, "step": 7830}, {"loss": 1.2556, "grad_norm": 0.9568153619766235, "learning_rate": 0.0002, "epoch": 5.252931323283082, "step": 7840}, {"loss": 1.2442, "grad_norm": 1.0035018920898438, "learning_rate": 0.0002, "epoch": 5.259631490787269, "step": 7850}, {"loss": 1.2605, "grad_norm": 0.8554368615150452, "learning_rate": 0.0002, "epoch": 5.266331658291457, "step": 7860}, {"loss": 1.2799, "grad_norm": 0.9677708148956299, "learning_rate": 0.0002, "epoch": 5.273031825795645, "step": 7870}, {"loss": 1.275, "grad_norm": 0.943606436252594, "learning_rate": 0.0002, "epoch": 5.279731993299833, "step": 7880}, {"loss": 1.2335, "grad_norm": 1.0029335021972656, "learning_rate": 0.0002, "epoch": 5.28643216080402, "step": 7890}, {"loss": 1.2494, "grad_norm": 1.0164015293121338, "learning_rate": 0.0002, "epoch": 5.293132328308207, "step": 7900}, {"loss": 1.3117, "grad_norm": 0.8908365368843079, "learning_rate": 0.0002, "epoch": 5.299832495812395, "step": 7910}, {"loss": 1.2832, "grad_norm": 0.9307826161384583, "learning_rate": 0.0002, "epoch": 5.306532663316583, "step": 7920}, {"loss": 1.242, "grad_norm": 1.0730371475219727, "learning_rate": 0.0002, "epoch": 5.313232830820771, "step": 7930}, {"loss": 1.2003, "grad_norm": 0.844739556312561, "learning_rate": 0.0002, "epoch": 5.319932998324958, "step": 7940}, {"loss": 1.2688, "grad_norm": 1.275833010673523, "learning_rate": 0.0002, "epoch": 5.326633165829146, "step": 7950}, {"loss": 1.2957, "grad_norm": 0.9042661190032959, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 7960}, {"loss": 1.2912, "grad_norm": 0.9374269247055054, "learning_rate": 0.0002, "epoch": 5.340033500837521, "step": 7970}, {"loss": 1.2721, "grad_norm": 1.033098578453064, "learning_rate": 0.0002, "epoch": 5.346733668341709, "step": 7980}, {"loss": 1.3208, "grad_norm": 1.062775731086731, "learning_rate": 0.0002, "epoch": 5.353433835845896, "step": 7990}, {"loss": 1.3065, "grad_norm": 1.1064317226409912, "learning_rate": 0.0002, "epoch": 5.360134003350084, "step": 8000}, {"loss": 1.2341, "grad_norm": 1.1114039421081543, "learning_rate": 0.0002, "epoch": 5.366834170854271, "step": 8010}, {"loss": 1.2255, "grad_norm": 1.0198014974594116, "learning_rate": 0.0002, "epoch": 5.373534338358459, "step": 8020}, {"loss": 1.2433, "grad_norm": 0.8443173170089722, "learning_rate": 0.0002, "epoch": 5.380234505862647, "step": 8030}, {"loss": 1.206, "grad_norm": 1.000881314277649, "learning_rate": 0.0002, "epoch": 5.386934673366834, "step": 8040}, {"loss": 1.2982, "grad_norm": 0.9874443411827087, "learning_rate": 0.0002, "epoch": 5.393634840871022, "step": 8050}, {"loss": 1.2289, "grad_norm": 0.9895344972610474, "learning_rate": 0.0002, "epoch": 5.400335008375209, "step": 8060}, {"loss": 1.249, "grad_norm": 0.8595236539840698, "learning_rate": 0.0002, "epoch": 5.407035175879397, "step": 8070}, {"loss": 1.2308, "grad_norm": 0.9523849487304688, "learning_rate": 0.0002, "epoch": 5.413735343383585, "step": 8080}, {"loss": 1.2343, "grad_norm": 1.0560476779937744, "learning_rate": 0.0002, "epoch": 5.420435510887772, "step": 8090}, {"loss": 1.2956, "grad_norm": 1.0893689393997192, "learning_rate": 0.0002, "epoch": 5.42713567839196, "step": 8100}, {"loss": 1.2846, "grad_norm": 0.9395513534545898, "learning_rate": 0.0002, "epoch": 5.433835845896147, "step": 8110}, {"loss": 1.3444, "grad_norm": 0.9364215135574341, "learning_rate": 0.0002, "epoch": 5.440536013400335, "step": 8120}, {"loss": 1.2944, "grad_norm": 0.9502208232879639, "learning_rate": 0.0002, "epoch": 5.447236180904523, "step": 8130}, {"loss": 1.2971, "grad_norm": 0.9559208154678345, "learning_rate": 0.0002, "epoch": 5.45393634840871, "step": 8140}, {"loss": 1.2495, "grad_norm": 0.9261730313301086, "learning_rate": 0.0002, "epoch": 5.460636515912898, "step": 8150}, {"loss": 1.2599, "grad_norm": 0.9832326173782349, "learning_rate": 0.0002, "epoch": 5.467336683417085, "step": 8160}, {"loss": 1.2771, "grad_norm": 1.065953016281128, "learning_rate": 0.0002, "epoch": 5.474036850921273, "step": 8170}, {"loss": 1.3617, "grad_norm": 0.9139469861984253, "learning_rate": 0.0002, "epoch": 5.480737018425461, "step": 8180}, {"loss": 1.2197, "grad_norm": 1.2322484254837036, "learning_rate": 0.0002, "epoch": 5.4874371859296485, "step": 8190}, {"loss": 1.2879, "grad_norm": 0.9722974896430969, "learning_rate": 0.0002, "epoch": 5.494137353433836, "step": 8200}, {"loss": 1.2664, "grad_norm": 0.9338926076889038, "learning_rate": 0.0002, "epoch": 5.500837520938023, "step": 8210}, {"loss": 1.2128, "grad_norm": 0.9283728003501892, "learning_rate": 0.0002, "epoch": 5.507537688442211, "step": 8220}, {"loss": 1.2141, "grad_norm": 1.0489585399627686, "learning_rate": 0.0002, "epoch": 5.514237855946399, "step": 8230}, {"loss": 1.3257, "grad_norm": 0.9881814122200012, "learning_rate": 0.0002, "epoch": 5.5209380234505865, "step": 8240}, {"loss": 1.2221, "grad_norm": 0.9274460077285767, "learning_rate": 0.0002, "epoch": 5.527638190954773, "step": 8250}, {"loss": 1.2241, "grad_norm": 0.8650718331336975, "learning_rate": 0.0002, "epoch": 5.534338358458961, "step": 8260}, {"loss": 1.2462, "grad_norm": 1.014069676399231, "learning_rate": 0.0002, "epoch": 5.541038525963149, "step": 8270}, {"loss": 1.3502, "grad_norm": 0.9212974905967712, "learning_rate": 0.0002, "epoch": 5.547738693467337, "step": 8280}, {"loss": 1.2779, "grad_norm": 1.1235398054122925, "learning_rate": 0.0002, "epoch": 5.5544388609715245, "step": 8290}, {"loss": 1.306, "grad_norm": 0.961954653263092, "learning_rate": 0.0002, "epoch": 5.561139028475711, "step": 8300}, {"loss": 1.2946, "grad_norm": 0.9386700391769409, "learning_rate": 0.0002, "epoch": 5.567839195979899, "step": 8310}, {"loss": 1.313, "grad_norm": 1.01912522315979, "learning_rate": 0.0002, "epoch": 5.574539363484087, "step": 8320}, {"loss": 1.3121, "grad_norm": 0.9851216077804565, "learning_rate": 0.0002, "epoch": 5.581239530988275, "step": 8330}, {"loss": 1.3071, "grad_norm": 1.0138001441955566, "learning_rate": 0.0002, "epoch": 5.5879396984924625, "step": 8340}, {"loss": 1.2257, "grad_norm": 0.9262447357177734, "learning_rate": 0.0002, "epoch": 5.594639865996649, "step": 8350}, {"loss": 1.2473, "grad_norm": 1.1322970390319824, "learning_rate": 0.0002, "epoch": 5.601340033500837, "step": 8360}, {"loss": 1.3098, "grad_norm": 1.1429349184036255, "learning_rate": 0.0002, "epoch": 5.608040201005025, "step": 8370}, {"loss": 1.2686, "grad_norm": 0.9130118489265442, "learning_rate": 0.0002, "epoch": 5.614740368509213, "step": 8380}, {"loss": 1.2541, "grad_norm": 0.9651545882225037, "learning_rate": 0.0002, "epoch": 5.6214405360134005, "step": 8390}, {"loss": 1.2799, "grad_norm": 0.9595398306846619, "learning_rate": 0.0002, "epoch": 5.628140703517588, "step": 8400}, {"loss": 1.3429, "grad_norm": 1.0049372911453247, "learning_rate": 0.0002, "epoch": 5.634840871021775, "step": 8410}, {"loss": 1.3224, "grad_norm": 1.082804560661316, "learning_rate": 0.0002, "epoch": 5.641541038525963, "step": 8420}, {"loss": 1.297, "grad_norm": 0.9489204287528992, "learning_rate": 0.0002, "epoch": 5.648241206030151, "step": 8430}, {"loss": 1.3424, "grad_norm": 0.9470235109329224, "learning_rate": 0.0002, "epoch": 5.654941373534339, "step": 8440}, {"loss": 1.3358, "grad_norm": 1.0662927627563477, "learning_rate": 0.0002, "epoch": 5.661641541038526, "step": 8450}, {"loss": 1.2973, "grad_norm": 0.9097877740859985, "learning_rate": 0.0002, "epoch": 5.668341708542713, "step": 8460}, {"loss": 1.3072, "grad_norm": 0.9740368127822876, "learning_rate": 0.0002, "epoch": 5.675041876046901, "step": 8470}, {"loss": 1.286, "grad_norm": 0.9878810048103333, "learning_rate": 0.0002, "epoch": 5.681742043551089, "step": 8480}, {"loss": 1.208, "grad_norm": 1.148260474205017, "learning_rate": 0.0002, "epoch": 5.688442211055277, "step": 8490}, {"loss": 1.2842, "grad_norm": 0.9632558822631836, "learning_rate": 0.0002, "epoch": 5.695142378559464, "step": 8500}, {"loss": 1.2787, "grad_norm": 0.876812756061554, "learning_rate": 0.0002, "epoch": 5.701842546063651, "step": 8510}, {"loss": 1.3186, "grad_norm": 1.0730829238891602, "learning_rate": 0.0002, "epoch": 5.708542713567839, "step": 8520}, {"loss": 1.2856, "grad_norm": 1.2239218950271606, "learning_rate": 0.0002, "epoch": 5.715242881072027, "step": 8530}, {"loss": 1.2717, "grad_norm": 0.9460835456848145, "learning_rate": 0.0002, "epoch": 5.721943048576215, "step": 8540}, {"loss": 1.3509, "grad_norm": 0.9086270928382874, "learning_rate": 0.0002, "epoch": 5.728643216080402, "step": 8550}, {"loss": 1.2971, "grad_norm": 1.0258867740631104, "learning_rate": 0.0002, "epoch": 5.735343383584589, "step": 8560}, {"loss": 1.3581, "grad_norm": 1.0543923377990723, "learning_rate": 0.0002, "epoch": 5.742043551088777, "step": 8570}, {"loss": 1.2988, "grad_norm": 0.9063900113105774, "learning_rate": 0.0002, "epoch": 5.748743718592965, "step": 8580}, {"loss": 1.3535, "grad_norm": 1.1838830709457397, "learning_rate": 0.0002, "epoch": 5.755443886097153, "step": 8590}, {"loss": 1.2655, "grad_norm": 0.9631859064102173, "learning_rate": 0.0002, "epoch": 5.76214405360134, "step": 8600}, {"loss": 1.276, "grad_norm": 0.9702655673027039, "learning_rate": 0.0002, "epoch": 5.768844221105527, "step": 8610}, {"loss": 1.3196, "grad_norm": 1.0591435432434082, "learning_rate": 0.0002, "epoch": 5.775544388609715, "step": 8620}, {"loss": 1.267, "grad_norm": 0.9989570379257202, "learning_rate": 0.0002, "epoch": 5.782244556113903, "step": 8630}, {"loss": 1.3227, "grad_norm": 1.0836435556411743, "learning_rate": 0.0002, "epoch": 5.788944723618091, "step": 8640}, {"loss": 1.3334, "grad_norm": 0.8832896947860718, "learning_rate": 0.0002, "epoch": 5.795644891122278, "step": 8650}, {"loss": 1.3214, "grad_norm": 1.0104607343673706, "learning_rate": 0.0002, "epoch": 5.802345058626465, "step": 8660}, {"loss": 1.2703, "grad_norm": 0.8375084400177002, "learning_rate": 0.0002, "epoch": 5.809045226130653, "step": 8670}, {"loss": 1.3554, "grad_norm": 1.1300716400146484, "learning_rate": 0.0002, "epoch": 5.815745393634841, "step": 8680}, {"loss": 1.3468, "grad_norm": 0.9311910271644592, "learning_rate": 0.0002, "epoch": 5.822445561139029, "step": 8690}, {"loss": 1.2749, "grad_norm": 0.9488391876220703, "learning_rate": 0.0002, "epoch": 5.8291457286432165, "step": 8700}, {"loss": 1.2281, "grad_norm": 0.9747629761695862, "learning_rate": 0.0002, "epoch": 5.835845896147403, "step": 8710}, {"loss": 1.2923, "grad_norm": 1.1029598712921143, "learning_rate": 0.0002, "epoch": 5.842546063651591, "step": 8720}, {"loss": 1.3613, "grad_norm": 1.0396875143051147, "learning_rate": 0.0002, "epoch": 5.849246231155779, "step": 8730}, {"loss": 1.3272, "grad_norm": 0.9259780645370483, "learning_rate": 0.0002, "epoch": 5.855946398659967, "step": 8740}, {"loss": 1.3236, "grad_norm": 1.020033597946167, "learning_rate": 0.0002, "epoch": 5.8626465661641545, "step": 8750}, {"loss": 1.3453, "grad_norm": 0.9191218614578247, "learning_rate": 0.0002, "epoch": 5.869346733668341, "step": 8760}, {"loss": 1.3012, "grad_norm": 1.1093107461929321, "learning_rate": 0.0002, "epoch": 5.876046901172529, "step": 8770}, {"loss": 1.2718, "grad_norm": 1.1626793146133423, "learning_rate": 0.0002, "epoch": 5.882747068676717, "step": 8780}, {"loss": 1.2969, "grad_norm": 0.9542945027351379, "learning_rate": 0.0002, "epoch": 5.889447236180905, "step": 8790}, {"loss": 1.3134, "grad_norm": 0.9086058139801025, "learning_rate": 0.0002, "epoch": 5.8961474036850925, "step": 8800}, {"loss": 1.2731, "grad_norm": 0.9249639511108398, "learning_rate": 0.0002, "epoch": 5.902847571189279, "step": 8810}, {"loss": 1.337, "grad_norm": 0.9414396286010742, "learning_rate": 0.0002, "epoch": 5.909547738693467, "step": 8820}, {"loss": 1.2865, "grad_norm": 0.9086037874221802, "learning_rate": 0.0002, "epoch": 5.916247906197655, "step": 8830}, {"loss": 1.2756, "grad_norm": 0.8685907125473022, "learning_rate": 0.0002, "epoch": 5.922948073701843, "step": 8840}, {"loss": 1.297, "grad_norm": 1.036419153213501, "learning_rate": 0.0002, "epoch": 5.9296482412060305, "step": 8850}, {"loss": 1.3207, "grad_norm": 1.0183674097061157, "learning_rate": 0.0002, "epoch": 5.936348408710217, "step": 8860}, {"loss": 1.3922, "grad_norm": 0.966444194316864, "learning_rate": 0.0002, "epoch": 5.943048576214405, "step": 8870}, {"loss": 1.333, "grad_norm": 1.125693917274475, "learning_rate": 0.0002, "epoch": 5.949748743718593, "step": 8880}, {"loss": 1.3116, "grad_norm": 0.9857436418533325, "learning_rate": 0.0002, "epoch": 5.956448911222781, "step": 8890}, {"loss": 1.2526, "grad_norm": 0.9377069473266602, "learning_rate": 0.0002, "epoch": 5.9631490787269685, "step": 8900}, {"loss": 1.3221, "grad_norm": 0.9493814706802368, "learning_rate": 0.0002, "epoch": 5.969849246231155, "step": 8910}, {"loss": 1.2516, "grad_norm": 0.8806208372116089, "learning_rate": 0.0002, "epoch": 5.976549413735343, "step": 8920}, {"loss": 1.2558, "grad_norm": 0.8727600574493408, "learning_rate": 0.0002, "epoch": 5.983249581239531, "step": 8930}, {"loss": 1.3538, "grad_norm": 0.9799810647964478, "learning_rate": 0.0002, "epoch": 5.989949748743719, "step": 8940}, {"loss": 1.3323, "grad_norm": 0.9866513609886169, "learning_rate": 0.0002, "epoch": 5.9966499162479066, "step": 8950}, {"eval_loss": 2.0282373428344727, "eval_runtime": 38.0375, "eval_samples_per_second": 13.539, "eval_steps_per_second": 1.709, "epoch": 6.0, "step": 8955}, {"loss": 1.1768, "grad_norm": 0.8747885227203369, "learning_rate": 0.0002, "epoch": 6.0033500837520934, "step": 8960}, {"loss": 1.0677, "grad_norm": 1.2512741088867188, "learning_rate": 0.0002, "epoch": 6.010050251256281, "step": 8970}, {"loss": 1.1128, "grad_norm": 1.06855309009552, "learning_rate": 0.0002, "epoch": 6.016750418760469, "step": 8980}, {"loss": 1.1382, "grad_norm": 1.1868711709976196, "learning_rate": 0.0002, "epoch": 6.023450586264657, "step": 8990}, {"loss": 1.1377, "grad_norm": 1.2984495162963867, "learning_rate": 0.0002, "epoch": 6.030150753768845, "step": 9000}, {"loss": 1.0803, "grad_norm": 1.1147589683532715, "learning_rate": 0.0002, "epoch": 6.0368509212730315, "step": 9010}, {"loss": 1.1244, "grad_norm": 1.3128414154052734, "learning_rate": 0.0002, "epoch": 6.043551088777219, "step": 9020}, {"loss": 1.097, "grad_norm": 1.068290114402771, "learning_rate": 0.0002, "epoch": 6.050251256281407, "step": 9030}, {"loss": 1.1764, "grad_norm": 1.1890562772750854, "learning_rate": 0.0002, "epoch": 6.056951423785595, "step": 9040}, {"loss": 1.1239, "grad_norm": 1.2121573686599731, "learning_rate": 0.0002, "epoch": 6.063651591289783, "step": 9050}, {"loss": 1.0683, "grad_norm": 1.0860483646392822, "learning_rate": 0.0002, "epoch": 6.0703517587939695, "step": 9060}, {"loss": 1.1613, "grad_norm": 1.1214599609375, "learning_rate": 0.0002, "epoch": 6.077051926298157, "step": 9070}, {"loss": 1.1001, "grad_norm": 1.147580862045288, "learning_rate": 0.0002, "epoch": 6.083752093802345, "step": 9080}, {"loss": 1.1154, "grad_norm": 1.3233155012130737, "learning_rate": 0.0002, "epoch": 6.090452261306533, "step": 9090}, {"loss": 1.1017, "grad_norm": 1.1869080066680908, "learning_rate": 0.0002, "epoch": 6.097152428810721, "step": 9100}, {"loss": 1.1532, "grad_norm": 1.1695014238357544, "learning_rate": 0.0002, "epoch": 6.1038525963149075, "step": 9110}, {"loss": 1.1366, "grad_norm": 1.1982251405715942, "learning_rate": 0.0002, "epoch": 6.110552763819095, "step": 9120}, {"loss": 1.0805, "grad_norm": 1.1426950693130493, "learning_rate": 0.0002, "epoch": 6.117252931323283, "step": 9130}, {"loss": 1.0801, "grad_norm": 1.2257394790649414, "learning_rate": 0.0002, "epoch": 6.123953098827471, "step": 9140}, {"loss": 1.1209, "grad_norm": 1.2932263612747192, "learning_rate": 0.0002, "epoch": 6.130653266331659, "step": 9150}, {"loss": 1.0934, "grad_norm": 1.2617030143737793, "learning_rate": 0.0002, "epoch": 6.1373534338358455, "step": 9160}, {"loss": 1.0551, "grad_norm": 1.1201422214508057, "learning_rate": 0.0002, "epoch": 6.144053601340033, "step": 9170}, {"loss": 1.1059, "grad_norm": 0.9625319838523865, "learning_rate": 0.0002, "epoch": 6.150753768844221, "step": 9180}, {"loss": 1.1397, "grad_norm": 1.0290048122406006, "learning_rate": 0.0002, "epoch": 6.157453936348409, "step": 9190}, {"loss": 1.1257, "grad_norm": 1.1137803792953491, "learning_rate": 0.0002, "epoch": 6.164154103852597, "step": 9200}, {"loss": 1.1211, "grad_norm": 1.3674522638320923, "learning_rate": 0.0002, "epoch": 6.1708542713567835, "step": 9210}, {"loss": 1.0947, "grad_norm": 1.182207703590393, "learning_rate": 0.0002, "epoch": 6.177554438860971, "step": 9220}, {"loss": 1.0838, "grad_norm": 1.0496711730957031, "learning_rate": 0.0002, "epoch": 6.184254606365159, "step": 9230}, {"loss": 1.0666, "grad_norm": 1.1899489164352417, "learning_rate": 0.0002, "epoch": 6.190954773869347, "step": 9240}, {"loss": 1.1633, "grad_norm": 1.2666147947311401, "learning_rate": 0.0002, "epoch": 6.197654941373535, "step": 9250}, {"loss": 1.1532, "grad_norm": 1.2013030052185059, "learning_rate": 0.0002, "epoch": 6.204355108877722, "step": 9260}, {"loss": 1.151, "grad_norm": 1.3049768209457397, "learning_rate": 0.0002, "epoch": 6.211055276381909, "step": 9270}, {"loss": 1.0741, "grad_norm": 1.1733006238937378, "learning_rate": 0.0002, "epoch": 6.217755443886097, "step": 9280}, {"loss": 1.0933, "grad_norm": 1.2742516994476318, "learning_rate": 0.0002, "epoch": 6.224455611390285, "step": 9290}, {"loss": 1.1028, "grad_norm": 1.110198974609375, "learning_rate": 0.0002, "epoch": 6.231155778894473, "step": 9300}, {"loss": 1.1619, "grad_norm": 1.159963607788086, "learning_rate": 0.0002, "epoch": 6.23785594639866, "step": 9310}, {"loss": 1.0716, "grad_norm": 1.302216649055481, "learning_rate": 0.0002, "epoch": 6.244556113902847, "step": 9320}, {"loss": 1.0694, "grad_norm": 1.2134063243865967, "learning_rate": 0.0002, "epoch": 6.251256281407035, "step": 9330}, {"loss": 1.2151, "grad_norm": 1.062682867050171, "learning_rate": 0.0002, "epoch": 6.257956448911223, "step": 9340}, {"loss": 1.148, "grad_norm": 1.1568971872329712, "learning_rate": 0.0002, "epoch": 6.264656616415411, "step": 9350}, {"loss": 1.094, "grad_norm": 0.9914957880973816, "learning_rate": 0.0002, "epoch": 6.271356783919598, "step": 9360}, {"loss": 1.125, "grad_norm": 1.017250895500183, "learning_rate": 0.0002, "epoch": 6.278056951423785, "step": 9370}, {"loss": 1.2177, "grad_norm": 1.1862834692001343, "learning_rate": 0.0002, "epoch": 6.284757118927973, "step": 9380}, {"loss": 0.9994, "grad_norm": 1.2834911346435547, "learning_rate": 0.0002, "epoch": 6.291457286432161, "step": 9390}, {"loss": 1.0922, "grad_norm": 1.3306856155395508, "learning_rate": 0.0002, "epoch": 6.298157453936349, "step": 9400}, {"loss": 1.1136, "grad_norm": 1.12908136844635, "learning_rate": 0.0002, "epoch": 6.304857621440536, "step": 9410}, {"loss": 1.1406, "grad_norm": 1.2157351970672607, "learning_rate": 0.0002, "epoch": 6.311557788944723, "step": 9420}, {"loss": 1.1388, "grad_norm": 1.121882677078247, "learning_rate": 0.0002, "epoch": 6.318257956448911, "step": 9430}, {"loss": 1.1648, "grad_norm": 1.3144481182098389, "learning_rate": 0.0002, "epoch": 6.324958123953099, "step": 9440}, {"loss": 1.1228, "grad_norm": 1.1946896314620972, "learning_rate": 0.0002, "epoch": 6.331658291457287, "step": 9450}, {"loss": 1.1613, "grad_norm": 1.1289668083190918, "learning_rate": 0.0002, "epoch": 6.338358458961474, "step": 9460}, {"loss": 1.1059, "grad_norm": 1.1065658330917358, "learning_rate": 0.0002, "epoch": 6.345058626465661, "step": 9470}, {"loss": 1.1431, "grad_norm": 1.0881422758102417, "learning_rate": 0.0002, "epoch": 6.351758793969849, "step": 9480}, {"loss": 1.223, "grad_norm": 1.242676854133606, "learning_rate": 0.0002, "epoch": 6.358458961474037, "step": 9490}, {"loss": 1.1379, "grad_norm": 0.9650855660438538, "learning_rate": 0.0002, "epoch": 6.365159128978225, "step": 9500}, {"loss": 1.0763, "grad_norm": 1.2845722436904907, "learning_rate": 0.0002, "epoch": 6.371859296482412, "step": 9510}, {"loss": 1.1351, "grad_norm": 1.0327043533325195, "learning_rate": 0.0002, "epoch": 6.3785594639865995, "step": 9520}, {"loss": 1.114, "grad_norm": 1.0780898332595825, "learning_rate": 0.0002, "epoch": 6.385259631490787, "step": 9530}, {"loss": 1.1579, "grad_norm": 1.4934027194976807, "learning_rate": 0.0002, "epoch": 6.391959798994975, "step": 9540}, {"loss": 1.1546, "grad_norm": 0.9882908463478088, "learning_rate": 0.0002, "epoch": 6.398659966499163, "step": 9550}, {"loss": 1.1145, "grad_norm": 1.3250664472579956, "learning_rate": 0.0002, "epoch": 6.40536013400335, "step": 9560}, {"loss": 1.2333, "grad_norm": 1.1888482570648193, "learning_rate": 0.0002, "epoch": 6.4120603015075375, "step": 9570}, {"loss": 1.0892, "grad_norm": 1.136496901512146, "learning_rate": 0.0002, "epoch": 6.418760469011725, "step": 9580}, {"loss": 1.1674, "grad_norm": 1.161360502243042, "learning_rate": 0.0002, "epoch": 6.425460636515913, "step": 9590}, {"loss": 1.1293, "grad_norm": 1.2034236192703247, "learning_rate": 0.0002, "epoch": 6.432160804020101, "step": 9600}, {"loss": 1.1059, "grad_norm": 1.0268361568450928, "learning_rate": 0.0002, "epoch": 6.438860971524288, "step": 9610}, {"loss": 1.1732, "grad_norm": 1.2132930755615234, "learning_rate": 0.0002, "epoch": 6.4455611390284755, "step": 9620}, {"loss": 1.1329, "grad_norm": 1.0773013830184937, "learning_rate": 0.0002, "epoch": 6.452261306532663, "step": 9630}, {"loss": 1.0822, "grad_norm": 1.3848375082015991, "learning_rate": 0.0002, "epoch": 6.458961474036851, "step": 9640}, {"loss": 1.1778, "grad_norm": 1.110495924949646, "learning_rate": 0.0002, "epoch": 6.465661641541039, "step": 9650}, {"loss": 1.2022, "grad_norm": 1.118093729019165, "learning_rate": 0.0002, "epoch": 6.472361809045226, "step": 9660}, {"loss": 1.1222, "grad_norm": 1.2611900568008423, "learning_rate": 0.0002, "epoch": 6.4790619765494135, "step": 9670}, {"loss": 1.2138, "grad_norm": 0.971754252910614, "learning_rate": 0.0002, "epoch": 6.485762144053601, "step": 9680}, {"loss": 1.1641, "grad_norm": 1.2615419626235962, "learning_rate": 0.0002, "epoch": 6.492462311557789, "step": 9690}, {"loss": 1.1412, "grad_norm": 1.1370900869369507, "learning_rate": 0.0002, "epoch": 6.499162479061977, "step": 9700}, {"loss": 1.186, "grad_norm": 1.1815906763076782, "learning_rate": 0.0002, "epoch": 6.505862646566165, "step": 9710}, {"loss": 1.167, "grad_norm": 1.3424339294433594, "learning_rate": 0.0002, "epoch": 6.5125628140703515, "step": 9720}, {"loss": 1.1602, "grad_norm": 1.2858397960662842, "learning_rate": 0.0002, "epoch": 6.519262981574539, "step": 9730}, {"loss": 1.178, "grad_norm": 0.9578179121017456, "learning_rate": 0.0002, "epoch": 6.525963149078727, "step": 9740}, {"loss": 1.1805, "grad_norm": 1.3105167150497437, "learning_rate": 0.0002, "epoch": 6.532663316582915, "step": 9750}, {"loss": 1.1899, "grad_norm": 1.0586575269699097, "learning_rate": 0.0002, "epoch": 6.539363484087103, "step": 9760}, {"loss": 1.095, "grad_norm": 1.2122068405151367, "learning_rate": 0.0002, "epoch": 6.54606365159129, "step": 9770}, {"loss": 1.1471, "grad_norm": 1.3088626861572266, "learning_rate": 0.0002, "epoch": 6.552763819095477, "step": 9780}, {"loss": 1.1067, "grad_norm": 1.194122076034546, "learning_rate": 0.0002, "epoch": 6.559463986599665, "step": 9790}, {"loss": 1.0967, "grad_norm": 1.1508387327194214, "learning_rate": 0.0002, "epoch": 6.566164154103853, "step": 9800}, {"loss": 1.1694, "grad_norm": 1.109228253364563, "learning_rate": 0.0002, "epoch": 6.572864321608041, "step": 9810}, {"loss": 1.1378, "grad_norm": 1.1607427597045898, "learning_rate": 0.0002, "epoch": 6.579564489112228, "step": 9820}, {"loss": 1.1585, "grad_norm": 1.174089789390564, "learning_rate": 0.0002, "epoch": 6.586264656616415, "step": 9830}, {"loss": 1.1385, "grad_norm": 1.1739521026611328, "learning_rate": 0.0002, "epoch": 6.592964824120603, "step": 9840}, {"loss": 1.155, "grad_norm": 1.098528504371643, "learning_rate": 0.0002, "epoch": 6.599664991624791, "step": 9850}, {"loss": 1.1359, "grad_norm": 1.0397740602493286, "learning_rate": 0.0002, "epoch": 6.606365159128979, "step": 9860}, {"loss": 1.1433, "grad_norm": 1.1087969541549683, "learning_rate": 0.0002, "epoch": 6.613065326633166, "step": 9870}, {"loss": 1.2356, "grad_norm": 1.2070481777191162, "learning_rate": 0.0002, "epoch": 6.619765494137353, "step": 9880}, {"loss": 1.1161, "grad_norm": 1.1115655899047852, "learning_rate": 0.0002, "epoch": 6.626465661641541, "step": 9890}, {"loss": 1.2163, "grad_norm": 1.2486097812652588, "learning_rate": 0.0002, "epoch": 6.633165829145729, "step": 9900}, {"loss": 1.0984, "grad_norm": 1.230380654335022, "learning_rate": 0.0002, "epoch": 6.639865996649917, "step": 9910}, {"loss": 1.1862, "grad_norm": 1.1479365825653076, "learning_rate": 0.0002, "epoch": 6.646566164154104, "step": 9920}, {"loss": 1.1139, "grad_norm": 1.0790960788726807, "learning_rate": 0.0002, "epoch": 6.653266331658291, "step": 9930}, {"loss": 1.2001, "grad_norm": 1.1157397031784058, "learning_rate": 0.0002, "epoch": 6.659966499162479, "step": 9940}, {"loss": 1.1085, "grad_norm": 1.3104028701782227, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 9950}, {"loss": 1.2209, "grad_norm": 1.1727646589279175, "learning_rate": 0.0002, "epoch": 6.673366834170855, "step": 9960}, {"loss": 1.1671, "grad_norm": 1.2104284763336182, "learning_rate": 0.0002, "epoch": 6.680067001675042, "step": 9970}, {"loss": 1.1952, "grad_norm": 1.2023727893829346, "learning_rate": 0.0002, "epoch": 6.686767169179229, "step": 9980}, {"loss": 1.1385, "grad_norm": 1.0088225603103638, "learning_rate": 0.0002, "epoch": 6.693467336683417, "step": 9990}, {"loss": 1.1314, "grad_norm": 1.298015832901001, "learning_rate": 0.0002, "epoch": 6.700167504187605, "step": 10000}, {"loss": 1.1983, "grad_norm": 1.1315910816192627, "learning_rate": 0.0002, "epoch": 6.706867671691793, "step": 10010}, {"loss": 1.1679, "grad_norm": 1.1283273696899414, "learning_rate": 0.0002, "epoch": 6.71356783919598, "step": 10020}, {"loss": 1.1917, "grad_norm": 1.2564418315887451, "learning_rate": 0.0002, "epoch": 6.720268006700167, "step": 10030}, {"loss": 1.1788, "grad_norm": 1.0451353788375854, "learning_rate": 0.0002, "epoch": 6.726968174204355, "step": 10040}, {"loss": 1.1905, "grad_norm": 1.054793357849121, "learning_rate": 0.0002, "epoch": 6.733668341708543, "step": 10050}, {"loss": 1.1814, "grad_norm": 1.2741243839263916, "learning_rate": 0.0002, "epoch": 6.740368509212731, "step": 10060}, {"loss": 1.2015, "grad_norm": 1.1342514753341675, "learning_rate": 0.0002, "epoch": 6.747068676716918, "step": 10070}, {"loss": 1.2587, "grad_norm": 1.0081498622894287, "learning_rate": 0.0002, "epoch": 6.7537688442211055, "step": 10080}, {"loss": 1.1226, "grad_norm": 1.2164603471755981, "learning_rate": 0.0002, "epoch": 6.760469011725293, "step": 10090}, {"loss": 1.1353, "grad_norm": 1.2062463760375977, "learning_rate": 0.0002, "epoch": 6.767169179229481, "step": 10100}, {"loss": 1.2143, "grad_norm": 1.2255526781082153, "learning_rate": 0.0002, "epoch": 6.773869346733669, "step": 10110}, {"loss": 1.1544, "grad_norm": 1.08175790309906, "learning_rate": 0.0002, "epoch": 6.780569514237856, "step": 10120}, {"loss": 1.1983, "grad_norm": 1.5781128406524658, "learning_rate": 0.0002, "epoch": 6.7872696817420435, "step": 10130}, {"loss": 1.0994, "grad_norm": 1.0622451305389404, "learning_rate": 0.0002, "epoch": 6.793969849246231, "step": 10140}, {"loss": 1.2084, "grad_norm": 1.1591497659683228, "learning_rate": 0.0002, "epoch": 6.800670016750419, "step": 10150}, {"loss": 1.2203, "grad_norm": 1.0398483276367188, "learning_rate": 0.0002, "epoch": 6.807370184254607, "step": 10160}, {"loss": 1.2249, "grad_norm": 1.229132056236267, "learning_rate": 0.0002, "epoch": 6.814070351758794, "step": 10170}, {"loss": 1.1789, "grad_norm": 1.0918090343475342, "learning_rate": 0.0002, "epoch": 6.8207705192629815, "step": 10180}, {"loss": 1.1639, "grad_norm": 1.1543749570846558, "learning_rate": 0.0002, "epoch": 6.827470686767169, "step": 10190}, {"loss": 1.1353, "grad_norm": 1.1831817626953125, "learning_rate": 0.0002, "epoch": 6.834170854271357, "step": 10200}, {"loss": 1.2565, "grad_norm": 1.305327296257019, "learning_rate": 0.0002, "epoch": 6.840871021775545, "step": 10210}, {"loss": 1.2037, "grad_norm": 1.136720061302185, "learning_rate": 0.0002, "epoch": 6.847571189279732, "step": 10220}, {"loss": 1.2256, "grad_norm": 1.2282346487045288, "learning_rate": 0.0002, "epoch": 6.8542713567839195, "step": 10230}, {"loss": 1.1281, "grad_norm": 1.2457010746002197, "learning_rate": 0.0002, "epoch": 6.860971524288107, "step": 10240}, {"loss": 1.2762, "grad_norm": 1.2808631658554077, "learning_rate": 0.0002, "epoch": 6.867671691792295, "step": 10250}, {"loss": 1.2213, "grad_norm": 1.089066743850708, "learning_rate": 0.0002, "epoch": 6.874371859296483, "step": 10260}, {"loss": 1.2627, "grad_norm": 0.9543178081512451, "learning_rate": 0.0002, "epoch": 6.88107202680067, "step": 10270}, {"loss": 1.1617, "grad_norm": 1.1149744987487793, "learning_rate": 0.0002, "epoch": 6.8877721943048575, "step": 10280}, {"loss": 1.1134, "grad_norm": 1.0185538530349731, "learning_rate": 0.0002, "epoch": 6.894472361809045, "step": 10290}, {"loss": 1.217, "grad_norm": 0.9954617619514465, "learning_rate": 0.0002, "epoch": 6.901172529313233, "step": 10300}, {"loss": 1.1524, "grad_norm": 1.2581418752670288, "learning_rate": 0.0002, "epoch": 6.907872696817421, "step": 10310}, {"loss": 1.1942, "grad_norm": 1.2430983781814575, "learning_rate": 0.0002, "epoch": 6.914572864321608, "step": 10320}, {"loss": 1.1254, "grad_norm": 1.4937270879745483, "learning_rate": 0.0002, "epoch": 6.921273031825796, "step": 10330}, {"loss": 1.1519, "grad_norm": 1.1257144212722778, "learning_rate": 0.0002, "epoch": 6.927973199329983, "step": 10340}, {"loss": 1.2622, "grad_norm": 1.2068904638290405, "learning_rate": 0.0002, "epoch": 6.934673366834171, "step": 10350}, {"loss": 1.1349, "grad_norm": 1.0290757417678833, "learning_rate": 0.0002, "epoch": 6.941373534338359, "step": 10360}, {"loss": 1.1752, "grad_norm": 1.0070724487304688, "learning_rate": 0.0002, "epoch": 6.948073701842546, "step": 10370}, {"loss": 1.1838, "grad_norm": 0.9936357140541077, "learning_rate": 0.0002, "epoch": 6.954773869346734, "step": 10380}, {"loss": 1.2305, "grad_norm": 1.1063416004180908, "learning_rate": 0.0002, "epoch": 6.961474036850921, "step": 10390}, {"loss": 1.154, "grad_norm": 1.5199986696243286, "learning_rate": 0.0002, "epoch": 6.968174204355109, "step": 10400}, {"loss": 1.1143, "grad_norm": 1.160731554031372, "learning_rate": 0.0002, "epoch": 6.974874371859297, "step": 10410}, {"loss": 1.2132, "grad_norm": 1.084697961807251, "learning_rate": 0.0002, "epoch": 6.981574539363484, "step": 10420}, {"loss": 1.1995, "grad_norm": 1.1257576942443848, "learning_rate": 0.0002, "epoch": 6.988274706867672, "step": 10430}, {"loss": 1.1197, "grad_norm": 1.310616135597229, "learning_rate": 0.0002, "epoch": 6.994974874371859, "step": 10440}]} +{"epoch": 7.997319932998325, "step": 11936, "epoch_duration": 1594.4848327636719, "total_accumulated_duration": 12784.900410413742, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-42/checkpoint-2985", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6189, "grad_norm": 0.565915048122406, "learning_rate": 0.0002, "epoch": 0.006700167504187605, "step": 10}, {"loss": 2.3162, "grad_norm": 0.5004463791847229, "learning_rate": 0.0002, "epoch": 0.01340033500837521, "step": 20}, {"loss": 2.0576, "grad_norm": 0.511043906211853, "learning_rate": 0.0002, "epoch": 0.020100502512562814, "step": 30}, {"loss": 2.0085, "grad_norm": 0.47327178716659546, "learning_rate": 0.0002, "epoch": 0.02680067001675042, "step": 40}, {"loss": 2.0276, "grad_norm": 0.5511676669120789, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 50}, {"loss": 1.9075, "grad_norm": 0.4666278064250946, "learning_rate": 0.0002, "epoch": 0.04020100502512563, "step": 60}, {"loss": 1.8413, "grad_norm": 0.5310961008071899, "learning_rate": 0.0002, "epoch": 0.04690117252931323, "step": 70}, {"loss": 1.8711, "grad_norm": 0.5606027245521545, "learning_rate": 0.0002, "epoch": 0.05360134003350084, "step": 80}, {"loss": 1.9282, "grad_norm": 0.4934779703617096, "learning_rate": 0.0002, "epoch": 0.06030150753768844, "step": 90}, {"loss": 1.8925, "grad_norm": 0.4821869730949402, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 100}, {"loss": 1.8628, "grad_norm": 0.5262084603309631, "learning_rate": 0.0002, "epoch": 0.07370184254606366, "step": 110}, {"loss": 1.8347, "grad_norm": 0.3774230182170868, "learning_rate": 0.0002, "epoch": 0.08040201005025126, "step": 120}, {"loss": 1.8386, "grad_norm": 0.34137430787086487, "learning_rate": 0.0002, "epoch": 0.08710217755443886, "step": 130}, {"loss": 1.861, "grad_norm": 0.407272070646286, "learning_rate": 0.0002, "epoch": 0.09380234505862646, "step": 140}, {"loss": 1.8279, "grad_norm": 0.4011937975883484, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 150}, {"loss": 1.9317, "grad_norm": 0.4432467222213745, "learning_rate": 0.0002, "epoch": 0.10720268006700168, "step": 160}, {"loss": 1.8157, "grad_norm": 0.44030463695526123, "learning_rate": 0.0002, "epoch": 0.11390284757118928, "step": 170}, {"loss": 1.8534, "grad_norm": 0.3799569308757782, "learning_rate": 0.0002, "epoch": 0.12060301507537688, "step": 180}, {"loss": 1.7658, "grad_norm": 0.33721521496772766, "learning_rate": 0.0002, "epoch": 0.1273031825795645, "step": 190}, {"loss": 1.8269, "grad_norm": 0.4096226692199707, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 200}, {"loss": 1.802, "grad_norm": 0.37374693155288696, "learning_rate": 0.0002, "epoch": 0.1407035175879397, "step": 210}, {"loss": 1.8901, "grad_norm": 0.3249480128288269, "learning_rate": 0.0002, "epoch": 0.1474036850921273, "step": 220}, {"loss": 1.8163, "grad_norm": 0.3612042963504791, "learning_rate": 0.0002, "epoch": 0.1541038525963149, "step": 230}, {"loss": 1.7585, "grad_norm": 0.3686671257019043, "learning_rate": 0.0002, "epoch": 0.16080402010050251, "step": 240}, {"loss": 1.8365, "grad_norm": 0.3521044850349426, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 250}, {"loss": 1.8623, "grad_norm": 0.4073677361011505, "learning_rate": 0.0002, "epoch": 0.17420435510887772, "step": 260}, {"loss": 1.8026, "grad_norm": 0.34522193670272827, "learning_rate": 0.0002, "epoch": 0.18090452261306533, "step": 270}, {"loss": 1.8162, "grad_norm": 0.4121900498867035, "learning_rate": 0.0002, "epoch": 0.18760469011725292, "step": 280}, {"loss": 1.7976, "grad_norm": 0.3544778525829315, "learning_rate": 0.0002, "epoch": 0.19430485762144054, "step": 290}, {"loss": 1.8787, "grad_norm": 0.3482133448123932, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 300}, {"loss": 1.8578, "grad_norm": 0.3421826660633087, "learning_rate": 0.0002, "epoch": 0.20770519262981574, "step": 310}, {"loss": 1.8013, "grad_norm": 0.5024696588516235, "learning_rate": 0.0002, "epoch": 0.21440536013400335, "step": 320}, {"loss": 1.8607, "grad_norm": 0.36013063788414, "learning_rate": 0.0002, "epoch": 0.22110552763819097, "step": 330}, {"loss": 1.9075, "grad_norm": 0.3611244857311249, "learning_rate": 0.0002, "epoch": 0.22780569514237856, "step": 340}, {"loss": 1.8128, "grad_norm": 0.39244529604911804, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 350}, {"loss": 1.7885, "grad_norm": 0.3299325704574585, "learning_rate": 0.0002, "epoch": 0.24120603015075376, "step": 360}, {"loss": 1.8028, "grad_norm": 0.3994322419166565, "learning_rate": 0.0002, "epoch": 0.24790619765494137, "step": 370}, {"loss": 1.8321, "grad_norm": 0.3559151887893677, "learning_rate": 0.0002, "epoch": 0.254606365159129, "step": 380}, {"loss": 1.7802, "grad_norm": 0.3873756229877472, "learning_rate": 0.0002, "epoch": 0.2613065326633166, "step": 390}, {"loss": 1.7844, "grad_norm": 0.3710744082927704, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 400}, {"loss": 1.7789, "grad_norm": 0.3618465065956116, "learning_rate": 0.0002, "epoch": 0.2747068676716918, "step": 410}, {"loss": 1.8529, "grad_norm": 0.30063769221305847, "learning_rate": 0.0002, "epoch": 0.2814070351758794, "step": 420}, {"loss": 1.7765, "grad_norm": 0.3695628345012665, "learning_rate": 0.0002, "epoch": 0.288107202680067, "step": 430}, {"loss": 1.7982, "grad_norm": 0.31451135873794556, "learning_rate": 0.0002, "epoch": 0.2948073701842546, "step": 440}, {"loss": 1.7517, "grad_norm": 0.3959707021713257, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 450}, {"loss": 1.8142, "grad_norm": 0.33475354313850403, "learning_rate": 0.0002, "epoch": 0.3082077051926298, "step": 460}, {"loss": 1.8805, "grad_norm": 0.33933115005493164, "learning_rate": 0.0002, "epoch": 0.3149078726968174, "step": 470}, {"loss": 1.7564, "grad_norm": 0.3264943063259125, "learning_rate": 0.0002, "epoch": 0.32160804020100503, "step": 480}, {"loss": 1.8428, "grad_norm": 0.40188100934028625, "learning_rate": 0.0002, "epoch": 0.32830820770519265, "step": 490}, {"loss": 1.7624, "grad_norm": 0.37408649921417236, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 500}, {"loss": 1.7745, "grad_norm": 0.33925938606262207, "learning_rate": 0.0002, "epoch": 0.3417085427135678, "step": 510}, {"loss": 1.814, "grad_norm": 0.36836713552474976, "learning_rate": 0.0002, "epoch": 0.34840871021775544, "step": 520}, {"loss": 1.8037, "grad_norm": 0.37284499406814575, "learning_rate": 0.0002, "epoch": 0.35510887772194305, "step": 530}, {"loss": 1.8379, "grad_norm": 0.3192278742790222, "learning_rate": 0.0002, "epoch": 0.36180904522613067, "step": 540}, {"loss": 1.8702, "grad_norm": 0.30233290791511536, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 550}, {"loss": 1.8799, "grad_norm": 0.3340817391872406, "learning_rate": 0.0002, "epoch": 0.37520938023450584, "step": 560}, {"loss": 1.8404, "grad_norm": 0.32600095868110657, "learning_rate": 0.0002, "epoch": 0.38190954773869346, "step": 570}, {"loss": 1.7804, "grad_norm": 0.33711278438568115, "learning_rate": 0.0002, "epoch": 0.38860971524288107, "step": 580}, {"loss": 1.8445, "grad_norm": 0.34890690445899963, "learning_rate": 0.0002, "epoch": 0.3953098827470687, "step": 590}, {"loss": 1.8187, "grad_norm": 0.38238924741744995, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 600}, {"loss": 1.8111, "grad_norm": 0.34399354457855225, "learning_rate": 0.0002, "epoch": 0.40871021775544386, "step": 610}, {"loss": 1.8006, "grad_norm": 0.3346073627471924, "learning_rate": 0.0002, "epoch": 0.4154103852596315, "step": 620}, {"loss": 1.7705, "grad_norm": 0.3545648157596588, "learning_rate": 0.0002, "epoch": 0.4221105527638191, "step": 630}, {"loss": 1.8445, "grad_norm": 0.3378899097442627, "learning_rate": 0.0002, "epoch": 0.4288107202680067, "step": 640}, {"loss": 1.804, "grad_norm": 0.3255569040775299, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 650}, {"loss": 1.7679, "grad_norm": 0.34880587458610535, "learning_rate": 0.0002, "epoch": 0.44221105527638194, "step": 660}, {"loss": 1.7861, "grad_norm": 0.3402383625507355, "learning_rate": 0.0002, "epoch": 0.4489112227805695, "step": 670}, {"loss": 1.8131, "grad_norm": 0.3594033718109131, "learning_rate": 0.0002, "epoch": 0.4556113902847571, "step": 680}, {"loss": 1.8399, "grad_norm": 0.31000566482543945, "learning_rate": 0.0002, "epoch": 0.4623115577889447, "step": 690}, {"loss": 1.7521, "grad_norm": 0.37229061126708984, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 700}, {"loss": 1.7779, "grad_norm": 0.315801739692688, "learning_rate": 0.0002, "epoch": 0.47571189279731996, "step": 710}, {"loss": 1.7515, "grad_norm": 0.3220832645893097, "learning_rate": 0.0002, "epoch": 0.4824120603015075, "step": 720}, {"loss": 1.7181, "grad_norm": 0.3435456156730652, "learning_rate": 0.0002, "epoch": 0.48911222780569513, "step": 730}, {"loss": 1.8844, "grad_norm": 0.30380892753601074, "learning_rate": 0.0002, "epoch": 0.49581239530988275, "step": 740}, {"loss": 1.7792, "grad_norm": 0.3555026054382324, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 750}, {"loss": 1.7714, "grad_norm": 0.3019855320453644, "learning_rate": 0.0002, "epoch": 0.509212730318258, "step": 760}, {"loss": 1.7962, "grad_norm": 0.309111088514328, "learning_rate": 0.0002, "epoch": 0.5159128978224455, "step": 770}, {"loss": 1.7913, "grad_norm": 0.366020530462265, "learning_rate": 0.0002, "epoch": 0.5226130653266332, "step": 780}, {"loss": 1.8008, "grad_norm": 0.3267050087451935, "learning_rate": 0.0002, "epoch": 0.5293132328308208, "step": 790}, {"loss": 1.7397, "grad_norm": 0.34265750646591187, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 800}, {"loss": 1.8251, "grad_norm": 0.313669890165329, "learning_rate": 0.0002, "epoch": 0.542713567839196, "step": 810}, {"loss": 1.8369, "grad_norm": 0.3355236053466797, "learning_rate": 0.0002, "epoch": 0.5494137353433836, "step": 820}, {"loss": 1.7381, "grad_norm": 0.3186608552932739, "learning_rate": 0.0002, "epoch": 0.5561139028475712, "step": 830}, {"loss": 1.8034, "grad_norm": 0.30357518792152405, "learning_rate": 0.0002, "epoch": 0.5628140703517588, "step": 840}, {"loss": 1.769, "grad_norm": 0.3990040123462677, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 850}, {"loss": 1.7482, "grad_norm": 0.34363803267478943, "learning_rate": 0.0002, "epoch": 0.576214405360134, "step": 860}, {"loss": 1.8106, "grad_norm": 0.3757908046245575, "learning_rate": 0.0002, "epoch": 0.5829145728643216, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3359757661819458, "learning_rate": 0.0002, "epoch": 0.5896147403685092, "step": 880}, {"loss": 1.7591, "grad_norm": 0.5555329918861389, "learning_rate": 0.0002, "epoch": 0.5963149078726968, "step": 890}, {"loss": 1.7715, "grad_norm": 0.4046323895454407, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 900}, {"loss": 1.7998, "grad_norm": 0.29834219813346863, "learning_rate": 0.0002, "epoch": 0.609715242881072, "step": 910}, {"loss": 1.7826, "grad_norm": 0.3241238594055176, "learning_rate": 0.0002, "epoch": 0.6164154103852596, "step": 920}, {"loss": 1.8342, "grad_norm": 0.35154739022254944, "learning_rate": 0.0002, "epoch": 0.6231155778894473, "step": 930}, {"loss": 1.8076, "grad_norm": 0.3287706673145294, "learning_rate": 0.0002, "epoch": 0.6298157453936348, "step": 940}, {"loss": 1.8038, "grad_norm": 0.35670626163482666, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 950}, {"loss": 1.869, "grad_norm": 0.6114104986190796, "learning_rate": 0.0002, "epoch": 0.6432160804020101, "step": 960}, {"loss": 1.8297, "grad_norm": 0.3186565041542053, "learning_rate": 0.0002, "epoch": 0.6499162479061976, "step": 970}, {"loss": 1.7539, "grad_norm": 0.27164125442504883, "learning_rate": 0.0002, "epoch": 0.6566164154103853, "step": 980}, {"loss": 1.8339, "grad_norm": 0.34407344460487366, "learning_rate": 0.0002, "epoch": 0.6633165829145728, "step": 990}, {"loss": 1.855, "grad_norm": 0.368415892124176, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 1000}, {"loss": 1.7821, "grad_norm": 0.3306390643119812, "learning_rate": 0.0002, "epoch": 0.6767169179229481, "step": 1010}, {"loss": 1.786, "grad_norm": 0.3198648989200592, "learning_rate": 0.0002, "epoch": 0.6834170854271356, "step": 1020}, {"loss": 1.816, "grad_norm": 0.3092987537384033, "learning_rate": 0.0002, "epoch": 0.6901172529313233, "step": 1030}, {"loss": 1.7689, "grad_norm": 0.3090653419494629, "learning_rate": 0.0002, "epoch": 0.6968174204355109, "step": 1040}, {"loss": 1.7544, "grad_norm": 0.3485880196094513, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 1050}, {"loss": 1.8187, "grad_norm": 0.35782721638679504, "learning_rate": 0.0002, "epoch": 0.7102177554438861, "step": 1060}, {"loss": 1.806, "grad_norm": 0.34256869554519653, "learning_rate": 0.0002, "epoch": 0.7169179229480737, "step": 1070}, {"loss": 1.7873, "grad_norm": 0.30461037158966064, "learning_rate": 0.0002, "epoch": 0.7236180904522613, "step": 1080}, {"loss": 1.7367, "grad_norm": 0.3398691713809967, "learning_rate": 0.0002, "epoch": 0.7303182579564489, "step": 1090}, {"loss": 1.8756, "grad_norm": 0.3180808126926422, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 1100}, {"loss": 1.6988, "grad_norm": 0.34400665760040283, "learning_rate": 0.0002, "epoch": 0.7437185929648241, "step": 1110}, {"loss": 1.7851, "grad_norm": 0.34244877099990845, "learning_rate": 0.0002, "epoch": 0.7504187604690117, "step": 1120}, {"loss": 1.7841, "grad_norm": 0.29946693778038025, "learning_rate": 0.0002, "epoch": 0.7571189279731994, "step": 1130}, {"loss": 1.7456, "grad_norm": 0.37547236680984497, "learning_rate": 0.0002, "epoch": 0.7638190954773869, "step": 1140}, {"loss": 1.8425, "grad_norm": 0.3263005018234253, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 1150}, {"loss": 1.7222, "grad_norm": 0.41363608837127686, "learning_rate": 0.0002, "epoch": 0.7772194304857621, "step": 1160}, {"loss": 1.7836, "grad_norm": 0.36267954111099243, "learning_rate": 0.0002, "epoch": 0.7839195979899497, "step": 1170}, {"loss": 1.9183, "grad_norm": 0.31789499521255493, "learning_rate": 0.0002, "epoch": 0.7906197654941374, "step": 1180}, {"loss": 1.78, "grad_norm": 0.5708149075508118, "learning_rate": 0.0002, "epoch": 0.7973199329983249, "step": 1190}, {"loss": 1.6908, "grad_norm": 0.322099506855011, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 1200}, {"loss": 1.7639, "grad_norm": 0.3419909179210663, "learning_rate": 0.0002, "epoch": 0.8107202680067002, "step": 1210}, {"loss": 1.7428, "grad_norm": 0.36286255717277527, "learning_rate": 0.0002, "epoch": 0.8174204355108877, "step": 1220}, {"loss": 1.8409, "grad_norm": 0.33992862701416016, "learning_rate": 0.0002, "epoch": 0.8241206030150754, "step": 1230}, {"loss": 1.7507, "grad_norm": 0.32622793316841125, "learning_rate": 0.0002, "epoch": 0.830820770519263, "step": 1240}, {"loss": 1.8098, "grad_norm": 0.3036167621612549, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1250}, {"loss": 1.8094, "grad_norm": 0.3182215392589569, "learning_rate": 0.0002, "epoch": 0.8442211055276382, "step": 1260}, {"loss": 1.8017, "grad_norm": 0.3270018696784973, "learning_rate": 0.0002, "epoch": 0.8509212730318257, "step": 1270}, {"loss": 1.798, "grad_norm": 0.32652342319488525, "learning_rate": 0.0002, "epoch": 0.8576214405360134, "step": 1280}, {"loss": 1.7448, "grad_norm": 0.3631329834461212, "learning_rate": 0.0002, "epoch": 0.864321608040201, "step": 1290}, {"loss": 1.7, "grad_norm": 0.36706018447875977, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1300}, {"loss": 1.8178, "grad_norm": 0.3347418010234833, "learning_rate": 0.0002, "epoch": 0.8777219430485762, "step": 1310}, {"loss": 1.7824, "grad_norm": 0.34371060132980347, "learning_rate": 0.0002, "epoch": 0.8844221105527639, "step": 1320}, {"loss": 1.783, "grad_norm": 0.3029090166091919, "learning_rate": 0.0002, "epoch": 0.8911222780569514, "step": 1330}, {"loss": 1.8017, "grad_norm": 0.34700682759284973, "learning_rate": 0.0002, "epoch": 0.897822445561139, "step": 1340}, {"loss": 1.7998, "grad_norm": 0.35574328899383545, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.30928221344947815, "learning_rate": 0.0002, "epoch": 0.9112227805695142, "step": 1360}, {"loss": 1.7479, "grad_norm": 0.30652928352355957, "learning_rate": 0.0002, "epoch": 0.9179229480737019, "step": 1370}, {"loss": 1.7491, "grad_norm": 0.3838157653808594, "learning_rate": 0.0002, "epoch": 0.9246231155778895, "step": 1380}, {"loss": 1.7977, "grad_norm": 0.31655240058898926, "learning_rate": 0.0002, "epoch": 0.931323283082077, "step": 1390}, {"loss": 1.8175, "grad_norm": 0.41737303137779236, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1400}, {"loss": 1.6811, "grad_norm": 0.3227267861366272, "learning_rate": 0.0002, "epoch": 0.9447236180904522, "step": 1410}, {"loss": 1.7343, "grad_norm": 0.3729925751686096, "learning_rate": 0.0002, "epoch": 0.9514237855946399, "step": 1420}, {"loss": 1.8221, "grad_norm": 0.30779409408569336, "learning_rate": 0.0002, "epoch": 0.9581239530988275, "step": 1430}, {"loss": 1.7972, "grad_norm": 0.334379643201828, "learning_rate": 0.0002, "epoch": 0.964824120603015, "step": 1440}, {"loss": 1.7141, "grad_norm": 0.3568236231803894, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1450}, {"loss": 1.7541, "grad_norm": 0.33310577273368835, "learning_rate": 0.0002, "epoch": 0.9782244556113903, "step": 1460}, {"loss": 1.8511, "grad_norm": 0.2972261905670166, "learning_rate": 0.0002, "epoch": 0.9849246231155779, "step": 1470}, {"loss": 1.7654, "grad_norm": 0.3322717845439911, "learning_rate": 0.0002, "epoch": 0.9916247906197655, "step": 1480}, {"loss": 1.8033, "grad_norm": 0.3276330828666687, "learning_rate": 0.0002, "epoch": 0.998324958123953, "step": 1490}, {"eval_loss": 1.8036354780197144, "eval_runtime": 37.8949, "eval_samples_per_second": 13.59, "eval_steps_per_second": 1.715, "epoch": 0.9996649916247906, "step": 1492}, {"loss": 1.7138, "grad_norm": 0.29252371191978455, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1500}, {"loss": 1.8198, "grad_norm": 0.31607162952423096, "learning_rate": 0.0002, "epoch": 1.0117252931323284, "step": 1510}, {"loss": 1.6779, "grad_norm": 0.32294467091560364, "learning_rate": 0.0002, "epoch": 1.018425460636516, "step": 1520}, {"loss": 1.7919, "grad_norm": 0.3868017792701721, "learning_rate": 0.0002, "epoch": 1.0251256281407035, "step": 1530}, {"loss": 1.7954, "grad_norm": 0.3178282082080841, "learning_rate": 0.0002, "epoch": 1.031825795644891, "step": 1540}, {"loss": 1.7136, "grad_norm": 0.3706750273704529, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1550}, {"loss": 1.7382, "grad_norm": 0.33930912613868713, "learning_rate": 0.0002, "epoch": 1.0452261306532664, "step": 1560}, {"loss": 1.7602, "grad_norm": 0.33970504999160767, "learning_rate": 0.0002, "epoch": 1.051926298157454, "step": 1570}, {"loss": 1.6573, "grad_norm": 0.42553383111953735, "learning_rate": 0.0002, "epoch": 1.0586264656616415, "step": 1580}, {"loss": 1.645, "grad_norm": 0.3772421181201935, "learning_rate": 0.0002, "epoch": 1.065326633165829, "step": 1590}, {"loss": 1.7362, "grad_norm": 0.34212902188301086, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1600}, {"loss": 1.7057, "grad_norm": 0.3798283338546753, "learning_rate": 0.0002, "epoch": 1.0787269681742044, "step": 1610}, {"loss": 1.7468, "grad_norm": 0.36909598112106323, "learning_rate": 0.0002, "epoch": 1.085427135678392, "step": 1620}, {"loss": 1.7807, "grad_norm": 0.3344230651855469, "learning_rate": 0.0002, "epoch": 1.0921273031825796, "step": 1630}, {"loss": 1.7111, "grad_norm": 0.3862569332122803, "learning_rate": 0.0002, "epoch": 1.0988274706867671, "step": 1640}, {"loss": 1.7163, "grad_norm": 0.31188511848449707, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1650}, {"loss": 1.7263, "grad_norm": 0.3563670814037323, "learning_rate": 0.0002, "epoch": 1.1122278056951425, "step": 1660}, {"loss": 1.7718, "grad_norm": 0.35052165389060974, "learning_rate": 0.0002, "epoch": 1.11892797319933, "step": 1670}, {"loss": 1.7601, "grad_norm": 0.3285699188709259, "learning_rate": 0.0002, "epoch": 1.1256281407035176, "step": 1680}, {"loss": 1.6877, "grad_norm": 0.3639393746852875, "learning_rate": 0.0002, "epoch": 1.1323283082077051, "step": 1690}, {"loss": 1.7719, "grad_norm": 0.3842753767967224, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1700}, {"loss": 1.7002, "grad_norm": 0.3624933063983917, "learning_rate": 0.0002, "epoch": 1.1457286432160805, "step": 1710}, {"loss": 1.7243, "grad_norm": 0.3641220033168793, "learning_rate": 0.0002, "epoch": 1.152428810720268, "step": 1720}, {"loss": 1.752, "grad_norm": 0.32765355706214905, "learning_rate": 0.0002, "epoch": 1.1591289782244556, "step": 1730}, {"loss": 1.6556, "grad_norm": 0.34974896907806396, "learning_rate": 0.0002, "epoch": 1.1658291457286432, "step": 1740}, {"loss": 1.7273, "grad_norm": 0.3910926580429077, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1750}, {"loss": 1.7204, "grad_norm": 0.3564300537109375, "learning_rate": 0.0002, "epoch": 1.1792294807370185, "step": 1760}, {"loss": 1.746, "grad_norm": 0.34822574257850647, "learning_rate": 0.0002, "epoch": 1.185929648241206, "step": 1770}, {"loss": 1.7256, "grad_norm": 0.36185044050216675, "learning_rate": 0.0002, "epoch": 1.1926298157453936, "step": 1780}, {"loss": 1.6431, "grad_norm": 0.34866711497306824, "learning_rate": 0.0002, "epoch": 1.1993299832495812, "step": 1790}, {"loss": 1.8084, "grad_norm": 0.4017769992351532, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1800}, {"loss": 1.6985, "grad_norm": 0.32930681109428406, "learning_rate": 0.0002, "epoch": 1.2127303182579565, "step": 1810}, {"loss": 1.7606, "grad_norm": 0.35951921343803406, "learning_rate": 0.0002, "epoch": 1.219430485762144, "step": 1820}, {"loss": 1.6933, "grad_norm": 0.37366992235183716, "learning_rate": 0.0002, "epoch": 1.2261306532663316, "step": 1830}, {"loss": 1.6737, "grad_norm": 0.3565689027309418, "learning_rate": 0.0002, "epoch": 1.2328308207705192, "step": 1840}, {"loss": 1.8013, "grad_norm": 0.3692343533039093, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1850}, {"loss": 1.736, "grad_norm": 0.38426971435546875, "learning_rate": 0.0002, "epoch": 1.2462311557788945, "step": 1860}, {"loss": 1.7031, "grad_norm": 0.33559855818748474, "learning_rate": 0.0002, "epoch": 1.252931323283082, "step": 1870}, {"loss": 1.7033, "grad_norm": 0.34181106090545654, "learning_rate": 0.0002, "epoch": 1.2596314907872697, "step": 1880}, {"loss": 1.7707, "grad_norm": 0.3916318416595459, "learning_rate": 0.0002, "epoch": 1.2663316582914572, "step": 1890}, {"loss": 1.6686, "grad_norm": 0.3887825012207031, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1900}, {"loss": 1.7062, "grad_norm": 0.33583927154541016, "learning_rate": 0.0002, "epoch": 1.2797319932998326, "step": 1910}, {"loss": 1.717, "grad_norm": 0.37639349699020386, "learning_rate": 0.0002, "epoch": 1.2864321608040201, "step": 1920}, {"loss": 1.777, "grad_norm": 0.38059428334236145, "learning_rate": 0.0002, "epoch": 1.2931323283082077, "step": 1930}, {"loss": 1.6126, "grad_norm": 0.37253183126449585, "learning_rate": 0.0002, "epoch": 1.2998324958123952, "step": 1940}, {"loss": 1.6758, "grad_norm": 0.37371566891670227, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1950}, {"loss": 1.6788, "grad_norm": 0.4080910086631775, "learning_rate": 0.0002, "epoch": 1.3132328308207706, "step": 1960}, {"loss": 1.6518, "grad_norm": 0.3174354135990143, "learning_rate": 0.0002, "epoch": 1.3199329983249581, "step": 1970}, {"loss": 1.7925, "grad_norm": 0.4518888294696808, "learning_rate": 0.0002, "epoch": 1.3266331658291457, "step": 1980}, {"loss": 1.7085, "grad_norm": 0.3627921938896179, "learning_rate": 0.0002, "epoch": 1.3333333333333333, "step": 1990}, {"loss": 1.7676, "grad_norm": 0.3655930161476135, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 2000}, {"loss": 1.7016, "grad_norm": 0.3509993255138397, "learning_rate": 0.0002, "epoch": 1.3467336683417086, "step": 2010}, {"loss": 1.7359, "grad_norm": 0.4281129240989685, "learning_rate": 0.0002, "epoch": 1.3534338358458962, "step": 2020}, {"loss": 1.6884, "grad_norm": 0.3821414113044739, "learning_rate": 0.0002, "epoch": 1.3601340033500837, "step": 2030}, {"loss": 1.7075, "grad_norm": 0.3907586336135864, "learning_rate": 0.0002, "epoch": 1.3668341708542713, "step": 2040}, {"loss": 1.7424, "grad_norm": 0.37792932987213135, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 2050}, {"loss": 1.7305, "grad_norm": 0.3693985641002655, "learning_rate": 0.0002, "epoch": 1.3802345058626466, "step": 2060}, {"loss": 1.7434, "grad_norm": 0.32275936007499695, "learning_rate": 0.0002, "epoch": 1.3869346733668342, "step": 2070}, {"loss": 1.6677, "grad_norm": 0.3789440095424652, "learning_rate": 0.0002, "epoch": 1.3936348408710217, "step": 2080}, {"loss": 1.6825, "grad_norm": 0.3638380467891693, "learning_rate": 0.0002, "epoch": 1.4003350083752093, "step": 2090}, {"loss": 1.6542, "grad_norm": 0.3495481610298157, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 2100}, {"loss": 1.7225, "grad_norm": 0.37920597195625305, "learning_rate": 0.0002, "epoch": 1.4137353433835846, "step": 2110}, {"loss": 1.7329, "grad_norm": 0.37218064069747925, "learning_rate": 0.0002, "epoch": 1.4204355108877722, "step": 2120}, {"loss": 1.799, "grad_norm": 0.38074082136154175, "learning_rate": 0.0002, "epoch": 1.4271356783919598, "step": 2130}, {"loss": 1.7403, "grad_norm": 0.3455527126789093, "learning_rate": 0.0002, "epoch": 1.4338358458961473, "step": 2140}, {"loss": 1.776, "grad_norm": 0.3712003529071808, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 2150}, {"loss": 1.7619, "grad_norm": 0.3786754906177521, "learning_rate": 0.0002, "epoch": 1.4472361809045227, "step": 2160}, {"loss": 1.68, "grad_norm": 0.3879223167896271, "learning_rate": 0.0002, "epoch": 1.4539363484087102, "step": 2170}, {"loss": 1.7, "grad_norm": 0.38738805055618286, "learning_rate": 0.0002, "epoch": 1.4606365159128978, "step": 2180}, {"loss": 1.7581, "grad_norm": 0.39768800139427185, "learning_rate": 0.0002, "epoch": 1.4673366834170856, "step": 2190}, {"loss": 1.7671, "grad_norm": 0.4172441065311432, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 2200}, {"loss": 1.6736, "grad_norm": 0.4043174982070923, "learning_rate": 0.0002, "epoch": 1.4807370184254607, "step": 2210}, {"loss": 1.7444, "grad_norm": 0.3750883936882019, "learning_rate": 0.0002, "epoch": 1.4874371859296482, "step": 2220}, {"loss": 1.6861, "grad_norm": 0.3552253246307373, "learning_rate": 0.0002, "epoch": 1.4941373534338358, "step": 2230}, {"loss": 1.6471, "grad_norm": 0.34607139229774475, "learning_rate": 0.0002, "epoch": 1.5008375209380236, "step": 2240}, {"loss": 1.6962, "grad_norm": 0.3406706750392914, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 2250}, {"loss": 1.7694, "grad_norm": 0.36654895544052124, "learning_rate": 0.0002, "epoch": 1.5142378559463987, "step": 2260}, {"loss": 1.6812, "grad_norm": 0.3914054334163666, "learning_rate": 0.0002, "epoch": 1.5209380234505863, "step": 2270}, {"loss": 1.6822, "grad_norm": 0.42012137174606323, "learning_rate": 0.0002, "epoch": 1.5276381909547738, "step": 2280}, {"loss": 1.697, "grad_norm": 0.39563435316085815, "learning_rate": 0.0002, "epoch": 1.5343383584589616, "step": 2290}, {"loss": 1.7491, "grad_norm": 0.3508438766002655, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 2300}, {"loss": 1.7727, "grad_norm": 0.3785218596458435, "learning_rate": 0.0002, "epoch": 1.5477386934673367, "step": 2310}, {"loss": 1.6963, "grad_norm": 0.39377647638320923, "learning_rate": 0.0002, "epoch": 1.5544388609715243, "step": 2320}, {"loss": 1.7263, "grad_norm": 0.3391438126564026, "learning_rate": 0.0002, "epoch": 1.5611390284757118, "step": 2330}, {"loss": 1.7722, "grad_norm": 0.37944263219833374, "learning_rate": 0.0002, "epoch": 1.5678391959798996, "step": 2340}, {"loss": 1.6371, "grad_norm": 0.3523491322994232, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 2350}, {"loss": 1.7583, "grad_norm": 0.3911575973033905, "learning_rate": 0.0002, "epoch": 1.5812395309882747, "step": 2360}, {"loss": 1.7117, "grad_norm": 0.33832186460494995, "learning_rate": 0.0002, "epoch": 1.5879396984924623, "step": 2370}, {"loss": 1.7701, "grad_norm": 0.3665979206562042, "learning_rate": 0.0002, "epoch": 1.5946398659966499, "step": 2380}, {"loss": 1.779, "grad_norm": 0.3871748149394989, "learning_rate": 0.0002, "epoch": 1.6013400335008376, "step": 2390}, {"loss": 1.7109, "grad_norm": 0.3586967885494232, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 2400}, {"loss": 1.7096, "grad_norm": 0.3563673198223114, "learning_rate": 0.0002, "epoch": 1.6147403685092128, "step": 2410}, {"loss": 1.745, "grad_norm": 0.37588971853256226, "learning_rate": 0.0002, "epoch": 1.6214405360134003, "step": 2420}, {"loss": 1.7086, "grad_norm": 0.352556437253952, "learning_rate": 0.0002, "epoch": 1.6281407035175879, "step": 2430}, {"loss": 1.6547, "grad_norm": 0.3716259300708771, "learning_rate": 0.0002, "epoch": 1.6348408710217757, "step": 2440}, {"loss": 1.7033, "grad_norm": 0.372001975774765, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 2450}, {"loss": 1.6584, "grad_norm": 0.3430042862892151, "learning_rate": 0.0002, "epoch": 1.6482412060301508, "step": 2460}, {"loss": 1.7217, "grad_norm": 0.3741483688354492, "learning_rate": 0.0002, "epoch": 1.6549413735343383, "step": 2470}, {"loss": 1.7701, "grad_norm": 0.3610571324825287, "learning_rate": 0.0002, "epoch": 1.661641541038526, "step": 2480}, {"loss": 1.7057, "grad_norm": 0.4204719066619873, "learning_rate": 0.0002, "epoch": 1.6683417085427137, "step": 2490}, {"loss": 1.7954, "grad_norm": 0.3938186466693878, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2500}, {"loss": 1.6633, "grad_norm": 0.3421435058116913, "learning_rate": 0.0002, "epoch": 1.6817420435510888, "step": 2510}, {"loss": 1.7996, "grad_norm": 0.42441412806510925, "learning_rate": 0.0002, "epoch": 1.6884422110552764, "step": 2520}, {"loss": 1.7142, "grad_norm": 0.38071519136428833, "learning_rate": 0.0002, "epoch": 1.695142378559464, "step": 2530}, {"loss": 1.7232, "grad_norm": 0.34078919887542725, "learning_rate": 0.0002, "epoch": 1.7018425460636517, "step": 2540}, {"loss": 1.7126, "grad_norm": 0.412844181060791, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2550}, {"loss": 1.7149, "grad_norm": 0.3753604292869568, "learning_rate": 0.0002, "epoch": 1.7152428810720268, "step": 2560}, {"loss": 1.7011, "grad_norm": 0.41588476300239563, "learning_rate": 0.0002, "epoch": 1.7219430485762144, "step": 2570}, {"loss": 1.6427, "grad_norm": 0.35504111647605896, "learning_rate": 0.0002, "epoch": 1.728643216080402, "step": 2580}, {"loss": 1.7296, "grad_norm": 0.36909720301628113, "learning_rate": 0.0002, "epoch": 1.7353433835845897, "step": 2590}, {"loss": 1.7022, "grad_norm": 0.4149979054927826, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2600}, {"loss": 1.77, "grad_norm": 0.38859328627586365, "learning_rate": 0.0002, "epoch": 1.7487437185929648, "step": 2610}, {"loss": 1.7036, "grad_norm": 0.36738792061805725, "learning_rate": 0.0002, "epoch": 1.7554438860971524, "step": 2620}, {"loss": 1.764, "grad_norm": 0.3968178927898407, "learning_rate": 0.0002, "epoch": 1.76214405360134, "step": 2630}, {"loss": 1.7687, "grad_norm": 0.3972901999950409, "learning_rate": 0.0002, "epoch": 1.7688442211055277, "step": 2640}, {"loss": 1.6774, "grad_norm": 0.3949959874153137, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2650}, {"loss": 1.7247, "grad_norm": 0.44074657559394836, "learning_rate": 0.0002, "epoch": 1.7822445561139029, "step": 2660}, {"loss": 1.7188, "grad_norm": 0.39743664860725403, "learning_rate": 0.0002, "epoch": 1.7889447236180904, "step": 2670}, {"loss": 1.7258, "grad_norm": 0.3950406610965729, "learning_rate": 0.0002, "epoch": 1.795644891122278, "step": 2680}, {"loss": 1.6906, "grad_norm": 0.3568263649940491, "learning_rate": 0.0002, "epoch": 1.8023450586264658, "step": 2690}, {"loss": 1.6735, "grad_norm": 0.3819476366043091, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2700}, {"loss": 1.7198, "grad_norm": 0.3480634391307831, "learning_rate": 0.0002, "epoch": 1.8157453936348409, "step": 2710}, {"loss": 1.7042, "grad_norm": 0.3875853419303894, "learning_rate": 0.0002, "epoch": 1.8224455611390284, "step": 2720}, {"loss": 1.6988, "grad_norm": 0.3441337049007416, "learning_rate": 0.0002, "epoch": 1.829145728643216, "step": 2730}, {"loss": 1.7647, "grad_norm": 0.35692882537841797, "learning_rate": 0.0002, "epoch": 1.8358458961474038, "step": 2740}, {"loss": 1.7033, "grad_norm": 0.36959215998649597, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2750}, {"loss": 1.7657, "grad_norm": 0.3893393278121948, "learning_rate": 0.0002, "epoch": 1.849246231155779, "step": 2760}, {"loss": 1.7068, "grad_norm": 0.37817293405532837, "learning_rate": 0.0002, "epoch": 1.8559463986599665, "step": 2770}, {"loss": 1.761, "grad_norm": 0.36071285605430603, "learning_rate": 0.0002, "epoch": 1.862646566164154, "step": 2780}, {"loss": 1.7623, "grad_norm": 0.3758420944213867, "learning_rate": 0.0002, "epoch": 1.8693467336683418, "step": 2790}, {"loss": 1.6743, "grad_norm": 0.3889938294887543, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2800}, {"loss": 1.6151, "grad_norm": 0.34361857175827026, "learning_rate": 0.0002, "epoch": 1.882747068676717, "step": 2810}, {"loss": 1.6038, "grad_norm": 0.39283323287963867, "learning_rate": 0.0002, "epoch": 1.8894472361809045, "step": 2820}, {"loss": 1.7555, "grad_norm": 0.3919452726840973, "learning_rate": 0.0002, "epoch": 1.896147403685092, "step": 2830}, {"loss": 1.673, "grad_norm": 0.38215070962905884, "learning_rate": 0.0002, "epoch": 1.9028475711892798, "step": 2840}, {"loss": 1.7044, "grad_norm": 0.4235064387321472, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2850}, {"loss": 1.7123, "grad_norm": 0.35694634914398193, "learning_rate": 0.0002, "epoch": 1.916247906197655, "step": 2860}, {"loss": 1.8128, "grad_norm": 0.383492112159729, "learning_rate": 0.0002, "epoch": 1.9229480737018425, "step": 2870}, {"loss": 1.7581, "grad_norm": 0.5945147275924683, "learning_rate": 0.0002, "epoch": 1.92964824120603, "step": 2880}, {"loss": 1.7421, "grad_norm": 0.3367522358894348, "learning_rate": 0.0002, "epoch": 1.9363484087102178, "step": 2890}, {"loss": 1.6561, "grad_norm": 0.35300394892692566, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2900}, {"loss": 1.7033, "grad_norm": 0.38084495067596436, "learning_rate": 0.0002, "epoch": 1.949748743718593, "step": 2910}, {"loss": 1.7132, "grad_norm": 0.37559160590171814, "learning_rate": 0.0002, "epoch": 1.9564489112227805, "step": 2920}, {"loss": 1.6759, "grad_norm": 0.3661738336086273, "learning_rate": 0.0002, "epoch": 1.963149078726968, "step": 2930}, {"loss": 1.7643, "grad_norm": 0.4073849320411682, "learning_rate": 0.0002, "epoch": 1.9698492462311559, "step": 2940}, {"loss": 1.6806, "grad_norm": 0.3723304271697998, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2950}, {"loss": 1.7611, "grad_norm": 0.3991098999977112, "learning_rate": 0.0002, "epoch": 1.983249581239531, "step": 2960}, {"loss": 1.7263, "grad_norm": 0.3947085440158844, "learning_rate": 0.0002, "epoch": 1.9899497487437185, "step": 2970}, {"loss": 1.7217, "grad_norm": 0.3786258399486542, "learning_rate": 0.0002, "epoch": 1.996649916247906, "step": 2980}, {"eval_loss": 1.8028968572616577, "eval_runtime": 37.8985, "eval_samples_per_second": 13.589, "eval_steps_per_second": 1.715, "epoch": 2.0, "step": 2985}, {"loss": 1.695, "grad_norm": 0.34824079275131226, "learning_rate": 0.0002, "epoch": 2.003350083752094, "step": 2990}, {"loss": 1.5853, "grad_norm": 0.3394894003868103, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 3000}, {"loss": 1.5783, "grad_norm": 0.36910977959632874, "learning_rate": 0.0002, "epoch": 2.016750418760469, "step": 3010}, {"loss": 1.6105, "grad_norm": 0.45000967383384705, "learning_rate": 0.0002, "epoch": 2.023450586264657, "step": 3020}, {"loss": 1.6019, "grad_norm": 0.3791407346725464, "learning_rate": 0.0002, "epoch": 2.030150753768844, "step": 3030}, {"loss": 1.5832, "grad_norm": 0.387321799993515, "learning_rate": 0.0002, "epoch": 2.036850921273032, "step": 3040}, {"loss": 1.6834, "grad_norm": 0.4185757040977478, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 3050}, {"loss": 1.5696, "grad_norm": 0.45110777020454407, "learning_rate": 0.0002, "epoch": 2.050251256281407, "step": 3060}, {"loss": 1.6231, "grad_norm": 0.42663660645484924, "learning_rate": 0.0002, "epoch": 2.056951423785595, "step": 3070}, {"loss": 1.6279, "grad_norm": 0.4546292722225189, "learning_rate": 0.0002, "epoch": 2.063651591289782, "step": 3080}, {"loss": 1.6141, "grad_norm": 0.3979759216308594, "learning_rate": 0.0002, "epoch": 2.07035175879397, "step": 3090}, {"loss": 1.6343, "grad_norm": 0.43596673011779785, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 3100}, {"loss": 1.5441, "grad_norm": 0.40120232105255127, "learning_rate": 0.0002, "epoch": 2.083752093802345, "step": 3110}, {"loss": 1.6309, "grad_norm": 0.44449281692504883, "learning_rate": 0.0002, "epoch": 2.090452261306533, "step": 3120}, {"loss": 1.5652, "grad_norm": 0.42672568559646606, "learning_rate": 0.0002, "epoch": 2.09715242881072, "step": 3130}, {"loss": 1.682, "grad_norm": 0.4232690930366516, "learning_rate": 0.0002, "epoch": 2.103852596314908, "step": 3140}, {"loss": 1.624, "grad_norm": 0.4299317002296448, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 3150}, {"loss": 1.6766, "grad_norm": 0.4067758023738861, "learning_rate": 0.0002, "epoch": 2.117252931323283, "step": 3160}, {"loss": 1.6759, "grad_norm": 0.4918815791606903, "learning_rate": 0.0002, "epoch": 2.123953098827471, "step": 3170}, {"loss": 1.6478, "grad_norm": 0.4140559732913971, "learning_rate": 0.0002, "epoch": 2.130653266331658, "step": 3180}, {"loss": 1.6641, "grad_norm": 0.4555995464324951, "learning_rate": 0.0002, "epoch": 2.137353433835846, "step": 3190}, {"loss": 1.5888, "grad_norm": 0.42943915724754333, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 3200}, {"loss": 1.5886, "grad_norm": 0.4730435013771057, "learning_rate": 0.0002, "epoch": 2.150753768844221, "step": 3210}, {"loss": 1.6022, "grad_norm": 0.43310216069221497, "learning_rate": 0.0002, "epoch": 2.157453936348409, "step": 3220}, {"loss": 1.6058, "grad_norm": 0.42054110765457153, "learning_rate": 0.0002, "epoch": 2.164154103852596, "step": 3230}, {"loss": 1.6749, "grad_norm": 0.4897233247756958, "learning_rate": 0.0002, "epoch": 2.170854271356784, "step": 3240}, {"loss": 1.6983, "grad_norm": 0.42194533348083496, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 3250}, {"loss": 1.6159, "grad_norm": 0.44494450092315674, "learning_rate": 0.0002, "epoch": 2.184254606365159, "step": 3260}, {"loss": 1.6977, "grad_norm": 0.43524879217147827, "learning_rate": 0.0002, "epoch": 2.190954773869347, "step": 3270}, {"loss": 1.528, "grad_norm": 0.4621117413043976, "learning_rate": 0.0002, "epoch": 2.1976549413735342, "step": 3280}, {"loss": 1.632, "grad_norm": 0.4073285460472107, "learning_rate": 0.0002, "epoch": 2.204355108877722, "step": 3290}, {"loss": 1.6141, "grad_norm": 0.47868335247039795, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 3300}, {"loss": 1.6857, "grad_norm": 0.4264970123767853, "learning_rate": 0.0002, "epoch": 2.217755443886097, "step": 3310}, {"loss": 1.5653, "grad_norm": 0.4491245150566101, "learning_rate": 0.0002, "epoch": 2.224455611390285, "step": 3320}, {"loss": 1.5881, "grad_norm": 0.4010344445705414, "learning_rate": 0.0002, "epoch": 2.2311557788944723, "step": 3330}, {"loss": 1.6684, "grad_norm": 0.4232759177684784, "learning_rate": 0.0002, "epoch": 2.23785594639866, "step": 3340}, {"loss": 1.6336, "grad_norm": 0.5099776983261108, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 3350}, {"loss": 1.6764, "grad_norm": 0.5223407745361328, "learning_rate": 0.0002, "epoch": 2.251256281407035, "step": 3360}, {"loss": 1.6625, "grad_norm": 0.47818470001220703, "learning_rate": 0.0002, "epoch": 2.257956448911223, "step": 3370}, {"loss": 1.5946, "grad_norm": 0.4721255898475647, "learning_rate": 0.0002, "epoch": 2.2646566164154103, "step": 3380}, {"loss": 1.5568, "grad_norm": 0.4113229513168335, "learning_rate": 0.0002, "epoch": 2.271356783919598, "step": 3390}, {"loss": 1.6494, "grad_norm": 0.507080078125, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 3400}, {"loss": 1.6183, "grad_norm": 0.4852292239665985, "learning_rate": 0.0002, "epoch": 2.284757118927973, "step": 3410}, {"loss": 1.6132, "grad_norm": 0.4503684341907501, "learning_rate": 0.0002, "epoch": 2.291457286432161, "step": 3420}, {"loss": 1.6649, "grad_norm": 0.8359600305557251, "learning_rate": 0.0002, "epoch": 2.2981574539363483, "step": 3430}, {"loss": 1.6644, "grad_norm": 0.44604045152664185, "learning_rate": 0.0002, "epoch": 2.304857621440536, "step": 3440}, {"loss": 1.5972, "grad_norm": 0.45667049288749695, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 3450}, {"loss": 1.6667, "grad_norm": 0.4879349172115326, "learning_rate": 0.0002, "epoch": 2.318257956448911, "step": 3460}, {"loss": 1.5804, "grad_norm": 0.4033963084220886, "learning_rate": 0.0002, "epoch": 2.324958123953099, "step": 3470}, {"loss": 1.5838, "grad_norm": 0.44494301080703735, "learning_rate": 0.0002, "epoch": 2.3316582914572863, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4794621765613556, "learning_rate": 0.0002, "epoch": 2.338358458961474, "step": 3490}, {"loss": 1.6807, "grad_norm": 0.41404327750205994, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 3500}, {"loss": 1.714, "grad_norm": 0.4664851725101471, "learning_rate": 0.0002, "epoch": 2.351758793969849, "step": 3510}, {"loss": 1.6537, "grad_norm": 0.4263697564601898, "learning_rate": 0.0002, "epoch": 2.358458961474037, "step": 3520}, {"loss": 1.6551, "grad_norm": 0.5035167336463928, "learning_rate": 0.0002, "epoch": 2.3651591289782243, "step": 3530}, {"loss": 1.6208, "grad_norm": 0.4380664527416229, "learning_rate": 0.0002, "epoch": 2.371859296482412, "step": 3540}, {"loss": 1.634, "grad_norm": 0.5227681994438171, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 3550}, {"loss": 1.6146, "grad_norm": 0.4382302761077881, "learning_rate": 0.0002, "epoch": 2.3852596314907872, "step": 3560}, {"loss": 1.5653, "grad_norm": 0.4392451047897339, "learning_rate": 0.0002, "epoch": 2.391959798994975, "step": 3570}, {"loss": 1.6626, "grad_norm": 0.4372786581516266, "learning_rate": 0.0002, "epoch": 2.3986599664991624, "step": 3580}, {"loss": 1.519, "grad_norm": 0.5015502572059631, "learning_rate": 0.0002, "epoch": 2.40536013400335, "step": 3590}, {"loss": 1.588, "grad_norm": 0.5653210878372192, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 3600}, {"loss": 1.6075, "grad_norm": 0.53007972240448, "learning_rate": 0.0002, "epoch": 2.4187604690117253, "step": 3610}, {"loss": 1.6421, "grad_norm": 0.4659176766872406, "learning_rate": 0.0002, "epoch": 2.425460636515913, "step": 3620}, {"loss": 1.625, "grad_norm": 0.5637837052345276, "learning_rate": 0.0002, "epoch": 2.4321608040201004, "step": 3630}, {"loss": 1.6168, "grad_norm": 0.4248391389846802, "learning_rate": 0.0002, "epoch": 2.438860971524288, "step": 3640}, {"loss": 1.6822, "grad_norm": 0.44668248295783997, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 3650}, {"loss": 1.6417, "grad_norm": 0.43990179896354675, "learning_rate": 0.0002, "epoch": 2.4522613065326633, "step": 3660}, {"loss": 1.6723, "grad_norm": 0.4532523453235626, "learning_rate": 0.0002, "epoch": 2.458961474036851, "step": 3670}, {"loss": 1.6957, "grad_norm": 0.6605591773986816, "learning_rate": 0.0002, "epoch": 2.4656616415410384, "step": 3680}, {"loss": 1.6159, "grad_norm": 0.4694533348083496, "learning_rate": 0.0002, "epoch": 2.472361809045226, "step": 3690}, {"loss": 1.6239, "grad_norm": 0.4485011100769043, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 3700}, {"loss": 1.6834, "grad_norm": 0.4761785864830017, "learning_rate": 0.0002, "epoch": 2.4857621440536013, "step": 3710}, {"loss": 1.6313, "grad_norm": 0.5116432309150696, "learning_rate": 0.0002, "epoch": 2.492462311557789, "step": 3720}, {"loss": 1.5054, "grad_norm": 0.49523618817329407, "learning_rate": 0.0002, "epoch": 2.4991624790619764, "step": 3730}, {"loss": 1.6249, "grad_norm": 0.43826380372047424, "learning_rate": 0.0002, "epoch": 2.505862646566164, "step": 3740}, {"loss": 1.5762, "grad_norm": 0.4916154146194458, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3750}, {"loss": 1.5157, "grad_norm": 0.5381299257278442, "learning_rate": 0.0002, "epoch": 2.5192629815745393, "step": 3760}, {"loss": 1.6467, "grad_norm": 0.44947415590286255, "learning_rate": 0.0002, "epoch": 2.525963149078727, "step": 3770}, {"loss": 1.67, "grad_norm": 0.49979084730148315, "learning_rate": 0.0002, "epoch": 2.5326633165829144, "step": 3780}, {"loss": 1.622, "grad_norm": 0.43046900629997253, "learning_rate": 0.0002, "epoch": 2.539363484087102, "step": 3790}, {"loss": 1.6789, "grad_norm": 0.4513470530509949, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3800}, {"loss": 1.6335, "grad_norm": 0.49900051951408386, "learning_rate": 0.0002, "epoch": 2.5527638190954773, "step": 3810}, {"loss": 1.6322, "grad_norm": 0.4348420202732086, "learning_rate": 0.0002, "epoch": 2.559463986599665, "step": 3820}, {"loss": 1.6218, "grad_norm": 0.4684867560863495, "learning_rate": 0.0002, "epoch": 2.5661641541038525, "step": 3830}, {"loss": 1.6535, "grad_norm": 0.44430989027023315, "learning_rate": 0.0002, "epoch": 2.5728643216080402, "step": 3840}, {"loss": 1.5909, "grad_norm": 0.47375255823135376, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3850}, {"loss": 1.6269, "grad_norm": 0.45493075251579285, "learning_rate": 0.0002, "epoch": 2.5862646566164154, "step": 3860}, {"loss": 1.604, "grad_norm": 0.4563275873661041, "learning_rate": 0.0002, "epoch": 2.592964824120603, "step": 3870}, {"loss": 1.642, "grad_norm": 0.46060335636138916, "learning_rate": 0.0002, "epoch": 2.5996649916247905, "step": 3880}, {"loss": 1.6302, "grad_norm": 0.4718867540359497, "learning_rate": 0.0002, "epoch": 2.6063651591289783, "step": 3890}, {"loss": 1.6242, "grad_norm": 0.41570305824279785, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3900}, {"loss": 1.6401, "grad_norm": 0.4603121876716614, "learning_rate": 0.0002, "epoch": 2.6197654941373534, "step": 3910}, {"loss": 1.6839, "grad_norm": 0.4734652638435364, "learning_rate": 0.0002, "epoch": 2.626465661641541, "step": 3920}, {"loss": 1.5448, "grad_norm": 0.45348483324050903, "learning_rate": 0.0002, "epoch": 2.6331658291457285, "step": 3930}, {"loss": 1.6157, "grad_norm": 0.46559447050094604, "learning_rate": 0.0002, "epoch": 2.6398659966499163, "step": 3940}, {"loss": 1.7052, "grad_norm": 0.44113144278526306, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3950}, {"loss": 1.6315, "grad_norm": 0.41415104269981384, "learning_rate": 0.0002, "epoch": 2.6532663316582914, "step": 3960}, {"loss": 1.6589, "grad_norm": 0.48868080973625183, "learning_rate": 0.0002, "epoch": 2.659966499162479, "step": 3970}, {"loss": 1.6211, "grad_norm": 0.49610549211502075, "learning_rate": 0.0002, "epoch": 2.6666666666666665, "step": 3980}, {"loss": 1.6235, "grad_norm": 0.4309130907058716, "learning_rate": 0.0002, "epoch": 2.6733668341708543, "step": 3990}, {"loss": 1.6452, "grad_norm": 0.4489327669143677, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 4000}, {"loss": 1.5744, "grad_norm": 0.5380139946937561, "learning_rate": 0.0002, "epoch": 2.6867671691792294, "step": 4010}, {"loss": 1.6524, "grad_norm": 0.5076672434806824, "learning_rate": 0.0002, "epoch": 2.693467336683417, "step": 4020}, {"loss": 1.636, "grad_norm": 0.47620031237602234, "learning_rate": 0.0002, "epoch": 2.7001675041876045, "step": 4030}, {"loss": 1.5543, "grad_norm": 0.48089155554771423, "learning_rate": 0.0002, "epoch": 2.7068676716917923, "step": 4040}, {"loss": 1.6396, "grad_norm": 0.5108814239501953, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 4050}, {"loss": 1.5905, "grad_norm": 0.4196513295173645, "learning_rate": 0.0002, "epoch": 2.7202680067001674, "step": 4060}, {"loss": 1.686, "grad_norm": 0.4574664831161499, "learning_rate": 0.0002, "epoch": 2.726968174204355, "step": 4070}, {"loss": 1.6234, "grad_norm": 0.4671640992164612, "learning_rate": 0.0002, "epoch": 2.7336683417085426, "step": 4080}, {"loss": 1.6827, "grad_norm": 0.49355530738830566, "learning_rate": 0.0002, "epoch": 2.7403685092127303, "step": 4090}, {"loss": 1.6999, "grad_norm": 0.46716663241386414, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 4100}, {"loss": 1.6463, "grad_norm": 0.45420581102371216, "learning_rate": 0.0002, "epoch": 2.7537688442211055, "step": 4110}, {"loss": 1.5718, "grad_norm": 0.4680487811565399, "learning_rate": 0.0002, "epoch": 2.7604690117252932, "step": 4120}, {"loss": 1.5968, "grad_norm": 0.5375032424926758, "learning_rate": 0.0002, "epoch": 2.7671691792294806, "step": 4130}, {"loss": 1.5254, "grad_norm": 0.46026280522346497, "learning_rate": 0.0002, "epoch": 2.7738693467336684, "step": 4140}, {"loss": 1.6613, "grad_norm": 0.43658447265625, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 4150}, {"loss": 1.6546, "grad_norm": 0.4935547113418579, "learning_rate": 0.0002, "epoch": 2.7872696817420435, "step": 4160}, {"loss": 1.5961, "grad_norm": 0.8167962431907654, "learning_rate": 0.0002, "epoch": 2.7939698492462313, "step": 4170}, {"loss": 1.6907, "grad_norm": 0.4289683997631073, "learning_rate": 0.0002, "epoch": 2.8006700167504186, "step": 4180}, {"loss": 1.6385, "grad_norm": 0.4569324254989624, "learning_rate": 0.0002, "epoch": 2.8073701842546064, "step": 4190}, {"loss": 1.6077, "grad_norm": 0.474795937538147, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 4200}, {"loss": 1.6223, "grad_norm": 0.44272229075431824, "learning_rate": 0.0002, "epoch": 2.8207705192629815, "step": 4210}, {"loss": 1.6706, "grad_norm": 0.525240957736969, "learning_rate": 0.0002, "epoch": 2.8274706867671693, "step": 4220}, {"loss": 1.7196, "grad_norm": 0.4802303910255432, "learning_rate": 0.0002, "epoch": 2.8341708542713566, "step": 4230}, {"loss": 1.6002, "grad_norm": 0.46400442719459534, "learning_rate": 0.0002, "epoch": 2.8408710217755444, "step": 4240}, {"loss": 1.6052, "grad_norm": 0.49884888529777527, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 4250}, {"loss": 1.6919, "grad_norm": 0.5015072226524353, "learning_rate": 0.0002, "epoch": 2.8542713567839195, "step": 4260}, {"loss": 1.6335, "grad_norm": 0.4335440695285797, "learning_rate": 0.0002, "epoch": 2.8609715242881073, "step": 4270}, {"loss": 1.5664, "grad_norm": 0.5131644606590271, "learning_rate": 0.0002, "epoch": 2.8676716917922946, "step": 4280}, {"loss": 1.6409, "grad_norm": 0.6977195739746094, "learning_rate": 0.0002, "epoch": 2.8743718592964824, "step": 4290}, {"loss": 1.7192, "grad_norm": 0.5133762955665588, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 4300}, {"loss": 1.6257, "grad_norm": 0.4737614393234253, "learning_rate": 0.0002, "epoch": 2.8877721943048575, "step": 4310}, {"loss": 1.6076, "grad_norm": 0.4580535590648651, "learning_rate": 0.0002, "epoch": 2.8944723618090453, "step": 4320}, {"loss": 1.6538, "grad_norm": 0.43863341212272644, "learning_rate": 0.0002, "epoch": 2.901172529313233, "step": 4330}, {"loss": 1.6091, "grad_norm": 0.4103737473487854, "learning_rate": 0.0002, "epoch": 2.9078726968174204, "step": 4340}, {"loss": 1.7106, "grad_norm": 0.438014417886734, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 4350}, {"loss": 1.6025, "grad_norm": 0.5068213939666748, "learning_rate": 0.0002, "epoch": 2.9212730318257956, "step": 4360}, {"loss": 1.6426, "grad_norm": 0.45305484533309937, "learning_rate": 0.0002, "epoch": 2.9279731993299833, "step": 4370}, {"loss": 1.5726, "grad_norm": 0.4612090289592743, "learning_rate": 0.0002, "epoch": 2.934673366834171, "step": 4380}, {"loss": 1.6536, "grad_norm": 0.508736789226532, "learning_rate": 0.0002, "epoch": 2.9413735343383585, "step": 4390}, {"loss": 1.6132, "grad_norm": 0.4924427270889282, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 4400}, {"loss": 1.7007, "grad_norm": 0.5707460641860962, "learning_rate": 0.0002, "epoch": 2.9547738693467336, "step": 4410}, {"loss": 1.6814, "grad_norm": 0.42270299792289734, "learning_rate": 0.0002, "epoch": 2.9614740368509214, "step": 4420}, {"loss": 1.6644, "grad_norm": 0.4429931044578552, "learning_rate": 0.0002, "epoch": 2.968174204355109, "step": 4430}, {"loss": 1.6251, "grad_norm": 0.49760574102401733, "learning_rate": 0.0002, "epoch": 2.9748743718592965, "step": 4440}, {"loss": 1.6169, "grad_norm": 0.4558229148387909, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 4450}, {"loss": 1.6055, "grad_norm": 0.39848530292510986, "learning_rate": 0.0002, "epoch": 2.9882747068676716, "step": 4460}, {"loss": 1.6705, "grad_norm": 0.5224862098693848, "learning_rate": 0.0002, "epoch": 2.9949748743718594, "step": 4470}, {"eval_loss": 1.8228833675384521, "eval_runtime": 37.9049, "eval_samples_per_second": 13.587, "eval_steps_per_second": 1.715, "epoch": 2.9996649916247904, "step": 4477}, {"loss": 1.6637, "grad_norm": 0.41169142723083496, "learning_rate": 0.0002, "epoch": 3.0016750418760467, "step": 4480}, {"loss": 1.5974, "grad_norm": 0.4865207374095917, "learning_rate": 0.0002, "epoch": 3.0083752093802345, "step": 4490}, {"loss": 1.5297, "grad_norm": 0.5462028384208679, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 4500}, {"loss": 1.5251, "grad_norm": 0.6169732809066772, "learning_rate": 0.0002, "epoch": 3.0217755443886096, "step": 4510}, {"loss": 1.5559, "grad_norm": 0.5667954087257385, "learning_rate": 0.0002, "epoch": 3.0284757118927974, "step": 4520}, {"loss": 1.5037, "grad_norm": 0.5758325457572937, "learning_rate": 0.0002, "epoch": 3.0351758793969847, "step": 4530}, {"loss": 1.4873, "grad_norm": 0.5220064520835876, "learning_rate": 0.0002, "epoch": 3.0418760469011725, "step": 4540}, {"loss": 1.5126, "grad_norm": 0.5469558835029602, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 4550}, {"loss": 1.4275, "grad_norm": 0.5680848956108093, "learning_rate": 0.0002, "epoch": 3.0552763819095476, "step": 4560}, {"loss": 1.5187, "grad_norm": 0.5906574726104736, "learning_rate": 0.0002, "epoch": 3.0619765494137354, "step": 4570}, {"loss": 1.4551, "grad_norm": 0.4725631773471832, "learning_rate": 0.0002, "epoch": 3.0686767169179228, "step": 4580}, {"loss": 1.5083, "grad_norm": 0.5273477435112, "learning_rate": 0.0002, "epoch": 3.0753768844221105, "step": 4590}, {"loss": 1.5154, "grad_norm": 0.5861203074455261, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 4600}, {"loss": 1.4924, "grad_norm": 0.5343965291976929, "learning_rate": 0.0002, "epoch": 3.0887772194304857, "step": 4610}, {"loss": 1.5608, "grad_norm": 0.5348150730133057, "learning_rate": 0.0002, "epoch": 3.0954773869346734, "step": 4620}, {"loss": 1.5399, "grad_norm": 0.5971846580505371, "learning_rate": 0.0002, "epoch": 3.102177554438861, "step": 4630}, {"loss": 1.4662, "grad_norm": 0.5203177332878113, "learning_rate": 0.0002, "epoch": 3.1088777219430486, "step": 4640}, {"loss": 1.5805, "grad_norm": 0.55289226770401, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 4650}, {"loss": 1.4745, "grad_norm": 0.6878530979156494, "learning_rate": 0.0002, "epoch": 3.1222780569514237, "step": 4660}, {"loss": 1.5335, "grad_norm": 0.6173256635665894, "learning_rate": 0.0002, "epoch": 3.1289782244556115, "step": 4670}, {"loss": 1.51, "grad_norm": 0.536796510219574, "learning_rate": 0.0002, "epoch": 3.135678391959799, "step": 4680}, {"loss": 1.4713, "grad_norm": 0.58846116065979, "learning_rate": 0.0002, "epoch": 3.1423785594639866, "step": 4690}, {"loss": 1.5114, "grad_norm": 0.645889401435852, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 4700}, {"loss": 1.4705, "grad_norm": 0.6118691563606262, "learning_rate": 0.0002, "epoch": 3.1557788944723617, "step": 4710}, {"loss": 1.5533, "grad_norm": 0.5189669132232666, "learning_rate": 0.0002, "epoch": 3.1624790619765495, "step": 4720}, {"loss": 1.4769, "grad_norm": 0.5794713497161865, "learning_rate": 0.0002, "epoch": 3.169179229480737, "step": 4730}, {"loss": 1.4849, "grad_norm": 0.6579326391220093, "learning_rate": 0.0002, "epoch": 3.1758793969849246, "step": 4740}, {"loss": 1.545, "grad_norm": 0.5822742581367493, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 4750}, {"loss": 1.4358, "grad_norm": 0.5475956201553345, "learning_rate": 0.0002, "epoch": 3.1892797319932997, "step": 4760}, {"loss": 1.4723, "grad_norm": 0.6743834018707275, "learning_rate": 0.0002, "epoch": 3.1959798994974875, "step": 4770}, {"loss": 1.5161, "grad_norm": 0.6110585927963257, "learning_rate": 0.0002, "epoch": 3.202680067001675, "step": 4780}, {"loss": 1.5455, "grad_norm": 0.5426181554794312, "learning_rate": 0.0002, "epoch": 3.2093802345058626, "step": 4790}, {"loss": 1.5315, "grad_norm": 0.6077824234962463, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 4800}, {"loss": 1.5314, "grad_norm": 0.5785858631134033, "learning_rate": 0.0002, "epoch": 3.2227805695142377, "step": 4810}, {"loss": 1.4041, "grad_norm": 0.6425958275794983, "learning_rate": 0.0002, "epoch": 3.2294807370184255, "step": 4820}, {"loss": 1.4751, "grad_norm": 0.6607080698013306, "learning_rate": 0.0002, "epoch": 3.236180904522613, "step": 4830}, {"loss": 1.5267, "grad_norm": 0.5385788679122925, "learning_rate": 0.0002, "epoch": 3.2428810720268006, "step": 4840}, {"loss": 1.4673, "grad_norm": 0.5630403757095337, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 4850}, {"loss": 1.5257, "grad_norm": 0.6340779662132263, "learning_rate": 0.0002, "epoch": 3.2562814070351758, "step": 4860}, {"loss": 1.5148, "grad_norm": 0.5305342674255371, "learning_rate": 0.0002, "epoch": 3.2629815745393635, "step": 4870}, {"loss": 1.5162, "grad_norm": 0.597670316696167, "learning_rate": 0.0002, "epoch": 3.2696817420435513, "step": 4880}, {"loss": 1.5429, "grad_norm": 0.665553867816925, "learning_rate": 0.0002, "epoch": 3.2763819095477387, "step": 4890}, {"loss": 1.4607, "grad_norm": 0.579767644405365, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 4900}, {"loss": 1.4999, "grad_norm": 0.5512481331825256, "learning_rate": 0.0002, "epoch": 3.289782244556114, "step": 4910}, {"loss": 1.5022, "grad_norm": 0.5916532278060913, "learning_rate": 0.0002, "epoch": 3.2964824120603016, "step": 4920}, {"loss": 1.4889, "grad_norm": 0.7521726489067078, "learning_rate": 0.0002, "epoch": 3.3031825795644894, "step": 4930}, {"loss": 1.4223, "grad_norm": 0.5352797508239746, "learning_rate": 0.0002, "epoch": 3.3098827470686767, "step": 4940}, {"loss": 1.5122, "grad_norm": 0.5950371623039246, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 4950}, {"loss": 1.5072, "grad_norm": 0.8020477890968323, "learning_rate": 0.0002, "epoch": 3.323283082077052, "step": 4960}, {"loss": 1.5422, "grad_norm": 0.6790024638175964, "learning_rate": 0.0002, "epoch": 3.3299832495812396, "step": 4970}, {"loss": 1.5363, "grad_norm": 0.687627375125885, "learning_rate": 0.0002, "epoch": 3.3366834170854274, "step": 4980}, {"loss": 1.5276, "grad_norm": 0.6094385385513306, "learning_rate": 0.0002, "epoch": 3.3433835845896147, "step": 4990}, {"loss": 1.549, "grad_norm": 0.6541242003440857, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 5000}, {"loss": 1.6067, "grad_norm": 0.5560880303382874, "learning_rate": 0.0002, "epoch": 3.35678391959799, "step": 5010}, {"loss": 1.5769, "grad_norm": 0.5440094470977783, "learning_rate": 0.0002, "epoch": 3.3634840871021776, "step": 5020}, {"loss": 1.6183, "grad_norm": 0.5749301314353943, "learning_rate": 0.0002, "epoch": 3.3701842546063654, "step": 5030}, {"loss": 1.4801, "grad_norm": 0.5919716954231262, "learning_rate": 0.0002, "epoch": 3.3768844221105527, "step": 5040}, {"loss": 1.5957, "grad_norm": 0.6331481337547302, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 5050}, {"loss": 1.5188, "grad_norm": 0.5687161684036255, "learning_rate": 0.0002, "epoch": 3.390284757118928, "step": 5060}, {"loss": 1.5702, "grad_norm": 0.6718577742576599, "learning_rate": 0.0002, "epoch": 3.3969849246231156, "step": 5070}, {"loss": 1.5577, "grad_norm": 0.5089324116706848, "learning_rate": 0.0002, "epoch": 3.4036850921273034, "step": 5080}, {"loss": 1.512, "grad_norm": 0.5710174441337585, "learning_rate": 0.0002, "epoch": 3.4103852596314908, "step": 5090}, {"loss": 1.5492, "grad_norm": 0.6670721173286438, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 5100}, {"loss": 1.5227, "grad_norm": 0.6875665187835693, "learning_rate": 0.0002, "epoch": 3.423785594639866, "step": 5110}, {"loss": 1.4496, "grad_norm": 0.5375880599021912, "learning_rate": 0.0002, "epoch": 3.4304857621440537, "step": 5120}, {"loss": 1.5527, "grad_norm": 0.6550399661064148, "learning_rate": 0.0002, "epoch": 3.4371859296482414, "step": 5130}, {"loss": 1.5687, "grad_norm": 0.5948067903518677, "learning_rate": 0.0002, "epoch": 3.4438860971524288, "step": 5140}, {"loss": 1.4813, "grad_norm": 0.6134477257728577, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 5150}, {"loss": 1.5069, "grad_norm": 0.6506398320198059, "learning_rate": 0.0002, "epoch": 3.457286432160804, "step": 5160}, {"loss": 1.4422, "grad_norm": 0.6060147881507874, "learning_rate": 0.0002, "epoch": 3.4639865996649917, "step": 5170}, {"loss": 1.5093, "grad_norm": 0.6173806190490723, "learning_rate": 0.0002, "epoch": 3.4706867671691795, "step": 5180}, {"loss": 1.4975, "grad_norm": 0.6032607555389404, "learning_rate": 0.0002, "epoch": 3.477386934673367, "step": 5190}, {"loss": 1.4979, "grad_norm": 0.5652492046356201, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 5200}, {"loss": 1.4883, "grad_norm": 0.6168607473373413, "learning_rate": 0.0002, "epoch": 3.490787269681742, "step": 5210}, {"loss": 1.5164, "grad_norm": 0.6170629262924194, "learning_rate": 0.0002, "epoch": 3.4974874371859297, "step": 5220}, {"loss": 1.4879, "grad_norm": 0.6926297545433044, "learning_rate": 0.0002, "epoch": 3.5041876046901175, "step": 5230}, {"loss": 1.4982, "grad_norm": 0.6702437996864319, "learning_rate": 0.0002, "epoch": 3.510887772194305, "step": 5240}, {"loss": 1.4986, "grad_norm": 0.5421436429023743, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 5250}, {"loss": 1.4673, "grad_norm": 0.5726765990257263, "learning_rate": 0.0002, "epoch": 3.52428810720268, "step": 5260}, {"loss": 1.5423, "grad_norm": 0.5685455203056335, "learning_rate": 0.0002, "epoch": 3.5309882747068677, "step": 5270}, {"loss": 1.4715, "grad_norm": 0.6018396019935608, "learning_rate": 0.0002, "epoch": 3.5376884422110555, "step": 5280}, {"loss": 1.5451, "grad_norm": 0.5731932520866394, "learning_rate": 0.0002, "epoch": 3.544388609715243, "step": 5290}, {"loss": 1.4752, "grad_norm": 0.6601519584655762, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 5300}, {"loss": 1.5434, "grad_norm": 0.5545530319213867, "learning_rate": 0.0002, "epoch": 3.557788944723618, "step": 5310}, {"loss": 1.5438, "grad_norm": 0.5998541116714478, "learning_rate": 0.0002, "epoch": 3.5644891122278057, "step": 5320}, {"loss": 1.56, "grad_norm": 0.5651767253875732, "learning_rate": 0.0002, "epoch": 3.5711892797319935, "step": 5330}, {"loss": 1.4829, "grad_norm": 0.7425084114074707, "learning_rate": 0.0002, "epoch": 3.577889447236181, "step": 5340}, {"loss": 1.5571, "grad_norm": 0.5770602226257324, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 5350}, {"loss": 1.458, "grad_norm": 0.54723060131073, "learning_rate": 0.0002, "epoch": 3.591289782244556, "step": 5360}, {"loss": 1.497, "grad_norm": 0.6658238172531128, "learning_rate": 0.0002, "epoch": 3.5979899497487438, "step": 5370}, {"loss": 1.5456, "grad_norm": 0.5787645578384399, "learning_rate": 0.0002, "epoch": 3.6046901172529315, "step": 5380}, {"loss": 1.5343, "grad_norm": 0.594913125038147, "learning_rate": 0.0002, "epoch": 3.611390284757119, "step": 5390}, {"loss": 1.4727, "grad_norm": 0.4964977502822876, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 5400}, {"loss": 1.5062, "grad_norm": 0.6087527275085449, "learning_rate": 0.0002, "epoch": 3.624790619765494, "step": 5410}, {"loss": 1.5098, "grad_norm": 0.6315323710441589, "learning_rate": 0.0002, "epoch": 3.6314907872696818, "step": 5420}, {"loss": 1.4855, "grad_norm": 0.574799120426178, "learning_rate": 0.0002, "epoch": 3.6381909547738696, "step": 5430}, {"loss": 1.4595, "grad_norm": 0.5949277877807617, "learning_rate": 0.0002, "epoch": 3.644891122278057, "step": 5440}, {"loss": 1.4816, "grad_norm": 0.5640677213668823, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 5450}, {"loss": 1.525, "grad_norm": 0.6198237538337708, "learning_rate": 0.0002, "epoch": 3.658291457286432, "step": 5460}, {"loss": 1.5676, "grad_norm": 0.6902034878730774, "learning_rate": 0.0002, "epoch": 3.66499162479062, "step": 5470}, {"loss": 1.544, "grad_norm": 0.5686674118041992, "learning_rate": 0.0002, "epoch": 3.6716917922948076, "step": 5480}, {"loss": 1.5255, "grad_norm": 0.6532107591629028, "learning_rate": 0.0002, "epoch": 3.678391959798995, "step": 5490}, {"loss": 1.5767, "grad_norm": 0.5790849924087524, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 5500}, {"loss": 1.5507, "grad_norm": 0.6055065393447876, "learning_rate": 0.0002, "epoch": 3.69179229480737, "step": 5510}, {"loss": 1.4656, "grad_norm": 0.5630605816841125, "learning_rate": 0.0002, "epoch": 3.698492462311558, "step": 5520}, {"loss": 1.537, "grad_norm": 0.6005825996398926, "learning_rate": 0.0002, "epoch": 3.7051926298157456, "step": 5530}, {"loss": 1.5313, "grad_norm": 0.6553038954734802, "learning_rate": 0.0002, "epoch": 3.711892797319933, "step": 5540}, {"loss": 1.4943, "grad_norm": 0.5601094961166382, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 5550}, {"loss": 1.5161, "grad_norm": 0.6598808169364929, "learning_rate": 0.0002, "epoch": 3.725293132328308, "step": 5560}, {"loss": 1.5345, "grad_norm": 0.5506255626678467, "learning_rate": 0.0002, "epoch": 3.731993299832496, "step": 5570}, {"loss": 1.4805, "grad_norm": 0.6001223921775818, "learning_rate": 0.0002, "epoch": 3.7386934673366836, "step": 5580}, {"loss": 1.4652, "grad_norm": 0.6287297606468201, "learning_rate": 0.0002, "epoch": 3.745393634840871, "step": 5590}, {"loss": 1.5246, "grad_norm": 0.6253238916397095, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 5600}, {"loss": 1.5691, "grad_norm": 0.5713174939155579, "learning_rate": 0.0002, "epoch": 3.758793969849246, "step": 5610}, {"loss": 1.5661, "grad_norm": 0.6198310852050781, "learning_rate": 0.0002, "epoch": 3.765494137353434, "step": 5620}, {"loss": 1.5448, "grad_norm": 0.5941224098205566, "learning_rate": 0.0002, "epoch": 3.7721943048576216, "step": 5630}, {"loss": 1.4925, "grad_norm": 0.606002151966095, "learning_rate": 0.0002, "epoch": 3.778894472361809, "step": 5640}, {"loss": 1.5182, "grad_norm": 0.6540704965591431, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 5650}, {"loss": 1.5903, "grad_norm": 0.6147415041923523, "learning_rate": 0.0002, "epoch": 3.792294807370184, "step": 5660}, {"loss": 1.5329, "grad_norm": 0.5649605393409729, "learning_rate": 0.0002, "epoch": 3.798994974874372, "step": 5670}, {"loss": 1.5747, "grad_norm": 0.6788773536682129, "learning_rate": 0.0002, "epoch": 3.8056951423785597, "step": 5680}, {"loss": 1.535, "grad_norm": 0.6581860780715942, "learning_rate": 0.0002, "epoch": 3.812395309882747, "step": 5690}, {"loss": 1.4587, "grad_norm": 0.5529348850250244, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 5700}, {"loss": 1.5594, "grad_norm": 0.6320232152938843, "learning_rate": 0.0002, "epoch": 3.825795644891122, "step": 5710}, {"loss": 1.4696, "grad_norm": 0.6529698371887207, "learning_rate": 0.0002, "epoch": 3.83249581239531, "step": 5720}, {"loss": 1.5854, "grad_norm": 0.5983362793922424, "learning_rate": 0.0002, "epoch": 3.8391959798994977, "step": 5730}, {"loss": 1.465, "grad_norm": 0.6335684061050415, "learning_rate": 0.0002, "epoch": 3.845896147403685, "step": 5740}, {"loss": 1.5545, "grad_norm": 0.700446605682373, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 5750}, {"loss": 1.5707, "grad_norm": 0.6092597842216492, "learning_rate": 0.0002, "epoch": 3.85929648241206, "step": 5760}, {"loss": 1.5729, "grad_norm": 0.564146101474762, "learning_rate": 0.0002, "epoch": 3.865996649916248, "step": 5770}, {"loss": 1.5872, "grad_norm": 0.615275502204895, "learning_rate": 0.0002, "epoch": 3.8726968174204357, "step": 5780}, {"loss": 1.5142, "grad_norm": 0.6685376763343811, "learning_rate": 0.0002, "epoch": 3.879396984924623, "step": 5790}, {"loss": 1.4752, "grad_norm": 0.6116922497749329, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 5800}, {"loss": 1.5179, "grad_norm": 0.5486813187599182, "learning_rate": 0.0002, "epoch": 3.892797319932998, "step": 5810}, {"loss": 1.5167, "grad_norm": 0.6208204030990601, "learning_rate": 0.0002, "epoch": 3.899497487437186, "step": 5820}, {"loss": 1.5334, "grad_norm": 0.6500625014305115, "learning_rate": 0.0002, "epoch": 3.9061976549413737, "step": 5830}, {"loss": 1.4716, "grad_norm": 0.5948089361190796, "learning_rate": 0.0002, "epoch": 3.912897822445561, "step": 5840}, {"loss": 1.6011, "grad_norm": 0.7210732698440552, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 5850}, {"loss": 1.5519, "grad_norm": 0.6662322878837585, "learning_rate": 0.0002, "epoch": 3.926298157453936, "step": 5860}, {"loss": 1.5656, "grad_norm": 0.5613839626312256, "learning_rate": 0.0002, "epoch": 3.932998324958124, "step": 5870}, {"loss": 1.544, "grad_norm": 0.6069002151489258, "learning_rate": 0.0002, "epoch": 3.9396984924623117, "step": 5880}, {"loss": 1.6745, "grad_norm": 0.7075562477111816, "learning_rate": 0.0002, "epoch": 3.946398659966499, "step": 5890}, {"loss": 1.5391, "grad_norm": 0.6316173076629639, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 5900}, {"loss": 1.6314, "grad_norm": 0.5716308355331421, "learning_rate": 0.0002, "epoch": 3.959798994974874, "step": 5910}, {"loss": 1.5947, "grad_norm": 0.6800096035003662, "learning_rate": 0.0002, "epoch": 3.966499162479062, "step": 5920}, {"loss": 1.5189, "grad_norm": 0.6057983040809631, "learning_rate": 0.0002, "epoch": 3.9731993299832498, "step": 5930}, {"loss": 1.5431, "grad_norm": 0.5938987731933594, "learning_rate": 0.0002, "epoch": 3.979899497487437, "step": 5940}, {"loss": 1.5111, "grad_norm": 0.6963576674461365, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 5950}, {"loss": 1.5521, "grad_norm": 0.6279940009117126, "learning_rate": 0.0002, "epoch": 3.993299832495812, "step": 5960}, {"loss": 1.5974, "grad_norm": 0.7161159515380859, "learning_rate": 0.0002, "epoch": 4.0, "step": 5970}, {"eval_loss": 1.8655421733856201, "eval_runtime": 37.9276, "eval_samples_per_second": 13.579, "eval_steps_per_second": 1.714, "epoch": 4.0, "step": 5970}, {"loss": 1.3666, "grad_norm": 0.7380476593971252, "learning_rate": 0.0002, "epoch": 4.006700167504188, "step": 5980}, {"loss": 1.3913, "grad_norm": 0.7148947715759277, "learning_rate": 0.0002, "epoch": 4.013400335008376, "step": 5990}, {"loss": 1.4204, "grad_norm": 0.6177082657814026, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 6000}, {"loss": 1.4421, "grad_norm": 0.8552946448326111, "learning_rate": 0.0002, "epoch": 4.02680067001675, "step": 6010}, {"loss": 1.4342, "grad_norm": 0.8033416271209717, "learning_rate": 0.0002, "epoch": 4.033500837520938, "step": 6020}, {"loss": 1.4092, "grad_norm": 0.8501318097114563, "learning_rate": 0.0002, "epoch": 4.040201005025126, "step": 6030}, {"loss": 1.3367, "grad_norm": 0.6981393098831177, "learning_rate": 0.0002, "epoch": 4.046901172529314, "step": 6040}, {"loss": 1.3925, "grad_norm": 0.7227180600166321, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 6050}, {"loss": 1.4007, "grad_norm": 0.6923989653587341, "learning_rate": 0.0002, "epoch": 4.060301507537688, "step": 6060}, {"loss": 1.3837, "grad_norm": 0.879779040813446, "learning_rate": 0.0002, "epoch": 4.067001675041876, "step": 6070}, {"loss": 1.4383, "grad_norm": 0.8184754848480225, "learning_rate": 0.0002, "epoch": 4.073701842546064, "step": 6080}, {"loss": 1.3128, "grad_norm": 0.8211342692375183, "learning_rate": 0.0002, "epoch": 4.080402010050252, "step": 6090}, {"loss": 1.3892, "grad_norm": 0.7542396783828735, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 6100}, {"loss": 1.3607, "grad_norm": 0.6631066799163818, "learning_rate": 0.0002, "epoch": 4.093802345058626, "step": 6110}, {"loss": 1.3275, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.100502512562814, "step": 6120}, {"loss": 1.3443, "grad_norm": 0.681851863861084, "learning_rate": 0.0002, "epoch": 4.107202680067002, "step": 6130}, {"loss": 1.3486, "grad_norm": 0.8757794499397278, "learning_rate": 0.0002, "epoch": 4.11390284757119, "step": 6140}, {"loss": 1.351, "grad_norm": 0.6567301750183105, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 6150}, {"loss": 1.3824, "grad_norm": 0.7950329184532166, "learning_rate": 0.0002, "epoch": 4.127303182579564, "step": 6160}, {"loss": 1.3738, "grad_norm": 0.7545644044876099, "learning_rate": 0.0002, "epoch": 4.134003350083752, "step": 6170}, {"loss": 1.4214, "grad_norm": 0.7172710299491882, "learning_rate": 0.0002, "epoch": 4.14070351758794, "step": 6180}, {"loss": 1.4091, "grad_norm": 0.7040584087371826, "learning_rate": 0.0002, "epoch": 4.147403685092128, "step": 6190}, {"loss": 1.4149, "grad_norm": 0.7482913732528687, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 6200}, {"loss": 1.3227, "grad_norm": 0.8523276448249817, "learning_rate": 0.0002, "epoch": 4.160804020100502, "step": 6210}, {"loss": 1.4194, "grad_norm": 0.6672041416168213, "learning_rate": 0.0002, "epoch": 4.16750418760469, "step": 6220}, {"loss": 1.3953, "grad_norm": 0.7523500919342041, "learning_rate": 0.0002, "epoch": 4.174204355108878, "step": 6230}, {"loss": 1.371, "grad_norm": 0.8085253834724426, "learning_rate": 0.0002, "epoch": 4.180904522613066, "step": 6240}, {"loss": 1.3293, "grad_norm": 0.789450466632843, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 6250}, {"loss": 1.3539, "grad_norm": 0.7502310872077942, "learning_rate": 0.0002, "epoch": 4.19430485762144, "step": 6260}, {"loss": 1.3415, "grad_norm": 0.7397456765174866, "learning_rate": 0.0002, "epoch": 4.201005025125628, "step": 6270}, {"loss": 1.3963, "grad_norm": 0.6921947002410889, "learning_rate": 0.0002, "epoch": 4.207705192629816, "step": 6280}, {"loss": 1.3125, "grad_norm": 0.9334571957588196, "learning_rate": 0.0002, "epoch": 4.214405360134004, "step": 6290}, {"loss": 1.3612, "grad_norm": 0.725799024105072, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 6300}, {"loss": 1.4217, "grad_norm": 0.8290495872497559, "learning_rate": 0.0002, "epoch": 4.227805695142378, "step": 6310}, {"loss": 1.4135, "grad_norm": 0.688983678817749, "learning_rate": 0.0002, "epoch": 4.234505862646566, "step": 6320}, {"loss": 1.3807, "grad_norm": 0.8620913028717041, "learning_rate": 0.0002, "epoch": 4.241206030150754, "step": 6330}, {"loss": 1.3738, "grad_norm": 0.8008657693862915, "learning_rate": 0.0002, "epoch": 4.247906197654942, "step": 6340}, {"loss": 1.4005, "grad_norm": 0.7379199266433716, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 6350}, {"loss": 1.426, "grad_norm": 0.7842815518379211, "learning_rate": 0.0002, "epoch": 4.261306532663316, "step": 6360}, {"loss": 1.4262, "grad_norm": 0.812600314617157, "learning_rate": 0.0002, "epoch": 4.268006700167504, "step": 6370}, {"loss": 1.4028, "grad_norm": 0.7852841019630432, "learning_rate": 0.0002, "epoch": 4.274706867671692, "step": 6380}, {"loss": 1.3722, "grad_norm": 1.0377534627914429, "learning_rate": 0.0002, "epoch": 4.28140703517588, "step": 6390}, {"loss": 1.3755, "grad_norm": 1.03935706615448, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 6400}, {"loss": 1.3961, "grad_norm": 0.7244732975959778, "learning_rate": 0.0002, "epoch": 4.294807370184254, "step": 6410}, {"loss": 1.4608, "grad_norm": 0.7137406468391418, "learning_rate": 0.0002, "epoch": 4.301507537688442, "step": 6420}, {"loss": 1.4461, "grad_norm": 0.7492543458938599, "learning_rate": 0.0002, "epoch": 4.30820770519263, "step": 6430}, {"loss": 1.4562, "grad_norm": 0.7065439224243164, "learning_rate": 0.0002, "epoch": 4.314907872696818, "step": 6440}, {"loss": 1.4246, "grad_norm": 0.7786989808082581, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 6450}, {"loss": 1.3098, "grad_norm": 0.7369208335876465, "learning_rate": 0.0002, "epoch": 4.328308207705192, "step": 6460}, {"loss": 1.3686, "grad_norm": 0.7412346005439758, "learning_rate": 0.0002, "epoch": 4.33500837520938, "step": 6470}, {"loss": 1.4087, "grad_norm": 0.780927300453186, "learning_rate": 0.0002, "epoch": 4.341708542713568, "step": 6480}, {"loss": 1.3628, "grad_norm": 0.8320930600166321, "learning_rate": 0.0002, "epoch": 4.348408710217756, "step": 6490}, {"loss": 1.3715, "grad_norm": 0.6871094703674316, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 6500}, {"loss": 1.3257, "grad_norm": 0.6751559972763062, "learning_rate": 0.0002, "epoch": 4.36180904522613, "step": 6510}, {"loss": 1.4311, "grad_norm": 0.7723976969718933, "learning_rate": 0.0002, "epoch": 4.368509212730318, "step": 6520}, {"loss": 1.4086, "grad_norm": 0.7915401458740234, "learning_rate": 0.0002, "epoch": 4.375209380234506, "step": 6530}, {"loss": 1.3973, "grad_norm": 0.7329102754592896, "learning_rate": 0.0002, "epoch": 4.381909547738694, "step": 6540}, {"loss": 1.447, "grad_norm": 0.7388760447502136, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 6550}, {"loss": 1.4378, "grad_norm": 0.8282579183578491, "learning_rate": 0.0002, "epoch": 4.3953098827470685, "step": 6560}, {"loss": 1.3923, "grad_norm": 0.7192724347114563, "learning_rate": 0.0002, "epoch": 4.402010050251256, "step": 6570}, {"loss": 1.4141, "grad_norm": 0.746526837348938, "learning_rate": 0.0002, "epoch": 4.408710217755444, "step": 6580}, {"loss": 1.33, "grad_norm": 0.8738046288490295, "learning_rate": 0.0002, "epoch": 4.415410385259632, "step": 6590}, {"loss": 1.3995, "grad_norm": 0.8408458828926086, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 6600}, {"loss": 1.4148, "grad_norm": 0.8110666275024414, "learning_rate": 0.0002, "epoch": 4.4288107202680065, "step": 6610}, {"loss": 1.441, "grad_norm": 0.8602406978607178, "learning_rate": 0.0002, "epoch": 4.435510887772194, "step": 6620}, {"loss": 1.4319, "grad_norm": 0.7549102902412415, "learning_rate": 0.0002, "epoch": 4.442211055276382, "step": 6630}, {"loss": 1.388, "grad_norm": 0.7831804156303406, "learning_rate": 0.0002, "epoch": 4.44891122278057, "step": 6640}, {"loss": 1.4283, "grad_norm": 0.7269673943519592, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 6650}, {"loss": 1.4132, "grad_norm": 0.7397838830947876, "learning_rate": 0.0002, "epoch": 4.4623115577889445, "step": 6660}, {"loss": 1.3174, "grad_norm": 0.713707447052002, "learning_rate": 0.0002, "epoch": 4.469011725293132, "step": 6670}, {"loss": 1.3406, "grad_norm": 0.7525581121444702, "learning_rate": 0.0002, "epoch": 4.47571189279732, "step": 6680}, {"loss": 1.4283, "grad_norm": 0.8030191659927368, "learning_rate": 0.0002, "epoch": 4.482412060301508, "step": 6690}, {"loss": 1.4586, "grad_norm": 0.7469439506530762, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 6700}, {"loss": 1.367, "grad_norm": 0.7743868231773376, "learning_rate": 0.0002, "epoch": 4.4958123953098825, "step": 6710}, {"loss": 1.3439, "grad_norm": 0.6539737582206726, "learning_rate": 0.0002, "epoch": 4.50251256281407, "step": 6720}, {"loss": 1.4513, "grad_norm": 0.825818657875061, "learning_rate": 0.0002, "epoch": 4.509212730318258, "step": 6730}, {"loss": 1.3984, "grad_norm": 0.8048575520515442, "learning_rate": 0.0002, "epoch": 4.515912897822446, "step": 6740}, {"loss": 1.3923, "grad_norm": 0.7828766107559204, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 6750}, {"loss": 1.3886, "grad_norm": 0.7406010031700134, "learning_rate": 0.0002, "epoch": 4.5293132328308205, "step": 6760}, {"loss": 1.3109, "grad_norm": 0.840345561504364, "learning_rate": 0.0002, "epoch": 4.536013400335008, "step": 6770}, {"loss": 1.4808, "grad_norm": 0.8492622971534729, "learning_rate": 0.0002, "epoch": 4.542713567839196, "step": 6780}, {"loss": 1.4384, "grad_norm": 0.7130163908004761, "learning_rate": 0.0002, "epoch": 4.549413735343384, "step": 6790}, {"loss": 1.4531, "grad_norm": 0.8454728126525879, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 6800}, {"loss": 1.3239, "grad_norm": 0.7847645282745361, "learning_rate": 0.0002, "epoch": 4.562814070351759, "step": 6810}, {"loss": 1.4181, "grad_norm": 0.7245864272117615, "learning_rate": 0.0002, "epoch": 4.569514237855946, "step": 6820}, {"loss": 1.3233, "grad_norm": 0.768893301486969, "learning_rate": 0.0002, "epoch": 4.576214405360134, "step": 6830}, {"loss": 1.3932, "grad_norm": 0.8028400540351868, "learning_rate": 0.0002, "epoch": 4.582914572864322, "step": 6840}, {"loss": 1.3745, "grad_norm": 0.763945460319519, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 6850}, {"loss": 1.4797, "grad_norm": 0.7417685389518738, "learning_rate": 0.0002, "epoch": 4.596314907872697, "step": 6860}, {"loss": 1.4468, "grad_norm": 0.7603038549423218, "learning_rate": 0.0002, "epoch": 4.603015075376884, "step": 6870}, {"loss": 1.4095, "grad_norm": 0.7981528043746948, "learning_rate": 0.0002, "epoch": 4.609715242881072, "step": 6880}, {"loss": 1.3963, "grad_norm": 0.8077111840248108, "learning_rate": 0.0002, "epoch": 4.61641541038526, "step": 6890}, {"loss": 1.4721, "grad_norm": 0.8778454065322876, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 6900}, {"loss": 1.3758, "grad_norm": 0.8620710372924805, "learning_rate": 0.0002, "epoch": 4.629815745393635, "step": 6910}, {"loss": 1.344, "grad_norm": 0.7486072778701782, "learning_rate": 0.0002, "epoch": 4.636515912897822, "step": 6920}, {"loss": 1.3913, "grad_norm": 0.7493042945861816, "learning_rate": 0.0002, "epoch": 4.64321608040201, "step": 6930}, {"loss": 1.397, "grad_norm": 0.7388978600502014, "learning_rate": 0.0002, "epoch": 4.649916247906198, "step": 6940}, {"loss": 1.3593, "grad_norm": 0.798530638217926, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 6950}, {"loss": 1.3982, "grad_norm": 0.7929500937461853, "learning_rate": 0.0002, "epoch": 4.663316582914573, "step": 6960}, {"loss": 1.4183, "grad_norm": 0.9186785221099854, "learning_rate": 0.0002, "epoch": 4.67001675041876, "step": 6970}, {"loss": 1.3955, "grad_norm": 1.1103485822677612, "learning_rate": 0.0002, "epoch": 4.676716917922948, "step": 6980}, {"loss": 1.3941, "grad_norm": 0.8000466823577881, "learning_rate": 0.0002, "epoch": 4.683417085427136, "step": 6990}, {"loss": 1.371, "grad_norm": 0.7520599961280823, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 7000}, {"loss": 1.4582, "grad_norm": 0.7971973419189453, "learning_rate": 0.0002, "epoch": 4.696817420435511, "step": 7010}, {"loss": 1.3682, "grad_norm": 0.7363343834877014, "learning_rate": 0.0002, "epoch": 4.703517587939698, "step": 7020}, {"loss": 1.3889, "grad_norm": 0.8268865942955017, "learning_rate": 0.0002, "epoch": 4.710217755443886, "step": 7030}, {"loss": 1.4382, "grad_norm": 0.7054963111877441, "learning_rate": 0.0002, "epoch": 4.716917922948074, "step": 7040}, {"loss": 1.4578, "grad_norm": 0.8196262121200562, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 7050}, {"loss": 1.365, "grad_norm": 0.8276031017303467, "learning_rate": 0.0002, "epoch": 4.730318257956449, "step": 7060}, {"loss": 1.3887, "grad_norm": 0.8248157501220703, "learning_rate": 0.0002, "epoch": 4.7370184254606365, "step": 7070}, {"loss": 1.4193, "grad_norm": 0.8937979936599731, "learning_rate": 0.0002, "epoch": 4.743718592964824, "step": 7080}, {"loss": 1.4334, "grad_norm": 0.7984238862991333, "learning_rate": 0.0002, "epoch": 4.750418760469012, "step": 7090}, {"loss": 1.4385, "grad_norm": 0.9495313763618469, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 7100}, {"loss": 1.4504, "grad_norm": 0.8598204255104065, "learning_rate": 0.0002, "epoch": 4.763819095477387, "step": 7110}, {"loss": 1.3969, "grad_norm": 0.8951472640037537, "learning_rate": 0.0002, "epoch": 4.7705192629815745, "step": 7120}, {"loss": 1.4339, "grad_norm": 0.9110309481620789, "learning_rate": 0.0002, "epoch": 4.777219430485762, "step": 7130}, {"loss": 1.4001, "grad_norm": 0.7929584980010986, "learning_rate": 0.0002, "epoch": 4.78391959798995, "step": 7140}, {"loss": 1.467, "grad_norm": 0.7415322661399841, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 7150}, {"loss": 1.5107, "grad_norm": 0.7504757046699524, "learning_rate": 0.0002, "epoch": 4.797319932998325, "step": 7160}, {"loss": 1.3736, "grad_norm": 0.7166924476623535, "learning_rate": 0.0002, "epoch": 4.8040201005025125, "step": 7170}, {"loss": 1.4088, "grad_norm": 0.7728400826454163, "learning_rate": 0.0002, "epoch": 4.8107202680067, "step": 7180}, {"loss": 1.3814, "grad_norm": 0.7992154955863953, "learning_rate": 0.0002, "epoch": 4.817420435510888, "step": 7190}, {"loss": 1.3958, "grad_norm": 0.8655321002006531, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 7200}, {"loss": 1.3837, "grad_norm": 0.7672632336616516, "learning_rate": 0.0002, "epoch": 4.830820770519263, "step": 7210}, {"loss": 1.4578, "grad_norm": 0.708416223526001, "learning_rate": 0.0002, "epoch": 4.8375209380234505, "step": 7220}, {"loss": 1.5413, "grad_norm": 0.8914081454277039, "learning_rate": 0.0002, "epoch": 4.844221105527638, "step": 7230}, {"loss": 1.3569, "grad_norm": 0.7141931653022766, "learning_rate": 0.0002, "epoch": 4.850921273031826, "step": 7240}, {"loss": 1.4532, "grad_norm": 0.6913040280342102, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 7250}, {"loss": 1.3912, "grad_norm": 0.7871233820915222, "learning_rate": 0.0002, "epoch": 4.864321608040201, "step": 7260}, {"loss": 1.3688, "grad_norm": 0.8466277122497559, "learning_rate": 0.0002, "epoch": 4.8710217755443885, "step": 7270}, {"loss": 1.33, "grad_norm": 0.8492183685302734, "learning_rate": 0.0002, "epoch": 4.877721943048576, "step": 7280}, {"loss": 1.3744, "grad_norm": 0.8339574933052063, "learning_rate": 0.0002, "epoch": 4.884422110552764, "step": 7290}, {"loss": 1.4157, "grad_norm": 0.787022590637207, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 7300}, {"loss": 1.3725, "grad_norm": 0.8877332806587219, "learning_rate": 0.0002, "epoch": 4.897822445561139, "step": 7310}, {"loss": 1.3968, "grad_norm": 0.744989812374115, "learning_rate": 0.0002, "epoch": 4.9045226130653266, "step": 7320}, {"loss": 1.4421, "grad_norm": 0.8027268648147583, "learning_rate": 0.0002, "epoch": 4.911222780569514, "step": 7330}, {"loss": 1.425, "grad_norm": 0.6437455415725708, "learning_rate": 0.0002, "epoch": 4.917922948073702, "step": 7340}, {"loss": 1.4829, "grad_norm": 0.685999870300293, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 7350}, {"loss": 1.4352, "grad_norm": 0.9086187481880188, "learning_rate": 0.0002, "epoch": 4.931323283082077, "step": 7360}, {"loss": 1.4245, "grad_norm": 0.8272411227226257, "learning_rate": 0.0002, "epoch": 4.938023450586265, "step": 7370}, {"loss": 1.4226, "grad_norm": 0.9227852821350098, "learning_rate": 0.0002, "epoch": 4.944723618090452, "step": 7380}, {"loss": 1.3643, "grad_norm": 0.7688441276550293, "learning_rate": 0.0002, "epoch": 4.95142378559464, "step": 7390}, {"loss": 1.4491, "grad_norm": 0.8662643432617188, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 7400}, {"loss": 1.4194, "grad_norm": 0.9234127998352051, "learning_rate": 0.0002, "epoch": 4.964824120603015, "step": 7410}, {"loss": 1.4009, "grad_norm": 0.9131470918655396, "learning_rate": 0.0002, "epoch": 4.971524288107203, "step": 7420}, {"loss": 1.4544, "grad_norm": 0.7377504110336304, "learning_rate": 0.0002, "epoch": 4.97822445561139, "step": 7430}, {"loss": 1.4008, "grad_norm": 0.8762801289558411, "learning_rate": 0.0002, "epoch": 4.984924623115578, "step": 7440}, {"loss": 1.4304, "grad_norm": 0.7919872999191284, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 7450}, {"loss": 1.3817, "grad_norm": 0.7144299149513245, "learning_rate": 0.0002, "epoch": 4.998324958123953, "step": 7460}, {"eval_loss": 1.9291157722473145, "eval_runtime": 37.9831, "eval_samples_per_second": 13.559, "eval_steps_per_second": 1.711, "epoch": 4.99966499162479, "step": 7462}, {"loss": 1.2753, "grad_norm": 0.7860151529312134, "learning_rate": 0.0002, "epoch": 5.005025125628141, "step": 7470}, {"loss": 1.2149, "grad_norm": 0.9418314695358276, "learning_rate": 0.0002, "epoch": 5.011725293132328, "step": 7480}, {"loss": 1.1966, "grad_norm": 0.8474572896957397, "learning_rate": 0.0002, "epoch": 5.018425460636516, "step": 7490}, {"loss": 1.2111, "grad_norm": 1.0724040269851685, "learning_rate": 0.0002, "epoch": 5.025125628140704, "step": 7500}, {"loss": 1.2228, "grad_norm": 0.9109148979187012, "learning_rate": 0.0002, "epoch": 5.031825795644891, "step": 7510}, {"loss": 1.2239, "grad_norm": 1.0088659524917603, "learning_rate": 0.0002, "epoch": 5.038525963149079, "step": 7520}, {"loss": 1.2156, "grad_norm": 1.1421623229980469, "learning_rate": 0.0002, "epoch": 5.045226130653266, "step": 7530}, {"loss": 1.1739, "grad_norm": 0.9219902157783508, "learning_rate": 0.0002, "epoch": 5.051926298157454, "step": 7540}, {"loss": 1.2686, "grad_norm": 0.9150987863540649, "learning_rate": 0.0002, "epoch": 5.058626465661642, "step": 7550}, {"loss": 1.2068, "grad_norm": 0.8889328241348267, "learning_rate": 0.0002, "epoch": 5.065326633165829, "step": 7560}, {"loss": 1.276, "grad_norm": 0.9751363396644592, "learning_rate": 0.0002, "epoch": 5.072026800670017, "step": 7570}, {"loss": 1.2078, "grad_norm": 0.8603123426437378, "learning_rate": 0.0002, "epoch": 5.078726968174204, "step": 7580}, {"loss": 1.2175, "grad_norm": 0.8910616636276245, "learning_rate": 0.0002, "epoch": 5.085427135678392, "step": 7590}, {"loss": 1.2475, "grad_norm": 1.1128392219543457, "learning_rate": 0.0002, "epoch": 5.09212730318258, "step": 7600}, {"loss": 1.3065, "grad_norm": 0.9480258822441101, "learning_rate": 0.0002, "epoch": 5.098827470686767, "step": 7610}, {"loss": 1.193, "grad_norm": 0.906958818435669, "learning_rate": 0.0002, "epoch": 5.105527638190955, "step": 7620}, {"loss": 1.2223, "grad_norm": 0.8741167187690735, "learning_rate": 0.0002, "epoch": 5.1122278056951425, "step": 7630}, {"loss": 1.2126, "grad_norm": 0.966268002986908, "learning_rate": 0.0002, "epoch": 5.11892797319933, "step": 7640}, {"loss": 1.2782, "grad_norm": 0.9124358892440796, "learning_rate": 0.0002, "epoch": 5.125628140703517, "step": 7650}, {"loss": 1.3004, "grad_norm": 1.0436606407165527, "learning_rate": 0.0002, "epoch": 5.132328308207705, "step": 7660}, {"loss": 1.2675, "grad_norm": 0.9217309355735779, "learning_rate": 0.0002, "epoch": 5.139028475711893, "step": 7670}, {"loss": 1.2502, "grad_norm": 1.344765543937683, "learning_rate": 0.0002, "epoch": 5.1457286432160805, "step": 7680}, {"loss": 1.2416, "grad_norm": 1.0730723142623901, "learning_rate": 0.0002, "epoch": 5.152428810720268, "step": 7690}, {"loss": 1.1888, "grad_norm": 0.9321247339248657, "learning_rate": 0.0002, "epoch": 5.159128978224456, "step": 7700}, {"loss": 1.1941, "grad_norm": 0.8482614755630493, "learning_rate": 0.0002, "epoch": 5.165829145728643, "step": 7710}, {"loss": 1.2668, "grad_norm": 0.8274452686309814, "learning_rate": 0.0002, "epoch": 5.172529313232831, "step": 7720}, {"loss": 1.1972, "grad_norm": 0.9120376706123352, "learning_rate": 0.0002, "epoch": 5.1792294807370185, "step": 7730}, {"loss": 1.1648, "grad_norm": 1.0062892436981201, "learning_rate": 0.0002, "epoch": 5.185929648241206, "step": 7740}, {"loss": 1.2199, "grad_norm": 0.9521504640579224, "learning_rate": 0.0002, "epoch": 5.192629815745394, "step": 7750}, {"loss": 1.2855, "grad_norm": 0.8800198435783386, "learning_rate": 0.0002, "epoch": 5.199329983249581, "step": 7760}, {"loss": 1.2535, "grad_norm": 0.9749179482460022, "learning_rate": 0.0002, "epoch": 5.206030150753769, "step": 7770}, {"loss": 1.2975, "grad_norm": 0.9441686868667603, "learning_rate": 0.0002, "epoch": 5.2127303182579565, "step": 7780}, {"loss": 1.256, "grad_norm": 0.9114066362380981, "learning_rate": 0.0002, "epoch": 5.219430485762144, "step": 7790}, {"loss": 1.2621, "grad_norm": 0.9851446151733398, "learning_rate": 0.0002, "epoch": 5.226130653266332, "step": 7800}, {"loss": 1.2502, "grad_norm": 0.9526297450065613, "learning_rate": 0.0002, "epoch": 5.232830820770519, "step": 7810}, {"loss": 1.1502, "grad_norm": 1.05986487865448, "learning_rate": 0.0002, "epoch": 5.239530988274707, "step": 7820}, {"loss": 1.2517, "grad_norm": 0.8956538438796997, "learning_rate": 0.0002, "epoch": 5.2462311557788945, "step": 7830}, {"loss": 1.2556, "grad_norm": 0.9568153619766235, "learning_rate": 0.0002, "epoch": 5.252931323283082, "step": 7840}, {"loss": 1.2442, "grad_norm": 1.0035018920898438, "learning_rate": 0.0002, "epoch": 5.259631490787269, "step": 7850}, {"loss": 1.2605, "grad_norm": 0.8554368615150452, "learning_rate": 0.0002, "epoch": 5.266331658291457, "step": 7860}, {"loss": 1.2799, "grad_norm": 0.9677708148956299, "learning_rate": 0.0002, "epoch": 5.273031825795645, "step": 7870}, {"loss": 1.275, "grad_norm": 0.943606436252594, "learning_rate": 0.0002, "epoch": 5.279731993299833, "step": 7880}, {"loss": 1.2335, "grad_norm": 1.0029335021972656, "learning_rate": 0.0002, "epoch": 5.28643216080402, "step": 7890}, {"loss": 1.2494, "grad_norm": 1.0164015293121338, "learning_rate": 0.0002, "epoch": 5.293132328308207, "step": 7900}, {"loss": 1.3117, "grad_norm": 0.8908365368843079, "learning_rate": 0.0002, "epoch": 5.299832495812395, "step": 7910}, {"loss": 1.2832, "grad_norm": 0.9307826161384583, "learning_rate": 0.0002, "epoch": 5.306532663316583, "step": 7920}, {"loss": 1.242, "grad_norm": 1.0730371475219727, "learning_rate": 0.0002, "epoch": 5.313232830820771, "step": 7930}, {"loss": 1.2003, "grad_norm": 0.844739556312561, "learning_rate": 0.0002, "epoch": 5.319932998324958, "step": 7940}, {"loss": 1.2688, "grad_norm": 1.275833010673523, "learning_rate": 0.0002, "epoch": 5.326633165829146, "step": 7950}, {"loss": 1.2957, "grad_norm": 0.9042661190032959, "learning_rate": 0.0002, "epoch": 5.333333333333333, "step": 7960}, {"loss": 1.2912, "grad_norm": 0.9374269247055054, "learning_rate": 0.0002, "epoch": 5.340033500837521, "step": 7970}, {"loss": 1.2721, "grad_norm": 1.033098578453064, "learning_rate": 0.0002, "epoch": 5.346733668341709, "step": 7980}, {"loss": 1.3208, "grad_norm": 1.062775731086731, "learning_rate": 0.0002, "epoch": 5.353433835845896, "step": 7990}, {"loss": 1.3065, "grad_norm": 1.1064317226409912, "learning_rate": 0.0002, "epoch": 5.360134003350084, "step": 8000}, {"loss": 1.2341, "grad_norm": 1.1114039421081543, "learning_rate": 0.0002, "epoch": 5.366834170854271, "step": 8010}, {"loss": 1.2255, "grad_norm": 1.0198014974594116, "learning_rate": 0.0002, "epoch": 5.373534338358459, "step": 8020}, {"loss": 1.2433, "grad_norm": 0.8443173170089722, "learning_rate": 0.0002, "epoch": 5.380234505862647, "step": 8030}, {"loss": 1.206, "grad_norm": 1.000881314277649, "learning_rate": 0.0002, "epoch": 5.386934673366834, "step": 8040}, {"loss": 1.2982, "grad_norm": 0.9874443411827087, "learning_rate": 0.0002, "epoch": 5.393634840871022, "step": 8050}, {"loss": 1.2289, "grad_norm": 0.9895344972610474, "learning_rate": 0.0002, "epoch": 5.400335008375209, "step": 8060}, {"loss": 1.249, "grad_norm": 0.8595236539840698, "learning_rate": 0.0002, "epoch": 5.407035175879397, "step": 8070}, {"loss": 1.2308, "grad_norm": 0.9523849487304688, "learning_rate": 0.0002, "epoch": 5.413735343383585, "step": 8080}, {"loss": 1.2343, "grad_norm": 1.0560476779937744, "learning_rate": 0.0002, "epoch": 5.420435510887772, "step": 8090}, {"loss": 1.2956, "grad_norm": 1.0893689393997192, "learning_rate": 0.0002, "epoch": 5.42713567839196, "step": 8100}, {"loss": 1.2846, "grad_norm": 0.9395513534545898, "learning_rate": 0.0002, "epoch": 5.433835845896147, "step": 8110}, {"loss": 1.3444, "grad_norm": 0.9364215135574341, "learning_rate": 0.0002, "epoch": 5.440536013400335, "step": 8120}, {"loss": 1.2944, "grad_norm": 0.9502208232879639, "learning_rate": 0.0002, "epoch": 5.447236180904523, "step": 8130}, {"loss": 1.2971, "grad_norm": 0.9559208154678345, "learning_rate": 0.0002, "epoch": 5.45393634840871, "step": 8140}, {"loss": 1.2495, "grad_norm": 0.9261730313301086, "learning_rate": 0.0002, "epoch": 5.460636515912898, "step": 8150}, {"loss": 1.2599, "grad_norm": 0.9832326173782349, "learning_rate": 0.0002, "epoch": 5.467336683417085, "step": 8160}, {"loss": 1.2771, "grad_norm": 1.065953016281128, "learning_rate": 0.0002, "epoch": 5.474036850921273, "step": 8170}, {"loss": 1.3617, "grad_norm": 0.9139469861984253, "learning_rate": 0.0002, "epoch": 5.480737018425461, "step": 8180}, {"loss": 1.2197, "grad_norm": 1.2322484254837036, "learning_rate": 0.0002, "epoch": 5.4874371859296485, "step": 8190}, {"loss": 1.2879, "grad_norm": 0.9722974896430969, "learning_rate": 0.0002, "epoch": 5.494137353433836, "step": 8200}, {"loss": 1.2664, "grad_norm": 0.9338926076889038, "learning_rate": 0.0002, "epoch": 5.500837520938023, "step": 8210}, {"loss": 1.2128, "grad_norm": 0.9283728003501892, "learning_rate": 0.0002, "epoch": 5.507537688442211, "step": 8220}, {"loss": 1.2141, "grad_norm": 1.0489585399627686, "learning_rate": 0.0002, "epoch": 5.514237855946399, "step": 8230}, {"loss": 1.3257, "grad_norm": 0.9881814122200012, "learning_rate": 0.0002, "epoch": 5.5209380234505865, "step": 8240}, {"loss": 1.2221, "grad_norm": 0.9274460077285767, "learning_rate": 0.0002, "epoch": 5.527638190954773, "step": 8250}, {"loss": 1.2241, "grad_norm": 0.8650718331336975, "learning_rate": 0.0002, "epoch": 5.534338358458961, "step": 8260}, {"loss": 1.2462, "grad_norm": 1.014069676399231, "learning_rate": 0.0002, "epoch": 5.541038525963149, "step": 8270}, {"loss": 1.3502, "grad_norm": 0.9212974905967712, "learning_rate": 0.0002, "epoch": 5.547738693467337, "step": 8280}, {"loss": 1.2779, "grad_norm": 1.1235398054122925, "learning_rate": 0.0002, "epoch": 5.5544388609715245, "step": 8290}, {"loss": 1.306, "grad_norm": 0.961954653263092, "learning_rate": 0.0002, "epoch": 5.561139028475711, "step": 8300}, {"loss": 1.2946, "grad_norm": 0.9386700391769409, "learning_rate": 0.0002, "epoch": 5.567839195979899, "step": 8310}, {"loss": 1.313, "grad_norm": 1.01912522315979, "learning_rate": 0.0002, "epoch": 5.574539363484087, "step": 8320}, {"loss": 1.3121, "grad_norm": 0.9851216077804565, "learning_rate": 0.0002, "epoch": 5.581239530988275, "step": 8330}, {"loss": 1.3071, "grad_norm": 1.0138001441955566, "learning_rate": 0.0002, "epoch": 5.5879396984924625, "step": 8340}, {"loss": 1.2257, "grad_norm": 0.9262447357177734, "learning_rate": 0.0002, "epoch": 5.594639865996649, "step": 8350}, {"loss": 1.2473, "grad_norm": 1.1322970390319824, "learning_rate": 0.0002, "epoch": 5.601340033500837, "step": 8360}, {"loss": 1.3098, "grad_norm": 1.1429349184036255, "learning_rate": 0.0002, "epoch": 5.608040201005025, "step": 8370}, {"loss": 1.2686, "grad_norm": 0.9130118489265442, "learning_rate": 0.0002, "epoch": 5.614740368509213, "step": 8380}, {"loss": 1.2541, "grad_norm": 0.9651545882225037, "learning_rate": 0.0002, "epoch": 5.6214405360134005, "step": 8390}, {"loss": 1.2799, "grad_norm": 0.9595398306846619, "learning_rate": 0.0002, "epoch": 5.628140703517588, "step": 8400}, {"loss": 1.3429, "grad_norm": 1.0049372911453247, "learning_rate": 0.0002, "epoch": 5.634840871021775, "step": 8410}, {"loss": 1.3224, "grad_norm": 1.082804560661316, "learning_rate": 0.0002, "epoch": 5.641541038525963, "step": 8420}, {"loss": 1.297, "grad_norm": 0.9489204287528992, "learning_rate": 0.0002, "epoch": 5.648241206030151, "step": 8430}, {"loss": 1.3424, "grad_norm": 0.9470235109329224, "learning_rate": 0.0002, "epoch": 5.654941373534339, "step": 8440}, {"loss": 1.3358, "grad_norm": 1.0662927627563477, "learning_rate": 0.0002, "epoch": 5.661641541038526, "step": 8450}, {"loss": 1.2973, "grad_norm": 0.9097877740859985, "learning_rate": 0.0002, "epoch": 5.668341708542713, "step": 8460}, {"loss": 1.3072, "grad_norm": 0.9740368127822876, "learning_rate": 0.0002, "epoch": 5.675041876046901, "step": 8470}, {"loss": 1.286, "grad_norm": 0.9878810048103333, "learning_rate": 0.0002, "epoch": 5.681742043551089, "step": 8480}, {"loss": 1.208, "grad_norm": 1.148260474205017, "learning_rate": 0.0002, "epoch": 5.688442211055277, "step": 8490}, {"loss": 1.2842, "grad_norm": 0.9632558822631836, "learning_rate": 0.0002, "epoch": 5.695142378559464, "step": 8500}, {"loss": 1.2787, "grad_norm": 0.876812756061554, "learning_rate": 0.0002, "epoch": 5.701842546063651, "step": 8510}, {"loss": 1.3186, "grad_norm": 1.0730829238891602, "learning_rate": 0.0002, "epoch": 5.708542713567839, "step": 8520}, {"loss": 1.2856, "grad_norm": 1.2239218950271606, "learning_rate": 0.0002, "epoch": 5.715242881072027, "step": 8530}, {"loss": 1.2717, "grad_norm": 0.9460835456848145, "learning_rate": 0.0002, "epoch": 5.721943048576215, "step": 8540}, {"loss": 1.3509, "grad_norm": 0.9086270928382874, "learning_rate": 0.0002, "epoch": 5.728643216080402, "step": 8550}, {"loss": 1.2971, "grad_norm": 1.0258867740631104, "learning_rate": 0.0002, "epoch": 5.735343383584589, "step": 8560}, {"loss": 1.3581, "grad_norm": 1.0543923377990723, "learning_rate": 0.0002, "epoch": 5.742043551088777, "step": 8570}, {"loss": 1.2988, "grad_norm": 0.9063900113105774, "learning_rate": 0.0002, "epoch": 5.748743718592965, "step": 8580}, {"loss": 1.3535, "grad_norm": 1.1838830709457397, "learning_rate": 0.0002, "epoch": 5.755443886097153, "step": 8590}, {"loss": 1.2655, "grad_norm": 0.9631859064102173, "learning_rate": 0.0002, "epoch": 5.76214405360134, "step": 8600}, {"loss": 1.276, "grad_norm": 0.9702655673027039, "learning_rate": 0.0002, "epoch": 5.768844221105527, "step": 8610}, {"loss": 1.3196, "grad_norm": 1.0591435432434082, "learning_rate": 0.0002, "epoch": 5.775544388609715, "step": 8620}, {"loss": 1.267, "grad_norm": 0.9989570379257202, "learning_rate": 0.0002, "epoch": 5.782244556113903, "step": 8630}, {"loss": 1.3227, "grad_norm": 1.0836435556411743, "learning_rate": 0.0002, "epoch": 5.788944723618091, "step": 8640}, {"loss": 1.3334, "grad_norm": 0.8832896947860718, "learning_rate": 0.0002, "epoch": 5.795644891122278, "step": 8650}, {"loss": 1.3214, "grad_norm": 1.0104607343673706, "learning_rate": 0.0002, "epoch": 5.802345058626465, "step": 8660}, {"loss": 1.2703, "grad_norm": 0.8375084400177002, "learning_rate": 0.0002, "epoch": 5.809045226130653, "step": 8670}, {"loss": 1.3554, "grad_norm": 1.1300716400146484, "learning_rate": 0.0002, "epoch": 5.815745393634841, "step": 8680}, {"loss": 1.3468, "grad_norm": 0.9311910271644592, "learning_rate": 0.0002, "epoch": 5.822445561139029, "step": 8690}, {"loss": 1.2749, "grad_norm": 0.9488391876220703, "learning_rate": 0.0002, "epoch": 5.8291457286432165, "step": 8700}, {"loss": 1.2281, "grad_norm": 0.9747629761695862, "learning_rate": 0.0002, "epoch": 5.835845896147403, "step": 8710}, {"loss": 1.2923, "grad_norm": 1.1029598712921143, "learning_rate": 0.0002, "epoch": 5.842546063651591, "step": 8720}, {"loss": 1.3613, "grad_norm": 1.0396875143051147, "learning_rate": 0.0002, "epoch": 5.849246231155779, "step": 8730}, {"loss": 1.3272, "grad_norm": 0.9259780645370483, "learning_rate": 0.0002, "epoch": 5.855946398659967, "step": 8740}, {"loss": 1.3236, "grad_norm": 1.020033597946167, "learning_rate": 0.0002, "epoch": 5.8626465661641545, "step": 8750}, {"loss": 1.3453, "grad_norm": 0.9191218614578247, "learning_rate": 0.0002, "epoch": 5.869346733668341, "step": 8760}, {"loss": 1.3012, "grad_norm": 1.1093107461929321, "learning_rate": 0.0002, "epoch": 5.876046901172529, "step": 8770}, {"loss": 1.2718, "grad_norm": 1.1626793146133423, "learning_rate": 0.0002, "epoch": 5.882747068676717, "step": 8780}, {"loss": 1.2969, "grad_norm": 0.9542945027351379, "learning_rate": 0.0002, "epoch": 5.889447236180905, "step": 8790}, {"loss": 1.3134, "grad_norm": 0.9086058139801025, "learning_rate": 0.0002, "epoch": 5.8961474036850925, "step": 8800}, {"loss": 1.2731, "grad_norm": 0.9249639511108398, "learning_rate": 0.0002, "epoch": 5.902847571189279, "step": 8810}, {"loss": 1.337, "grad_norm": 0.9414396286010742, "learning_rate": 0.0002, "epoch": 5.909547738693467, "step": 8820}, {"loss": 1.2865, "grad_norm": 0.9086037874221802, "learning_rate": 0.0002, "epoch": 5.916247906197655, "step": 8830}, {"loss": 1.2756, "grad_norm": 0.8685907125473022, "learning_rate": 0.0002, "epoch": 5.922948073701843, "step": 8840}, {"loss": 1.297, "grad_norm": 1.036419153213501, "learning_rate": 0.0002, "epoch": 5.9296482412060305, "step": 8850}, {"loss": 1.3207, "grad_norm": 1.0183674097061157, "learning_rate": 0.0002, "epoch": 5.936348408710217, "step": 8860}, {"loss": 1.3922, "grad_norm": 0.966444194316864, "learning_rate": 0.0002, "epoch": 5.943048576214405, "step": 8870}, {"loss": 1.333, "grad_norm": 1.125693917274475, "learning_rate": 0.0002, "epoch": 5.949748743718593, "step": 8880}, {"loss": 1.3116, "grad_norm": 0.9857436418533325, "learning_rate": 0.0002, "epoch": 5.956448911222781, "step": 8890}, {"loss": 1.2526, "grad_norm": 0.9377069473266602, "learning_rate": 0.0002, "epoch": 5.9631490787269685, "step": 8900}, {"loss": 1.3221, "grad_norm": 0.9493814706802368, "learning_rate": 0.0002, "epoch": 5.969849246231155, "step": 8910}, {"loss": 1.2516, "grad_norm": 0.8806208372116089, "learning_rate": 0.0002, "epoch": 5.976549413735343, "step": 8920}, {"loss": 1.2558, "grad_norm": 0.8727600574493408, "learning_rate": 0.0002, "epoch": 5.983249581239531, "step": 8930}, {"loss": 1.3538, "grad_norm": 0.9799810647964478, "learning_rate": 0.0002, "epoch": 5.989949748743719, "step": 8940}, {"loss": 1.3323, "grad_norm": 0.9866513609886169, "learning_rate": 0.0002, "epoch": 5.9966499162479066, "step": 8950}, {"eval_loss": 2.0282373428344727, "eval_runtime": 38.0375, "eval_samples_per_second": 13.539, "eval_steps_per_second": 1.709, "epoch": 6.0, "step": 8955}, {"loss": 1.1768, "grad_norm": 0.8747885227203369, "learning_rate": 0.0002, "epoch": 6.0033500837520934, "step": 8960}, {"loss": 1.0677, "grad_norm": 1.2512741088867188, "learning_rate": 0.0002, "epoch": 6.010050251256281, "step": 8970}, {"loss": 1.1128, "grad_norm": 1.06855309009552, "learning_rate": 0.0002, "epoch": 6.016750418760469, "step": 8980}, {"loss": 1.1382, "grad_norm": 1.1868711709976196, "learning_rate": 0.0002, "epoch": 6.023450586264657, "step": 8990}, {"loss": 1.1377, "grad_norm": 1.2984495162963867, "learning_rate": 0.0002, "epoch": 6.030150753768845, "step": 9000}, {"loss": 1.0803, "grad_norm": 1.1147589683532715, "learning_rate": 0.0002, "epoch": 6.0368509212730315, "step": 9010}, {"loss": 1.1244, "grad_norm": 1.3128414154052734, "learning_rate": 0.0002, "epoch": 6.043551088777219, "step": 9020}, {"loss": 1.097, "grad_norm": 1.068290114402771, "learning_rate": 0.0002, "epoch": 6.050251256281407, "step": 9030}, {"loss": 1.1764, "grad_norm": 1.1890562772750854, "learning_rate": 0.0002, "epoch": 6.056951423785595, "step": 9040}, {"loss": 1.1239, "grad_norm": 1.2121573686599731, "learning_rate": 0.0002, "epoch": 6.063651591289783, "step": 9050}, {"loss": 1.0683, "grad_norm": 1.0860483646392822, "learning_rate": 0.0002, "epoch": 6.0703517587939695, "step": 9060}, {"loss": 1.1613, "grad_norm": 1.1214599609375, "learning_rate": 0.0002, "epoch": 6.077051926298157, "step": 9070}, {"loss": 1.1001, "grad_norm": 1.147580862045288, "learning_rate": 0.0002, "epoch": 6.083752093802345, "step": 9080}, {"loss": 1.1154, "grad_norm": 1.3233155012130737, "learning_rate": 0.0002, "epoch": 6.090452261306533, "step": 9090}, {"loss": 1.1017, "grad_norm": 1.1869080066680908, "learning_rate": 0.0002, "epoch": 6.097152428810721, "step": 9100}, {"loss": 1.1532, "grad_norm": 1.1695014238357544, "learning_rate": 0.0002, "epoch": 6.1038525963149075, "step": 9110}, {"loss": 1.1366, "grad_norm": 1.1982251405715942, "learning_rate": 0.0002, "epoch": 6.110552763819095, "step": 9120}, {"loss": 1.0805, "grad_norm": 1.1426950693130493, "learning_rate": 0.0002, "epoch": 6.117252931323283, "step": 9130}, {"loss": 1.0801, "grad_norm": 1.2257394790649414, "learning_rate": 0.0002, "epoch": 6.123953098827471, "step": 9140}, {"loss": 1.1209, "grad_norm": 1.2932263612747192, "learning_rate": 0.0002, "epoch": 6.130653266331659, "step": 9150}, {"loss": 1.0934, "grad_norm": 1.2617030143737793, "learning_rate": 0.0002, "epoch": 6.1373534338358455, "step": 9160}, {"loss": 1.0551, "grad_norm": 1.1201422214508057, "learning_rate": 0.0002, "epoch": 6.144053601340033, "step": 9170}, {"loss": 1.1059, "grad_norm": 0.9625319838523865, "learning_rate": 0.0002, "epoch": 6.150753768844221, "step": 9180}, {"loss": 1.1397, "grad_norm": 1.0290048122406006, "learning_rate": 0.0002, "epoch": 6.157453936348409, "step": 9190}, {"loss": 1.1257, "grad_norm": 1.1137803792953491, "learning_rate": 0.0002, "epoch": 6.164154103852597, "step": 9200}, {"loss": 1.1211, "grad_norm": 1.3674522638320923, "learning_rate": 0.0002, "epoch": 6.1708542713567835, "step": 9210}, {"loss": 1.0947, "grad_norm": 1.182207703590393, "learning_rate": 0.0002, "epoch": 6.177554438860971, "step": 9220}, {"loss": 1.0838, "grad_norm": 1.0496711730957031, "learning_rate": 0.0002, "epoch": 6.184254606365159, "step": 9230}, {"loss": 1.0666, "grad_norm": 1.1899489164352417, "learning_rate": 0.0002, "epoch": 6.190954773869347, "step": 9240}, {"loss": 1.1633, "grad_norm": 1.2666147947311401, "learning_rate": 0.0002, "epoch": 6.197654941373535, "step": 9250}, {"loss": 1.1532, "grad_norm": 1.2013030052185059, "learning_rate": 0.0002, "epoch": 6.204355108877722, "step": 9260}, {"loss": 1.151, "grad_norm": 1.3049768209457397, "learning_rate": 0.0002, "epoch": 6.211055276381909, "step": 9270}, {"loss": 1.0741, "grad_norm": 1.1733006238937378, "learning_rate": 0.0002, "epoch": 6.217755443886097, "step": 9280}, {"loss": 1.0933, "grad_norm": 1.2742516994476318, "learning_rate": 0.0002, "epoch": 6.224455611390285, "step": 9290}, {"loss": 1.1028, "grad_norm": 1.110198974609375, "learning_rate": 0.0002, "epoch": 6.231155778894473, "step": 9300}, {"loss": 1.1619, "grad_norm": 1.159963607788086, "learning_rate": 0.0002, "epoch": 6.23785594639866, "step": 9310}, {"loss": 1.0716, "grad_norm": 1.302216649055481, "learning_rate": 0.0002, "epoch": 6.244556113902847, "step": 9320}, {"loss": 1.0694, "grad_norm": 1.2134063243865967, "learning_rate": 0.0002, "epoch": 6.251256281407035, "step": 9330}, {"loss": 1.2151, "grad_norm": 1.062682867050171, "learning_rate": 0.0002, "epoch": 6.257956448911223, "step": 9340}, {"loss": 1.148, "grad_norm": 1.1568971872329712, "learning_rate": 0.0002, "epoch": 6.264656616415411, "step": 9350}, {"loss": 1.094, "grad_norm": 0.9914957880973816, "learning_rate": 0.0002, "epoch": 6.271356783919598, "step": 9360}, {"loss": 1.125, "grad_norm": 1.017250895500183, "learning_rate": 0.0002, "epoch": 6.278056951423785, "step": 9370}, {"loss": 1.2177, "grad_norm": 1.1862834692001343, "learning_rate": 0.0002, "epoch": 6.284757118927973, "step": 9380}, {"loss": 0.9994, "grad_norm": 1.2834911346435547, "learning_rate": 0.0002, "epoch": 6.291457286432161, "step": 9390}, {"loss": 1.0922, "grad_norm": 1.3306856155395508, "learning_rate": 0.0002, "epoch": 6.298157453936349, "step": 9400}, {"loss": 1.1136, "grad_norm": 1.12908136844635, "learning_rate": 0.0002, "epoch": 6.304857621440536, "step": 9410}, {"loss": 1.1406, "grad_norm": 1.2157351970672607, "learning_rate": 0.0002, "epoch": 6.311557788944723, "step": 9420}, {"loss": 1.1388, "grad_norm": 1.121882677078247, "learning_rate": 0.0002, "epoch": 6.318257956448911, "step": 9430}, {"loss": 1.1648, "grad_norm": 1.3144481182098389, "learning_rate": 0.0002, "epoch": 6.324958123953099, "step": 9440}, {"loss": 1.1228, "grad_norm": 1.1946896314620972, "learning_rate": 0.0002, "epoch": 6.331658291457287, "step": 9450}, {"loss": 1.1613, "grad_norm": 1.1289668083190918, "learning_rate": 0.0002, "epoch": 6.338358458961474, "step": 9460}, {"loss": 1.1059, "grad_norm": 1.1065658330917358, "learning_rate": 0.0002, "epoch": 6.345058626465661, "step": 9470}, {"loss": 1.1431, "grad_norm": 1.0881422758102417, "learning_rate": 0.0002, "epoch": 6.351758793969849, "step": 9480}, {"loss": 1.223, "grad_norm": 1.242676854133606, "learning_rate": 0.0002, "epoch": 6.358458961474037, "step": 9490}, {"loss": 1.1379, "grad_norm": 0.9650855660438538, "learning_rate": 0.0002, "epoch": 6.365159128978225, "step": 9500}, {"loss": 1.0763, "grad_norm": 1.2845722436904907, "learning_rate": 0.0002, "epoch": 6.371859296482412, "step": 9510}, {"loss": 1.1351, "grad_norm": 1.0327043533325195, "learning_rate": 0.0002, "epoch": 6.3785594639865995, "step": 9520}, {"loss": 1.114, "grad_norm": 1.0780898332595825, "learning_rate": 0.0002, "epoch": 6.385259631490787, "step": 9530}, {"loss": 1.1579, "grad_norm": 1.4934027194976807, "learning_rate": 0.0002, "epoch": 6.391959798994975, "step": 9540}, {"loss": 1.1546, "grad_norm": 0.9882908463478088, "learning_rate": 0.0002, "epoch": 6.398659966499163, "step": 9550}, {"loss": 1.1145, "grad_norm": 1.3250664472579956, "learning_rate": 0.0002, "epoch": 6.40536013400335, "step": 9560}, {"loss": 1.2333, "grad_norm": 1.1888482570648193, "learning_rate": 0.0002, "epoch": 6.4120603015075375, "step": 9570}, {"loss": 1.0892, "grad_norm": 1.136496901512146, "learning_rate": 0.0002, "epoch": 6.418760469011725, "step": 9580}, {"loss": 1.1674, "grad_norm": 1.161360502243042, "learning_rate": 0.0002, "epoch": 6.425460636515913, "step": 9590}, {"loss": 1.1293, "grad_norm": 1.2034236192703247, "learning_rate": 0.0002, "epoch": 6.432160804020101, "step": 9600}, {"loss": 1.1059, "grad_norm": 1.0268361568450928, "learning_rate": 0.0002, "epoch": 6.438860971524288, "step": 9610}, {"loss": 1.1732, "grad_norm": 1.2132930755615234, "learning_rate": 0.0002, "epoch": 6.4455611390284755, "step": 9620}, {"loss": 1.1329, "grad_norm": 1.0773013830184937, "learning_rate": 0.0002, "epoch": 6.452261306532663, "step": 9630}, {"loss": 1.0822, "grad_norm": 1.3848375082015991, "learning_rate": 0.0002, "epoch": 6.458961474036851, "step": 9640}, {"loss": 1.1778, "grad_norm": 1.110495924949646, "learning_rate": 0.0002, "epoch": 6.465661641541039, "step": 9650}, {"loss": 1.2022, "grad_norm": 1.118093729019165, "learning_rate": 0.0002, "epoch": 6.472361809045226, "step": 9660}, {"loss": 1.1222, "grad_norm": 1.2611900568008423, "learning_rate": 0.0002, "epoch": 6.4790619765494135, "step": 9670}, {"loss": 1.2138, "grad_norm": 0.971754252910614, "learning_rate": 0.0002, "epoch": 6.485762144053601, "step": 9680}, {"loss": 1.1641, "grad_norm": 1.2615419626235962, "learning_rate": 0.0002, "epoch": 6.492462311557789, "step": 9690}, {"loss": 1.1412, "grad_norm": 1.1370900869369507, "learning_rate": 0.0002, "epoch": 6.499162479061977, "step": 9700}, {"loss": 1.186, "grad_norm": 1.1815906763076782, "learning_rate": 0.0002, "epoch": 6.505862646566165, "step": 9710}, {"loss": 1.167, "grad_norm": 1.3424339294433594, "learning_rate": 0.0002, "epoch": 6.5125628140703515, "step": 9720}, {"loss": 1.1602, "grad_norm": 1.2858397960662842, "learning_rate": 0.0002, "epoch": 6.519262981574539, "step": 9730}, {"loss": 1.178, "grad_norm": 0.9578179121017456, "learning_rate": 0.0002, "epoch": 6.525963149078727, "step": 9740}, {"loss": 1.1805, "grad_norm": 1.3105167150497437, "learning_rate": 0.0002, "epoch": 6.532663316582915, "step": 9750}, {"loss": 1.1899, "grad_norm": 1.0586575269699097, "learning_rate": 0.0002, "epoch": 6.539363484087103, "step": 9760}, {"loss": 1.095, "grad_norm": 1.2122068405151367, "learning_rate": 0.0002, "epoch": 6.54606365159129, "step": 9770}, {"loss": 1.1471, "grad_norm": 1.3088626861572266, "learning_rate": 0.0002, "epoch": 6.552763819095477, "step": 9780}, {"loss": 1.1067, "grad_norm": 1.194122076034546, "learning_rate": 0.0002, "epoch": 6.559463986599665, "step": 9790}, {"loss": 1.0967, "grad_norm": 1.1508387327194214, "learning_rate": 0.0002, "epoch": 6.566164154103853, "step": 9800}, {"loss": 1.1694, "grad_norm": 1.109228253364563, "learning_rate": 0.0002, "epoch": 6.572864321608041, "step": 9810}, {"loss": 1.1378, "grad_norm": 1.1607427597045898, "learning_rate": 0.0002, "epoch": 6.579564489112228, "step": 9820}, {"loss": 1.1585, "grad_norm": 1.174089789390564, "learning_rate": 0.0002, "epoch": 6.586264656616415, "step": 9830}, {"loss": 1.1385, "grad_norm": 1.1739521026611328, "learning_rate": 0.0002, "epoch": 6.592964824120603, "step": 9840}, {"loss": 1.155, "grad_norm": 1.098528504371643, "learning_rate": 0.0002, "epoch": 6.599664991624791, "step": 9850}, {"loss": 1.1359, "grad_norm": 1.0397740602493286, "learning_rate": 0.0002, "epoch": 6.606365159128979, "step": 9860}, {"loss": 1.1433, "grad_norm": 1.1087969541549683, "learning_rate": 0.0002, "epoch": 6.613065326633166, "step": 9870}, {"loss": 1.2356, "grad_norm": 1.2070481777191162, "learning_rate": 0.0002, "epoch": 6.619765494137353, "step": 9880}, {"loss": 1.1161, "grad_norm": 1.1115655899047852, "learning_rate": 0.0002, "epoch": 6.626465661641541, "step": 9890}, {"loss": 1.2163, "grad_norm": 1.2486097812652588, "learning_rate": 0.0002, "epoch": 6.633165829145729, "step": 9900}, {"loss": 1.0984, "grad_norm": 1.230380654335022, "learning_rate": 0.0002, "epoch": 6.639865996649917, "step": 9910}, {"loss": 1.1862, "grad_norm": 1.1479365825653076, "learning_rate": 0.0002, "epoch": 6.646566164154104, "step": 9920}, {"loss": 1.1139, "grad_norm": 1.0790960788726807, "learning_rate": 0.0002, "epoch": 6.653266331658291, "step": 9930}, {"loss": 1.2001, "grad_norm": 1.1157397031784058, "learning_rate": 0.0002, "epoch": 6.659966499162479, "step": 9940}, {"loss": 1.1085, "grad_norm": 1.3104028701782227, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 9950}, {"loss": 1.2209, "grad_norm": 1.1727646589279175, "learning_rate": 0.0002, "epoch": 6.673366834170855, "step": 9960}, {"loss": 1.1671, "grad_norm": 1.2104284763336182, "learning_rate": 0.0002, "epoch": 6.680067001675042, "step": 9970}, {"loss": 1.1952, "grad_norm": 1.2023727893829346, "learning_rate": 0.0002, "epoch": 6.686767169179229, "step": 9980}, {"loss": 1.1385, "grad_norm": 1.0088225603103638, "learning_rate": 0.0002, "epoch": 6.693467336683417, "step": 9990}, {"loss": 1.1314, "grad_norm": 1.298015832901001, "learning_rate": 0.0002, "epoch": 6.700167504187605, "step": 10000}, {"loss": 1.1983, "grad_norm": 1.1315910816192627, "learning_rate": 0.0002, "epoch": 6.706867671691793, "step": 10010}, {"loss": 1.1679, "grad_norm": 1.1283273696899414, "learning_rate": 0.0002, "epoch": 6.71356783919598, "step": 10020}, {"loss": 1.1917, "grad_norm": 1.2564418315887451, "learning_rate": 0.0002, "epoch": 6.720268006700167, "step": 10030}, {"loss": 1.1788, "grad_norm": 1.0451353788375854, "learning_rate": 0.0002, "epoch": 6.726968174204355, "step": 10040}, {"loss": 1.1905, "grad_norm": 1.054793357849121, "learning_rate": 0.0002, "epoch": 6.733668341708543, "step": 10050}, {"loss": 1.1814, "grad_norm": 1.2741243839263916, "learning_rate": 0.0002, "epoch": 6.740368509212731, "step": 10060}, {"loss": 1.2015, "grad_norm": 1.1342514753341675, "learning_rate": 0.0002, "epoch": 6.747068676716918, "step": 10070}, {"loss": 1.2587, "grad_norm": 1.0081498622894287, "learning_rate": 0.0002, "epoch": 6.7537688442211055, "step": 10080}, {"loss": 1.1226, "grad_norm": 1.2164603471755981, "learning_rate": 0.0002, "epoch": 6.760469011725293, "step": 10090}, {"loss": 1.1353, "grad_norm": 1.2062463760375977, "learning_rate": 0.0002, "epoch": 6.767169179229481, "step": 10100}, {"loss": 1.2143, "grad_norm": 1.2255526781082153, "learning_rate": 0.0002, "epoch": 6.773869346733669, "step": 10110}, {"loss": 1.1544, "grad_norm": 1.08175790309906, "learning_rate": 0.0002, "epoch": 6.780569514237856, "step": 10120}, {"loss": 1.1983, "grad_norm": 1.5781128406524658, "learning_rate": 0.0002, "epoch": 6.7872696817420435, "step": 10130}, {"loss": 1.0994, "grad_norm": 1.0622451305389404, "learning_rate": 0.0002, "epoch": 6.793969849246231, "step": 10140}, {"loss": 1.2084, "grad_norm": 1.1591497659683228, "learning_rate": 0.0002, "epoch": 6.800670016750419, "step": 10150}, {"loss": 1.2203, "grad_norm": 1.0398483276367188, "learning_rate": 0.0002, "epoch": 6.807370184254607, "step": 10160}, {"loss": 1.2249, "grad_norm": 1.229132056236267, "learning_rate": 0.0002, "epoch": 6.814070351758794, "step": 10170}, {"loss": 1.1789, "grad_norm": 1.0918090343475342, "learning_rate": 0.0002, "epoch": 6.8207705192629815, "step": 10180}, {"loss": 1.1639, "grad_norm": 1.1543749570846558, "learning_rate": 0.0002, "epoch": 6.827470686767169, "step": 10190}, {"loss": 1.1353, "grad_norm": 1.1831817626953125, "learning_rate": 0.0002, "epoch": 6.834170854271357, "step": 10200}, {"loss": 1.2565, "grad_norm": 1.305327296257019, "learning_rate": 0.0002, "epoch": 6.840871021775545, "step": 10210}, {"loss": 1.2037, "grad_norm": 1.136720061302185, "learning_rate": 0.0002, "epoch": 6.847571189279732, "step": 10220}, {"loss": 1.2256, "grad_norm": 1.2282346487045288, "learning_rate": 0.0002, "epoch": 6.8542713567839195, "step": 10230}, {"loss": 1.1281, "grad_norm": 1.2457010746002197, "learning_rate": 0.0002, "epoch": 6.860971524288107, "step": 10240}, {"loss": 1.2762, "grad_norm": 1.2808631658554077, "learning_rate": 0.0002, "epoch": 6.867671691792295, "step": 10250}, {"loss": 1.2213, "grad_norm": 1.089066743850708, "learning_rate": 0.0002, "epoch": 6.874371859296483, "step": 10260}, {"loss": 1.2627, "grad_norm": 0.9543178081512451, "learning_rate": 0.0002, "epoch": 6.88107202680067, "step": 10270}, {"loss": 1.1617, "grad_norm": 1.1149744987487793, "learning_rate": 0.0002, "epoch": 6.8877721943048575, "step": 10280}, {"loss": 1.1134, "grad_norm": 1.0185538530349731, "learning_rate": 0.0002, "epoch": 6.894472361809045, "step": 10290}, {"loss": 1.217, "grad_norm": 0.9954617619514465, "learning_rate": 0.0002, "epoch": 6.901172529313233, "step": 10300}, {"loss": 1.1524, "grad_norm": 1.2581418752670288, "learning_rate": 0.0002, "epoch": 6.907872696817421, "step": 10310}, {"loss": 1.1942, "grad_norm": 1.2430983781814575, "learning_rate": 0.0002, "epoch": 6.914572864321608, "step": 10320}, {"loss": 1.1254, "grad_norm": 1.4937270879745483, "learning_rate": 0.0002, "epoch": 6.921273031825796, "step": 10330}, {"loss": 1.1519, "grad_norm": 1.1257144212722778, "learning_rate": 0.0002, "epoch": 6.927973199329983, "step": 10340}, {"loss": 1.2622, "grad_norm": 1.2068904638290405, "learning_rate": 0.0002, "epoch": 6.934673366834171, "step": 10350}, {"loss": 1.1349, "grad_norm": 1.0290757417678833, "learning_rate": 0.0002, "epoch": 6.941373534338359, "step": 10360}, {"loss": 1.1752, "grad_norm": 1.0070724487304688, "learning_rate": 0.0002, "epoch": 6.948073701842546, "step": 10370}, {"loss": 1.1838, "grad_norm": 0.9936357140541077, "learning_rate": 0.0002, "epoch": 6.954773869346734, "step": 10380}, {"loss": 1.2305, "grad_norm": 1.1063416004180908, "learning_rate": 0.0002, "epoch": 6.961474036850921, "step": 10390}, {"loss": 1.154, "grad_norm": 1.5199986696243286, "learning_rate": 0.0002, "epoch": 6.968174204355109, "step": 10400}, {"loss": 1.1143, "grad_norm": 1.160731554031372, "learning_rate": 0.0002, "epoch": 6.974874371859297, "step": 10410}, {"loss": 1.2132, "grad_norm": 1.084697961807251, "learning_rate": 0.0002, "epoch": 6.981574539363484, "step": 10420}, {"loss": 1.1995, "grad_norm": 1.1257576942443848, "learning_rate": 0.0002, "epoch": 6.988274706867672, "step": 10430}, {"loss": 1.1197, "grad_norm": 1.310616135597229, "learning_rate": 0.0002, "epoch": 6.994974874371859, "step": 10440}, {"eval_loss": 2.1203012466430664, "eval_runtime": 37.936, "eval_samples_per_second": 13.576, "eval_steps_per_second": 1.713, "epoch": 6.99966499162479, "step": 10447}, {"loss": 1.1118, "grad_norm": 1.0176491737365723, "learning_rate": 0.0002, "epoch": 7.001675041876047, "step": 10450}, {"loss": 0.9932, "grad_norm": 1.602665662765503, "learning_rate": 0.0002, "epoch": 7.008375209380235, "step": 10460}, {"loss": 0.9818, "grad_norm": 1.2909572124481201, "learning_rate": 0.0002, "epoch": 7.015075376884422, "step": 10470}, {"loss": 0.9398, "grad_norm": 1.2601855993270874, "learning_rate": 0.0002, "epoch": 7.02177554438861, "step": 10480}, {"loss": 0.9241, "grad_norm": 1.3437587022781372, "learning_rate": 0.0002, "epoch": 7.028475711892797, "step": 10490}, {"loss": 1.0031, "grad_norm": 1.2220089435577393, "learning_rate": 0.0002, "epoch": 7.035175879396985, "step": 10500}, {"loss": 0.9428, "grad_norm": 1.3392685651779175, "learning_rate": 0.0002, "epoch": 7.041876046901173, "step": 10510}, {"loss": 0.9566, "grad_norm": 1.3902767896652222, "learning_rate": 0.0002, "epoch": 7.04857621440536, "step": 10520}, {"loss": 0.9993, "grad_norm": 1.4098035097122192, "learning_rate": 0.0002, "epoch": 7.055276381909548, "step": 10530}, {"loss": 0.9683, "grad_norm": 1.38866126537323, "learning_rate": 0.0002, "epoch": 7.061976549413735, "step": 10540}, {"loss": 0.9961, "grad_norm": 1.3638999462127686, "learning_rate": 0.0002, "epoch": 7.068676716917923, "step": 10550}, {"loss": 0.9698, "grad_norm": 1.3181349039077759, "learning_rate": 0.0002, "epoch": 7.075376884422111, "step": 10560}, {"loss": 0.9963, "grad_norm": 1.2528657913208008, "learning_rate": 0.0002, "epoch": 7.082077051926298, "step": 10570}, {"loss": 0.9624, "grad_norm": 1.4326589107513428, "learning_rate": 0.0002, "epoch": 7.088777219430486, "step": 10580}, {"loss": 1.0432, "grad_norm": 1.4312337636947632, "learning_rate": 0.0002, "epoch": 7.0954773869346734, "step": 10590}, {"loss": 0.9704, "grad_norm": 1.7286990880966187, "learning_rate": 0.0002, "epoch": 7.102177554438861, "step": 10600}, {"loss": 0.9954, "grad_norm": 1.1248762607574463, "learning_rate": 0.0002, "epoch": 7.108877721943049, "step": 10610}, {"loss": 0.9736, "grad_norm": 1.278731346130371, "learning_rate": 0.0002, "epoch": 7.115577889447236, "step": 10620}, {"loss": 0.9885, "grad_norm": 1.53670072555542, "learning_rate": 0.0002, "epoch": 7.122278056951424, "step": 10630}, {"loss": 0.9573, "grad_norm": 1.268069863319397, "learning_rate": 0.0002, "epoch": 7.1289782244556115, "step": 10640}, {"loss": 0.9716, "grad_norm": 1.5072290897369385, "learning_rate": 0.0002, "epoch": 7.135678391959799, "step": 10650}, {"loss": 0.9871, "grad_norm": 1.5552845001220703, "learning_rate": 0.0002, "epoch": 7.142378559463987, "step": 10660}, {"loss": 1.0029, "grad_norm": 1.2643769979476929, "learning_rate": 0.0002, "epoch": 7.149078726968174, "step": 10670}, {"loss": 0.9848, "grad_norm": 1.347589373588562, "learning_rate": 0.0002, "epoch": 7.155778894472362, "step": 10680}, {"loss": 1.0193, "grad_norm": 1.364700436592102, "learning_rate": 0.0002, "epoch": 7.1624790619765495, "step": 10690}, {"loss": 1.0057, "grad_norm": 1.4375768899917603, "learning_rate": 0.0002, "epoch": 7.169179229480737, "step": 10700}, {"loss": 1.0108, "grad_norm": 1.426047444343567, "learning_rate": 0.0002, "epoch": 7.175879396984925, "step": 10710}, {"loss": 0.9985, "grad_norm": 1.5208740234375, "learning_rate": 0.0002, "epoch": 7.182579564489112, "step": 10720}, {"loss": 1.0535, "grad_norm": 1.4713337421417236, "learning_rate": 0.0002, "epoch": 7.1892797319933, "step": 10730}, {"loss": 0.9481, "grad_norm": 1.3042446374893188, "learning_rate": 0.0002, "epoch": 7.1959798994974875, "step": 10740}, {"loss": 0.9813, "grad_norm": 1.2290682792663574, "learning_rate": 0.0002, "epoch": 7.202680067001675, "step": 10750}, {"loss": 1.0437, "grad_norm": 1.1152390241622925, "learning_rate": 0.0002, "epoch": 7.209380234505863, "step": 10760}, {"loss": 0.9557, "grad_norm": 1.3632242679595947, "learning_rate": 0.0002, "epoch": 7.21608040201005, "step": 10770}, {"loss": 0.9915, "grad_norm": 1.2406541109085083, "learning_rate": 0.0002, "epoch": 7.222780569514238, "step": 10780}, {"loss": 1.0138, "grad_norm": 1.1965205669403076, "learning_rate": 0.0002, "epoch": 7.2294807370184255, "step": 10790}, {"loss": 1.0186, "grad_norm": 1.2895352840423584, "learning_rate": 0.0002, "epoch": 7.236180904522613, "step": 10800}, {"loss": 0.9616, "grad_norm": 1.388366937637329, "learning_rate": 0.0002, "epoch": 7.242881072026801, "step": 10810}, {"loss": 0.992, "grad_norm": 1.1411796808242798, "learning_rate": 0.0002, "epoch": 7.249581239530988, "step": 10820}, {"loss": 0.942, "grad_norm": 1.2220646142959595, "learning_rate": 0.0002, "epoch": 7.256281407035176, "step": 10830}, {"loss": 1.0136, "grad_norm": 1.495492696762085, "learning_rate": 0.0002, "epoch": 7.2629815745393635, "step": 10840}, {"loss": 1.0475, "grad_norm": 1.1395213603973389, "learning_rate": 0.0002, "epoch": 7.269681742043551, "step": 10850}, {"loss": 1.0134, "grad_norm": 1.3826487064361572, "learning_rate": 0.0002, "epoch": 7.276381909547739, "step": 10860}, {"loss": 0.944, "grad_norm": 1.4356474876403809, "learning_rate": 0.0002, "epoch": 7.283082077051926, "step": 10870}, {"loss": 0.971, "grad_norm": 1.3617557287216187, "learning_rate": 0.0002, "epoch": 7.289782244556114, "step": 10880}, {"loss": 0.9772, "grad_norm": 1.585394024848938, "learning_rate": 0.0002, "epoch": 7.296482412060302, "step": 10890}, {"loss": 1.0204, "grad_norm": 1.1442821025848389, "learning_rate": 0.0002, "epoch": 7.303182579564489, "step": 10900}, {"loss": 0.9608, "grad_norm": 1.4712985754013062, "learning_rate": 0.0002, "epoch": 7.309882747068677, "step": 10910}, {"loss": 1.0027, "grad_norm": 1.2864325046539307, "learning_rate": 0.0002, "epoch": 7.316582914572864, "step": 10920}, {"loss": 1.0341, "grad_norm": 1.308010458946228, "learning_rate": 0.0002, "epoch": 7.323283082077052, "step": 10930}, {"loss": 1.0096, "grad_norm": 1.4371414184570312, "learning_rate": 0.0002, "epoch": 7.32998324958124, "step": 10940}, {"loss": 0.9999, "grad_norm": 1.5968806743621826, "learning_rate": 0.0002, "epoch": 7.336683417085427, "step": 10950}, {"loss": 0.9611, "grad_norm": 1.3592339754104614, "learning_rate": 0.0002, "epoch": 7.343383584589615, "step": 10960}, {"loss": 1.0505, "grad_norm": 1.225520133972168, "learning_rate": 0.0002, "epoch": 7.350083752093802, "step": 10970}, {"loss": 0.9702, "grad_norm": 1.3138031959533691, "learning_rate": 0.0002, "epoch": 7.35678391959799, "step": 10980}, {"loss": 1.0062, "grad_norm": 1.2601540088653564, "learning_rate": 0.0002, "epoch": 7.363484087102178, "step": 10990}, {"loss": 1.0508, "grad_norm": 1.3437113761901855, "learning_rate": 0.0002, "epoch": 7.370184254606365, "step": 11000}, {"loss": 0.9649, "grad_norm": 1.0681836605072021, "learning_rate": 0.0002, "epoch": 7.376884422110553, "step": 11010}, {"loss": 1.0025, "grad_norm": 1.415852665901184, "learning_rate": 0.0002, "epoch": 7.38358458961474, "step": 11020}, {"loss": 1.0532, "grad_norm": 1.3147039413452148, "learning_rate": 0.0002, "epoch": 7.390284757118928, "step": 11030}, {"loss": 1.1188, "grad_norm": 1.4778614044189453, "learning_rate": 0.0002, "epoch": 7.396984924623116, "step": 11040}, {"loss": 0.9917, "grad_norm": 1.3137797117233276, "learning_rate": 0.0002, "epoch": 7.403685092127303, "step": 11050}, {"loss": 1.0115, "grad_norm": 1.1917701959609985, "learning_rate": 0.0002, "epoch": 7.410385259631491, "step": 11060}, {"loss": 1.0668, "grad_norm": 1.3999699354171753, "learning_rate": 0.0002, "epoch": 7.417085427135678, "step": 11070}, {"loss": 1.0311, "grad_norm": 1.3817322254180908, "learning_rate": 0.0002, "epoch": 7.423785594639866, "step": 11080}, {"loss": 1.0086, "grad_norm": 1.2086812257766724, "learning_rate": 0.0002, "epoch": 7.430485762144054, "step": 11090}, {"loss": 1.0002, "grad_norm": 1.1938024759292603, "learning_rate": 0.0002, "epoch": 7.437185929648241, "step": 11100}, {"loss": 1.0463, "grad_norm": 1.1543669700622559, "learning_rate": 0.0002, "epoch": 7.443886097152429, "step": 11110}, {"loss": 1.0814, "grad_norm": 1.158841848373413, "learning_rate": 0.0002, "epoch": 7.450586264656616, "step": 11120}, {"loss": 1.0022, "grad_norm": 1.2777763605117798, "learning_rate": 0.0002, "epoch": 7.457286432160804, "step": 11130}, {"loss": 1.0162, "grad_norm": 1.3375903367996216, "learning_rate": 0.0002, "epoch": 7.463986599664992, "step": 11140}, {"loss": 0.9995, "grad_norm": 1.5573794841766357, "learning_rate": 0.0002, "epoch": 7.4706867671691795, "step": 11150}, {"loss": 1.0698, "grad_norm": 1.3869640827178955, "learning_rate": 0.0002, "epoch": 7.477386934673367, "step": 11160}, {"loss": 1.0217, "grad_norm": 1.310341238975525, "learning_rate": 0.0002, "epoch": 7.484087102177554, "step": 11170}, {"loss": 1.0365, "grad_norm": 1.4249892234802246, "learning_rate": 0.0002, "epoch": 7.490787269681742, "step": 11180}, {"loss": 1.0346, "grad_norm": 1.4025905132293701, "learning_rate": 0.0002, "epoch": 7.49748743718593, "step": 11190}, {"loss": 1.0328, "grad_norm": 1.4361881017684937, "learning_rate": 0.0002, "epoch": 7.5041876046901175, "step": 11200}, {"loss": 1.0316, "grad_norm": 1.1791380643844604, "learning_rate": 0.0002, "epoch": 7.510887772194305, "step": 11210}, {"loss": 1.0621, "grad_norm": 1.415075421333313, "learning_rate": 0.0002, "epoch": 7.517587939698492, "step": 11220}, {"loss": 1.0265, "grad_norm": 1.3633697032928467, "learning_rate": 0.0002, "epoch": 7.52428810720268, "step": 11230}, {"loss": 0.983, "grad_norm": 1.2803648710250854, "learning_rate": 0.0002, "epoch": 7.530988274706868, "step": 11240}, {"loss": 1.0121, "grad_norm": 1.4032878875732422, "learning_rate": 0.0002, "epoch": 7.5376884422110555, "step": 11250}, {"loss": 1.032, "grad_norm": 1.4507378339767456, "learning_rate": 0.0002, "epoch": 7.544388609715243, "step": 11260}, {"loss": 1.0221, "grad_norm": 1.227613925933838, "learning_rate": 0.0002, "epoch": 7.55108877721943, "step": 11270}, {"loss": 1.0288, "grad_norm": 1.2620965242385864, "learning_rate": 0.0002, "epoch": 7.557788944723618, "step": 11280}, {"loss": 1.1263, "grad_norm": 1.3917821645736694, "learning_rate": 0.0002, "epoch": 7.564489112227806, "step": 11290}, {"loss": 1.0628, "grad_norm": 1.5720019340515137, "learning_rate": 0.0002, "epoch": 7.5711892797319935, "step": 11300}, {"loss": 1.0537, "grad_norm": 1.376694917678833, "learning_rate": 0.0002, "epoch": 7.577889447236181, "step": 11310}, {"loss": 1.012, "grad_norm": 1.4403680562973022, "learning_rate": 0.0002, "epoch": 7.584589614740368, "step": 11320}, {"loss": 1.0318, "grad_norm": 1.6306934356689453, "learning_rate": 0.0002, "epoch": 7.591289782244556, "step": 11330}, {"loss": 1.0297, "grad_norm": 1.2361583709716797, "learning_rate": 0.0002, "epoch": 7.597989949748744, "step": 11340}, {"loss": 1.022, "grad_norm": 1.2658058404922485, "learning_rate": 0.0002, "epoch": 7.6046901172529315, "step": 11350}, {"loss": 1.0357, "grad_norm": 1.5335173606872559, "learning_rate": 0.0002, "epoch": 7.611390284757119, "step": 11360}, {"loss": 1.0689, "grad_norm": 1.3432948589324951, "learning_rate": 0.0002, "epoch": 7.618090452261306, "step": 11370}, {"loss": 1.0613, "grad_norm": 1.374617338180542, "learning_rate": 0.0002, "epoch": 7.624790619765494, "step": 11380}, {"loss": 1.0722, "grad_norm": 1.3790302276611328, "learning_rate": 0.0002, "epoch": 7.631490787269682, "step": 11390}, {"loss": 1.0078, "grad_norm": 1.4256713390350342, "learning_rate": 0.0002, "epoch": 7.63819095477387, "step": 11400}, {"loss": 1.0314, "grad_norm": 1.271228313446045, "learning_rate": 0.0002, "epoch": 7.644891122278057, "step": 11410}, {"loss": 1.0678, "grad_norm": 1.432146668434143, "learning_rate": 0.0002, "epoch": 7.651591289782244, "step": 11420}, {"loss": 1.0496, "grad_norm": 1.2698006629943848, "learning_rate": 0.0002, "epoch": 7.658291457286432, "step": 11430}, {"loss": 1.0678, "grad_norm": 1.439496636390686, "learning_rate": 0.0002, "epoch": 7.66499162479062, "step": 11440}, {"loss": 1.0344, "grad_norm": 1.2079370021820068, "learning_rate": 0.0002, "epoch": 7.671691792294808, "step": 11450}, {"loss": 1.047, "grad_norm": 1.310412049293518, "learning_rate": 0.0002, "epoch": 7.678391959798995, "step": 11460}, {"loss": 1.0524, "grad_norm": 1.413438320159912, "learning_rate": 0.0002, "epoch": 7.685092127303182, "step": 11470}, {"loss": 0.9965, "grad_norm": 1.2390344142913818, "learning_rate": 0.0002, "epoch": 7.69179229480737, "step": 11480}, {"loss": 1.0477, "grad_norm": 1.3902971744537354, "learning_rate": 0.0002, "epoch": 7.698492462311558, "step": 11490}, {"loss": 0.9811, "grad_norm": 1.1194908618927002, "learning_rate": 0.0002, "epoch": 7.705192629815746, "step": 11500}, {"loss": 1.0959, "grad_norm": 1.625697374343872, "learning_rate": 0.0002, "epoch": 7.711892797319933, "step": 11510}, {"loss": 1.0285, "grad_norm": 1.339687466621399, "learning_rate": 0.0002, "epoch": 7.71859296482412, "step": 11520}, {"loss": 1.032, "grad_norm": 1.336680293083191, "learning_rate": 0.0002, "epoch": 7.725293132328308, "step": 11530}, {"loss": 1.0915, "grad_norm": 1.3316529989242554, "learning_rate": 0.0002, "epoch": 7.731993299832496, "step": 11540}, {"loss": 1.0031, "grad_norm": 1.2593837976455688, "learning_rate": 0.0002, "epoch": 7.738693467336684, "step": 11550}, {"loss": 1.019, "grad_norm": 1.2159652709960938, "learning_rate": 0.0002, "epoch": 7.745393634840871, "step": 11560}, {"loss": 1.0554, "grad_norm": 1.2919223308563232, "learning_rate": 0.0002, "epoch": 7.752093802345058, "step": 11570}, {"loss": 1.0072, "grad_norm": 1.2574092149734497, "learning_rate": 0.0002, "epoch": 7.758793969849246, "step": 11580}, {"loss": 1.0379, "grad_norm": 1.228236436843872, "learning_rate": 0.0002, "epoch": 7.765494137353434, "step": 11590}, {"loss": 1.0494, "grad_norm": 1.3790936470031738, "learning_rate": 0.0002, "epoch": 7.772194304857622, "step": 11600}, {"loss": 1.0256, "grad_norm": 1.419376015663147, "learning_rate": 0.0002, "epoch": 7.778894472361809, "step": 11610}, {"loss": 1.0985, "grad_norm": 1.3336344957351685, "learning_rate": 0.0002, "epoch": 7.785594639865996, "step": 11620}, {"loss": 1.107, "grad_norm": 1.5279520750045776, "learning_rate": 0.0002, "epoch": 7.792294807370184, "step": 11630}, {"loss": 1.1197, "grad_norm": 1.4296824932098389, "learning_rate": 0.0002, "epoch": 7.798994974874372, "step": 11640}, {"loss": 1.128, "grad_norm": 1.279316782951355, "learning_rate": 0.0002, "epoch": 7.80569514237856, "step": 11650}, {"loss": 1.0738, "grad_norm": 1.2764557600021362, "learning_rate": 0.0002, "epoch": 7.812395309882747, "step": 11660}, {"loss": 1.1157, "grad_norm": 1.1745330095291138, "learning_rate": 0.0002, "epoch": 7.819095477386934, "step": 11670}, {"loss": 1.1071, "grad_norm": 1.3440991640090942, "learning_rate": 0.0002, "epoch": 7.825795644891122, "step": 11680}, {"loss": 1.0751, "grad_norm": 1.1923167705535889, "learning_rate": 0.0002, "epoch": 7.83249581239531, "step": 11690}, {"loss": 1.0656, "grad_norm": 1.2679530382156372, "learning_rate": 0.0002, "epoch": 7.839195979899498, "step": 11700}, {"loss": 1.0504, "grad_norm": 1.3027020692825317, "learning_rate": 0.0002, "epoch": 7.8458961474036855, "step": 11710}, {"loss": 1.0496, "grad_norm": 1.4565616846084595, "learning_rate": 0.0002, "epoch": 7.852596314907872, "step": 11720}, {"loss": 1.0487, "grad_norm": 1.3157920837402344, "learning_rate": 0.0002, "epoch": 7.85929648241206, "step": 11730}, {"loss": 1.0876, "grad_norm": 1.3120285272598267, "learning_rate": 0.0002, "epoch": 7.865996649916248, "step": 11740}, {"loss": 1.0248, "grad_norm": 1.2625858783721924, "learning_rate": 0.0002, "epoch": 7.872696817420436, "step": 11750}, {"loss": 1.0047, "grad_norm": 1.3911317586898804, "learning_rate": 0.0002, "epoch": 7.8793969849246235, "step": 11760}, {"loss": 1.0436, "grad_norm": 1.2151232957839966, "learning_rate": 0.0002, "epoch": 7.88609715242881, "step": 11770}, {"loss": 1.0526, "grad_norm": 1.5171650648117065, "learning_rate": 0.0002, "epoch": 7.892797319932998, "step": 11780}, {"loss": 1.0957, "grad_norm": 1.4308419227600098, "learning_rate": 0.0002, "epoch": 7.899497487437186, "step": 11790}, {"loss": 1.0346, "grad_norm": 1.2683558464050293, "learning_rate": 0.0002, "epoch": 7.906197654941374, "step": 11800}, {"loss": 1.1134, "grad_norm": 1.6872674226760864, "learning_rate": 0.0002, "epoch": 7.9128978224455615, "step": 11810}, {"loss": 1.0077, "grad_norm": 1.3805692195892334, "learning_rate": 0.0002, "epoch": 7.919597989949748, "step": 11820}, {"loss": 1.1098, "grad_norm": 1.3342738151550293, "learning_rate": 0.0002, "epoch": 7.926298157453936, "step": 11830}, {"loss": 1.0446, "grad_norm": 1.308137059211731, "learning_rate": 0.0002, "epoch": 7.932998324958124, "step": 11840}, {"loss": 1.1032, "grad_norm": 1.3725523948669434, "learning_rate": 0.0002, "epoch": 7.939698492462312, "step": 11850}, {"loss": 1.0492, "grad_norm": 1.1458995342254639, "learning_rate": 0.0002, "epoch": 7.9463986599664995, "step": 11860}, {"loss": 1.0968, "grad_norm": 1.4715759754180908, "learning_rate": 0.0002, "epoch": 7.953098827470686, "step": 11870}, {"loss": 1.1408, "grad_norm": 1.1897743940353394, "learning_rate": 0.0002, "epoch": 7.959798994974874, "step": 11880}, {"loss": 1.0628, "grad_norm": 1.3386842012405396, "learning_rate": 0.0002, "epoch": 7.966499162479062, "step": 11890}, {"loss": 1.0463, "grad_norm": 1.3611114025115967, "learning_rate": 0.0002, "epoch": 7.97319932998325, "step": 11900}, {"loss": 1.1283, "grad_norm": 1.1429232358932495, "learning_rate": 0.0002, "epoch": 7.9798994974874375, "step": 11910}, {"loss": 1.1153, "grad_norm": 1.4848406314849854, "learning_rate": 0.0002, "epoch": 7.986599664991624, "step": 11920}, {"loss": 1.1159, "grad_norm": 1.3205432891845703, "learning_rate": 0.0002, "epoch": 7.993299832495812, "step": 11930}]}