diff --git a/.gitattributes b/.gitattributes index 33736f02c744a5faccc58f9e7e578bc77b8fca4e..306d15af5e77e2dd162b433825f1a0edc089143d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4522,3 +4522,12 @@ Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_ Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-0/checkpoint-6088/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-0/checkpoint-761/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-0/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..964125ab9fd48536bbe987c48ca9e996e5223801 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deffe23be2cae0be54e805f07b2006c881109d56dcdd7ea094daa5e4104a6166 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..964125ab9fd48536bbe987c48ca9e996e5223801 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deffe23be2cae0be54e805f07b2006c881109d56dcdd7ea094daa5e4104a6166 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4cc5164d2824cc1b5adb231cfd133903bd134581 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9128601b1dd4f51e15837ab80efc0df1aa48d229d6814175d61814abb8d2ab3a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..034f4a7091ed7da31368f43c194f34e37f4de50f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:865e3a66c19eeaa7806ebffa8e8ab75dbe90f022d68806907afee9e68de72b83 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bba2ffdbf1afae84b09ca25865a1063b1b54a634 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7849289b953e46b8a593b8a68c5290cd1cb1d956d606e9dd6f3a91f537dd7d22 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..47d405655e45010dc1287a673cb6cda39e7b053c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/trainer_state.json @@ -0,0 +1,1092 @@ +{ + "best_metric": 1.8132041692733765, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1497, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.32302048802375793, + "learning_rate": 0.0002, + "loss": 1.7786, + "step": 750 + }, + { + "epoch": 1.0153640614562458, + "grad_norm": 0.37585633993148804, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 760 + }, + { + "epoch": 1.0287241148964597, + "grad_norm": 0.33826273679733276, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 770 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.44682955741882324, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 780 + }, + { + "epoch": 1.0554442217768871, + "grad_norm": 0.422188401222229, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 790 + }, + { + "epoch": 1.0688042752171008, + "grad_norm": 0.3809906244277954, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 800 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.3454349637031555, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 1.0955243820975284, + "grad_norm": 0.3767355978488922, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 820 + }, + { + "epoch": 1.108884435537742, + "grad_norm": 0.3361407518386841, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 830 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.3654632568359375, + "learning_rate": 0.0002, + "loss": 1.7509, + "step": 840 + }, + { + "epoch": 1.1356045424181698, + "grad_norm": 0.3822861313819885, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 850 + }, + { + "epoch": 1.1489645958583834, + "grad_norm": 0.3853831887245178, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 860 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.35521796345710754, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 870 + }, + { + "epoch": 1.1756847027388109, + "grad_norm": 0.4107200503349304, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 880 + }, + { + "epoch": 1.1890447561790247, + "grad_norm": 0.33219534158706665, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 890 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.3559704124927521, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 900 + }, + { + "epoch": 1.2157648630594522, + "grad_norm": 0.3700537383556366, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 910 + }, + { + "epoch": 1.229124916499666, + "grad_norm": 0.3771909475326538, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 920 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 0.3136613965034485, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 930 + }, + { + "epoch": 1.2558450233800935, + "grad_norm": 0.3952099084854126, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 940 + }, + { + "epoch": 1.2692050768203074, + "grad_norm": 0.36534377932548523, + "learning_rate": 0.0002, + "loss": 1.7691, + "step": 950 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.3803492486476898, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 960 + }, + { + "epoch": 1.2959251837007348, + "grad_norm": 0.3992428183555603, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 970 + }, + { + "epoch": 1.3092852371409487, + "grad_norm": 0.3627142906188965, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 980 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.4248180091381073, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 990 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.4060308039188385, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1000 + }, + { + "epoch": 1.3493653974615898, + "grad_norm": 0.3788969814777374, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1010 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1020 + }, + { + "epoch": 1.3760855043420173, + "grad_norm": 0.35500675439834595, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1030 + }, + { + "epoch": 1.389445557782231, + "grad_norm": 0.3454059362411499, + "learning_rate": 0.0002, + "loss": 1.724, + "step": 1040 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.45807570219039917, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 1050 + }, + { + "epoch": 1.4161656646626586, + "grad_norm": 0.39338022470474243, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1060 + }, + { + "epoch": 1.4295257181028724, + "grad_norm": 0.3870709240436554, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1070 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.40996190905570984, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 1080 + }, + { + "epoch": 1.4562458249833, + "grad_norm": 0.38762837648391724, + "learning_rate": 0.0002, + "loss": 1.7324, + "step": 1090 + }, + { + "epoch": 1.4696058784235138, + "grad_norm": 0.36756977438926697, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1100 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.4087235927581787, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1110 + }, + { + "epoch": 1.4963259853039412, + "grad_norm": 0.3357745110988617, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 1120 + }, + { + "epoch": 1.5096860387441549, + "grad_norm": 0.37486532330513, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1130 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.3387809991836548, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1140 + }, + { + "epoch": 1.5364061456245826, + "grad_norm": 0.37462118268013, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1150 + }, + { + "epoch": 1.5497661990647962, + "grad_norm": 0.38575324416160583, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1160 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.3515765964984894, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1170 + }, + { + "epoch": 1.5764863059452239, + "grad_norm": 0.39308643341064453, + "learning_rate": 0.0002, + "loss": 1.7524, + "step": 1180 + }, + { + "epoch": 1.5898463593854375, + "grad_norm": 0.3308864235877991, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1190 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.3397478461265564, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1200 + }, + { + "epoch": 1.6165664662658652, + "grad_norm": 0.3911525309085846, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 1210 + }, + { + "epoch": 1.6299265197060788, + "grad_norm": 0.3771969974040985, + "learning_rate": 0.0002, + "loss": 1.7443, + "step": 1220 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.35346856713294983, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1230 + }, + { + "epoch": 1.6566466265865063, + "grad_norm": 0.41736963391304016, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 0.3375225067138672, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1250 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.3779928982257843, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1260 + }, + { + "epoch": 1.6967267869071476, + "grad_norm": 0.35388994216918945, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1270 + }, + { + "epoch": 1.7100868403473615, + "grad_norm": 0.33884134888648987, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1280 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.35439756512641907, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1290 + }, + { + "epoch": 1.736806947227789, + "grad_norm": 0.3766156733036041, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 1300 + }, + { + "epoch": 1.7501670006680028, + "grad_norm": 0.36148911714553833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1310 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.39687496423721313, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.77688710754843, + "grad_norm": 0.35639452934265137, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1330 + }, + { + "epoch": 1.7902471609886441, + "grad_norm": 0.38781628012657166, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1340 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.42784637212753296, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 1350 + }, + { + "epoch": 1.8169672678690714, + "grad_norm": 0.40258511900901794, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1360 + }, + { + "epoch": 1.8303273213092852, + "grad_norm": 0.36674195528030396, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 1370 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 0.4064558446407318, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1380 + }, + { + "epoch": 1.8570474281897127, + "grad_norm": 0.3669849932193756, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1390 + }, + { + "epoch": 1.8704074816299265, + "grad_norm": 0.37569567561149597, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1400 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.37307995557785034, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1410 + }, + { + "epoch": 1.897127588510354, + "grad_norm": 0.3772695064544678, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1420 + }, + { + "epoch": 1.9104876419505676, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1430 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.3490557372570038, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 1440 + }, + { + "epoch": 1.9372077488309953, + "grad_norm": 0.3716149628162384, + "learning_rate": 0.0002, + "loss": 1.7979, + "step": 1450 + }, + { + "epoch": 1.950567802271209, + "grad_norm": 0.39236098527908325, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.37258651852607727, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 1470 + }, + { + "epoch": 1.9772879091516367, + "grad_norm": 0.36183077096939087, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1480 + }, + { + "epoch": 1.9906479625918503, + "grad_norm": 0.3956947326660156, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8132041692733765, + "eval_runtime": 38.6287, + "eval_samples_per_second": 13.332, + "eval_steps_per_second": 1.683, + "step": 1497 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.927780808124006e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e3a2bc9e3e10b99ab356fffee3ffefe177b4879f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc369814feac393d3ace2c6780b443da8e22f0aab9d9abebeb252b5c0de4273 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..031899978def968de0182cd1d377b3faffa9bc58 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5119c7bc18a0e8aa2d70ba4ae20bf571aff516d7dccfa15fb010c8e94ea9f47 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..21dee325a9613e8ae441cc919ec4b3971ea7784d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00e5bd0b522557bfdfbff732fff6e8df84a6a18784c9cc40d6cafab496f2033 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3260ae5b15b8abf75f8091264d9585cc281cc957 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cc8f3029874061e84dd272cb445cc335b8a37c374c04aff0d84b61cef30cb0c +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d2963f60946f7ada17c9a51136b80265031ec56f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/trainer_state.json @@ -0,0 +1,1625 @@ +{ + "best_metric": 1.8132041692733765, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", + "epoch": 2.9993319973279893, + "eval_steps": 10, + "global_step": 2245, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.32302048802375793, + "learning_rate": 0.0002, + "loss": 1.7786, + "step": 750 + }, + { + "epoch": 1.0153640614562458, + "grad_norm": 0.37585633993148804, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 760 + }, + { + "epoch": 1.0287241148964597, + "grad_norm": 0.33826273679733276, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 770 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.44682955741882324, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 780 + }, + { + "epoch": 1.0554442217768871, + "grad_norm": 0.422188401222229, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 790 + }, + { + "epoch": 1.0688042752171008, + "grad_norm": 0.3809906244277954, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 800 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.3454349637031555, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 1.0955243820975284, + "grad_norm": 0.3767355978488922, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 820 + }, + { + "epoch": 1.108884435537742, + "grad_norm": 0.3361407518386841, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 830 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.3654632568359375, + "learning_rate": 0.0002, + "loss": 1.7509, + "step": 840 + }, + { + "epoch": 1.1356045424181698, + "grad_norm": 0.3822861313819885, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 850 + }, + { + "epoch": 1.1489645958583834, + "grad_norm": 0.3853831887245178, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 860 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.35521796345710754, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 870 + }, + { + "epoch": 1.1756847027388109, + "grad_norm": 0.4107200503349304, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 880 + }, + { + "epoch": 1.1890447561790247, + "grad_norm": 0.33219534158706665, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 890 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.3559704124927521, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 900 + }, + { + "epoch": 1.2157648630594522, + "grad_norm": 0.3700537383556366, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 910 + }, + { + "epoch": 1.229124916499666, + "grad_norm": 0.3771909475326538, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 920 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 0.3136613965034485, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 930 + }, + { + "epoch": 1.2558450233800935, + "grad_norm": 0.3952099084854126, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 940 + }, + { + "epoch": 1.2692050768203074, + "grad_norm": 0.36534377932548523, + "learning_rate": 0.0002, + "loss": 1.7691, + "step": 950 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.3803492486476898, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 960 + }, + { + "epoch": 1.2959251837007348, + "grad_norm": 0.3992428183555603, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 970 + }, + { + "epoch": 1.3092852371409487, + "grad_norm": 0.3627142906188965, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 980 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.4248180091381073, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 990 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.4060308039188385, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1000 + }, + { + "epoch": 1.3493653974615898, + "grad_norm": 0.3788969814777374, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1010 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1020 + }, + { + "epoch": 1.3760855043420173, + "grad_norm": 0.35500675439834595, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1030 + }, + { + "epoch": 1.389445557782231, + "grad_norm": 0.3454059362411499, + "learning_rate": 0.0002, + "loss": 1.724, + "step": 1040 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.45807570219039917, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 1050 + }, + { + "epoch": 1.4161656646626586, + "grad_norm": 0.39338022470474243, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1060 + }, + { + "epoch": 1.4295257181028724, + "grad_norm": 0.3870709240436554, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1070 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.40996190905570984, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 1080 + }, + { + "epoch": 1.4562458249833, + "grad_norm": 0.38762837648391724, + "learning_rate": 0.0002, + "loss": 1.7324, + "step": 1090 + }, + { + "epoch": 1.4696058784235138, + "grad_norm": 0.36756977438926697, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1100 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.4087235927581787, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1110 + }, + { + "epoch": 1.4963259853039412, + "grad_norm": 0.3357745110988617, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 1120 + }, + { + "epoch": 1.5096860387441549, + "grad_norm": 0.37486532330513, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1130 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.3387809991836548, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1140 + }, + { + "epoch": 1.5364061456245826, + "grad_norm": 0.37462118268013, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1150 + }, + { + "epoch": 1.5497661990647962, + "grad_norm": 0.38575324416160583, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1160 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.3515765964984894, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1170 + }, + { + "epoch": 1.5764863059452239, + "grad_norm": 0.39308643341064453, + "learning_rate": 0.0002, + "loss": 1.7524, + "step": 1180 + }, + { + "epoch": 1.5898463593854375, + "grad_norm": 0.3308864235877991, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1190 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.3397478461265564, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1200 + }, + { + "epoch": 1.6165664662658652, + "grad_norm": 0.3911525309085846, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 1210 + }, + { + "epoch": 1.6299265197060788, + "grad_norm": 0.3771969974040985, + "learning_rate": 0.0002, + "loss": 1.7443, + "step": 1220 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.35346856713294983, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1230 + }, + { + "epoch": 1.6566466265865063, + "grad_norm": 0.41736963391304016, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 0.3375225067138672, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1250 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.3779928982257843, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1260 + }, + { + "epoch": 1.6967267869071476, + "grad_norm": 0.35388994216918945, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1270 + }, + { + "epoch": 1.7100868403473615, + "grad_norm": 0.33884134888648987, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1280 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.35439756512641907, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1290 + }, + { + "epoch": 1.736806947227789, + "grad_norm": 0.3766156733036041, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 1300 + }, + { + "epoch": 1.7501670006680028, + "grad_norm": 0.36148911714553833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1310 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.39687496423721313, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.77688710754843, + "grad_norm": 0.35639452934265137, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1330 + }, + { + "epoch": 1.7902471609886441, + "grad_norm": 0.38781628012657166, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1340 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.42784637212753296, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 1350 + }, + { + "epoch": 1.8169672678690714, + "grad_norm": 0.40258511900901794, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1360 + }, + { + "epoch": 1.8303273213092852, + "grad_norm": 0.36674195528030396, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 1370 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 0.4064558446407318, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1380 + }, + { + "epoch": 1.8570474281897127, + "grad_norm": 0.3669849932193756, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1390 + }, + { + "epoch": 1.8704074816299265, + "grad_norm": 0.37569567561149597, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1400 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.37307995557785034, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1410 + }, + { + "epoch": 1.897127588510354, + "grad_norm": 0.3772695064544678, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1420 + }, + { + "epoch": 1.9104876419505676, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1430 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.3490557372570038, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 1440 + }, + { + "epoch": 1.9372077488309953, + "grad_norm": 0.3716149628162384, + "learning_rate": 0.0002, + "loss": 1.7979, + "step": 1450 + }, + { + "epoch": 1.950567802271209, + "grad_norm": 0.39236098527908325, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.37258651852607727, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 1470 + }, + { + "epoch": 1.9772879091516367, + "grad_norm": 0.36183077096939087, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1480 + }, + { + "epoch": 1.9906479625918503, + "grad_norm": 0.3956947326660156, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8132041692733765, + "eval_runtime": 38.6287, + "eval_samples_per_second": 13.332, + "eval_steps_per_second": 1.683, + "step": 1497 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 0.34480565786361694, + "learning_rate": 0.0002, + "loss": 1.6791, + "step": 1500 + }, + { + "epoch": 2.017368069472278, + "grad_norm": 0.3418028652667999, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1510 + }, + { + "epoch": 2.0307281229124916, + "grad_norm": 0.4514467716217041, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 1520 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 0.4197506606578827, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1530 + }, + { + "epoch": 2.0574482297929193, + "grad_norm": 0.4134170711040497, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1540 + }, + { + "epoch": 2.070808283233133, + "grad_norm": 0.43709826469421387, + "learning_rate": 0.0002, + "loss": 1.6876, + "step": 1550 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 0.4703378677368164, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 1560 + }, + { + "epoch": 2.0975283901135606, + "grad_norm": 0.4538188576698303, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 1570 + }, + { + "epoch": 2.1108884435537743, + "grad_norm": 0.4649668037891388, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1580 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 0.42669883370399475, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1590 + }, + { + "epoch": 2.1376085504342015, + "grad_norm": 0.43162038922309875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1600 + }, + { + "epoch": 2.1509686038744156, + "grad_norm": 0.4294586479663849, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1610 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 0.4669102132320404, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1620 + }, + { + "epoch": 2.177688710754843, + "grad_norm": 0.4188412129878998, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1630 + }, + { + "epoch": 2.191048764195057, + "grad_norm": 0.4662680923938751, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 1640 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 0.4020286500453949, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1650 + }, + { + "epoch": 2.217768871075484, + "grad_norm": 0.41919606924057007, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1660 + }, + { + "epoch": 2.231128924515698, + "grad_norm": 0.4644531309604645, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1670 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 0.4526427984237671, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 1680 + }, + { + "epoch": 2.2578490313961255, + "grad_norm": 0.45953166484832764, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1690 + }, + { + "epoch": 2.2712090848363395, + "grad_norm": 0.4701860249042511, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 1700 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 0.4749310612678528, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1710 + }, + { + "epoch": 2.297929191716767, + "grad_norm": 0.45026102662086487, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 1720 + }, + { + "epoch": 2.3112892451569804, + "grad_norm": 0.4755004048347473, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1730 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 0.4505726993083954, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1740 + }, + { + "epoch": 2.338009352037408, + "grad_norm": 0.44464054703712463, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1750 + }, + { + "epoch": 2.3513694054776217, + "grad_norm": 0.4449476897716522, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1760 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 0.4216482937335968, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 1770 + }, + { + "epoch": 2.3780895123580494, + "grad_norm": 0.4379308521747589, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 1780 + }, + { + "epoch": 2.391449565798263, + "grad_norm": 0.41670042276382446, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 1790 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 0.48089510202407837, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 1800 + }, + { + "epoch": 2.4181696726786908, + "grad_norm": 0.4389738142490387, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 1810 + }, + { + "epoch": 2.4315297261189044, + "grad_norm": 0.45293036103248596, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1820 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 0.5211683511734009, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1830 + }, + { + "epoch": 2.458249832999332, + "grad_norm": 0.4631884694099426, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 1840 + }, + { + "epoch": 2.4716098864395457, + "grad_norm": 0.4276818335056305, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 1850 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 0.477524071931839, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1860 + }, + { + "epoch": 2.4983299933199734, + "grad_norm": 0.44860973954200745, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1870 + }, + { + "epoch": 2.511690046760187, + "grad_norm": 0.46413546800613403, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1880 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 0.42487645149230957, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1890 + }, + { + "epoch": 2.5384101536406147, + "grad_norm": 0.4778307378292084, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1900 + }, + { + "epoch": 2.5517702070808284, + "grad_norm": 0.45307061076164246, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1910 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 0.47886642813682556, + "learning_rate": 0.0002, + "loss": 1.7279, + "step": 1920 + }, + { + "epoch": 2.5784903139612556, + "grad_norm": 0.4839435815811157, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1930 + }, + { + "epoch": 2.5918503674014697, + "grad_norm": 0.4388359785079956, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 1940 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 0.47859734296798706, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1950 + }, + { + "epoch": 2.6185704742818974, + "grad_norm": 0.5526517033576965, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 1960 + }, + { + "epoch": 2.631930527722111, + "grad_norm": 0.5449170470237732, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1970 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 0.48521968722343445, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 1980 + }, + { + "epoch": 2.6586506346025383, + "grad_norm": 0.4733737111091614, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 1990 + }, + { + "epoch": 2.6720106880427523, + "grad_norm": 0.507118284702301, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2000 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 0.4508971571922302, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2010 + }, + { + "epoch": 2.6987307949231796, + "grad_norm": 0.4657728672027588, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2020 + }, + { + "epoch": 2.7120908483633936, + "grad_norm": 0.48647549748420715, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2030 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 0.49525555968284607, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 2040 + }, + { + "epoch": 2.738810955243821, + "grad_norm": 0.4712379276752472, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 2050 + }, + { + "epoch": 2.7521710086840345, + "grad_norm": 0.4846591055393219, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 2060 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 0.4823240041732788, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 2070 + }, + { + "epoch": 2.778891115564462, + "grad_norm": 0.4546685516834259, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 2080 + }, + { + "epoch": 2.7922511690046763, + "grad_norm": 0.45542681217193604, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 2090 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 0.42137566208839417, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2100 + }, + { + "epoch": 2.8189712758851035, + "grad_norm": 0.6143282055854797, + "learning_rate": 0.0002, + "loss": 1.6526, + "step": 2110 + }, + { + "epoch": 2.832331329325317, + "grad_norm": 0.4828081727027893, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 2120 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 0.4319005608558655, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2130 + }, + { + "epoch": 2.859051436205745, + "grad_norm": 0.4297086298465729, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2140 + }, + { + "epoch": 2.8724114896459585, + "grad_norm": 0.5011981129646301, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2150 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 0.4401548504829407, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 2160 + }, + { + "epoch": 2.899131596526386, + "grad_norm": 0.48090746998786926, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 2170 + }, + { + "epoch": 2.9124916499666, + "grad_norm": 0.4740385413169861, + "learning_rate": 0.0002, + "loss": 1.6596, + "step": 2180 + }, + { + "epoch": 2.9258517034068134, + "grad_norm": 0.5337260365486145, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2190 + }, + { + "epoch": 2.9392117568470275, + "grad_norm": 0.4420052766799927, + "learning_rate": 0.0002, + "loss": 1.6802, + "step": 2200 + }, + { + "epoch": 2.952571810287241, + "grad_norm": 0.477512389421463, + "learning_rate": 0.0002, + "loss": 1.5474, + "step": 2210 + }, + { + "epoch": 2.9659318637274548, + "grad_norm": 0.5344052910804749, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 2220 + }, + { + "epoch": 2.979291917167669, + "grad_norm": 0.4483940303325653, + "learning_rate": 0.0002, + "loss": 1.6866, + "step": 2230 + }, + { + "epoch": 2.9926519706078825, + "grad_norm": 0.4366597831249237, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2240 + }, + { + "epoch": 2.9993319973279893, + "eval_loss": 1.834012746810913, + "eval_runtime": 38.5659, + "eval_samples_per_second": 13.354, + "eval_steps_per_second": 1.685, + "step": 2245 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.039167121218601e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2245/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d9c47b281dd363276b5935cdd914f602fb5b097e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50145fdba72c154193c72a354fd9ed8a9384f2d380d2d052fff0853733b999b9 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c54f518264268b081d234b966559f7d6206e0061 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d67409eb615b81e95854e8d76155238d3b72eba70e87a1c57213652fb80c51 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..df8f7a931ab5382a60f213cbc64774cb39fd8340 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbf4809ab99e0a3867e853a0003750622809b7d78cab5822d0da5dd6eef01db1 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b46a4e0b733f29f1695fec8b5c3c466f9cd7944 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8fa4ddd949eedb49a4d50d157f1003d3c620937763ed48cf89d88bbe557bcd6 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..21498cc9e6bfdb5fd5c874cd8c31cd9880f711e9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/trainer_state.json @@ -0,0 +1,2158 @@ +{ + "best_metric": 1.8132041692733765, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 2994, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.32302048802375793, + "learning_rate": 0.0002, + "loss": 1.7786, + "step": 750 + }, + { + "epoch": 1.0153640614562458, + "grad_norm": 0.37585633993148804, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 760 + }, + { + "epoch": 1.0287241148964597, + "grad_norm": 0.33826273679733276, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 770 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.44682955741882324, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 780 + }, + { + "epoch": 1.0554442217768871, + "grad_norm": 0.422188401222229, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 790 + }, + { + "epoch": 1.0688042752171008, + "grad_norm": 0.3809906244277954, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 800 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.3454349637031555, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 1.0955243820975284, + "grad_norm": 0.3767355978488922, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 820 + }, + { + "epoch": 1.108884435537742, + "grad_norm": 0.3361407518386841, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 830 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.3654632568359375, + "learning_rate": 0.0002, + "loss": 1.7509, + "step": 840 + }, + { + "epoch": 1.1356045424181698, + "grad_norm": 0.3822861313819885, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 850 + }, + { + "epoch": 1.1489645958583834, + "grad_norm": 0.3853831887245178, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 860 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.35521796345710754, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 870 + }, + { + "epoch": 1.1756847027388109, + "grad_norm": 0.4107200503349304, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 880 + }, + { + "epoch": 1.1890447561790247, + "grad_norm": 0.33219534158706665, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 890 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.3559704124927521, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 900 + }, + { + "epoch": 1.2157648630594522, + "grad_norm": 0.3700537383556366, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 910 + }, + { + "epoch": 1.229124916499666, + "grad_norm": 0.3771909475326538, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 920 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 0.3136613965034485, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 930 + }, + { + "epoch": 1.2558450233800935, + "grad_norm": 0.3952099084854126, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 940 + }, + { + "epoch": 1.2692050768203074, + "grad_norm": 0.36534377932548523, + "learning_rate": 0.0002, + "loss": 1.7691, + "step": 950 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.3803492486476898, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 960 + }, + { + "epoch": 1.2959251837007348, + "grad_norm": 0.3992428183555603, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 970 + }, + { + "epoch": 1.3092852371409487, + "grad_norm": 0.3627142906188965, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 980 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.4248180091381073, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 990 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.4060308039188385, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1000 + }, + { + "epoch": 1.3493653974615898, + "grad_norm": 0.3788969814777374, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1010 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1020 + }, + { + "epoch": 1.3760855043420173, + "grad_norm": 0.35500675439834595, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1030 + }, + { + "epoch": 1.389445557782231, + "grad_norm": 0.3454059362411499, + "learning_rate": 0.0002, + "loss": 1.724, + "step": 1040 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.45807570219039917, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 1050 + }, + { + "epoch": 1.4161656646626586, + "grad_norm": 0.39338022470474243, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1060 + }, + { + "epoch": 1.4295257181028724, + "grad_norm": 0.3870709240436554, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1070 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.40996190905570984, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 1080 + }, + { + "epoch": 1.4562458249833, + "grad_norm": 0.38762837648391724, + "learning_rate": 0.0002, + "loss": 1.7324, + "step": 1090 + }, + { + "epoch": 1.4696058784235138, + "grad_norm": 0.36756977438926697, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1100 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.4087235927581787, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1110 + }, + { + "epoch": 1.4963259853039412, + "grad_norm": 0.3357745110988617, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 1120 + }, + { + "epoch": 1.5096860387441549, + "grad_norm": 0.37486532330513, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1130 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.3387809991836548, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1140 + }, + { + "epoch": 1.5364061456245826, + "grad_norm": 0.37462118268013, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1150 + }, + { + "epoch": 1.5497661990647962, + "grad_norm": 0.38575324416160583, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1160 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.3515765964984894, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1170 + }, + { + "epoch": 1.5764863059452239, + "grad_norm": 0.39308643341064453, + "learning_rate": 0.0002, + "loss": 1.7524, + "step": 1180 + }, + { + "epoch": 1.5898463593854375, + "grad_norm": 0.3308864235877991, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1190 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.3397478461265564, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1200 + }, + { + "epoch": 1.6165664662658652, + "grad_norm": 0.3911525309085846, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 1210 + }, + { + "epoch": 1.6299265197060788, + "grad_norm": 0.3771969974040985, + "learning_rate": 0.0002, + "loss": 1.7443, + "step": 1220 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.35346856713294983, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1230 + }, + { + "epoch": 1.6566466265865063, + "grad_norm": 0.41736963391304016, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 0.3375225067138672, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1250 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.3779928982257843, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1260 + }, + { + "epoch": 1.6967267869071476, + "grad_norm": 0.35388994216918945, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1270 + }, + { + "epoch": 1.7100868403473615, + "grad_norm": 0.33884134888648987, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1280 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.35439756512641907, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1290 + }, + { + "epoch": 1.736806947227789, + "grad_norm": 0.3766156733036041, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 1300 + }, + { + "epoch": 1.7501670006680028, + "grad_norm": 0.36148911714553833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1310 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.39687496423721313, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.77688710754843, + "grad_norm": 0.35639452934265137, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1330 + }, + { + "epoch": 1.7902471609886441, + "grad_norm": 0.38781628012657166, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1340 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.42784637212753296, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 1350 + }, + { + "epoch": 1.8169672678690714, + "grad_norm": 0.40258511900901794, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1360 + }, + { + "epoch": 1.8303273213092852, + "grad_norm": 0.36674195528030396, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 1370 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 0.4064558446407318, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1380 + }, + { + "epoch": 1.8570474281897127, + "grad_norm": 0.3669849932193756, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1390 + }, + { + "epoch": 1.8704074816299265, + "grad_norm": 0.37569567561149597, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1400 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.37307995557785034, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1410 + }, + { + "epoch": 1.897127588510354, + "grad_norm": 0.3772695064544678, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1420 + }, + { + "epoch": 1.9104876419505676, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1430 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.3490557372570038, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 1440 + }, + { + "epoch": 1.9372077488309953, + "grad_norm": 0.3716149628162384, + "learning_rate": 0.0002, + "loss": 1.7979, + "step": 1450 + }, + { + "epoch": 1.950567802271209, + "grad_norm": 0.39236098527908325, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.37258651852607727, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 1470 + }, + { + "epoch": 1.9772879091516367, + "grad_norm": 0.36183077096939087, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1480 + }, + { + "epoch": 1.9906479625918503, + "grad_norm": 0.3956947326660156, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8132041692733765, + "eval_runtime": 38.6287, + "eval_samples_per_second": 13.332, + "eval_steps_per_second": 1.683, + "step": 1497 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 0.34480565786361694, + "learning_rate": 0.0002, + "loss": 1.6791, + "step": 1500 + }, + { + "epoch": 2.017368069472278, + "grad_norm": 0.3418028652667999, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1510 + }, + { + "epoch": 2.0307281229124916, + "grad_norm": 0.4514467716217041, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 1520 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 0.4197506606578827, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1530 + }, + { + "epoch": 2.0574482297929193, + "grad_norm": 0.4134170711040497, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1540 + }, + { + "epoch": 2.070808283233133, + "grad_norm": 0.43709826469421387, + "learning_rate": 0.0002, + "loss": 1.6876, + "step": 1550 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 0.4703378677368164, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 1560 + }, + { + "epoch": 2.0975283901135606, + "grad_norm": 0.4538188576698303, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 1570 + }, + { + "epoch": 2.1108884435537743, + "grad_norm": 0.4649668037891388, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1580 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 0.42669883370399475, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1590 + }, + { + "epoch": 2.1376085504342015, + "grad_norm": 0.43162038922309875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1600 + }, + { + "epoch": 2.1509686038744156, + "grad_norm": 0.4294586479663849, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1610 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 0.4669102132320404, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1620 + }, + { + "epoch": 2.177688710754843, + "grad_norm": 0.4188412129878998, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1630 + }, + { + "epoch": 2.191048764195057, + "grad_norm": 0.4662680923938751, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 1640 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 0.4020286500453949, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1650 + }, + { + "epoch": 2.217768871075484, + "grad_norm": 0.41919606924057007, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1660 + }, + { + "epoch": 2.231128924515698, + "grad_norm": 0.4644531309604645, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1670 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 0.4526427984237671, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 1680 + }, + { + "epoch": 2.2578490313961255, + "grad_norm": 0.45953166484832764, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1690 + }, + { + "epoch": 2.2712090848363395, + "grad_norm": 0.4701860249042511, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 1700 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 0.4749310612678528, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1710 + }, + { + "epoch": 2.297929191716767, + "grad_norm": 0.45026102662086487, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 1720 + }, + { + "epoch": 2.3112892451569804, + "grad_norm": 0.4755004048347473, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1730 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 0.4505726993083954, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1740 + }, + { + "epoch": 2.338009352037408, + "grad_norm": 0.44464054703712463, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1750 + }, + { + "epoch": 2.3513694054776217, + "grad_norm": 0.4449476897716522, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1760 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 0.4216482937335968, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 1770 + }, + { + "epoch": 2.3780895123580494, + "grad_norm": 0.4379308521747589, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 1780 + }, + { + "epoch": 2.391449565798263, + "grad_norm": 0.41670042276382446, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 1790 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 0.48089510202407837, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 1800 + }, + { + "epoch": 2.4181696726786908, + "grad_norm": 0.4389738142490387, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 1810 + }, + { + "epoch": 2.4315297261189044, + "grad_norm": 0.45293036103248596, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1820 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 0.5211683511734009, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1830 + }, + { + "epoch": 2.458249832999332, + "grad_norm": 0.4631884694099426, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 1840 + }, + { + "epoch": 2.4716098864395457, + "grad_norm": 0.4276818335056305, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 1850 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 0.477524071931839, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1860 + }, + { + "epoch": 2.4983299933199734, + "grad_norm": 0.44860973954200745, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1870 + }, + { + "epoch": 2.511690046760187, + "grad_norm": 0.46413546800613403, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1880 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 0.42487645149230957, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1890 + }, + { + "epoch": 2.5384101536406147, + "grad_norm": 0.4778307378292084, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1900 + }, + { + "epoch": 2.5517702070808284, + "grad_norm": 0.45307061076164246, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1910 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 0.47886642813682556, + "learning_rate": 0.0002, + "loss": 1.7279, + "step": 1920 + }, + { + "epoch": 2.5784903139612556, + "grad_norm": 0.4839435815811157, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1930 + }, + { + "epoch": 2.5918503674014697, + "grad_norm": 0.4388359785079956, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 1940 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 0.47859734296798706, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1950 + }, + { + "epoch": 2.6185704742818974, + "grad_norm": 0.5526517033576965, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 1960 + }, + { + "epoch": 2.631930527722111, + "grad_norm": 0.5449170470237732, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1970 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 0.48521968722343445, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 1980 + }, + { + "epoch": 2.6586506346025383, + "grad_norm": 0.4733737111091614, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 1990 + }, + { + "epoch": 2.6720106880427523, + "grad_norm": 0.507118284702301, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2000 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 0.4508971571922302, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2010 + }, + { + "epoch": 2.6987307949231796, + "grad_norm": 0.4657728672027588, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2020 + }, + { + "epoch": 2.7120908483633936, + "grad_norm": 0.48647549748420715, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2030 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 0.49525555968284607, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 2040 + }, + { + "epoch": 2.738810955243821, + "grad_norm": 0.4712379276752472, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 2050 + }, + { + "epoch": 2.7521710086840345, + "grad_norm": 0.4846591055393219, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 2060 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 0.4823240041732788, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 2070 + }, + { + "epoch": 2.778891115564462, + "grad_norm": 0.4546685516834259, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 2080 + }, + { + "epoch": 2.7922511690046763, + "grad_norm": 0.45542681217193604, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 2090 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 0.42137566208839417, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2100 + }, + { + "epoch": 2.8189712758851035, + "grad_norm": 0.6143282055854797, + "learning_rate": 0.0002, + "loss": 1.6526, + "step": 2110 + }, + { + "epoch": 2.832331329325317, + "grad_norm": 0.4828081727027893, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 2120 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 0.4319005608558655, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2130 + }, + { + "epoch": 2.859051436205745, + "grad_norm": 0.4297086298465729, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2140 + }, + { + "epoch": 2.8724114896459585, + "grad_norm": 0.5011981129646301, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2150 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 0.4401548504829407, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 2160 + }, + { + "epoch": 2.899131596526386, + "grad_norm": 0.48090746998786926, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 2170 + }, + { + "epoch": 2.9124916499666, + "grad_norm": 0.4740385413169861, + "learning_rate": 0.0002, + "loss": 1.6596, + "step": 2180 + }, + { + "epoch": 2.9258517034068134, + "grad_norm": 0.5337260365486145, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2190 + }, + { + "epoch": 2.9392117568470275, + "grad_norm": 0.4420052766799927, + "learning_rate": 0.0002, + "loss": 1.6802, + "step": 2200 + }, + { + "epoch": 2.952571810287241, + "grad_norm": 0.477512389421463, + "learning_rate": 0.0002, + "loss": 1.5474, + "step": 2210 + }, + { + "epoch": 2.9659318637274548, + "grad_norm": 0.5344052910804749, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 2220 + }, + { + "epoch": 2.979291917167669, + "grad_norm": 0.4483940303325653, + "learning_rate": 0.0002, + "loss": 1.6866, + "step": 2230 + }, + { + "epoch": 2.9926519706078825, + "grad_norm": 0.4366597831249237, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2240 + }, + { + "epoch": 2.9993319973279893, + "eval_loss": 1.834012746810913, + "eval_runtime": 38.5659, + "eval_samples_per_second": 13.354, + "eval_steps_per_second": 1.685, + "step": 2245 + }, + { + "epoch": 3.006012024048096, + "grad_norm": 0.428824245929718, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 2250 + }, + { + "epoch": 3.01937207748831, + "grad_norm": 0.4870174825191498, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 2260 + }, + { + "epoch": 3.032732130928524, + "grad_norm": 0.4684266149997711, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 2270 + }, + { + "epoch": 3.0460921843687374, + "grad_norm": 0.581604540348053, + "learning_rate": 0.0002, + "loss": 1.5284, + "step": 2280 + }, + { + "epoch": 3.059452237808951, + "grad_norm": 0.5561677813529968, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 2290 + }, + { + "epoch": 3.072812291249165, + "grad_norm": 0.5750220417976379, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 2300 + }, + { + "epoch": 3.0861723446893787, + "grad_norm": 0.5704626441001892, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 2310 + }, + { + "epoch": 3.0995323981295924, + "grad_norm": 0.6242083311080933, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 2320 + }, + { + "epoch": 3.1128924515698064, + "grad_norm": 0.5174121260643005, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 2330 + }, + { + "epoch": 3.12625250501002, + "grad_norm": 0.5697633028030396, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 2340 + }, + { + "epoch": 3.1396125584502337, + "grad_norm": 0.5969541072845459, + "learning_rate": 0.0002, + "loss": 1.5156, + "step": 2350 + }, + { + "epoch": 3.1529726118904478, + "grad_norm": 0.6244304180145264, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 2360 + }, + { + "epoch": 3.1663326653306614, + "grad_norm": 0.5561705827713013, + "learning_rate": 0.0002, + "loss": 1.5244, + "step": 2370 + }, + { + "epoch": 3.179692718770875, + "grad_norm": 0.5401188135147095, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2380 + }, + { + "epoch": 3.1930527722110886, + "grad_norm": 0.6450421810150146, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2390 + }, + { + "epoch": 3.2064128256513027, + "grad_norm": 0.5741903185844421, + "learning_rate": 0.0002, + "loss": 1.4839, + "step": 2400 + }, + { + "epoch": 3.2197728790915163, + "grad_norm": 0.6337407231330872, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2410 + }, + { + "epoch": 3.23313293253173, + "grad_norm": 0.6493517160415649, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 2420 + }, + { + "epoch": 3.246492985971944, + "grad_norm": 0.6230176091194153, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2430 + }, + { + "epoch": 3.2598530394121576, + "grad_norm": 0.680704653263092, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2440 + }, + { + "epoch": 3.2732130928523713, + "grad_norm": 0.5279417037963867, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2450 + }, + { + "epoch": 3.2865731462925853, + "grad_norm": 0.5601515173912048, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2460 + }, + { + "epoch": 3.299933199732799, + "grad_norm": 0.5591090321540833, + "learning_rate": 0.0002, + "loss": 1.4949, + "step": 2470 + }, + { + "epoch": 3.3132932531730126, + "grad_norm": 0.6596529483795166, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 2480 + }, + { + "epoch": 3.3266533066132267, + "grad_norm": 0.6115918755531311, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 2490 + }, + { + "epoch": 3.3400133600534403, + "grad_norm": 0.6443548202514648, + "learning_rate": 0.0002, + "loss": 1.5344, + "step": 2500 + }, + { + "epoch": 3.353373413493654, + "grad_norm": 0.5504242181777954, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 2510 + }, + { + "epoch": 3.3667334669338675, + "grad_norm": 0.6104483604431152, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 2520 + }, + { + "epoch": 3.3800935203740816, + "grad_norm": 0.8387531638145447, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2530 + }, + { + "epoch": 3.3934535738142952, + "grad_norm": 0.6346094012260437, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2540 + }, + { + "epoch": 3.406813627254509, + "grad_norm": 0.6261265873908997, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 2550 + }, + { + "epoch": 3.420173680694723, + "grad_norm": 0.5960372090339661, + "learning_rate": 0.0002, + "loss": 1.5233, + "step": 2560 + }, + { + "epoch": 3.4335337341349366, + "grad_norm": 0.5291280746459961, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 2570 + }, + { + "epoch": 3.44689378757515, + "grad_norm": 0.6133161783218384, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2580 + }, + { + "epoch": 3.460253841015364, + "grad_norm": 0.623573362827301, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 2590 + }, + { + "epoch": 3.473613894455578, + "grad_norm": 0.5959834456443787, + "learning_rate": 0.0002, + "loss": 1.4935, + "step": 2600 + }, + { + "epoch": 3.4869739478957915, + "grad_norm": 0.583332359790802, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2610 + }, + { + "epoch": 3.5003340013360056, + "grad_norm": 0.6003559231758118, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2620 + }, + { + "epoch": 3.513694054776219, + "grad_norm": 0.5832992196083069, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2630 + }, + { + "epoch": 3.527054108216433, + "grad_norm": 0.5942609906196594, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 2640 + }, + { + "epoch": 3.5404141616566465, + "grad_norm": 0.6087163686752319, + "learning_rate": 0.0002, + "loss": 1.5213, + "step": 2650 + }, + { + "epoch": 3.5537742150968605, + "grad_norm": 0.631948709487915, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2660 + }, + { + "epoch": 3.567134268537074, + "grad_norm": 0.6450803279876709, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2670 + }, + { + "epoch": 3.580494321977288, + "grad_norm": 0.6507797837257385, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2680 + }, + { + "epoch": 3.593854375417502, + "grad_norm": 0.5778017044067383, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2690 + }, + { + "epoch": 3.6072144288577155, + "grad_norm": 0.6214032173156738, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 2700 + }, + { + "epoch": 3.620574482297929, + "grad_norm": 0.5681133270263672, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 2710 + }, + { + "epoch": 3.6339345357381427, + "grad_norm": 0.6074244976043701, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 2720 + }, + { + "epoch": 3.647294589178357, + "grad_norm": 0.5900560617446899, + "learning_rate": 0.0002, + "loss": 1.5243, + "step": 2730 + }, + { + "epoch": 3.6606546426185704, + "grad_norm": 0.5817505717277527, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2740 + }, + { + "epoch": 3.6740146960587845, + "grad_norm": 0.6095547676086426, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2750 + }, + { + "epoch": 3.687374749498998, + "grad_norm": 0.612790584564209, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2760 + }, + { + "epoch": 3.7007348029392118, + "grad_norm": 0.6574140787124634, + "learning_rate": 0.0002, + "loss": 1.4976, + "step": 2770 + }, + { + "epoch": 3.7140948563794254, + "grad_norm": 0.5643761157989502, + "learning_rate": 0.0002, + "loss": 1.5306, + "step": 2780 + }, + { + "epoch": 3.727454909819639, + "grad_norm": 0.5652621388435364, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2790 + }, + { + "epoch": 3.740814963259853, + "grad_norm": 0.5604206323623657, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 2800 + }, + { + "epoch": 3.7541750167000667, + "grad_norm": 3.911022663116455, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2810 + }, + { + "epoch": 3.7675350701402808, + "grad_norm": 0.6148333549499512, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2820 + }, + { + "epoch": 3.7808951235804944, + "grad_norm": 0.5605677962303162, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 2830 + }, + { + "epoch": 3.794255177020708, + "grad_norm": 0.6101965308189392, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 2840 + }, + { + "epoch": 3.8076152304609217, + "grad_norm": 0.5387342572212219, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2850 + }, + { + "epoch": 3.8209752839011357, + "grad_norm": 0.5733087062835693, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 2860 + }, + { + "epoch": 3.8343353373413493, + "grad_norm": 0.6538485884666443, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 2870 + }, + { + "epoch": 3.847695390781563, + "grad_norm": 0.6247632503509521, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 2880 + }, + { + "epoch": 3.861055444221777, + "grad_norm": 0.5745735764503479, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2890 + }, + { + "epoch": 3.8744154976619907, + "grad_norm": 0.5942763686180115, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 2900 + }, + { + "epoch": 3.8877755511022043, + "grad_norm": 0.7086281776428223, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2910 + }, + { + "epoch": 3.901135604542418, + "grad_norm": 0.8825129866600037, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 2920 + }, + { + "epoch": 3.914495657982632, + "grad_norm": 0.6260842680931091, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2930 + }, + { + "epoch": 3.9278557114228456, + "grad_norm": 0.6015968322753906, + "learning_rate": 0.0002, + "loss": 1.5433, + "step": 2940 + }, + { + "epoch": 3.9412157648630597, + "grad_norm": 0.7042809128761292, + "learning_rate": 0.0002, + "loss": 1.4931, + "step": 2950 + }, + { + "epoch": 3.9545758183032733, + "grad_norm": 0.5860083699226379, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2960 + }, + { + "epoch": 3.967935871743487, + "grad_norm": 0.5939757823944092, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2970 + }, + { + "epoch": 3.9812959251837006, + "grad_norm": 0.5523964166641235, + "learning_rate": 0.0002, + "loss": 1.408, + "step": 2980 + }, + { + "epoch": 3.9946559786239146, + "grad_norm": 0.6380264759063721, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 2990 + }, + { + "epoch": 4.0, + "eval_loss": 1.8875294923782349, + "eval_runtime": 38.5837, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.685, + "step": 2994 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3855561616248013e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-2994/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23c3d054d1bcb21dc734680243dd8e58f0349c53 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0234f046adbb732e82d46b9c17830ca358b6f13dd3813abc98021c87c81720a +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..330afd31aeb9ffc16dad948e238b713a67db8121 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63365542cce7e88bcd49021c067c91257e90ae815e16060fa11bb928f9845cf2 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bb95309cd0eef0bb2b5b08b07f02149b35250e82 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c405372416e5b315715bd99e32cc1f94f9f3960af6a536965472b554f320c098 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a7d93751d92907fd672b134139791f8f7198d2c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:368c92ac0b2b6d0ba315c25de97abc6927c0b1af9ed72d1d47ac3cfdc9ec46be +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dac1384a5d0bc26351f3cefc189777b5b3b8c39e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/trainer_state.json @@ -0,0 +1,2691 @@ +{ + "best_metric": 1.8132041692733765, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", + "epoch": 4.999331997327989, + "eval_steps": 10, + "global_step": 3742, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.32302048802375793, + "learning_rate": 0.0002, + "loss": 1.7786, + "step": 750 + }, + { + "epoch": 1.0153640614562458, + "grad_norm": 0.37585633993148804, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 760 + }, + { + "epoch": 1.0287241148964597, + "grad_norm": 0.33826273679733276, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 770 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.44682955741882324, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 780 + }, + { + "epoch": 1.0554442217768871, + "grad_norm": 0.422188401222229, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 790 + }, + { + "epoch": 1.0688042752171008, + "grad_norm": 0.3809906244277954, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 800 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.3454349637031555, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 1.0955243820975284, + "grad_norm": 0.3767355978488922, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 820 + }, + { + "epoch": 1.108884435537742, + "grad_norm": 0.3361407518386841, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 830 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.3654632568359375, + "learning_rate": 0.0002, + "loss": 1.7509, + "step": 840 + }, + { + "epoch": 1.1356045424181698, + "grad_norm": 0.3822861313819885, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 850 + }, + { + "epoch": 1.1489645958583834, + "grad_norm": 0.3853831887245178, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 860 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.35521796345710754, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 870 + }, + { + "epoch": 1.1756847027388109, + "grad_norm": 0.4107200503349304, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 880 + }, + { + "epoch": 1.1890447561790247, + "grad_norm": 0.33219534158706665, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 890 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.3559704124927521, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 900 + }, + { + "epoch": 1.2157648630594522, + "grad_norm": 0.3700537383556366, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 910 + }, + { + "epoch": 1.229124916499666, + "grad_norm": 0.3771909475326538, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 920 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 0.3136613965034485, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 930 + }, + { + "epoch": 1.2558450233800935, + "grad_norm": 0.3952099084854126, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 940 + }, + { + "epoch": 1.2692050768203074, + "grad_norm": 0.36534377932548523, + "learning_rate": 0.0002, + "loss": 1.7691, + "step": 950 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.3803492486476898, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 960 + }, + { + "epoch": 1.2959251837007348, + "grad_norm": 0.3992428183555603, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 970 + }, + { + "epoch": 1.3092852371409487, + "grad_norm": 0.3627142906188965, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 980 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.4248180091381073, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 990 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.4060308039188385, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1000 + }, + { + "epoch": 1.3493653974615898, + "grad_norm": 0.3788969814777374, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1010 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1020 + }, + { + "epoch": 1.3760855043420173, + "grad_norm": 0.35500675439834595, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1030 + }, + { + "epoch": 1.389445557782231, + "grad_norm": 0.3454059362411499, + "learning_rate": 0.0002, + "loss": 1.724, + "step": 1040 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.45807570219039917, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 1050 + }, + { + "epoch": 1.4161656646626586, + "grad_norm": 0.39338022470474243, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1060 + }, + { + "epoch": 1.4295257181028724, + "grad_norm": 0.3870709240436554, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1070 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.40996190905570984, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 1080 + }, + { + "epoch": 1.4562458249833, + "grad_norm": 0.38762837648391724, + "learning_rate": 0.0002, + "loss": 1.7324, + "step": 1090 + }, + { + "epoch": 1.4696058784235138, + "grad_norm": 0.36756977438926697, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1100 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.4087235927581787, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1110 + }, + { + "epoch": 1.4963259853039412, + "grad_norm": 0.3357745110988617, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 1120 + }, + { + "epoch": 1.5096860387441549, + "grad_norm": 0.37486532330513, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1130 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.3387809991836548, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1140 + }, + { + "epoch": 1.5364061456245826, + "grad_norm": 0.37462118268013, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1150 + }, + { + "epoch": 1.5497661990647962, + "grad_norm": 0.38575324416160583, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1160 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.3515765964984894, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1170 + }, + { + "epoch": 1.5764863059452239, + "grad_norm": 0.39308643341064453, + "learning_rate": 0.0002, + "loss": 1.7524, + "step": 1180 + }, + { + "epoch": 1.5898463593854375, + "grad_norm": 0.3308864235877991, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1190 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.3397478461265564, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1200 + }, + { + "epoch": 1.6165664662658652, + "grad_norm": 0.3911525309085846, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 1210 + }, + { + "epoch": 1.6299265197060788, + "grad_norm": 0.3771969974040985, + "learning_rate": 0.0002, + "loss": 1.7443, + "step": 1220 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.35346856713294983, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1230 + }, + { + "epoch": 1.6566466265865063, + "grad_norm": 0.41736963391304016, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 0.3375225067138672, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1250 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.3779928982257843, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1260 + }, + { + "epoch": 1.6967267869071476, + "grad_norm": 0.35388994216918945, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1270 + }, + { + "epoch": 1.7100868403473615, + "grad_norm": 0.33884134888648987, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1280 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.35439756512641907, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1290 + }, + { + "epoch": 1.736806947227789, + "grad_norm": 0.3766156733036041, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 1300 + }, + { + "epoch": 1.7501670006680028, + "grad_norm": 0.36148911714553833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1310 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.39687496423721313, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.77688710754843, + "grad_norm": 0.35639452934265137, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1330 + }, + { + "epoch": 1.7902471609886441, + "grad_norm": 0.38781628012657166, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1340 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.42784637212753296, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 1350 + }, + { + "epoch": 1.8169672678690714, + "grad_norm": 0.40258511900901794, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1360 + }, + { + "epoch": 1.8303273213092852, + "grad_norm": 0.36674195528030396, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 1370 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 0.4064558446407318, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1380 + }, + { + "epoch": 1.8570474281897127, + "grad_norm": 0.3669849932193756, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1390 + }, + { + "epoch": 1.8704074816299265, + "grad_norm": 0.37569567561149597, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1400 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.37307995557785034, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1410 + }, + { + "epoch": 1.897127588510354, + "grad_norm": 0.3772695064544678, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1420 + }, + { + "epoch": 1.9104876419505676, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1430 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.3490557372570038, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 1440 + }, + { + "epoch": 1.9372077488309953, + "grad_norm": 0.3716149628162384, + "learning_rate": 0.0002, + "loss": 1.7979, + "step": 1450 + }, + { + "epoch": 1.950567802271209, + "grad_norm": 0.39236098527908325, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.37258651852607727, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 1470 + }, + { + "epoch": 1.9772879091516367, + "grad_norm": 0.36183077096939087, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1480 + }, + { + "epoch": 1.9906479625918503, + "grad_norm": 0.3956947326660156, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8132041692733765, + "eval_runtime": 38.6287, + "eval_samples_per_second": 13.332, + "eval_steps_per_second": 1.683, + "step": 1497 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 0.34480565786361694, + "learning_rate": 0.0002, + "loss": 1.6791, + "step": 1500 + }, + { + "epoch": 2.017368069472278, + "grad_norm": 0.3418028652667999, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1510 + }, + { + "epoch": 2.0307281229124916, + "grad_norm": 0.4514467716217041, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 1520 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 0.4197506606578827, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1530 + }, + { + "epoch": 2.0574482297929193, + "grad_norm": 0.4134170711040497, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1540 + }, + { + "epoch": 2.070808283233133, + "grad_norm": 0.43709826469421387, + "learning_rate": 0.0002, + "loss": 1.6876, + "step": 1550 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 0.4703378677368164, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 1560 + }, + { + "epoch": 2.0975283901135606, + "grad_norm": 0.4538188576698303, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 1570 + }, + { + "epoch": 2.1108884435537743, + "grad_norm": 0.4649668037891388, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1580 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 0.42669883370399475, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1590 + }, + { + "epoch": 2.1376085504342015, + "grad_norm": 0.43162038922309875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1600 + }, + { + "epoch": 2.1509686038744156, + "grad_norm": 0.4294586479663849, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1610 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 0.4669102132320404, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1620 + }, + { + "epoch": 2.177688710754843, + "grad_norm": 0.4188412129878998, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1630 + }, + { + "epoch": 2.191048764195057, + "grad_norm": 0.4662680923938751, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 1640 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 0.4020286500453949, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1650 + }, + { + "epoch": 2.217768871075484, + "grad_norm": 0.41919606924057007, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1660 + }, + { + "epoch": 2.231128924515698, + "grad_norm": 0.4644531309604645, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1670 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 0.4526427984237671, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 1680 + }, + { + "epoch": 2.2578490313961255, + "grad_norm": 0.45953166484832764, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1690 + }, + { + "epoch": 2.2712090848363395, + "grad_norm": 0.4701860249042511, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 1700 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 0.4749310612678528, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1710 + }, + { + "epoch": 2.297929191716767, + "grad_norm": 0.45026102662086487, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 1720 + }, + { + "epoch": 2.3112892451569804, + "grad_norm": 0.4755004048347473, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1730 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 0.4505726993083954, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1740 + }, + { + "epoch": 2.338009352037408, + "grad_norm": 0.44464054703712463, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1750 + }, + { + "epoch": 2.3513694054776217, + "grad_norm": 0.4449476897716522, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1760 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 0.4216482937335968, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 1770 + }, + { + "epoch": 2.3780895123580494, + "grad_norm": 0.4379308521747589, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 1780 + }, + { + "epoch": 2.391449565798263, + "grad_norm": 0.41670042276382446, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 1790 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 0.48089510202407837, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 1800 + }, + { + "epoch": 2.4181696726786908, + "grad_norm": 0.4389738142490387, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 1810 + }, + { + "epoch": 2.4315297261189044, + "grad_norm": 0.45293036103248596, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1820 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 0.5211683511734009, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1830 + }, + { + "epoch": 2.458249832999332, + "grad_norm": 0.4631884694099426, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 1840 + }, + { + "epoch": 2.4716098864395457, + "grad_norm": 0.4276818335056305, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 1850 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 0.477524071931839, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1860 + }, + { + "epoch": 2.4983299933199734, + "grad_norm": 0.44860973954200745, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1870 + }, + { + "epoch": 2.511690046760187, + "grad_norm": 0.46413546800613403, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1880 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 0.42487645149230957, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1890 + }, + { + "epoch": 2.5384101536406147, + "grad_norm": 0.4778307378292084, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1900 + }, + { + "epoch": 2.5517702070808284, + "grad_norm": 0.45307061076164246, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1910 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 0.47886642813682556, + "learning_rate": 0.0002, + "loss": 1.7279, + "step": 1920 + }, + { + "epoch": 2.5784903139612556, + "grad_norm": 0.4839435815811157, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1930 + }, + { + "epoch": 2.5918503674014697, + "grad_norm": 0.4388359785079956, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 1940 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 0.47859734296798706, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1950 + }, + { + "epoch": 2.6185704742818974, + "grad_norm": 0.5526517033576965, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 1960 + }, + { + "epoch": 2.631930527722111, + "grad_norm": 0.5449170470237732, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1970 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 0.48521968722343445, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 1980 + }, + { + "epoch": 2.6586506346025383, + "grad_norm": 0.4733737111091614, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 1990 + }, + { + "epoch": 2.6720106880427523, + "grad_norm": 0.507118284702301, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2000 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 0.4508971571922302, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2010 + }, + { + "epoch": 2.6987307949231796, + "grad_norm": 0.4657728672027588, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2020 + }, + { + "epoch": 2.7120908483633936, + "grad_norm": 0.48647549748420715, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2030 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 0.49525555968284607, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 2040 + }, + { + "epoch": 2.738810955243821, + "grad_norm": 0.4712379276752472, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 2050 + }, + { + "epoch": 2.7521710086840345, + "grad_norm": 0.4846591055393219, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 2060 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 0.4823240041732788, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 2070 + }, + { + "epoch": 2.778891115564462, + "grad_norm": 0.4546685516834259, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 2080 + }, + { + "epoch": 2.7922511690046763, + "grad_norm": 0.45542681217193604, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 2090 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 0.42137566208839417, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2100 + }, + { + "epoch": 2.8189712758851035, + "grad_norm": 0.6143282055854797, + "learning_rate": 0.0002, + "loss": 1.6526, + "step": 2110 + }, + { + "epoch": 2.832331329325317, + "grad_norm": 0.4828081727027893, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 2120 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 0.4319005608558655, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2130 + }, + { + "epoch": 2.859051436205745, + "grad_norm": 0.4297086298465729, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2140 + }, + { + "epoch": 2.8724114896459585, + "grad_norm": 0.5011981129646301, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2150 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 0.4401548504829407, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 2160 + }, + { + "epoch": 2.899131596526386, + "grad_norm": 0.48090746998786926, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 2170 + }, + { + "epoch": 2.9124916499666, + "grad_norm": 0.4740385413169861, + "learning_rate": 0.0002, + "loss": 1.6596, + "step": 2180 + }, + { + "epoch": 2.9258517034068134, + "grad_norm": 0.5337260365486145, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2190 + }, + { + "epoch": 2.9392117568470275, + "grad_norm": 0.4420052766799927, + "learning_rate": 0.0002, + "loss": 1.6802, + "step": 2200 + }, + { + "epoch": 2.952571810287241, + "grad_norm": 0.477512389421463, + "learning_rate": 0.0002, + "loss": 1.5474, + "step": 2210 + }, + { + "epoch": 2.9659318637274548, + "grad_norm": 0.5344052910804749, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 2220 + }, + { + "epoch": 2.979291917167669, + "grad_norm": 0.4483940303325653, + "learning_rate": 0.0002, + "loss": 1.6866, + "step": 2230 + }, + { + "epoch": 2.9926519706078825, + "grad_norm": 0.4366597831249237, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2240 + }, + { + "epoch": 2.9993319973279893, + "eval_loss": 1.834012746810913, + "eval_runtime": 38.5659, + "eval_samples_per_second": 13.354, + "eval_steps_per_second": 1.685, + "step": 2245 + }, + { + "epoch": 3.006012024048096, + "grad_norm": 0.428824245929718, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 2250 + }, + { + "epoch": 3.01937207748831, + "grad_norm": 0.4870174825191498, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 2260 + }, + { + "epoch": 3.032732130928524, + "grad_norm": 0.4684266149997711, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 2270 + }, + { + "epoch": 3.0460921843687374, + "grad_norm": 0.581604540348053, + "learning_rate": 0.0002, + "loss": 1.5284, + "step": 2280 + }, + { + "epoch": 3.059452237808951, + "grad_norm": 0.5561677813529968, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 2290 + }, + { + "epoch": 3.072812291249165, + "grad_norm": 0.5750220417976379, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 2300 + }, + { + "epoch": 3.0861723446893787, + "grad_norm": 0.5704626441001892, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 2310 + }, + { + "epoch": 3.0995323981295924, + "grad_norm": 0.6242083311080933, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 2320 + }, + { + "epoch": 3.1128924515698064, + "grad_norm": 0.5174121260643005, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 2330 + }, + { + "epoch": 3.12625250501002, + "grad_norm": 0.5697633028030396, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 2340 + }, + { + "epoch": 3.1396125584502337, + "grad_norm": 0.5969541072845459, + "learning_rate": 0.0002, + "loss": 1.5156, + "step": 2350 + }, + { + "epoch": 3.1529726118904478, + "grad_norm": 0.6244304180145264, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 2360 + }, + { + "epoch": 3.1663326653306614, + "grad_norm": 0.5561705827713013, + "learning_rate": 0.0002, + "loss": 1.5244, + "step": 2370 + }, + { + "epoch": 3.179692718770875, + "grad_norm": 0.5401188135147095, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2380 + }, + { + "epoch": 3.1930527722110886, + "grad_norm": 0.6450421810150146, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2390 + }, + { + "epoch": 3.2064128256513027, + "grad_norm": 0.5741903185844421, + "learning_rate": 0.0002, + "loss": 1.4839, + "step": 2400 + }, + { + "epoch": 3.2197728790915163, + "grad_norm": 0.6337407231330872, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2410 + }, + { + "epoch": 3.23313293253173, + "grad_norm": 0.6493517160415649, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 2420 + }, + { + "epoch": 3.246492985971944, + "grad_norm": 0.6230176091194153, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2430 + }, + { + "epoch": 3.2598530394121576, + "grad_norm": 0.680704653263092, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2440 + }, + { + "epoch": 3.2732130928523713, + "grad_norm": 0.5279417037963867, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2450 + }, + { + "epoch": 3.2865731462925853, + "grad_norm": 0.5601515173912048, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2460 + }, + { + "epoch": 3.299933199732799, + "grad_norm": 0.5591090321540833, + "learning_rate": 0.0002, + "loss": 1.4949, + "step": 2470 + }, + { + "epoch": 3.3132932531730126, + "grad_norm": 0.6596529483795166, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 2480 + }, + { + "epoch": 3.3266533066132267, + "grad_norm": 0.6115918755531311, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 2490 + }, + { + "epoch": 3.3400133600534403, + "grad_norm": 0.6443548202514648, + "learning_rate": 0.0002, + "loss": 1.5344, + "step": 2500 + }, + { + "epoch": 3.353373413493654, + "grad_norm": 0.5504242181777954, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 2510 + }, + { + "epoch": 3.3667334669338675, + "grad_norm": 0.6104483604431152, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 2520 + }, + { + "epoch": 3.3800935203740816, + "grad_norm": 0.8387531638145447, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2530 + }, + { + "epoch": 3.3934535738142952, + "grad_norm": 0.6346094012260437, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2540 + }, + { + "epoch": 3.406813627254509, + "grad_norm": 0.6261265873908997, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 2550 + }, + { + "epoch": 3.420173680694723, + "grad_norm": 0.5960372090339661, + "learning_rate": 0.0002, + "loss": 1.5233, + "step": 2560 + }, + { + "epoch": 3.4335337341349366, + "grad_norm": 0.5291280746459961, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 2570 + }, + { + "epoch": 3.44689378757515, + "grad_norm": 0.6133161783218384, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2580 + }, + { + "epoch": 3.460253841015364, + "grad_norm": 0.623573362827301, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 2590 + }, + { + "epoch": 3.473613894455578, + "grad_norm": 0.5959834456443787, + "learning_rate": 0.0002, + "loss": 1.4935, + "step": 2600 + }, + { + "epoch": 3.4869739478957915, + "grad_norm": 0.583332359790802, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2610 + }, + { + "epoch": 3.5003340013360056, + "grad_norm": 0.6003559231758118, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2620 + }, + { + "epoch": 3.513694054776219, + "grad_norm": 0.5832992196083069, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2630 + }, + { + "epoch": 3.527054108216433, + "grad_norm": 0.5942609906196594, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 2640 + }, + { + "epoch": 3.5404141616566465, + "grad_norm": 0.6087163686752319, + "learning_rate": 0.0002, + "loss": 1.5213, + "step": 2650 + }, + { + "epoch": 3.5537742150968605, + "grad_norm": 0.631948709487915, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2660 + }, + { + "epoch": 3.567134268537074, + "grad_norm": 0.6450803279876709, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2670 + }, + { + "epoch": 3.580494321977288, + "grad_norm": 0.6507797837257385, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2680 + }, + { + "epoch": 3.593854375417502, + "grad_norm": 0.5778017044067383, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2690 + }, + { + "epoch": 3.6072144288577155, + "grad_norm": 0.6214032173156738, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 2700 + }, + { + "epoch": 3.620574482297929, + "grad_norm": 0.5681133270263672, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 2710 + }, + { + "epoch": 3.6339345357381427, + "grad_norm": 0.6074244976043701, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 2720 + }, + { + "epoch": 3.647294589178357, + "grad_norm": 0.5900560617446899, + "learning_rate": 0.0002, + "loss": 1.5243, + "step": 2730 + }, + { + "epoch": 3.6606546426185704, + "grad_norm": 0.5817505717277527, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2740 + }, + { + "epoch": 3.6740146960587845, + "grad_norm": 0.6095547676086426, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2750 + }, + { + "epoch": 3.687374749498998, + "grad_norm": 0.612790584564209, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2760 + }, + { + "epoch": 3.7007348029392118, + "grad_norm": 0.6574140787124634, + "learning_rate": 0.0002, + "loss": 1.4976, + "step": 2770 + }, + { + "epoch": 3.7140948563794254, + "grad_norm": 0.5643761157989502, + "learning_rate": 0.0002, + "loss": 1.5306, + "step": 2780 + }, + { + "epoch": 3.727454909819639, + "grad_norm": 0.5652621388435364, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2790 + }, + { + "epoch": 3.740814963259853, + "grad_norm": 0.5604206323623657, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 2800 + }, + { + "epoch": 3.7541750167000667, + "grad_norm": 3.911022663116455, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2810 + }, + { + "epoch": 3.7675350701402808, + "grad_norm": 0.6148333549499512, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2820 + }, + { + "epoch": 3.7808951235804944, + "grad_norm": 0.5605677962303162, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 2830 + }, + { + "epoch": 3.794255177020708, + "grad_norm": 0.6101965308189392, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 2840 + }, + { + "epoch": 3.8076152304609217, + "grad_norm": 0.5387342572212219, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2850 + }, + { + "epoch": 3.8209752839011357, + "grad_norm": 0.5733087062835693, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 2860 + }, + { + "epoch": 3.8343353373413493, + "grad_norm": 0.6538485884666443, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 2870 + }, + { + "epoch": 3.847695390781563, + "grad_norm": 0.6247632503509521, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 2880 + }, + { + "epoch": 3.861055444221777, + "grad_norm": 0.5745735764503479, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2890 + }, + { + "epoch": 3.8744154976619907, + "grad_norm": 0.5942763686180115, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 2900 + }, + { + "epoch": 3.8877755511022043, + "grad_norm": 0.7086281776428223, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2910 + }, + { + "epoch": 3.901135604542418, + "grad_norm": 0.8825129866600037, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 2920 + }, + { + "epoch": 3.914495657982632, + "grad_norm": 0.6260842680931091, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2930 + }, + { + "epoch": 3.9278557114228456, + "grad_norm": 0.6015968322753906, + "learning_rate": 0.0002, + "loss": 1.5433, + "step": 2940 + }, + { + "epoch": 3.9412157648630597, + "grad_norm": 0.7042809128761292, + "learning_rate": 0.0002, + "loss": 1.4931, + "step": 2950 + }, + { + "epoch": 3.9545758183032733, + "grad_norm": 0.5860083699226379, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2960 + }, + { + "epoch": 3.967935871743487, + "grad_norm": 0.5939757823944092, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2970 + }, + { + "epoch": 3.9812959251837006, + "grad_norm": 0.5523964166641235, + "learning_rate": 0.0002, + "loss": 1.408, + "step": 2980 + }, + { + "epoch": 3.9946559786239146, + "grad_norm": 0.6380264759063721, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 2990 + }, + { + "epoch": 4.0, + "eval_loss": 1.8875294923782349, + "eval_runtime": 38.5837, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.685, + "step": 2994 + }, + { + "epoch": 4.008016032064128, + "grad_norm": 0.5478564500808716, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3000 + }, + { + "epoch": 4.021376085504342, + "grad_norm": 0.9384379982948303, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 3010 + }, + { + "epoch": 4.034736138944556, + "grad_norm": 0.7819344401359558, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 3020 + }, + { + "epoch": 4.04809619238477, + "grad_norm": 0.7737417817115784, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 3030 + }, + { + "epoch": 4.061456245824983, + "grad_norm": 0.8893805742263794, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 3040 + }, + { + "epoch": 4.074816299265197, + "grad_norm": 0.7759843468666077, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 3050 + }, + { + "epoch": 4.0881763527054105, + "grad_norm": 0.642654538154602, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 3060 + }, + { + "epoch": 4.101536406145625, + "grad_norm": 0.8515549302101135, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 3070 + }, + { + "epoch": 4.114896459585839, + "grad_norm": 0.7033658623695374, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 3080 + }, + { + "epoch": 4.128256513026052, + "grad_norm": 0.7063882946968079, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 3090 + }, + { + "epoch": 4.141616566466266, + "grad_norm": 0.6946853995323181, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3100 + }, + { + "epoch": 4.1549766199064795, + "grad_norm": 0.7286741137504578, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 3110 + }, + { + "epoch": 4.168336673346693, + "grad_norm": 0.7894193530082703, + "learning_rate": 0.0002, + "loss": 1.3061, + "step": 3120 + }, + { + "epoch": 4.181696726786907, + "grad_norm": 0.7005895376205444, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 3130 + }, + { + "epoch": 4.195056780227121, + "grad_norm": 0.799567461013794, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 3140 + }, + { + "epoch": 4.208416833667335, + "grad_norm": 0.7010157108306885, + "learning_rate": 0.0002, + "loss": 1.3813, + "step": 3150 + }, + { + "epoch": 4.2217768871075485, + "grad_norm": 0.7489650249481201, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3160 + }, + { + "epoch": 4.235136940547762, + "grad_norm": 0.7908048629760742, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 3170 + }, + { + "epoch": 4.248496993987976, + "grad_norm": 0.7002180814743042, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 3180 + }, + { + "epoch": 4.261857047428189, + "grad_norm": 0.8339495062828064, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 3190 + }, + { + "epoch": 4.275217100868403, + "grad_norm": 0.7884618043899536, + "learning_rate": 0.0002, + "loss": 1.3471, + "step": 3200 + }, + { + "epoch": 4.2885771543086175, + "grad_norm": 0.7964122295379639, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 3210 + }, + { + "epoch": 4.301937207748831, + "grad_norm": 0.838646650314331, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 3220 + }, + { + "epoch": 4.315297261189045, + "grad_norm": 0.8063107132911682, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 3230 + }, + { + "epoch": 4.328657314629258, + "grad_norm": 0.8147385120391846, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 3240 + }, + { + "epoch": 4.342017368069472, + "grad_norm": 0.7636798620223999, + "learning_rate": 0.0002, + "loss": 1.4118, + "step": 3250 + }, + { + "epoch": 4.355377421509686, + "grad_norm": 0.7530609965324402, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 3260 + }, + { + "epoch": 4.3687374749499, + "grad_norm": 0.8853573799133301, + "learning_rate": 0.0002, + "loss": 1.3507, + "step": 3270 + }, + { + "epoch": 4.382097528390114, + "grad_norm": 0.7180975675582886, + "learning_rate": 0.0002, + "loss": 1.3614, + "step": 3280 + }, + { + "epoch": 4.395457581830327, + "grad_norm": 0.837150514125824, + "learning_rate": 0.0002, + "loss": 1.4119, + "step": 3290 + }, + { + "epoch": 4.408817635270541, + "grad_norm": 0.8370638489723206, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3300 + }, + { + "epoch": 4.422177688710755, + "grad_norm": 0.7738229036331177, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 3310 + }, + { + "epoch": 4.435537742150968, + "grad_norm": 0.7665290832519531, + "learning_rate": 0.0002, + "loss": 1.4195, + "step": 3320 + }, + { + "epoch": 4.448897795591183, + "grad_norm": 0.7547745704650879, + "learning_rate": 0.0002, + "loss": 1.3308, + "step": 3330 + }, + { + "epoch": 4.462257849031396, + "grad_norm": 0.7421861290931702, + "learning_rate": 0.0002, + "loss": 1.4165, + "step": 3340 + }, + { + "epoch": 4.47561790247161, + "grad_norm": 0.8042104244232178, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3350 + }, + { + "epoch": 4.488977955911824, + "grad_norm": 0.8111839890480042, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 3360 + }, + { + "epoch": 4.502338009352037, + "grad_norm": 0.7998340129852295, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 3370 + }, + { + "epoch": 4.515698062792251, + "grad_norm": 0.7668877243995667, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 3380 + }, + { + "epoch": 4.529058116232465, + "grad_norm": 0.7986718416213989, + "learning_rate": 0.0002, + "loss": 1.3972, + "step": 3390 + }, + { + "epoch": 4.542418169672679, + "grad_norm": 0.6806602478027344, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 3400 + }, + { + "epoch": 4.555778223112893, + "grad_norm": 0.8788819909095764, + "learning_rate": 0.0002, + "loss": 1.3942, + "step": 3410 + }, + { + "epoch": 4.569138276553106, + "grad_norm": 0.7499664425849915, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 3420 + }, + { + "epoch": 4.58249832999332, + "grad_norm": 0.7967109084129333, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 3430 + }, + { + "epoch": 4.595858383433534, + "grad_norm": 0.759639322757721, + "learning_rate": 0.0002, + "loss": 1.3531, + "step": 3440 + }, + { + "epoch": 4.609218436873747, + "grad_norm": 0.8327916264533997, + "learning_rate": 0.0002, + "loss": 1.3517, + "step": 3450 + }, + { + "epoch": 4.622578490313961, + "grad_norm": 0.7400892376899719, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 3460 + }, + { + "epoch": 4.635938543754175, + "grad_norm": 0.8116602301597595, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 3470 + }, + { + "epoch": 4.649298597194389, + "grad_norm": 0.7604362368583679, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 3480 + }, + { + "epoch": 4.662658650634603, + "grad_norm": 0.7397996783256531, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 3490 + }, + { + "epoch": 4.676018704074816, + "grad_norm": 0.869293749332428, + "learning_rate": 0.0002, + "loss": 1.4048, + "step": 3500 + }, + { + "epoch": 4.68937875751503, + "grad_norm": 0.6854358315467834, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 3510 + }, + { + "epoch": 4.7027388109552435, + "grad_norm": 0.8326661586761475, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 3520 + }, + { + "epoch": 4.716098864395457, + "grad_norm": 0.6887506246566772, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 3530 + }, + { + "epoch": 4.729458917835672, + "grad_norm": 3.837689161300659, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3540 + }, + { + "epoch": 4.742818971275885, + "grad_norm": 0.6874563694000244, + "learning_rate": 0.0002, + "loss": 1.3775, + "step": 3550 + }, + { + "epoch": 4.756179024716099, + "grad_norm": 0.8340407609939575, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 3560 + }, + { + "epoch": 4.7695390781563125, + "grad_norm": 0.7286418676376343, + "learning_rate": 0.0002, + "loss": 1.3556, + "step": 3570 + }, + { + "epoch": 4.782899131596526, + "grad_norm": 0.7239373326301575, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3580 + }, + { + "epoch": 4.796259185036741, + "grad_norm": 0.831310510635376, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3590 + }, + { + "epoch": 4.809619238476954, + "grad_norm": 0.767715573310852, + "learning_rate": 0.0002, + "loss": 1.4146, + "step": 3600 + }, + { + "epoch": 4.822979291917168, + "grad_norm": 0.9013199210166931, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 3610 + }, + { + "epoch": 4.8363393453573815, + "grad_norm": 0.7543512582778931, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3620 + }, + { + "epoch": 4.849699398797595, + "grad_norm": 0.7626057267189026, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3630 + }, + { + "epoch": 4.863059452237809, + "grad_norm": 0.847079336643219, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 3640 + }, + { + "epoch": 4.876419505678022, + "grad_norm": 0.8273295760154724, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 3650 + }, + { + "epoch": 4.889779559118237, + "grad_norm": 0.7675244808197021, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3660 + }, + { + "epoch": 4.9031396125584505, + "grad_norm": 0.9560356736183167, + "learning_rate": 0.0002, + "loss": 1.4894, + "step": 3670 + }, + { + "epoch": 4.916499665998664, + "grad_norm": 0.7682451605796814, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3680 + }, + { + "epoch": 4.929859719438878, + "grad_norm": 0.8113830089569092, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 3690 + }, + { + "epoch": 4.943219772879091, + "grad_norm": 0.7642542719841003, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 3700 + }, + { + "epoch": 4.956579826319305, + "grad_norm": 0.823863685131073, + "learning_rate": 0.0002, + "loss": 1.403, + "step": 3710 + }, + { + "epoch": 4.969939879759519, + "grad_norm": 0.8287797570228577, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 3720 + }, + { + "epoch": 4.983299933199733, + "grad_norm": 0.778170108795166, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3730 + }, + { + "epoch": 4.996659986639947, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3740 + }, + { + "epoch": 4.999331997327989, + "eval_loss": 1.9638569355010986, + "eval_runtime": 38.5725, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.685, + "step": 3742 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7319452020310016e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-3742/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2ff2b51d5983169399d252f98c92a13f3bbdc79e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:202ff5a7c06e1be89d42c147ce31dc497efcf6b513dcf3c7a42bee16ae322db5 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..14d4a83c0d7f5404bd603a5efc2381af337978c7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dd22daa5b07b64b4f46b5455deaff3f2811e9fdfdb6c7feba673d9eff289ea0 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf8dcb9191ef22ebeaab53bf82e205ac0258fc75 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d71846d766611c4ff35f96afdd2c6f029ba6ba3bb4a869ea9e5a37f2cf03d1c0 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5e273c84eb88987519772be274a0a747f74d245 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bcae50b846cfe798b14178881a8e1b25456e98e5002e123c0b7e9104d2ee731 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6c9b2eaf7064fedf9183215522c1475fdc37d831 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/trainer_state.json @@ -0,0 +1,3224 @@ +{ + "best_metric": 1.8132041692733765, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 4491, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.32302048802375793, + "learning_rate": 0.0002, + "loss": 1.7786, + "step": 750 + }, + { + "epoch": 1.0153640614562458, + "grad_norm": 0.37585633993148804, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 760 + }, + { + "epoch": 1.0287241148964597, + "grad_norm": 0.33826273679733276, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 770 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.44682955741882324, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 780 + }, + { + "epoch": 1.0554442217768871, + "grad_norm": 0.422188401222229, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 790 + }, + { + "epoch": 1.0688042752171008, + "grad_norm": 0.3809906244277954, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 800 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.3454349637031555, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 1.0955243820975284, + "grad_norm": 0.3767355978488922, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 820 + }, + { + "epoch": 1.108884435537742, + "grad_norm": 0.3361407518386841, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 830 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.3654632568359375, + "learning_rate": 0.0002, + "loss": 1.7509, + "step": 840 + }, + { + "epoch": 1.1356045424181698, + "grad_norm": 0.3822861313819885, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 850 + }, + { + "epoch": 1.1489645958583834, + "grad_norm": 0.3853831887245178, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 860 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.35521796345710754, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 870 + }, + { + "epoch": 1.1756847027388109, + "grad_norm": 0.4107200503349304, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 880 + }, + { + "epoch": 1.1890447561790247, + "grad_norm": 0.33219534158706665, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 890 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.3559704124927521, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 900 + }, + { + "epoch": 1.2157648630594522, + "grad_norm": 0.3700537383556366, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 910 + }, + { + "epoch": 1.229124916499666, + "grad_norm": 0.3771909475326538, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 920 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 0.3136613965034485, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 930 + }, + { + "epoch": 1.2558450233800935, + "grad_norm": 0.3952099084854126, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 940 + }, + { + "epoch": 1.2692050768203074, + "grad_norm": 0.36534377932548523, + "learning_rate": 0.0002, + "loss": 1.7691, + "step": 950 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.3803492486476898, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 960 + }, + { + "epoch": 1.2959251837007348, + "grad_norm": 0.3992428183555603, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 970 + }, + { + "epoch": 1.3092852371409487, + "grad_norm": 0.3627142906188965, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 980 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.4248180091381073, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 990 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.4060308039188385, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1000 + }, + { + "epoch": 1.3493653974615898, + "grad_norm": 0.3788969814777374, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1010 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1020 + }, + { + "epoch": 1.3760855043420173, + "grad_norm": 0.35500675439834595, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1030 + }, + { + "epoch": 1.389445557782231, + "grad_norm": 0.3454059362411499, + "learning_rate": 0.0002, + "loss": 1.724, + "step": 1040 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.45807570219039917, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 1050 + }, + { + "epoch": 1.4161656646626586, + "grad_norm": 0.39338022470474243, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1060 + }, + { + "epoch": 1.4295257181028724, + "grad_norm": 0.3870709240436554, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1070 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.40996190905570984, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 1080 + }, + { + "epoch": 1.4562458249833, + "grad_norm": 0.38762837648391724, + "learning_rate": 0.0002, + "loss": 1.7324, + "step": 1090 + }, + { + "epoch": 1.4696058784235138, + "grad_norm": 0.36756977438926697, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1100 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.4087235927581787, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1110 + }, + { + "epoch": 1.4963259853039412, + "grad_norm": 0.3357745110988617, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 1120 + }, + { + "epoch": 1.5096860387441549, + "grad_norm": 0.37486532330513, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1130 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.3387809991836548, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1140 + }, + { + "epoch": 1.5364061456245826, + "grad_norm": 0.37462118268013, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1150 + }, + { + "epoch": 1.5497661990647962, + "grad_norm": 0.38575324416160583, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1160 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.3515765964984894, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1170 + }, + { + "epoch": 1.5764863059452239, + "grad_norm": 0.39308643341064453, + "learning_rate": 0.0002, + "loss": 1.7524, + "step": 1180 + }, + { + "epoch": 1.5898463593854375, + "grad_norm": 0.3308864235877991, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1190 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.3397478461265564, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1200 + }, + { + "epoch": 1.6165664662658652, + "grad_norm": 0.3911525309085846, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 1210 + }, + { + "epoch": 1.6299265197060788, + "grad_norm": 0.3771969974040985, + "learning_rate": 0.0002, + "loss": 1.7443, + "step": 1220 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.35346856713294983, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1230 + }, + { + "epoch": 1.6566466265865063, + "grad_norm": 0.41736963391304016, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 0.3375225067138672, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1250 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.3779928982257843, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1260 + }, + { + "epoch": 1.6967267869071476, + "grad_norm": 0.35388994216918945, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1270 + }, + { + "epoch": 1.7100868403473615, + "grad_norm": 0.33884134888648987, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1280 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.35439756512641907, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1290 + }, + { + "epoch": 1.736806947227789, + "grad_norm": 0.3766156733036041, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 1300 + }, + { + "epoch": 1.7501670006680028, + "grad_norm": 0.36148911714553833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1310 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.39687496423721313, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.77688710754843, + "grad_norm": 0.35639452934265137, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1330 + }, + { + "epoch": 1.7902471609886441, + "grad_norm": 0.38781628012657166, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1340 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.42784637212753296, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 1350 + }, + { + "epoch": 1.8169672678690714, + "grad_norm": 0.40258511900901794, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1360 + }, + { + "epoch": 1.8303273213092852, + "grad_norm": 0.36674195528030396, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 1370 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 0.4064558446407318, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1380 + }, + { + "epoch": 1.8570474281897127, + "grad_norm": 0.3669849932193756, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1390 + }, + { + "epoch": 1.8704074816299265, + "grad_norm": 0.37569567561149597, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1400 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.37307995557785034, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1410 + }, + { + "epoch": 1.897127588510354, + "grad_norm": 0.3772695064544678, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1420 + }, + { + "epoch": 1.9104876419505676, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1430 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.3490557372570038, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 1440 + }, + { + "epoch": 1.9372077488309953, + "grad_norm": 0.3716149628162384, + "learning_rate": 0.0002, + "loss": 1.7979, + "step": 1450 + }, + { + "epoch": 1.950567802271209, + "grad_norm": 0.39236098527908325, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.37258651852607727, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 1470 + }, + { + "epoch": 1.9772879091516367, + "grad_norm": 0.36183077096939087, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1480 + }, + { + "epoch": 1.9906479625918503, + "grad_norm": 0.3956947326660156, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8132041692733765, + "eval_runtime": 38.6287, + "eval_samples_per_second": 13.332, + "eval_steps_per_second": 1.683, + "step": 1497 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 0.34480565786361694, + "learning_rate": 0.0002, + "loss": 1.6791, + "step": 1500 + }, + { + "epoch": 2.017368069472278, + "grad_norm": 0.3418028652667999, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1510 + }, + { + "epoch": 2.0307281229124916, + "grad_norm": 0.4514467716217041, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 1520 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 0.4197506606578827, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1530 + }, + { + "epoch": 2.0574482297929193, + "grad_norm": 0.4134170711040497, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1540 + }, + { + "epoch": 2.070808283233133, + "grad_norm": 0.43709826469421387, + "learning_rate": 0.0002, + "loss": 1.6876, + "step": 1550 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 0.4703378677368164, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 1560 + }, + { + "epoch": 2.0975283901135606, + "grad_norm": 0.4538188576698303, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 1570 + }, + { + "epoch": 2.1108884435537743, + "grad_norm": 0.4649668037891388, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1580 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 0.42669883370399475, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1590 + }, + { + "epoch": 2.1376085504342015, + "grad_norm": 0.43162038922309875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1600 + }, + { + "epoch": 2.1509686038744156, + "grad_norm": 0.4294586479663849, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1610 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 0.4669102132320404, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1620 + }, + { + "epoch": 2.177688710754843, + "grad_norm": 0.4188412129878998, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1630 + }, + { + "epoch": 2.191048764195057, + "grad_norm": 0.4662680923938751, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 1640 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 0.4020286500453949, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1650 + }, + { + "epoch": 2.217768871075484, + "grad_norm": 0.41919606924057007, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1660 + }, + { + "epoch": 2.231128924515698, + "grad_norm": 0.4644531309604645, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1670 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 0.4526427984237671, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 1680 + }, + { + "epoch": 2.2578490313961255, + "grad_norm": 0.45953166484832764, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1690 + }, + { + "epoch": 2.2712090848363395, + "grad_norm": 0.4701860249042511, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 1700 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 0.4749310612678528, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1710 + }, + { + "epoch": 2.297929191716767, + "grad_norm": 0.45026102662086487, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 1720 + }, + { + "epoch": 2.3112892451569804, + "grad_norm": 0.4755004048347473, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1730 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 0.4505726993083954, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1740 + }, + { + "epoch": 2.338009352037408, + "grad_norm": 0.44464054703712463, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1750 + }, + { + "epoch": 2.3513694054776217, + "grad_norm": 0.4449476897716522, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1760 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 0.4216482937335968, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 1770 + }, + { + "epoch": 2.3780895123580494, + "grad_norm": 0.4379308521747589, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 1780 + }, + { + "epoch": 2.391449565798263, + "grad_norm": 0.41670042276382446, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 1790 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 0.48089510202407837, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 1800 + }, + { + "epoch": 2.4181696726786908, + "grad_norm": 0.4389738142490387, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 1810 + }, + { + "epoch": 2.4315297261189044, + "grad_norm": 0.45293036103248596, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1820 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 0.5211683511734009, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1830 + }, + { + "epoch": 2.458249832999332, + "grad_norm": 0.4631884694099426, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 1840 + }, + { + "epoch": 2.4716098864395457, + "grad_norm": 0.4276818335056305, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 1850 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 0.477524071931839, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1860 + }, + { + "epoch": 2.4983299933199734, + "grad_norm": 0.44860973954200745, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1870 + }, + { + "epoch": 2.511690046760187, + "grad_norm": 0.46413546800613403, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1880 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 0.42487645149230957, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1890 + }, + { + "epoch": 2.5384101536406147, + "grad_norm": 0.4778307378292084, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1900 + }, + { + "epoch": 2.5517702070808284, + "grad_norm": 0.45307061076164246, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1910 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 0.47886642813682556, + "learning_rate": 0.0002, + "loss": 1.7279, + "step": 1920 + }, + { + "epoch": 2.5784903139612556, + "grad_norm": 0.4839435815811157, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1930 + }, + { + "epoch": 2.5918503674014697, + "grad_norm": 0.4388359785079956, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 1940 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 0.47859734296798706, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1950 + }, + { + "epoch": 2.6185704742818974, + "grad_norm": 0.5526517033576965, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 1960 + }, + { + "epoch": 2.631930527722111, + "grad_norm": 0.5449170470237732, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1970 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 0.48521968722343445, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 1980 + }, + { + "epoch": 2.6586506346025383, + "grad_norm": 0.4733737111091614, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 1990 + }, + { + "epoch": 2.6720106880427523, + "grad_norm": 0.507118284702301, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2000 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 0.4508971571922302, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2010 + }, + { + "epoch": 2.6987307949231796, + "grad_norm": 0.4657728672027588, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2020 + }, + { + "epoch": 2.7120908483633936, + "grad_norm": 0.48647549748420715, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2030 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 0.49525555968284607, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 2040 + }, + { + "epoch": 2.738810955243821, + "grad_norm": 0.4712379276752472, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 2050 + }, + { + "epoch": 2.7521710086840345, + "grad_norm": 0.4846591055393219, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 2060 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 0.4823240041732788, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 2070 + }, + { + "epoch": 2.778891115564462, + "grad_norm": 0.4546685516834259, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 2080 + }, + { + "epoch": 2.7922511690046763, + "grad_norm": 0.45542681217193604, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 2090 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 0.42137566208839417, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2100 + }, + { + "epoch": 2.8189712758851035, + "grad_norm": 0.6143282055854797, + "learning_rate": 0.0002, + "loss": 1.6526, + "step": 2110 + }, + { + "epoch": 2.832331329325317, + "grad_norm": 0.4828081727027893, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 2120 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 0.4319005608558655, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2130 + }, + { + "epoch": 2.859051436205745, + "grad_norm": 0.4297086298465729, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2140 + }, + { + "epoch": 2.8724114896459585, + "grad_norm": 0.5011981129646301, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2150 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 0.4401548504829407, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 2160 + }, + { + "epoch": 2.899131596526386, + "grad_norm": 0.48090746998786926, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 2170 + }, + { + "epoch": 2.9124916499666, + "grad_norm": 0.4740385413169861, + "learning_rate": 0.0002, + "loss": 1.6596, + "step": 2180 + }, + { + "epoch": 2.9258517034068134, + "grad_norm": 0.5337260365486145, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2190 + }, + { + "epoch": 2.9392117568470275, + "grad_norm": 0.4420052766799927, + "learning_rate": 0.0002, + "loss": 1.6802, + "step": 2200 + }, + { + "epoch": 2.952571810287241, + "grad_norm": 0.477512389421463, + "learning_rate": 0.0002, + "loss": 1.5474, + "step": 2210 + }, + { + "epoch": 2.9659318637274548, + "grad_norm": 0.5344052910804749, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 2220 + }, + { + "epoch": 2.979291917167669, + "grad_norm": 0.4483940303325653, + "learning_rate": 0.0002, + "loss": 1.6866, + "step": 2230 + }, + { + "epoch": 2.9926519706078825, + "grad_norm": 0.4366597831249237, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2240 + }, + { + "epoch": 2.9993319973279893, + "eval_loss": 1.834012746810913, + "eval_runtime": 38.5659, + "eval_samples_per_second": 13.354, + "eval_steps_per_second": 1.685, + "step": 2245 + }, + { + "epoch": 3.006012024048096, + "grad_norm": 0.428824245929718, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 2250 + }, + { + "epoch": 3.01937207748831, + "grad_norm": 0.4870174825191498, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 2260 + }, + { + "epoch": 3.032732130928524, + "grad_norm": 0.4684266149997711, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 2270 + }, + { + "epoch": 3.0460921843687374, + "grad_norm": 0.581604540348053, + "learning_rate": 0.0002, + "loss": 1.5284, + "step": 2280 + }, + { + "epoch": 3.059452237808951, + "grad_norm": 0.5561677813529968, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 2290 + }, + { + "epoch": 3.072812291249165, + "grad_norm": 0.5750220417976379, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 2300 + }, + { + "epoch": 3.0861723446893787, + "grad_norm": 0.5704626441001892, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 2310 + }, + { + "epoch": 3.0995323981295924, + "grad_norm": 0.6242083311080933, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 2320 + }, + { + "epoch": 3.1128924515698064, + "grad_norm": 0.5174121260643005, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 2330 + }, + { + "epoch": 3.12625250501002, + "grad_norm": 0.5697633028030396, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 2340 + }, + { + "epoch": 3.1396125584502337, + "grad_norm": 0.5969541072845459, + "learning_rate": 0.0002, + "loss": 1.5156, + "step": 2350 + }, + { + "epoch": 3.1529726118904478, + "grad_norm": 0.6244304180145264, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 2360 + }, + { + "epoch": 3.1663326653306614, + "grad_norm": 0.5561705827713013, + "learning_rate": 0.0002, + "loss": 1.5244, + "step": 2370 + }, + { + "epoch": 3.179692718770875, + "grad_norm": 0.5401188135147095, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2380 + }, + { + "epoch": 3.1930527722110886, + "grad_norm": 0.6450421810150146, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2390 + }, + { + "epoch": 3.2064128256513027, + "grad_norm": 0.5741903185844421, + "learning_rate": 0.0002, + "loss": 1.4839, + "step": 2400 + }, + { + "epoch": 3.2197728790915163, + "grad_norm": 0.6337407231330872, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2410 + }, + { + "epoch": 3.23313293253173, + "grad_norm": 0.6493517160415649, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 2420 + }, + { + "epoch": 3.246492985971944, + "grad_norm": 0.6230176091194153, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2430 + }, + { + "epoch": 3.2598530394121576, + "grad_norm": 0.680704653263092, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2440 + }, + { + "epoch": 3.2732130928523713, + "grad_norm": 0.5279417037963867, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2450 + }, + { + "epoch": 3.2865731462925853, + "grad_norm": 0.5601515173912048, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2460 + }, + { + "epoch": 3.299933199732799, + "grad_norm": 0.5591090321540833, + "learning_rate": 0.0002, + "loss": 1.4949, + "step": 2470 + }, + { + "epoch": 3.3132932531730126, + "grad_norm": 0.6596529483795166, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 2480 + }, + { + "epoch": 3.3266533066132267, + "grad_norm": 0.6115918755531311, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 2490 + }, + { + "epoch": 3.3400133600534403, + "grad_norm": 0.6443548202514648, + "learning_rate": 0.0002, + "loss": 1.5344, + "step": 2500 + }, + { + "epoch": 3.353373413493654, + "grad_norm": 0.5504242181777954, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 2510 + }, + { + "epoch": 3.3667334669338675, + "grad_norm": 0.6104483604431152, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 2520 + }, + { + "epoch": 3.3800935203740816, + "grad_norm": 0.8387531638145447, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2530 + }, + { + "epoch": 3.3934535738142952, + "grad_norm": 0.6346094012260437, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2540 + }, + { + "epoch": 3.406813627254509, + "grad_norm": 0.6261265873908997, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 2550 + }, + { + "epoch": 3.420173680694723, + "grad_norm": 0.5960372090339661, + "learning_rate": 0.0002, + "loss": 1.5233, + "step": 2560 + }, + { + "epoch": 3.4335337341349366, + "grad_norm": 0.5291280746459961, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 2570 + }, + { + "epoch": 3.44689378757515, + "grad_norm": 0.6133161783218384, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2580 + }, + { + "epoch": 3.460253841015364, + "grad_norm": 0.623573362827301, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 2590 + }, + { + "epoch": 3.473613894455578, + "grad_norm": 0.5959834456443787, + "learning_rate": 0.0002, + "loss": 1.4935, + "step": 2600 + }, + { + "epoch": 3.4869739478957915, + "grad_norm": 0.583332359790802, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2610 + }, + { + "epoch": 3.5003340013360056, + "grad_norm": 0.6003559231758118, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2620 + }, + { + "epoch": 3.513694054776219, + "grad_norm": 0.5832992196083069, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2630 + }, + { + "epoch": 3.527054108216433, + "grad_norm": 0.5942609906196594, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 2640 + }, + { + "epoch": 3.5404141616566465, + "grad_norm": 0.6087163686752319, + "learning_rate": 0.0002, + "loss": 1.5213, + "step": 2650 + }, + { + "epoch": 3.5537742150968605, + "grad_norm": 0.631948709487915, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2660 + }, + { + "epoch": 3.567134268537074, + "grad_norm": 0.6450803279876709, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2670 + }, + { + "epoch": 3.580494321977288, + "grad_norm": 0.6507797837257385, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2680 + }, + { + "epoch": 3.593854375417502, + "grad_norm": 0.5778017044067383, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2690 + }, + { + "epoch": 3.6072144288577155, + "grad_norm": 0.6214032173156738, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 2700 + }, + { + "epoch": 3.620574482297929, + "grad_norm": 0.5681133270263672, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 2710 + }, + { + "epoch": 3.6339345357381427, + "grad_norm": 0.6074244976043701, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 2720 + }, + { + "epoch": 3.647294589178357, + "grad_norm": 0.5900560617446899, + "learning_rate": 0.0002, + "loss": 1.5243, + "step": 2730 + }, + { + "epoch": 3.6606546426185704, + "grad_norm": 0.5817505717277527, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2740 + }, + { + "epoch": 3.6740146960587845, + "grad_norm": 0.6095547676086426, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2750 + }, + { + "epoch": 3.687374749498998, + "grad_norm": 0.612790584564209, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2760 + }, + { + "epoch": 3.7007348029392118, + "grad_norm": 0.6574140787124634, + "learning_rate": 0.0002, + "loss": 1.4976, + "step": 2770 + }, + { + "epoch": 3.7140948563794254, + "grad_norm": 0.5643761157989502, + "learning_rate": 0.0002, + "loss": 1.5306, + "step": 2780 + }, + { + "epoch": 3.727454909819639, + "grad_norm": 0.5652621388435364, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2790 + }, + { + "epoch": 3.740814963259853, + "grad_norm": 0.5604206323623657, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 2800 + }, + { + "epoch": 3.7541750167000667, + "grad_norm": 3.911022663116455, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2810 + }, + { + "epoch": 3.7675350701402808, + "grad_norm": 0.6148333549499512, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2820 + }, + { + "epoch": 3.7808951235804944, + "grad_norm": 0.5605677962303162, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 2830 + }, + { + "epoch": 3.794255177020708, + "grad_norm": 0.6101965308189392, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 2840 + }, + { + "epoch": 3.8076152304609217, + "grad_norm": 0.5387342572212219, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2850 + }, + { + "epoch": 3.8209752839011357, + "grad_norm": 0.5733087062835693, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 2860 + }, + { + "epoch": 3.8343353373413493, + "grad_norm": 0.6538485884666443, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 2870 + }, + { + "epoch": 3.847695390781563, + "grad_norm": 0.6247632503509521, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 2880 + }, + { + "epoch": 3.861055444221777, + "grad_norm": 0.5745735764503479, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2890 + }, + { + "epoch": 3.8744154976619907, + "grad_norm": 0.5942763686180115, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 2900 + }, + { + "epoch": 3.8877755511022043, + "grad_norm": 0.7086281776428223, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2910 + }, + { + "epoch": 3.901135604542418, + "grad_norm": 0.8825129866600037, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 2920 + }, + { + "epoch": 3.914495657982632, + "grad_norm": 0.6260842680931091, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2930 + }, + { + "epoch": 3.9278557114228456, + "grad_norm": 0.6015968322753906, + "learning_rate": 0.0002, + "loss": 1.5433, + "step": 2940 + }, + { + "epoch": 3.9412157648630597, + "grad_norm": 0.7042809128761292, + "learning_rate": 0.0002, + "loss": 1.4931, + "step": 2950 + }, + { + "epoch": 3.9545758183032733, + "grad_norm": 0.5860083699226379, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2960 + }, + { + "epoch": 3.967935871743487, + "grad_norm": 0.5939757823944092, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2970 + }, + { + "epoch": 3.9812959251837006, + "grad_norm": 0.5523964166641235, + "learning_rate": 0.0002, + "loss": 1.408, + "step": 2980 + }, + { + "epoch": 3.9946559786239146, + "grad_norm": 0.6380264759063721, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 2990 + }, + { + "epoch": 4.0, + "eval_loss": 1.8875294923782349, + "eval_runtime": 38.5837, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.685, + "step": 2994 + }, + { + "epoch": 4.008016032064128, + "grad_norm": 0.5478564500808716, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3000 + }, + { + "epoch": 4.021376085504342, + "grad_norm": 0.9384379982948303, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 3010 + }, + { + "epoch": 4.034736138944556, + "grad_norm": 0.7819344401359558, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 3020 + }, + { + "epoch": 4.04809619238477, + "grad_norm": 0.7737417817115784, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 3030 + }, + { + "epoch": 4.061456245824983, + "grad_norm": 0.8893805742263794, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 3040 + }, + { + "epoch": 4.074816299265197, + "grad_norm": 0.7759843468666077, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 3050 + }, + { + "epoch": 4.0881763527054105, + "grad_norm": 0.642654538154602, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 3060 + }, + { + "epoch": 4.101536406145625, + "grad_norm": 0.8515549302101135, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 3070 + }, + { + "epoch": 4.114896459585839, + "grad_norm": 0.7033658623695374, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 3080 + }, + { + "epoch": 4.128256513026052, + "grad_norm": 0.7063882946968079, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 3090 + }, + { + "epoch": 4.141616566466266, + "grad_norm": 0.6946853995323181, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3100 + }, + { + "epoch": 4.1549766199064795, + "grad_norm": 0.7286741137504578, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 3110 + }, + { + "epoch": 4.168336673346693, + "grad_norm": 0.7894193530082703, + "learning_rate": 0.0002, + "loss": 1.3061, + "step": 3120 + }, + { + "epoch": 4.181696726786907, + "grad_norm": 0.7005895376205444, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 3130 + }, + { + "epoch": 4.195056780227121, + "grad_norm": 0.799567461013794, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 3140 + }, + { + "epoch": 4.208416833667335, + "grad_norm": 0.7010157108306885, + "learning_rate": 0.0002, + "loss": 1.3813, + "step": 3150 + }, + { + "epoch": 4.2217768871075485, + "grad_norm": 0.7489650249481201, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3160 + }, + { + "epoch": 4.235136940547762, + "grad_norm": 0.7908048629760742, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 3170 + }, + { + "epoch": 4.248496993987976, + "grad_norm": 0.7002180814743042, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 3180 + }, + { + "epoch": 4.261857047428189, + "grad_norm": 0.8339495062828064, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 3190 + }, + { + "epoch": 4.275217100868403, + "grad_norm": 0.7884618043899536, + "learning_rate": 0.0002, + "loss": 1.3471, + "step": 3200 + }, + { + "epoch": 4.2885771543086175, + "grad_norm": 0.7964122295379639, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 3210 + }, + { + "epoch": 4.301937207748831, + "grad_norm": 0.838646650314331, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 3220 + }, + { + "epoch": 4.315297261189045, + "grad_norm": 0.8063107132911682, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 3230 + }, + { + "epoch": 4.328657314629258, + "grad_norm": 0.8147385120391846, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 3240 + }, + { + "epoch": 4.342017368069472, + "grad_norm": 0.7636798620223999, + "learning_rate": 0.0002, + "loss": 1.4118, + "step": 3250 + }, + { + "epoch": 4.355377421509686, + "grad_norm": 0.7530609965324402, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 3260 + }, + { + "epoch": 4.3687374749499, + "grad_norm": 0.8853573799133301, + "learning_rate": 0.0002, + "loss": 1.3507, + "step": 3270 + }, + { + "epoch": 4.382097528390114, + "grad_norm": 0.7180975675582886, + "learning_rate": 0.0002, + "loss": 1.3614, + "step": 3280 + }, + { + "epoch": 4.395457581830327, + "grad_norm": 0.837150514125824, + "learning_rate": 0.0002, + "loss": 1.4119, + "step": 3290 + }, + { + "epoch": 4.408817635270541, + "grad_norm": 0.8370638489723206, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3300 + }, + { + "epoch": 4.422177688710755, + "grad_norm": 0.7738229036331177, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 3310 + }, + { + "epoch": 4.435537742150968, + "grad_norm": 0.7665290832519531, + "learning_rate": 0.0002, + "loss": 1.4195, + "step": 3320 + }, + { + "epoch": 4.448897795591183, + "grad_norm": 0.7547745704650879, + "learning_rate": 0.0002, + "loss": 1.3308, + "step": 3330 + }, + { + "epoch": 4.462257849031396, + "grad_norm": 0.7421861290931702, + "learning_rate": 0.0002, + "loss": 1.4165, + "step": 3340 + }, + { + "epoch": 4.47561790247161, + "grad_norm": 0.8042104244232178, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3350 + }, + { + "epoch": 4.488977955911824, + "grad_norm": 0.8111839890480042, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 3360 + }, + { + "epoch": 4.502338009352037, + "grad_norm": 0.7998340129852295, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 3370 + }, + { + "epoch": 4.515698062792251, + "grad_norm": 0.7668877243995667, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 3380 + }, + { + "epoch": 4.529058116232465, + "grad_norm": 0.7986718416213989, + "learning_rate": 0.0002, + "loss": 1.3972, + "step": 3390 + }, + { + "epoch": 4.542418169672679, + "grad_norm": 0.6806602478027344, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 3400 + }, + { + "epoch": 4.555778223112893, + "grad_norm": 0.8788819909095764, + "learning_rate": 0.0002, + "loss": 1.3942, + "step": 3410 + }, + { + "epoch": 4.569138276553106, + "grad_norm": 0.7499664425849915, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 3420 + }, + { + "epoch": 4.58249832999332, + "grad_norm": 0.7967109084129333, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 3430 + }, + { + "epoch": 4.595858383433534, + "grad_norm": 0.759639322757721, + "learning_rate": 0.0002, + "loss": 1.3531, + "step": 3440 + }, + { + "epoch": 4.609218436873747, + "grad_norm": 0.8327916264533997, + "learning_rate": 0.0002, + "loss": 1.3517, + "step": 3450 + }, + { + "epoch": 4.622578490313961, + "grad_norm": 0.7400892376899719, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 3460 + }, + { + "epoch": 4.635938543754175, + "grad_norm": 0.8116602301597595, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 3470 + }, + { + "epoch": 4.649298597194389, + "grad_norm": 0.7604362368583679, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 3480 + }, + { + "epoch": 4.662658650634603, + "grad_norm": 0.7397996783256531, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 3490 + }, + { + "epoch": 4.676018704074816, + "grad_norm": 0.869293749332428, + "learning_rate": 0.0002, + "loss": 1.4048, + "step": 3500 + }, + { + "epoch": 4.68937875751503, + "grad_norm": 0.6854358315467834, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 3510 + }, + { + "epoch": 4.7027388109552435, + "grad_norm": 0.8326661586761475, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 3520 + }, + { + "epoch": 4.716098864395457, + "grad_norm": 0.6887506246566772, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 3530 + }, + { + "epoch": 4.729458917835672, + "grad_norm": 3.837689161300659, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3540 + }, + { + "epoch": 4.742818971275885, + "grad_norm": 0.6874563694000244, + "learning_rate": 0.0002, + "loss": 1.3775, + "step": 3550 + }, + { + "epoch": 4.756179024716099, + "grad_norm": 0.8340407609939575, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 3560 + }, + { + "epoch": 4.7695390781563125, + "grad_norm": 0.7286418676376343, + "learning_rate": 0.0002, + "loss": 1.3556, + "step": 3570 + }, + { + "epoch": 4.782899131596526, + "grad_norm": 0.7239373326301575, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3580 + }, + { + "epoch": 4.796259185036741, + "grad_norm": 0.831310510635376, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3590 + }, + { + "epoch": 4.809619238476954, + "grad_norm": 0.767715573310852, + "learning_rate": 0.0002, + "loss": 1.4146, + "step": 3600 + }, + { + "epoch": 4.822979291917168, + "grad_norm": 0.9013199210166931, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 3610 + }, + { + "epoch": 4.8363393453573815, + "grad_norm": 0.7543512582778931, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3620 + }, + { + "epoch": 4.849699398797595, + "grad_norm": 0.7626057267189026, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3630 + }, + { + "epoch": 4.863059452237809, + "grad_norm": 0.847079336643219, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 3640 + }, + { + "epoch": 4.876419505678022, + "grad_norm": 0.8273295760154724, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 3650 + }, + { + "epoch": 4.889779559118237, + "grad_norm": 0.7675244808197021, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3660 + }, + { + "epoch": 4.9031396125584505, + "grad_norm": 0.9560356736183167, + "learning_rate": 0.0002, + "loss": 1.4894, + "step": 3670 + }, + { + "epoch": 4.916499665998664, + "grad_norm": 0.7682451605796814, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3680 + }, + { + "epoch": 4.929859719438878, + "grad_norm": 0.8113830089569092, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 3690 + }, + { + "epoch": 4.943219772879091, + "grad_norm": 0.7642542719841003, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 3700 + }, + { + "epoch": 4.956579826319305, + "grad_norm": 0.823863685131073, + "learning_rate": 0.0002, + "loss": 1.403, + "step": 3710 + }, + { + "epoch": 4.969939879759519, + "grad_norm": 0.8287797570228577, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 3720 + }, + { + "epoch": 4.983299933199733, + "grad_norm": 0.778170108795166, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3730 + }, + { + "epoch": 4.996659986639947, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3740 + }, + { + "epoch": 4.999331997327989, + "eval_loss": 1.9638569355010986, + "eval_runtime": 38.5725, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.685, + "step": 3742 + }, + { + "epoch": 5.01002004008016, + "grad_norm": 0.8864085078239441, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 3750 + }, + { + "epoch": 5.023380093520374, + "grad_norm": 0.9191637635231018, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 3760 + }, + { + "epoch": 5.036740146960588, + "grad_norm": 0.749519407749176, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 3770 + }, + { + "epoch": 5.050100200400801, + "grad_norm": 0.7916892170906067, + "learning_rate": 0.0002, + "loss": 1.1959, + "step": 3780 + }, + { + "epoch": 5.063460253841015, + "grad_norm": 1.0318909883499146, + "learning_rate": 0.0002, + "loss": 1.2279, + "step": 3790 + }, + { + "epoch": 5.0768203072812295, + "grad_norm": 1.028586745262146, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 3800 + }, + { + "epoch": 5.090180360721443, + "grad_norm": 1.0568538904190063, + "learning_rate": 0.0002, + "loss": 1.1769, + "step": 3810 + }, + { + "epoch": 5.103540414161657, + "grad_norm": 0.9780595302581787, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 3820 + }, + { + "epoch": 5.11690046760187, + "grad_norm": 1.10311758518219, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 3830 + }, + { + "epoch": 5.130260521042084, + "grad_norm": 0.9497154355049133, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 3840 + }, + { + "epoch": 5.143620574482298, + "grad_norm": 0.948279857635498, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 3850 + }, + { + "epoch": 5.156980627922512, + "grad_norm": 0.9497880339622498, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 3860 + }, + { + "epoch": 5.170340681362726, + "grad_norm": 1.3213258981704712, + "learning_rate": 0.0002, + "loss": 1.1876, + "step": 3870 + }, + { + "epoch": 5.183700734802939, + "grad_norm": 0.9835752248764038, + "learning_rate": 0.0002, + "loss": 1.2327, + "step": 3880 + }, + { + "epoch": 5.197060788243153, + "grad_norm": 0.8426132202148438, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 3890 + }, + { + "epoch": 5.210420841683367, + "grad_norm": 1.0343470573425293, + "learning_rate": 0.0002, + "loss": 1.2066, + "step": 3900 + }, + { + "epoch": 5.22378089512358, + "grad_norm": 1.0771924257278442, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 3910 + }, + { + "epoch": 5.237140948563794, + "grad_norm": 0.8542634844779968, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 3920 + }, + { + "epoch": 5.250501002004008, + "grad_norm": 1.1021966934204102, + "learning_rate": 0.0002, + "loss": 1.2264, + "step": 3930 + }, + { + "epoch": 5.263861055444222, + "grad_norm": 1.170011281967163, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 3940 + }, + { + "epoch": 5.277221108884436, + "grad_norm": 0.9787653684616089, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 3950 + }, + { + "epoch": 5.290581162324649, + "grad_norm": 0.914513885974884, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 3960 + }, + { + "epoch": 5.303941215764863, + "grad_norm": 1.0831562280654907, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 3970 + }, + { + "epoch": 5.3173012692050765, + "grad_norm": 0.9810112714767456, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 3980 + }, + { + "epoch": 5.330661322645291, + "grad_norm": 0.9624066948890686, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 3990 + }, + { + "epoch": 5.344021376085505, + "grad_norm": 1.2296923398971558, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 4000 + }, + { + "epoch": 5.357381429525718, + "grad_norm": 1.011299967765808, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 4010 + }, + { + "epoch": 5.370741482965932, + "grad_norm": 0.9144132733345032, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 4020 + }, + { + "epoch": 5.3841015364061455, + "grad_norm": 1.0573601722717285, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4030 + }, + { + "epoch": 5.397461589846359, + "grad_norm": 1.1667137145996094, + "learning_rate": 0.0002, + "loss": 1.2295, + "step": 4040 + }, + { + "epoch": 5.410821643286573, + "grad_norm": 1.072070598602295, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 4050 + }, + { + "epoch": 5.424181696726787, + "grad_norm": 1.1005792617797852, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 4060 + }, + { + "epoch": 5.437541750167001, + "grad_norm": 1.033581018447876, + "learning_rate": 0.0002, + "loss": 1.2604, + "step": 4070 + }, + { + "epoch": 5.4509018036072145, + "grad_norm": 0.9537439942359924, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 4080 + }, + { + "epoch": 5.464261857047428, + "grad_norm": 1.0502177476882935, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 4090 + }, + { + "epoch": 5.477621910487642, + "grad_norm": 0.9098296761512756, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4100 + }, + { + "epoch": 5.490981963927855, + "grad_norm": 0.9551953077316284, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4110 + }, + { + "epoch": 5.504342017368069, + "grad_norm": 0.9169427156448364, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 4120 + }, + { + "epoch": 5.517702070808284, + "grad_norm": 0.9430235624313354, + "learning_rate": 0.0002, + "loss": 1.2572, + "step": 4130 + }, + { + "epoch": 5.531062124248497, + "grad_norm": 0.817259669303894, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 4140 + }, + { + "epoch": 5.544422177688711, + "grad_norm": 1.124152660369873, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 4150 + }, + { + "epoch": 5.557782231128924, + "grad_norm": 0.9250756502151489, + "learning_rate": 0.0002, + "loss": 1.2508, + "step": 4160 + }, + { + "epoch": 5.571142284569138, + "grad_norm": 0.9582970142364502, + "learning_rate": 0.0002, + "loss": 1.2492, + "step": 4170 + }, + { + "epoch": 5.584502338009352, + "grad_norm": 1.0078704357147217, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 4180 + }, + { + "epoch": 5.597862391449565, + "grad_norm": 0.9585610032081604, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 4190 + }, + { + "epoch": 5.61122244488978, + "grad_norm": 1.0150971412658691, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 4200 + }, + { + "epoch": 5.6245824983299935, + "grad_norm": 0.9943351149559021, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 4210 + }, + { + "epoch": 5.637942551770207, + "grad_norm": 0.8880936503410339, + "learning_rate": 0.0002, + "loss": 1.2928, + "step": 4220 + }, + { + "epoch": 5.651302605210421, + "grad_norm": 0.9873887896537781, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 4230 + }, + { + "epoch": 5.664662658650634, + "grad_norm": 0.9185152649879456, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 4240 + }, + { + "epoch": 5.678022712090849, + "grad_norm": 1.0706779956817627, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 4250 + }, + { + "epoch": 5.6913827655310625, + "grad_norm": 0.9660224914550781, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 4260 + }, + { + "epoch": 5.704742818971276, + "grad_norm": 0.8685019612312317, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 4270 + }, + { + "epoch": 5.71810287241149, + "grad_norm": 1.0390565395355225, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 4280 + }, + { + "epoch": 5.731462925851703, + "grad_norm": 0.9290478825569153, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 4290 + }, + { + "epoch": 5.744822979291917, + "grad_norm": 1.0361281633377075, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 4300 + }, + { + "epoch": 5.758183032732131, + "grad_norm": 0.8804615139961243, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 4310 + }, + { + "epoch": 5.771543086172345, + "grad_norm": 1.0051425695419312, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4320 + }, + { + "epoch": 5.784903139612559, + "grad_norm": 1.0051119327545166, + "learning_rate": 0.0002, + "loss": 1.1946, + "step": 4330 + }, + { + "epoch": 5.798263193052772, + "grad_norm": 0.9961661100387573, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 5.811623246492986, + "grad_norm": 1.0229419469833374, + "learning_rate": 0.0002, + "loss": 1.2179, + "step": 4350 + }, + { + "epoch": 5.8249832999332, + "grad_norm": 1.1129552125930786, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 4360 + }, + { + "epoch": 5.838343353373413, + "grad_norm": 1.18964421749115, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 4370 + }, + { + "epoch": 5.851703406813627, + "grad_norm": 0.9490230083465576, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4380 + }, + { + "epoch": 5.865063460253841, + "grad_norm": 0.8734540343284607, + "learning_rate": 0.0002, + "loss": 1.3177, + "step": 4390 + }, + { + "epoch": 5.878423513694055, + "grad_norm": 1.0017802715301514, + "learning_rate": 0.0002, + "loss": 1.3131, + "step": 4400 + }, + { + "epoch": 5.891783567134269, + "grad_norm": 0.953556478023529, + "learning_rate": 0.0002, + "loss": 1.2649, + "step": 4410 + }, + { + "epoch": 5.905143620574482, + "grad_norm": 0.8915258646011353, + "learning_rate": 0.0002, + "loss": 1.2684, + "step": 4420 + }, + { + "epoch": 5.918503674014696, + "grad_norm": 0.9715141654014587, + "learning_rate": 0.0002, + "loss": 1.2843, + "step": 4430 + }, + { + "epoch": 5.9318637274549095, + "grad_norm": 0.9432152509689331, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 4440 + }, + { + "epoch": 5.945223780895123, + "grad_norm": 0.9473979473114014, + "learning_rate": 0.0002, + "loss": 1.233, + "step": 4450 + }, + { + "epoch": 5.958583834335338, + "grad_norm": 1.104871392250061, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 4460 + }, + { + "epoch": 5.971943887775551, + "grad_norm": 1.0308905839920044, + "learning_rate": 0.0002, + "loss": 1.3427, + "step": 4470 + }, + { + "epoch": 5.985303941215765, + "grad_norm": 0.8895487189292908, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 4480 + }, + { + "epoch": 5.9986639946559785, + "grad_norm": 1.0148485898971558, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 4490 + }, + { + "epoch": 6.0, + "eval_loss": 2.0830726623535156, + "eval_runtime": 38.5442, + "eval_samples_per_second": 13.361, + "eval_steps_per_second": 1.686, + "step": 4491 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.078334242437202e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-4491/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0a01b2fc0b54d84f37ddfffdc3279cd1ee50a212 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b119d47c1fb9705c3c5b3c8e56823df542b9f9bc6c68f6f86338f8725b6b1e43 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..99e069f2d50b91e6b7545c4b385daeb0cbf21098 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00d21b5883ac12049ec22ad5b60bacc185273486591700a7cc13e07ab7ac28bc +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea52eb077424f886c63d82e99e83a2b8287acf73 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2224c66f4cb4f8ad60fb15486306df8a451cd14b191e6073feaf2b5420feb8 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6f8ad78e16357cf5218d5719bd480e9f376c074 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b994104d1f62e5245f79451223748faec7b00256fca74faf69dded4a46ebdd6 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1cb4185ba24bb16b23dd15f0c1cc5ff6b31d9afd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/trainer_state.json @@ -0,0 +1,3750 @@ +{ + "best_metric": 1.8132041692733765, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", + "epoch": 6.999331997327989, + "eval_steps": 10, + "global_step": 5239, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.32302048802375793, + "learning_rate": 0.0002, + "loss": 1.7786, + "step": 750 + }, + { + "epoch": 1.0153640614562458, + "grad_norm": 0.37585633993148804, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 760 + }, + { + "epoch": 1.0287241148964597, + "grad_norm": 0.33826273679733276, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 770 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.44682955741882324, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 780 + }, + { + "epoch": 1.0554442217768871, + "grad_norm": 0.422188401222229, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 790 + }, + { + "epoch": 1.0688042752171008, + "grad_norm": 0.3809906244277954, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 800 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.3454349637031555, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 1.0955243820975284, + "grad_norm": 0.3767355978488922, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 820 + }, + { + "epoch": 1.108884435537742, + "grad_norm": 0.3361407518386841, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 830 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.3654632568359375, + "learning_rate": 0.0002, + "loss": 1.7509, + "step": 840 + }, + { + "epoch": 1.1356045424181698, + "grad_norm": 0.3822861313819885, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 850 + }, + { + "epoch": 1.1489645958583834, + "grad_norm": 0.3853831887245178, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 860 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.35521796345710754, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 870 + }, + { + "epoch": 1.1756847027388109, + "grad_norm": 0.4107200503349304, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 880 + }, + { + "epoch": 1.1890447561790247, + "grad_norm": 0.33219534158706665, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 890 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.3559704124927521, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 900 + }, + { + "epoch": 1.2157648630594522, + "grad_norm": 0.3700537383556366, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 910 + }, + { + "epoch": 1.229124916499666, + "grad_norm": 0.3771909475326538, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 920 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 0.3136613965034485, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 930 + }, + { + "epoch": 1.2558450233800935, + "grad_norm": 0.3952099084854126, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 940 + }, + { + "epoch": 1.2692050768203074, + "grad_norm": 0.36534377932548523, + "learning_rate": 0.0002, + "loss": 1.7691, + "step": 950 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.3803492486476898, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 960 + }, + { + "epoch": 1.2959251837007348, + "grad_norm": 0.3992428183555603, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 970 + }, + { + "epoch": 1.3092852371409487, + "grad_norm": 0.3627142906188965, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 980 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.4248180091381073, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 990 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.4060308039188385, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1000 + }, + { + "epoch": 1.3493653974615898, + "grad_norm": 0.3788969814777374, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1010 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1020 + }, + { + "epoch": 1.3760855043420173, + "grad_norm": 0.35500675439834595, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1030 + }, + { + "epoch": 1.389445557782231, + "grad_norm": 0.3454059362411499, + "learning_rate": 0.0002, + "loss": 1.724, + "step": 1040 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.45807570219039917, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 1050 + }, + { + "epoch": 1.4161656646626586, + "grad_norm": 0.39338022470474243, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1060 + }, + { + "epoch": 1.4295257181028724, + "grad_norm": 0.3870709240436554, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1070 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.40996190905570984, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 1080 + }, + { + "epoch": 1.4562458249833, + "grad_norm": 0.38762837648391724, + "learning_rate": 0.0002, + "loss": 1.7324, + "step": 1090 + }, + { + "epoch": 1.4696058784235138, + "grad_norm": 0.36756977438926697, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1100 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.4087235927581787, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1110 + }, + { + "epoch": 1.4963259853039412, + "grad_norm": 0.3357745110988617, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 1120 + }, + { + "epoch": 1.5096860387441549, + "grad_norm": 0.37486532330513, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1130 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.3387809991836548, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1140 + }, + { + "epoch": 1.5364061456245826, + "grad_norm": 0.37462118268013, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1150 + }, + { + "epoch": 1.5497661990647962, + "grad_norm": 0.38575324416160583, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1160 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.3515765964984894, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1170 + }, + { + "epoch": 1.5764863059452239, + "grad_norm": 0.39308643341064453, + "learning_rate": 0.0002, + "loss": 1.7524, + "step": 1180 + }, + { + "epoch": 1.5898463593854375, + "grad_norm": 0.3308864235877991, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1190 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.3397478461265564, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1200 + }, + { + "epoch": 1.6165664662658652, + "grad_norm": 0.3911525309085846, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 1210 + }, + { + "epoch": 1.6299265197060788, + "grad_norm": 0.3771969974040985, + "learning_rate": 0.0002, + "loss": 1.7443, + "step": 1220 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.35346856713294983, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1230 + }, + { + "epoch": 1.6566466265865063, + "grad_norm": 0.41736963391304016, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 0.3375225067138672, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1250 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.3779928982257843, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1260 + }, + { + "epoch": 1.6967267869071476, + "grad_norm": 0.35388994216918945, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1270 + }, + { + "epoch": 1.7100868403473615, + "grad_norm": 0.33884134888648987, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1280 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.35439756512641907, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1290 + }, + { + "epoch": 1.736806947227789, + "grad_norm": 0.3766156733036041, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 1300 + }, + { + "epoch": 1.7501670006680028, + "grad_norm": 0.36148911714553833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1310 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.39687496423721313, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.77688710754843, + "grad_norm": 0.35639452934265137, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1330 + }, + { + "epoch": 1.7902471609886441, + "grad_norm": 0.38781628012657166, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1340 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.42784637212753296, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 1350 + }, + { + "epoch": 1.8169672678690714, + "grad_norm": 0.40258511900901794, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1360 + }, + { + "epoch": 1.8303273213092852, + "grad_norm": 0.36674195528030396, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 1370 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 0.4064558446407318, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1380 + }, + { + "epoch": 1.8570474281897127, + "grad_norm": 0.3669849932193756, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1390 + }, + { + "epoch": 1.8704074816299265, + "grad_norm": 0.37569567561149597, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1400 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.37307995557785034, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1410 + }, + { + "epoch": 1.897127588510354, + "grad_norm": 0.3772695064544678, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1420 + }, + { + "epoch": 1.9104876419505676, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1430 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.3490557372570038, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 1440 + }, + { + "epoch": 1.9372077488309953, + "grad_norm": 0.3716149628162384, + "learning_rate": 0.0002, + "loss": 1.7979, + "step": 1450 + }, + { + "epoch": 1.950567802271209, + "grad_norm": 0.39236098527908325, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.37258651852607727, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 1470 + }, + { + "epoch": 1.9772879091516367, + "grad_norm": 0.36183077096939087, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1480 + }, + { + "epoch": 1.9906479625918503, + "grad_norm": 0.3956947326660156, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8132041692733765, + "eval_runtime": 38.6287, + "eval_samples_per_second": 13.332, + "eval_steps_per_second": 1.683, + "step": 1497 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 0.34480565786361694, + "learning_rate": 0.0002, + "loss": 1.6791, + "step": 1500 + }, + { + "epoch": 2.017368069472278, + "grad_norm": 0.3418028652667999, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1510 + }, + { + "epoch": 2.0307281229124916, + "grad_norm": 0.4514467716217041, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 1520 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 0.4197506606578827, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1530 + }, + { + "epoch": 2.0574482297929193, + "grad_norm": 0.4134170711040497, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1540 + }, + { + "epoch": 2.070808283233133, + "grad_norm": 0.43709826469421387, + "learning_rate": 0.0002, + "loss": 1.6876, + "step": 1550 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 0.4703378677368164, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 1560 + }, + { + "epoch": 2.0975283901135606, + "grad_norm": 0.4538188576698303, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 1570 + }, + { + "epoch": 2.1108884435537743, + "grad_norm": 0.4649668037891388, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1580 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 0.42669883370399475, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1590 + }, + { + "epoch": 2.1376085504342015, + "grad_norm": 0.43162038922309875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1600 + }, + { + "epoch": 2.1509686038744156, + "grad_norm": 0.4294586479663849, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1610 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 0.4669102132320404, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1620 + }, + { + "epoch": 2.177688710754843, + "grad_norm": 0.4188412129878998, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1630 + }, + { + "epoch": 2.191048764195057, + "grad_norm": 0.4662680923938751, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 1640 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 0.4020286500453949, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1650 + }, + { + "epoch": 2.217768871075484, + "grad_norm": 0.41919606924057007, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1660 + }, + { + "epoch": 2.231128924515698, + "grad_norm": 0.4644531309604645, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1670 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 0.4526427984237671, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 1680 + }, + { + "epoch": 2.2578490313961255, + "grad_norm": 0.45953166484832764, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1690 + }, + { + "epoch": 2.2712090848363395, + "grad_norm": 0.4701860249042511, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 1700 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 0.4749310612678528, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1710 + }, + { + "epoch": 2.297929191716767, + "grad_norm": 0.45026102662086487, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 1720 + }, + { + "epoch": 2.3112892451569804, + "grad_norm": 0.4755004048347473, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1730 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 0.4505726993083954, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1740 + }, + { + "epoch": 2.338009352037408, + "grad_norm": 0.44464054703712463, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1750 + }, + { + "epoch": 2.3513694054776217, + "grad_norm": 0.4449476897716522, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1760 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 0.4216482937335968, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 1770 + }, + { + "epoch": 2.3780895123580494, + "grad_norm": 0.4379308521747589, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 1780 + }, + { + "epoch": 2.391449565798263, + "grad_norm": 0.41670042276382446, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 1790 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 0.48089510202407837, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 1800 + }, + { + "epoch": 2.4181696726786908, + "grad_norm": 0.4389738142490387, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 1810 + }, + { + "epoch": 2.4315297261189044, + "grad_norm": 0.45293036103248596, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1820 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 0.5211683511734009, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1830 + }, + { + "epoch": 2.458249832999332, + "grad_norm": 0.4631884694099426, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 1840 + }, + { + "epoch": 2.4716098864395457, + "grad_norm": 0.4276818335056305, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 1850 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 0.477524071931839, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1860 + }, + { + "epoch": 2.4983299933199734, + "grad_norm": 0.44860973954200745, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1870 + }, + { + "epoch": 2.511690046760187, + "grad_norm": 0.46413546800613403, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1880 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 0.42487645149230957, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1890 + }, + { + "epoch": 2.5384101536406147, + "grad_norm": 0.4778307378292084, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1900 + }, + { + "epoch": 2.5517702070808284, + "grad_norm": 0.45307061076164246, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1910 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 0.47886642813682556, + "learning_rate": 0.0002, + "loss": 1.7279, + "step": 1920 + }, + { + "epoch": 2.5784903139612556, + "grad_norm": 0.4839435815811157, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1930 + }, + { + "epoch": 2.5918503674014697, + "grad_norm": 0.4388359785079956, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 1940 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 0.47859734296798706, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1950 + }, + { + "epoch": 2.6185704742818974, + "grad_norm": 0.5526517033576965, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 1960 + }, + { + "epoch": 2.631930527722111, + "grad_norm": 0.5449170470237732, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1970 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 0.48521968722343445, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 1980 + }, + { + "epoch": 2.6586506346025383, + "grad_norm": 0.4733737111091614, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 1990 + }, + { + "epoch": 2.6720106880427523, + "grad_norm": 0.507118284702301, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2000 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 0.4508971571922302, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2010 + }, + { + "epoch": 2.6987307949231796, + "grad_norm": 0.4657728672027588, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2020 + }, + { + "epoch": 2.7120908483633936, + "grad_norm": 0.48647549748420715, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2030 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 0.49525555968284607, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 2040 + }, + { + "epoch": 2.738810955243821, + "grad_norm": 0.4712379276752472, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 2050 + }, + { + "epoch": 2.7521710086840345, + "grad_norm": 0.4846591055393219, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 2060 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 0.4823240041732788, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 2070 + }, + { + "epoch": 2.778891115564462, + "grad_norm": 0.4546685516834259, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 2080 + }, + { + "epoch": 2.7922511690046763, + "grad_norm": 0.45542681217193604, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 2090 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 0.42137566208839417, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2100 + }, + { + "epoch": 2.8189712758851035, + "grad_norm": 0.6143282055854797, + "learning_rate": 0.0002, + "loss": 1.6526, + "step": 2110 + }, + { + "epoch": 2.832331329325317, + "grad_norm": 0.4828081727027893, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 2120 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 0.4319005608558655, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2130 + }, + { + "epoch": 2.859051436205745, + "grad_norm": 0.4297086298465729, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2140 + }, + { + "epoch": 2.8724114896459585, + "grad_norm": 0.5011981129646301, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2150 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 0.4401548504829407, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 2160 + }, + { + "epoch": 2.899131596526386, + "grad_norm": 0.48090746998786926, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 2170 + }, + { + "epoch": 2.9124916499666, + "grad_norm": 0.4740385413169861, + "learning_rate": 0.0002, + "loss": 1.6596, + "step": 2180 + }, + { + "epoch": 2.9258517034068134, + "grad_norm": 0.5337260365486145, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2190 + }, + { + "epoch": 2.9392117568470275, + "grad_norm": 0.4420052766799927, + "learning_rate": 0.0002, + "loss": 1.6802, + "step": 2200 + }, + { + "epoch": 2.952571810287241, + "grad_norm": 0.477512389421463, + "learning_rate": 0.0002, + "loss": 1.5474, + "step": 2210 + }, + { + "epoch": 2.9659318637274548, + "grad_norm": 0.5344052910804749, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 2220 + }, + { + "epoch": 2.979291917167669, + "grad_norm": 0.4483940303325653, + "learning_rate": 0.0002, + "loss": 1.6866, + "step": 2230 + }, + { + "epoch": 2.9926519706078825, + "grad_norm": 0.4366597831249237, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2240 + }, + { + "epoch": 2.9993319973279893, + "eval_loss": 1.834012746810913, + "eval_runtime": 38.5659, + "eval_samples_per_second": 13.354, + "eval_steps_per_second": 1.685, + "step": 2245 + }, + { + "epoch": 3.006012024048096, + "grad_norm": 0.428824245929718, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 2250 + }, + { + "epoch": 3.01937207748831, + "grad_norm": 0.4870174825191498, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 2260 + }, + { + "epoch": 3.032732130928524, + "grad_norm": 0.4684266149997711, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 2270 + }, + { + "epoch": 3.0460921843687374, + "grad_norm": 0.581604540348053, + "learning_rate": 0.0002, + "loss": 1.5284, + "step": 2280 + }, + { + "epoch": 3.059452237808951, + "grad_norm": 0.5561677813529968, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 2290 + }, + { + "epoch": 3.072812291249165, + "grad_norm": 0.5750220417976379, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 2300 + }, + { + "epoch": 3.0861723446893787, + "grad_norm": 0.5704626441001892, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 2310 + }, + { + "epoch": 3.0995323981295924, + "grad_norm": 0.6242083311080933, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 2320 + }, + { + "epoch": 3.1128924515698064, + "grad_norm": 0.5174121260643005, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 2330 + }, + { + "epoch": 3.12625250501002, + "grad_norm": 0.5697633028030396, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 2340 + }, + { + "epoch": 3.1396125584502337, + "grad_norm": 0.5969541072845459, + "learning_rate": 0.0002, + "loss": 1.5156, + "step": 2350 + }, + { + "epoch": 3.1529726118904478, + "grad_norm": 0.6244304180145264, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 2360 + }, + { + "epoch": 3.1663326653306614, + "grad_norm": 0.5561705827713013, + "learning_rate": 0.0002, + "loss": 1.5244, + "step": 2370 + }, + { + "epoch": 3.179692718770875, + "grad_norm": 0.5401188135147095, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2380 + }, + { + "epoch": 3.1930527722110886, + "grad_norm": 0.6450421810150146, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2390 + }, + { + "epoch": 3.2064128256513027, + "grad_norm": 0.5741903185844421, + "learning_rate": 0.0002, + "loss": 1.4839, + "step": 2400 + }, + { + "epoch": 3.2197728790915163, + "grad_norm": 0.6337407231330872, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2410 + }, + { + "epoch": 3.23313293253173, + "grad_norm": 0.6493517160415649, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 2420 + }, + { + "epoch": 3.246492985971944, + "grad_norm": 0.6230176091194153, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2430 + }, + { + "epoch": 3.2598530394121576, + "grad_norm": 0.680704653263092, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2440 + }, + { + "epoch": 3.2732130928523713, + "grad_norm": 0.5279417037963867, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2450 + }, + { + "epoch": 3.2865731462925853, + "grad_norm": 0.5601515173912048, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2460 + }, + { + "epoch": 3.299933199732799, + "grad_norm": 0.5591090321540833, + "learning_rate": 0.0002, + "loss": 1.4949, + "step": 2470 + }, + { + "epoch": 3.3132932531730126, + "grad_norm": 0.6596529483795166, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 2480 + }, + { + "epoch": 3.3266533066132267, + "grad_norm": 0.6115918755531311, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 2490 + }, + { + "epoch": 3.3400133600534403, + "grad_norm": 0.6443548202514648, + "learning_rate": 0.0002, + "loss": 1.5344, + "step": 2500 + }, + { + "epoch": 3.353373413493654, + "grad_norm": 0.5504242181777954, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 2510 + }, + { + "epoch": 3.3667334669338675, + "grad_norm": 0.6104483604431152, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 2520 + }, + { + "epoch": 3.3800935203740816, + "grad_norm": 0.8387531638145447, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2530 + }, + { + "epoch": 3.3934535738142952, + "grad_norm": 0.6346094012260437, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2540 + }, + { + "epoch": 3.406813627254509, + "grad_norm": 0.6261265873908997, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 2550 + }, + { + "epoch": 3.420173680694723, + "grad_norm": 0.5960372090339661, + "learning_rate": 0.0002, + "loss": 1.5233, + "step": 2560 + }, + { + "epoch": 3.4335337341349366, + "grad_norm": 0.5291280746459961, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 2570 + }, + { + "epoch": 3.44689378757515, + "grad_norm": 0.6133161783218384, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2580 + }, + { + "epoch": 3.460253841015364, + "grad_norm": 0.623573362827301, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 2590 + }, + { + "epoch": 3.473613894455578, + "grad_norm": 0.5959834456443787, + "learning_rate": 0.0002, + "loss": 1.4935, + "step": 2600 + }, + { + "epoch": 3.4869739478957915, + "grad_norm": 0.583332359790802, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2610 + }, + { + "epoch": 3.5003340013360056, + "grad_norm": 0.6003559231758118, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2620 + }, + { + "epoch": 3.513694054776219, + "grad_norm": 0.5832992196083069, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2630 + }, + { + "epoch": 3.527054108216433, + "grad_norm": 0.5942609906196594, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 2640 + }, + { + "epoch": 3.5404141616566465, + "grad_norm": 0.6087163686752319, + "learning_rate": 0.0002, + "loss": 1.5213, + "step": 2650 + }, + { + "epoch": 3.5537742150968605, + "grad_norm": 0.631948709487915, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2660 + }, + { + "epoch": 3.567134268537074, + "grad_norm": 0.6450803279876709, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2670 + }, + { + "epoch": 3.580494321977288, + "grad_norm": 0.6507797837257385, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2680 + }, + { + "epoch": 3.593854375417502, + "grad_norm": 0.5778017044067383, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2690 + }, + { + "epoch": 3.6072144288577155, + "grad_norm": 0.6214032173156738, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 2700 + }, + { + "epoch": 3.620574482297929, + "grad_norm": 0.5681133270263672, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 2710 + }, + { + "epoch": 3.6339345357381427, + "grad_norm": 0.6074244976043701, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 2720 + }, + { + "epoch": 3.647294589178357, + "grad_norm": 0.5900560617446899, + "learning_rate": 0.0002, + "loss": 1.5243, + "step": 2730 + }, + { + "epoch": 3.6606546426185704, + "grad_norm": 0.5817505717277527, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2740 + }, + { + "epoch": 3.6740146960587845, + "grad_norm": 0.6095547676086426, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2750 + }, + { + "epoch": 3.687374749498998, + "grad_norm": 0.612790584564209, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2760 + }, + { + "epoch": 3.7007348029392118, + "grad_norm": 0.6574140787124634, + "learning_rate": 0.0002, + "loss": 1.4976, + "step": 2770 + }, + { + "epoch": 3.7140948563794254, + "grad_norm": 0.5643761157989502, + "learning_rate": 0.0002, + "loss": 1.5306, + "step": 2780 + }, + { + "epoch": 3.727454909819639, + "grad_norm": 0.5652621388435364, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2790 + }, + { + "epoch": 3.740814963259853, + "grad_norm": 0.5604206323623657, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 2800 + }, + { + "epoch": 3.7541750167000667, + "grad_norm": 3.911022663116455, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2810 + }, + { + "epoch": 3.7675350701402808, + "grad_norm": 0.6148333549499512, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2820 + }, + { + "epoch": 3.7808951235804944, + "grad_norm": 0.5605677962303162, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 2830 + }, + { + "epoch": 3.794255177020708, + "grad_norm": 0.6101965308189392, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 2840 + }, + { + "epoch": 3.8076152304609217, + "grad_norm": 0.5387342572212219, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2850 + }, + { + "epoch": 3.8209752839011357, + "grad_norm": 0.5733087062835693, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 2860 + }, + { + "epoch": 3.8343353373413493, + "grad_norm": 0.6538485884666443, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 2870 + }, + { + "epoch": 3.847695390781563, + "grad_norm": 0.6247632503509521, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 2880 + }, + { + "epoch": 3.861055444221777, + "grad_norm": 0.5745735764503479, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2890 + }, + { + "epoch": 3.8744154976619907, + "grad_norm": 0.5942763686180115, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 2900 + }, + { + "epoch": 3.8877755511022043, + "grad_norm": 0.7086281776428223, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2910 + }, + { + "epoch": 3.901135604542418, + "grad_norm": 0.8825129866600037, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 2920 + }, + { + "epoch": 3.914495657982632, + "grad_norm": 0.6260842680931091, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2930 + }, + { + "epoch": 3.9278557114228456, + "grad_norm": 0.6015968322753906, + "learning_rate": 0.0002, + "loss": 1.5433, + "step": 2940 + }, + { + "epoch": 3.9412157648630597, + "grad_norm": 0.7042809128761292, + "learning_rate": 0.0002, + "loss": 1.4931, + "step": 2950 + }, + { + "epoch": 3.9545758183032733, + "grad_norm": 0.5860083699226379, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2960 + }, + { + "epoch": 3.967935871743487, + "grad_norm": 0.5939757823944092, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2970 + }, + { + "epoch": 3.9812959251837006, + "grad_norm": 0.5523964166641235, + "learning_rate": 0.0002, + "loss": 1.408, + "step": 2980 + }, + { + "epoch": 3.9946559786239146, + "grad_norm": 0.6380264759063721, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 2990 + }, + { + "epoch": 4.0, + "eval_loss": 1.8875294923782349, + "eval_runtime": 38.5837, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.685, + "step": 2994 + }, + { + "epoch": 4.008016032064128, + "grad_norm": 0.5478564500808716, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3000 + }, + { + "epoch": 4.021376085504342, + "grad_norm": 0.9384379982948303, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 3010 + }, + { + "epoch": 4.034736138944556, + "grad_norm": 0.7819344401359558, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 3020 + }, + { + "epoch": 4.04809619238477, + "grad_norm": 0.7737417817115784, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 3030 + }, + { + "epoch": 4.061456245824983, + "grad_norm": 0.8893805742263794, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 3040 + }, + { + "epoch": 4.074816299265197, + "grad_norm": 0.7759843468666077, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 3050 + }, + { + "epoch": 4.0881763527054105, + "grad_norm": 0.642654538154602, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 3060 + }, + { + "epoch": 4.101536406145625, + "grad_norm": 0.8515549302101135, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 3070 + }, + { + "epoch": 4.114896459585839, + "grad_norm": 0.7033658623695374, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 3080 + }, + { + "epoch": 4.128256513026052, + "grad_norm": 0.7063882946968079, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 3090 + }, + { + "epoch": 4.141616566466266, + "grad_norm": 0.6946853995323181, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3100 + }, + { + "epoch": 4.1549766199064795, + "grad_norm": 0.7286741137504578, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 3110 + }, + { + "epoch": 4.168336673346693, + "grad_norm": 0.7894193530082703, + "learning_rate": 0.0002, + "loss": 1.3061, + "step": 3120 + }, + { + "epoch": 4.181696726786907, + "grad_norm": 0.7005895376205444, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 3130 + }, + { + "epoch": 4.195056780227121, + "grad_norm": 0.799567461013794, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 3140 + }, + { + "epoch": 4.208416833667335, + "grad_norm": 0.7010157108306885, + "learning_rate": 0.0002, + "loss": 1.3813, + "step": 3150 + }, + { + "epoch": 4.2217768871075485, + "grad_norm": 0.7489650249481201, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3160 + }, + { + "epoch": 4.235136940547762, + "grad_norm": 0.7908048629760742, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 3170 + }, + { + "epoch": 4.248496993987976, + "grad_norm": 0.7002180814743042, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 3180 + }, + { + "epoch": 4.261857047428189, + "grad_norm": 0.8339495062828064, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 3190 + }, + { + "epoch": 4.275217100868403, + "grad_norm": 0.7884618043899536, + "learning_rate": 0.0002, + "loss": 1.3471, + "step": 3200 + }, + { + "epoch": 4.2885771543086175, + "grad_norm": 0.7964122295379639, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 3210 + }, + { + "epoch": 4.301937207748831, + "grad_norm": 0.838646650314331, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 3220 + }, + { + "epoch": 4.315297261189045, + "grad_norm": 0.8063107132911682, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 3230 + }, + { + "epoch": 4.328657314629258, + "grad_norm": 0.8147385120391846, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 3240 + }, + { + "epoch": 4.342017368069472, + "grad_norm": 0.7636798620223999, + "learning_rate": 0.0002, + "loss": 1.4118, + "step": 3250 + }, + { + "epoch": 4.355377421509686, + "grad_norm": 0.7530609965324402, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 3260 + }, + { + "epoch": 4.3687374749499, + "grad_norm": 0.8853573799133301, + "learning_rate": 0.0002, + "loss": 1.3507, + "step": 3270 + }, + { + "epoch": 4.382097528390114, + "grad_norm": 0.7180975675582886, + "learning_rate": 0.0002, + "loss": 1.3614, + "step": 3280 + }, + { + "epoch": 4.395457581830327, + "grad_norm": 0.837150514125824, + "learning_rate": 0.0002, + "loss": 1.4119, + "step": 3290 + }, + { + "epoch": 4.408817635270541, + "grad_norm": 0.8370638489723206, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3300 + }, + { + "epoch": 4.422177688710755, + "grad_norm": 0.7738229036331177, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 3310 + }, + { + "epoch": 4.435537742150968, + "grad_norm": 0.7665290832519531, + "learning_rate": 0.0002, + "loss": 1.4195, + "step": 3320 + }, + { + "epoch": 4.448897795591183, + "grad_norm": 0.7547745704650879, + "learning_rate": 0.0002, + "loss": 1.3308, + "step": 3330 + }, + { + "epoch": 4.462257849031396, + "grad_norm": 0.7421861290931702, + "learning_rate": 0.0002, + "loss": 1.4165, + "step": 3340 + }, + { + "epoch": 4.47561790247161, + "grad_norm": 0.8042104244232178, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3350 + }, + { + "epoch": 4.488977955911824, + "grad_norm": 0.8111839890480042, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 3360 + }, + { + "epoch": 4.502338009352037, + "grad_norm": 0.7998340129852295, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 3370 + }, + { + "epoch": 4.515698062792251, + "grad_norm": 0.7668877243995667, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 3380 + }, + { + "epoch": 4.529058116232465, + "grad_norm": 0.7986718416213989, + "learning_rate": 0.0002, + "loss": 1.3972, + "step": 3390 + }, + { + "epoch": 4.542418169672679, + "grad_norm": 0.6806602478027344, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 3400 + }, + { + "epoch": 4.555778223112893, + "grad_norm": 0.8788819909095764, + "learning_rate": 0.0002, + "loss": 1.3942, + "step": 3410 + }, + { + "epoch": 4.569138276553106, + "grad_norm": 0.7499664425849915, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 3420 + }, + { + "epoch": 4.58249832999332, + "grad_norm": 0.7967109084129333, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 3430 + }, + { + "epoch": 4.595858383433534, + "grad_norm": 0.759639322757721, + "learning_rate": 0.0002, + "loss": 1.3531, + "step": 3440 + }, + { + "epoch": 4.609218436873747, + "grad_norm": 0.8327916264533997, + "learning_rate": 0.0002, + "loss": 1.3517, + "step": 3450 + }, + { + "epoch": 4.622578490313961, + "grad_norm": 0.7400892376899719, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 3460 + }, + { + "epoch": 4.635938543754175, + "grad_norm": 0.8116602301597595, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 3470 + }, + { + "epoch": 4.649298597194389, + "grad_norm": 0.7604362368583679, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 3480 + }, + { + "epoch": 4.662658650634603, + "grad_norm": 0.7397996783256531, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 3490 + }, + { + "epoch": 4.676018704074816, + "grad_norm": 0.869293749332428, + "learning_rate": 0.0002, + "loss": 1.4048, + "step": 3500 + }, + { + "epoch": 4.68937875751503, + "grad_norm": 0.6854358315467834, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 3510 + }, + { + "epoch": 4.7027388109552435, + "grad_norm": 0.8326661586761475, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 3520 + }, + { + "epoch": 4.716098864395457, + "grad_norm": 0.6887506246566772, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 3530 + }, + { + "epoch": 4.729458917835672, + "grad_norm": 3.837689161300659, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3540 + }, + { + "epoch": 4.742818971275885, + "grad_norm": 0.6874563694000244, + "learning_rate": 0.0002, + "loss": 1.3775, + "step": 3550 + }, + { + "epoch": 4.756179024716099, + "grad_norm": 0.8340407609939575, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 3560 + }, + { + "epoch": 4.7695390781563125, + "grad_norm": 0.7286418676376343, + "learning_rate": 0.0002, + "loss": 1.3556, + "step": 3570 + }, + { + "epoch": 4.782899131596526, + "grad_norm": 0.7239373326301575, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3580 + }, + { + "epoch": 4.796259185036741, + "grad_norm": 0.831310510635376, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3590 + }, + { + "epoch": 4.809619238476954, + "grad_norm": 0.767715573310852, + "learning_rate": 0.0002, + "loss": 1.4146, + "step": 3600 + }, + { + "epoch": 4.822979291917168, + "grad_norm": 0.9013199210166931, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 3610 + }, + { + "epoch": 4.8363393453573815, + "grad_norm": 0.7543512582778931, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3620 + }, + { + "epoch": 4.849699398797595, + "grad_norm": 0.7626057267189026, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3630 + }, + { + "epoch": 4.863059452237809, + "grad_norm": 0.847079336643219, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 3640 + }, + { + "epoch": 4.876419505678022, + "grad_norm": 0.8273295760154724, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 3650 + }, + { + "epoch": 4.889779559118237, + "grad_norm": 0.7675244808197021, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3660 + }, + { + "epoch": 4.9031396125584505, + "grad_norm": 0.9560356736183167, + "learning_rate": 0.0002, + "loss": 1.4894, + "step": 3670 + }, + { + "epoch": 4.916499665998664, + "grad_norm": 0.7682451605796814, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3680 + }, + { + "epoch": 4.929859719438878, + "grad_norm": 0.8113830089569092, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 3690 + }, + { + "epoch": 4.943219772879091, + "grad_norm": 0.7642542719841003, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 3700 + }, + { + "epoch": 4.956579826319305, + "grad_norm": 0.823863685131073, + "learning_rate": 0.0002, + "loss": 1.403, + "step": 3710 + }, + { + "epoch": 4.969939879759519, + "grad_norm": 0.8287797570228577, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 3720 + }, + { + "epoch": 4.983299933199733, + "grad_norm": 0.778170108795166, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3730 + }, + { + "epoch": 4.996659986639947, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3740 + }, + { + "epoch": 4.999331997327989, + "eval_loss": 1.9638569355010986, + "eval_runtime": 38.5725, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.685, + "step": 3742 + }, + { + "epoch": 5.01002004008016, + "grad_norm": 0.8864085078239441, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 3750 + }, + { + "epoch": 5.023380093520374, + "grad_norm": 0.9191637635231018, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 3760 + }, + { + "epoch": 5.036740146960588, + "grad_norm": 0.749519407749176, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 3770 + }, + { + "epoch": 5.050100200400801, + "grad_norm": 0.7916892170906067, + "learning_rate": 0.0002, + "loss": 1.1959, + "step": 3780 + }, + { + "epoch": 5.063460253841015, + "grad_norm": 1.0318909883499146, + "learning_rate": 0.0002, + "loss": 1.2279, + "step": 3790 + }, + { + "epoch": 5.0768203072812295, + "grad_norm": 1.028586745262146, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 3800 + }, + { + "epoch": 5.090180360721443, + "grad_norm": 1.0568538904190063, + "learning_rate": 0.0002, + "loss": 1.1769, + "step": 3810 + }, + { + "epoch": 5.103540414161657, + "grad_norm": 0.9780595302581787, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 3820 + }, + { + "epoch": 5.11690046760187, + "grad_norm": 1.10311758518219, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 3830 + }, + { + "epoch": 5.130260521042084, + "grad_norm": 0.9497154355049133, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 3840 + }, + { + "epoch": 5.143620574482298, + "grad_norm": 0.948279857635498, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 3850 + }, + { + "epoch": 5.156980627922512, + "grad_norm": 0.9497880339622498, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 3860 + }, + { + "epoch": 5.170340681362726, + "grad_norm": 1.3213258981704712, + "learning_rate": 0.0002, + "loss": 1.1876, + "step": 3870 + }, + { + "epoch": 5.183700734802939, + "grad_norm": 0.9835752248764038, + "learning_rate": 0.0002, + "loss": 1.2327, + "step": 3880 + }, + { + "epoch": 5.197060788243153, + "grad_norm": 0.8426132202148438, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 3890 + }, + { + "epoch": 5.210420841683367, + "grad_norm": 1.0343470573425293, + "learning_rate": 0.0002, + "loss": 1.2066, + "step": 3900 + }, + { + "epoch": 5.22378089512358, + "grad_norm": 1.0771924257278442, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 3910 + }, + { + "epoch": 5.237140948563794, + "grad_norm": 0.8542634844779968, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 3920 + }, + { + "epoch": 5.250501002004008, + "grad_norm": 1.1021966934204102, + "learning_rate": 0.0002, + "loss": 1.2264, + "step": 3930 + }, + { + "epoch": 5.263861055444222, + "grad_norm": 1.170011281967163, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 3940 + }, + { + "epoch": 5.277221108884436, + "grad_norm": 0.9787653684616089, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 3950 + }, + { + "epoch": 5.290581162324649, + "grad_norm": 0.914513885974884, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 3960 + }, + { + "epoch": 5.303941215764863, + "grad_norm": 1.0831562280654907, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 3970 + }, + { + "epoch": 5.3173012692050765, + "grad_norm": 0.9810112714767456, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 3980 + }, + { + "epoch": 5.330661322645291, + "grad_norm": 0.9624066948890686, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 3990 + }, + { + "epoch": 5.344021376085505, + "grad_norm": 1.2296923398971558, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 4000 + }, + { + "epoch": 5.357381429525718, + "grad_norm": 1.011299967765808, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 4010 + }, + { + "epoch": 5.370741482965932, + "grad_norm": 0.9144132733345032, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 4020 + }, + { + "epoch": 5.3841015364061455, + "grad_norm": 1.0573601722717285, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4030 + }, + { + "epoch": 5.397461589846359, + "grad_norm": 1.1667137145996094, + "learning_rate": 0.0002, + "loss": 1.2295, + "step": 4040 + }, + { + "epoch": 5.410821643286573, + "grad_norm": 1.072070598602295, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 4050 + }, + { + "epoch": 5.424181696726787, + "grad_norm": 1.1005792617797852, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 4060 + }, + { + "epoch": 5.437541750167001, + "grad_norm": 1.033581018447876, + "learning_rate": 0.0002, + "loss": 1.2604, + "step": 4070 + }, + { + "epoch": 5.4509018036072145, + "grad_norm": 0.9537439942359924, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 4080 + }, + { + "epoch": 5.464261857047428, + "grad_norm": 1.0502177476882935, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 4090 + }, + { + "epoch": 5.477621910487642, + "grad_norm": 0.9098296761512756, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4100 + }, + { + "epoch": 5.490981963927855, + "grad_norm": 0.9551953077316284, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4110 + }, + { + "epoch": 5.504342017368069, + "grad_norm": 0.9169427156448364, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 4120 + }, + { + "epoch": 5.517702070808284, + "grad_norm": 0.9430235624313354, + "learning_rate": 0.0002, + "loss": 1.2572, + "step": 4130 + }, + { + "epoch": 5.531062124248497, + "grad_norm": 0.817259669303894, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 4140 + }, + { + "epoch": 5.544422177688711, + "grad_norm": 1.124152660369873, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 4150 + }, + { + "epoch": 5.557782231128924, + "grad_norm": 0.9250756502151489, + "learning_rate": 0.0002, + "loss": 1.2508, + "step": 4160 + }, + { + "epoch": 5.571142284569138, + "grad_norm": 0.9582970142364502, + "learning_rate": 0.0002, + "loss": 1.2492, + "step": 4170 + }, + { + "epoch": 5.584502338009352, + "grad_norm": 1.0078704357147217, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 4180 + }, + { + "epoch": 5.597862391449565, + "grad_norm": 0.9585610032081604, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 4190 + }, + { + "epoch": 5.61122244488978, + "grad_norm": 1.0150971412658691, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 4200 + }, + { + "epoch": 5.6245824983299935, + "grad_norm": 0.9943351149559021, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 4210 + }, + { + "epoch": 5.637942551770207, + "grad_norm": 0.8880936503410339, + "learning_rate": 0.0002, + "loss": 1.2928, + "step": 4220 + }, + { + "epoch": 5.651302605210421, + "grad_norm": 0.9873887896537781, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 4230 + }, + { + "epoch": 5.664662658650634, + "grad_norm": 0.9185152649879456, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 4240 + }, + { + "epoch": 5.678022712090849, + "grad_norm": 1.0706779956817627, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 4250 + }, + { + "epoch": 5.6913827655310625, + "grad_norm": 0.9660224914550781, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 4260 + }, + { + "epoch": 5.704742818971276, + "grad_norm": 0.8685019612312317, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 4270 + }, + { + "epoch": 5.71810287241149, + "grad_norm": 1.0390565395355225, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 4280 + }, + { + "epoch": 5.731462925851703, + "grad_norm": 0.9290478825569153, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 4290 + }, + { + "epoch": 5.744822979291917, + "grad_norm": 1.0361281633377075, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 4300 + }, + { + "epoch": 5.758183032732131, + "grad_norm": 0.8804615139961243, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 4310 + }, + { + "epoch": 5.771543086172345, + "grad_norm": 1.0051425695419312, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4320 + }, + { + "epoch": 5.784903139612559, + "grad_norm": 1.0051119327545166, + "learning_rate": 0.0002, + "loss": 1.1946, + "step": 4330 + }, + { + "epoch": 5.798263193052772, + "grad_norm": 0.9961661100387573, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 5.811623246492986, + "grad_norm": 1.0229419469833374, + "learning_rate": 0.0002, + "loss": 1.2179, + "step": 4350 + }, + { + "epoch": 5.8249832999332, + "grad_norm": 1.1129552125930786, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 4360 + }, + { + "epoch": 5.838343353373413, + "grad_norm": 1.18964421749115, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 4370 + }, + { + "epoch": 5.851703406813627, + "grad_norm": 0.9490230083465576, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4380 + }, + { + "epoch": 5.865063460253841, + "grad_norm": 0.8734540343284607, + "learning_rate": 0.0002, + "loss": 1.3177, + "step": 4390 + }, + { + "epoch": 5.878423513694055, + "grad_norm": 1.0017802715301514, + "learning_rate": 0.0002, + "loss": 1.3131, + "step": 4400 + }, + { + "epoch": 5.891783567134269, + "grad_norm": 0.953556478023529, + "learning_rate": 0.0002, + "loss": 1.2649, + "step": 4410 + }, + { + "epoch": 5.905143620574482, + "grad_norm": 0.8915258646011353, + "learning_rate": 0.0002, + "loss": 1.2684, + "step": 4420 + }, + { + "epoch": 5.918503674014696, + "grad_norm": 0.9715141654014587, + "learning_rate": 0.0002, + "loss": 1.2843, + "step": 4430 + }, + { + "epoch": 5.9318637274549095, + "grad_norm": 0.9432152509689331, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 4440 + }, + { + "epoch": 5.945223780895123, + "grad_norm": 0.9473979473114014, + "learning_rate": 0.0002, + "loss": 1.233, + "step": 4450 + }, + { + "epoch": 5.958583834335338, + "grad_norm": 1.104871392250061, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 4460 + }, + { + "epoch": 5.971943887775551, + "grad_norm": 1.0308905839920044, + "learning_rate": 0.0002, + "loss": 1.3427, + "step": 4470 + }, + { + "epoch": 5.985303941215765, + "grad_norm": 0.8895487189292908, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 4480 + }, + { + "epoch": 5.9986639946559785, + "grad_norm": 1.0148485898971558, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 4490 + }, + { + "epoch": 6.0, + "eval_loss": 2.0830726623535156, + "eval_runtime": 38.5442, + "eval_samples_per_second": 13.361, + "eval_steps_per_second": 1.686, + "step": 4491 + }, + { + "epoch": 6.012024048096192, + "grad_norm": 1.1640599966049194, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 4500 + }, + { + "epoch": 6.025384101536406, + "grad_norm": 1.213204264640808, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 4510 + }, + { + "epoch": 6.03874415497662, + "grad_norm": 1.1694388389587402, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 4520 + }, + { + "epoch": 6.052104208416834, + "grad_norm": 1.1044062376022339, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 4530 + }, + { + "epoch": 6.065464261857048, + "grad_norm": 1.0701100826263428, + "learning_rate": 0.0002, + "loss": 1.0552, + "step": 4540 + }, + { + "epoch": 6.078824315297261, + "grad_norm": 1.360065221786499, + "learning_rate": 0.0002, + "loss": 1.0018, + "step": 4550 + }, + { + "epoch": 6.092184368737475, + "grad_norm": 1.0648503303527832, + "learning_rate": 0.0002, + "loss": 1.0189, + "step": 4560 + }, + { + "epoch": 6.1055444221776884, + "grad_norm": 1.066245198249817, + "learning_rate": 0.0002, + "loss": 1.008, + "step": 4570 + }, + { + "epoch": 6.118904475617902, + "grad_norm": 1.1483700275421143, + "learning_rate": 0.0002, + "loss": 1.099, + "step": 4580 + }, + { + "epoch": 6.132264529058117, + "grad_norm": 1.334275722503662, + "learning_rate": 0.0002, + "loss": 1.1043, + "step": 4590 + }, + { + "epoch": 6.14562458249833, + "grad_norm": 1.2141029834747314, + "learning_rate": 0.0002, + "loss": 1.0783, + "step": 4600 + }, + { + "epoch": 6.158984635938544, + "grad_norm": 1.2284387350082397, + "learning_rate": 0.0002, + "loss": 1.0891, + "step": 4610 + }, + { + "epoch": 6.1723446893787575, + "grad_norm": 1.2326734066009521, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 4620 + }, + { + "epoch": 6.185704742818971, + "grad_norm": 1.245004653930664, + "learning_rate": 0.0002, + "loss": 1.1069, + "step": 4630 + }, + { + "epoch": 6.199064796259185, + "grad_norm": 0.9685266017913818, + "learning_rate": 0.0002, + "loss": 1.0821, + "step": 4640 + }, + { + "epoch": 6.212424849699399, + "grad_norm": 1.141634464263916, + "learning_rate": 0.0002, + "loss": 1.0659, + "step": 4650 + }, + { + "epoch": 6.225784903139613, + "grad_norm": 1.4279003143310547, + "learning_rate": 0.0002, + "loss": 1.0971, + "step": 4660 + }, + { + "epoch": 6.2391449565798265, + "grad_norm": 1.186668872833252, + "learning_rate": 0.0002, + "loss": 1.093, + "step": 4670 + }, + { + "epoch": 6.25250501002004, + "grad_norm": 1.2656606435775757, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 4680 + }, + { + "epoch": 6.265865063460254, + "grad_norm": 1.1122987270355225, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 4690 + }, + { + "epoch": 6.279225116900467, + "grad_norm": 1.190050482749939, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 4700 + }, + { + "epoch": 6.292585170340681, + "grad_norm": 1.3683340549468994, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 4710 + }, + { + "epoch": 6.3059452237808955, + "grad_norm": 1.1787203550338745, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 4720 + }, + { + "epoch": 6.319305277221109, + "grad_norm": 1.3502576351165771, + "learning_rate": 0.0002, + "loss": 1.0856, + "step": 4730 + }, + { + "epoch": 6.332665330661323, + "grad_norm": 1.1958597898483276, + "learning_rate": 0.0002, + "loss": 1.0999, + "step": 4740 + }, + { + "epoch": 6.346025384101536, + "grad_norm": 1.0918327569961548, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 4750 + }, + { + "epoch": 6.35938543754175, + "grad_norm": 1.2624558210372925, + "learning_rate": 0.0002, + "loss": 1.0484, + "step": 4760 + }, + { + "epoch": 6.372745490981964, + "grad_norm": 1.1390577554702759, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 4770 + }, + { + "epoch": 6.386105544422177, + "grad_norm": 1.041666865348816, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 4780 + }, + { + "epoch": 6.399465597862392, + "grad_norm": 1.4209141731262207, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 4790 + }, + { + "epoch": 6.412825651302605, + "grad_norm": 1.1001079082489014, + "learning_rate": 0.0002, + "loss": 1.119, + "step": 4800 + }, + { + "epoch": 6.426185704742819, + "grad_norm": 1.3324936628341675, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 4810 + }, + { + "epoch": 6.439545758183033, + "grad_norm": 1.1270194053649902, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 4820 + }, + { + "epoch": 6.452905811623246, + "grad_norm": 1.1961387395858765, + "learning_rate": 0.0002, + "loss": 1.1338, + "step": 4830 + }, + { + "epoch": 6.46626586506346, + "grad_norm": 1.255366563796997, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 4840 + }, + { + "epoch": 6.479625918503674, + "grad_norm": 1.343855381011963, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 4850 + }, + { + "epoch": 6.492985971943888, + "grad_norm": 1.3216257095336914, + "learning_rate": 0.0002, + "loss": 1.1118, + "step": 4860 + }, + { + "epoch": 6.506346025384102, + "grad_norm": 1.5244755744934082, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4870 + }, + { + "epoch": 6.519706078824315, + "grad_norm": 1.1585701704025269, + "learning_rate": 0.0002, + "loss": 1.0403, + "step": 4880 + }, + { + "epoch": 6.533066132264529, + "grad_norm": 1.0301100015640259, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 4890 + }, + { + "epoch": 6.5464261857047426, + "grad_norm": 1.5772714614868164, + "learning_rate": 0.0002, + "loss": 1.1304, + "step": 4900 + }, + { + "epoch": 6.559786239144957, + "grad_norm": 1.2015259265899658, + "learning_rate": 0.0002, + "loss": 1.0953, + "step": 4910 + }, + { + "epoch": 6.573146292585171, + "grad_norm": 1.4365423917770386, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 4920 + }, + { + "epoch": 6.586506346025384, + "grad_norm": 1.2534470558166504, + "learning_rate": 0.0002, + "loss": 1.0717, + "step": 4930 + }, + { + "epoch": 6.599866399465598, + "grad_norm": 1.216138482093811, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 4940 + }, + { + "epoch": 6.613226452905812, + "grad_norm": 1.144316554069519, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 4950 + }, + { + "epoch": 6.626586506346025, + "grad_norm": 1.1127740144729614, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 4960 + }, + { + "epoch": 6.639946559786239, + "grad_norm": 1.1925606727600098, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 4970 + }, + { + "epoch": 6.653306613226453, + "grad_norm": 1.2500451803207397, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 4980 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.16154944896698, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 4990 + }, + { + "epoch": 6.680026720106881, + "grad_norm": 1.1921433210372925, + "learning_rate": 0.0002, + "loss": 1.1003, + "step": 5000 + }, + { + "epoch": 6.693386773547094, + "grad_norm": 1.1561170816421509, + "learning_rate": 0.0002, + "loss": 1.1278, + "step": 5010 + }, + { + "epoch": 6.706746826987308, + "grad_norm": 1.2988990545272827, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 5020 + }, + { + "epoch": 6.7201068804275215, + "grad_norm": 0.9620341062545776, + "learning_rate": 0.0002, + "loss": 1.1131, + "step": 5030 + }, + { + "epoch": 6.733466933867735, + "grad_norm": 1.084228515625, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5040 + }, + { + "epoch": 6.74682698730795, + "grad_norm": 1.1119431257247925, + "learning_rate": 0.0002, + "loss": 1.1474, + "step": 5050 + }, + { + "epoch": 6.760187040748163, + "grad_norm": 1.1365628242492676, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 5060 + }, + { + "epoch": 6.773547094188377, + "grad_norm": 1.0989075899124146, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 5070 + }, + { + "epoch": 6.7869071476285905, + "grad_norm": 1.040647268295288, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 5080 + }, + { + "epoch": 6.800267201068804, + "grad_norm": 1.1083087921142578, + "learning_rate": 0.0002, + "loss": 1.0793, + "step": 5090 + }, + { + "epoch": 6.813627254509018, + "grad_norm": 1.3434782028198242, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 5100 + }, + { + "epoch": 6.826987307949231, + "grad_norm": 1.2493442296981812, + "learning_rate": 0.0002, + "loss": 1.1243, + "step": 5110 + }, + { + "epoch": 6.840347361389446, + "grad_norm": 1.0672307014465332, + "learning_rate": 0.0002, + "loss": 1.0633, + "step": 5120 + }, + { + "epoch": 6.8537074148296595, + "grad_norm": 1.068350911140442, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 5130 + }, + { + "epoch": 6.867067468269873, + "grad_norm": 1.2880923748016357, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 5140 + }, + { + "epoch": 6.880427521710087, + "grad_norm": 1.0895041227340698, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 5150 + }, + { + "epoch": 6.8937875751503, + "grad_norm": 1.2383300065994263, + "learning_rate": 0.0002, + "loss": 1.1535, + "step": 5160 + }, + { + "epoch": 6.907147628590514, + "grad_norm": 1.5274227857589722, + "learning_rate": 0.0002, + "loss": 1.1653, + "step": 5170 + }, + { + "epoch": 6.920507682030728, + "grad_norm": 1.1453371047973633, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 5180 + }, + { + "epoch": 6.933867735470942, + "grad_norm": 1.171336054801941, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 5190 + }, + { + "epoch": 6.947227788911156, + "grad_norm": 1.1946955919265747, + "learning_rate": 0.0002, + "loss": 1.1142, + "step": 5200 + }, + { + "epoch": 6.960587842351369, + "grad_norm": 1.2290117740631104, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 5210 + }, + { + "epoch": 6.973947895791583, + "grad_norm": 1.3134533166885376, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 5220 + }, + { + "epoch": 6.987307949231797, + "grad_norm": 1.1500377655029297, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 5230 + }, + { + "epoch": 6.999331997327989, + "eval_loss": 2.2211341857910156, + "eval_runtime": 38.5729, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.685, + "step": 5239 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4247232828434022e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5239/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..87df48bf37c4668928b0c9052a1f7a921a0145c6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e6fb205815c6e30651ef6687417ae7ca7da70367d071aeb119db1f9854b7cc4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..97c5ef6cfca745aa890fb12ad0f328775fdfaadd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56db1f9fd5b7d75aa4034ba3491279663f70673c8560ffef5dbb38972dd6f1a3 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4b16818a83e498fe9241d615ae7138a33c5be269 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67f56643fe11a17640541d1e5aff6bd7405d89271bab8a14518e86ddcbc2e424 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5b44034653714b5420baa13d6aed5c6a2bd93e5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28ffeac63c7827c6d9ea01ea6d72537e7e5a53d6dd03179472c82dbdb62dc0fc +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cf0bee0bbd3820729ac971b52f32308570f5bf23 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/trainer_state.json @@ -0,0 +1,4283 @@ +{ + "best_metric": 1.8132041692733765, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", + "epoch": 7.994655978623914, + "eval_steps": 10, + "global_step": 5984, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + }, + { + "epoch": 1.002004008016032, + "grad_norm": 0.32302048802375793, + "learning_rate": 0.0002, + "loss": 1.7786, + "step": 750 + }, + { + "epoch": 1.0153640614562458, + "grad_norm": 0.37585633993148804, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 760 + }, + { + "epoch": 1.0287241148964597, + "grad_norm": 0.33826273679733276, + "learning_rate": 0.0002, + "loss": 1.7008, + "step": 770 + }, + { + "epoch": 1.0420841683366733, + "grad_norm": 0.44682955741882324, + "learning_rate": 0.0002, + "loss": 1.809, + "step": 780 + }, + { + "epoch": 1.0554442217768871, + "grad_norm": 0.422188401222229, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 790 + }, + { + "epoch": 1.0688042752171008, + "grad_norm": 0.3809906244277954, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 800 + }, + { + "epoch": 1.0821643286573146, + "grad_norm": 0.3454349637031555, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 1.0955243820975284, + "grad_norm": 0.3767355978488922, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 820 + }, + { + "epoch": 1.108884435537742, + "grad_norm": 0.3361407518386841, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 830 + }, + { + "epoch": 1.122244488977956, + "grad_norm": 0.3654632568359375, + "learning_rate": 0.0002, + "loss": 1.7509, + "step": 840 + }, + { + "epoch": 1.1356045424181698, + "grad_norm": 0.3822861313819885, + "learning_rate": 0.0002, + "loss": 1.7151, + "step": 850 + }, + { + "epoch": 1.1489645958583834, + "grad_norm": 0.3853831887245178, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 860 + }, + { + "epoch": 1.1623246492985972, + "grad_norm": 0.35521796345710754, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 870 + }, + { + "epoch": 1.1756847027388109, + "grad_norm": 0.4107200503349304, + "learning_rate": 0.0002, + "loss": 1.7735, + "step": 880 + }, + { + "epoch": 1.1890447561790247, + "grad_norm": 0.33219534158706665, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 890 + }, + { + "epoch": 1.2024048096192386, + "grad_norm": 0.3559704124927521, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 900 + }, + { + "epoch": 1.2157648630594522, + "grad_norm": 0.3700537383556366, + "learning_rate": 0.0002, + "loss": 1.7535, + "step": 910 + }, + { + "epoch": 1.229124916499666, + "grad_norm": 0.3771909475326538, + "learning_rate": 0.0002, + "loss": 1.7513, + "step": 920 + }, + { + "epoch": 1.2424849699398797, + "grad_norm": 0.3136613965034485, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 930 + }, + { + "epoch": 1.2558450233800935, + "grad_norm": 0.3952099084854126, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 940 + }, + { + "epoch": 1.2692050768203074, + "grad_norm": 0.36534377932548523, + "learning_rate": 0.0002, + "loss": 1.7691, + "step": 950 + }, + { + "epoch": 1.282565130260521, + "grad_norm": 0.3803492486476898, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 960 + }, + { + "epoch": 1.2959251837007348, + "grad_norm": 0.3992428183555603, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 970 + }, + { + "epoch": 1.3092852371409487, + "grad_norm": 0.3627142906188965, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 980 + }, + { + "epoch": 1.3226452905811623, + "grad_norm": 0.4248180091381073, + "learning_rate": 0.0002, + "loss": 1.7598, + "step": 990 + }, + { + "epoch": 1.3360053440213762, + "grad_norm": 0.4060308039188385, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1000 + }, + { + "epoch": 1.3493653974615898, + "grad_norm": 0.3788969814777374, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1010 + }, + { + "epoch": 1.3627254509018036, + "grad_norm": 0.4174270033836365, + "learning_rate": 0.0002, + "loss": 1.7111, + "step": 1020 + }, + { + "epoch": 1.3760855043420173, + "grad_norm": 0.35500675439834595, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 1030 + }, + { + "epoch": 1.389445557782231, + "grad_norm": 0.3454059362411499, + "learning_rate": 0.0002, + "loss": 1.724, + "step": 1040 + }, + { + "epoch": 1.402805611222445, + "grad_norm": 0.45807570219039917, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 1050 + }, + { + "epoch": 1.4161656646626586, + "grad_norm": 0.39338022470474243, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1060 + }, + { + "epoch": 1.4295257181028724, + "grad_norm": 0.3870709240436554, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1070 + }, + { + "epoch": 1.4428857715430863, + "grad_norm": 0.40996190905570984, + "learning_rate": 0.0002, + "loss": 1.6565, + "step": 1080 + }, + { + "epoch": 1.4562458249833, + "grad_norm": 0.38762837648391724, + "learning_rate": 0.0002, + "loss": 1.7324, + "step": 1090 + }, + { + "epoch": 1.4696058784235138, + "grad_norm": 0.36756977438926697, + "learning_rate": 0.0002, + "loss": 1.7362, + "step": 1100 + }, + { + "epoch": 1.4829659318637274, + "grad_norm": 0.4087235927581787, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 1110 + }, + { + "epoch": 1.4963259853039412, + "grad_norm": 0.3357745110988617, + "learning_rate": 0.0002, + "loss": 1.7114, + "step": 1120 + }, + { + "epoch": 1.5096860387441549, + "grad_norm": 0.37486532330513, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 1130 + }, + { + "epoch": 1.5230460921843687, + "grad_norm": 0.3387809991836548, + "learning_rate": 0.0002, + "loss": 1.7252, + "step": 1140 + }, + { + "epoch": 1.5364061456245826, + "grad_norm": 0.37462118268013, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1150 + }, + { + "epoch": 1.5497661990647962, + "grad_norm": 0.38575324416160583, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 1160 + }, + { + "epoch": 1.56312625250501, + "grad_norm": 0.3515765964984894, + "learning_rate": 0.0002, + "loss": 1.7438, + "step": 1170 + }, + { + "epoch": 1.5764863059452239, + "grad_norm": 0.39308643341064453, + "learning_rate": 0.0002, + "loss": 1.7524, + "step": 1180 + }, + { + "epoch": 1.5898463593854375, + "grad_norm": 0.3308864235877991, + "learning_rate": 0.0002, + "loss": 1.6422, + "step": 1190 + }, + { + "epoch": 1.6032064128256514, + "grad_norm": 0.3397478461265564, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1200 + }, + { + "epoch": 1.6165664662658652, + "grad_norm": 0.3911525309085846, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 1210 + }, + { + "epoch": 1.6299265197060788, + "grad_norm": 0.3771969974040985, + "learning_rate": 0.0002, + "loss": 1.7443, + "step": 1220 + }, + { + "epoch": 1.6432865731462925, + "grad_norm": 0.35346856713294983, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1230 + }, + { + "epoch": 1.6566466265865063, + "grad_norm": 0.41736963391304016, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6700066800267201, + "grad_norm": 0.3375225067138672, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1250 + }, + { + "epoch": 1.6833667334669338, + "grad_norm": 0.3779928982257843, + "learning_rate": 0.0002, + "loss": 1.6916, + "step": 1260 + }, + { + "epoch": 1.6967267869071476, + "grad_norm": 0.35388994216918945, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1270 + }, + { + "epoch": 1.7100868403473615, + "grad_norm": 0.33884134888648987, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1280 + }, + { + "epoch": 1.723446893787575, + "grad_norm": 0.35439756512641907, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 1290 + }, + { + "epoch": 1.736806947227789, + "grad_norm": 0.3766156733036041, + "learning_rate": 0.0002, + "loss": 1.7389, + "step": 1300 + }, + { + "epoch": 1.7501670006680028, + "grad_norm": 0.36148911714553833, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1310 + }, + { + "epoch": 1.7635270541082164, + "grad_norm": 0.39687496423721313, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.77688710754843, + "grad_norm": 0.35639452934265137, + "learning_rate": 0.0002, + "loss": 1.7541, + "step": 1330 + }, + { + "epoch": 1.7902471609886441, + "grad_norm": 0.38781628012657166, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1340 + }, + { + "epoch": 1.8036072144288577, + "grad_norm": 0.42784637212753296, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 1350 + }, + { + "epoch": 1.8169672678690714, + "grad_norm": 0.40258511900901794, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1360 + }, + { + "epoch": 1.8303273213092852, + "grad_norm": 0.36674195528030396, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 1370 + }, + { + "epoch": 1.843687374749499, + "grad_norm": 0.4064558446407318, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1380 + }, + { + "epoch": 1.8570474281897127, + "grad_norm": 0.3669849932193756, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1390 + }, + { + "epoch": 1.8704074816299265, + "grad_norm": 0.37569567561149597, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 1400 + }, + { + "epoch": 1.8837675350701404, + "grad_norm": 0.37307995557785034, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1410 + }, + { + "epoch": 1.897127588510354, + "grad_norm": 0.3772695064544678, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1420 + }, + { + "epoch": 1.9104876419505676, + "grad_norm": 0.36993589997291565, + "learning_rate": 0.0002, + "loss": 1.7682, + "step": 1430 + }, + { + "epoch": 1.9238476953907817, + "grad_norm": 0.3490557372570038, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 1440 + }, + { + "epoch": 1.9372077488309953, + "grad_norm": 0.3716149628162384, + "learning_rate": 0.0002, + "loss": 1.7979, + "step": 1450 + }, + { + "epoch": 1.950567802271209, + "grad_norm": 0.39236098527908325, + "learning_rate": 0.0002, + "loss": 1.6664, + "step": 1460 + }, + { + "epoch": 1.9639278557114228, + "grad_norm": 0.37258651852607727, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 1470 + }, + { + "epoch": 1.9772879091516367, + "grad_norm": 0.36183077096939087, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 1480 + }, + { + "epoch": 1.9906479625918503, + "grad_norm": 0.3956947326660156, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8132041692733765, + "eval_runtime": 38.6287, + "eval_samples_per_second": 13.332, + "eval_steps_per_second": 1.683, + "step": 1497 + }, + { + "epoch": 2.004008016032064, + "grad_norm": 0.34480565786361694, + "learning_rate": 0.0002, + "loss": 1.6791, + "step": 1500 + }, + { + "epoch": 2.017368069472278, + "grad_norm": 0.3418028652667999, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 1510 + }, + { + "epoch": 2.0307281229124916, + "grad_norm": 0.4514467716217041, + "learning_rate": 0.0002, + "loss": 1.5827, + "step": 1520 + }, + { + "epoch": 2.0440881763527052, + "grad_norm": 0.4197506606578827, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1530 + }, + { + "epoch": 2.0574482297929193, + "grad_norm": 0.4134170711040497, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 1540 + }, + { + "epoch": 2.070808283233133, + "grad_norm": 0.43709826469421387, + "learning_rate": 0.0002, + "loss": 1.6876, + "step": 1550 + }, + { + "epoch": 2.0841683366733466, + "grad_norm": 0.4703378677368164, + "learning_rate": 0.0002, + "loss": 1.5779, + "step": 1560 + }, + { + "epoch": 2.0975283901135606, + "grad_norm": 0.4538188576698303, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 1570 + }, + { + "epoch": 2.1108884435537743, + "grad_norm": 0.4649668037891388, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 1580 + }, + { + "epoch": 2.124248496993988, + "grad_norm": 0.42669883370399475, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 1590 + }, + { + "epoch": 2.1376085504342015, + "grad_norm": 0.43162038922309875, + "learning_rate": 0.0002, + "loss": 1.5838, + "step": 1600 + }, + { + "epoch": 2.1509686038744156, + "grad_norm": 0.4294586479663849, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1610 + }, + { + "epoch": 2.164328657314629, + "grad_norm": 0.4669102132320404, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1620 + }, + { + "epoch": 2.177688710754843, + "grad_norm": 0.4188412129878998, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1630 + }, + { + "epoch": 2.191048764195057, + "grad_norm": 0.4662680923938751, + "learning_rate": 0.0002, + "loss": 1.625, + "step": 1640 + }, + { + "epoch": 2.2044088176352705, + "grad_norm": 0.4020286500453949, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1650 + }, + { + "epoch": 2.217768871075484, + "grad_norm": 0.41919606924057007, + "learning_rate": 0.0002, + "loss": 1.6284, + "step": 1660 + }, + { + "epoch": 2.231128924515698, + "grad_norm": 0.4644531309604645, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1670 + }, + { + "epoch": 2.244488977955912, + "grad_norm": 0.4526427984237671, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 1680 + }, + { + "epoch": 2.2578490313961255, + "grad_norm": 0.45953166484832764, + "learning_rate": 0.0002, + "loss": 1.6058, + "step": 1690 + }, + { + "epoch": 2.2712090848363395, + "grad_norm": 0.4701860249042511, + "learning_rate": 0.0002, + "loss": 1.5979, + "step": 1700 + }, + { + "epoch": 2.284569138276553, + "grad_norm": 0.4749310612678528, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 1710 + }, + { + "epoch": 2.297929191716767, + "grad_norm": 0.45026102662086487, + "learning_rate": 0.0002, + "loss": 1.6703, + "step": 1720 + }, + { + "epoch": 2.3112892451569804, + "grad_norm": 0.4755004048347473, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 1730 + }, + { + "epoch": 2.3246492985971945, + "grad_norm": 0.4505726993083954, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1740 + }, + { + "epoch": 2.338009352037408, + "grad_norm": 0.44464054703712463, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 1750 + }, + { + "epoch": 2.3513694054776217, + "grad_norm": 0.4449476897716522, + "learning_rate": 0.0002, + "loss": 1.6139, + "step": 1760 + }, + { + "epoch": 2.364729458917836, + "grad_norm": 0.4216482937335968, + "learning_rate": 0.0002, + "loss": 1.7195, + "step": 1770 + }, + { + "epoch": 2.3780895123580494, + "grad_norm": 0.4379308521747589, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 1780 + }, + { + "epoch": 2.391449565798263, + "grad_norm": 0.41670042276382446, + "learning_rate": 0.0002, + "loss": 1.7024, + "step": 1790 + }, + { + "epoch": 2.404809619238477, + "grad_norm": 0.48089510202407837, + "learning_rate": 0.0002, + "loss": 1.5989, + "step": 1800 + }, + { + "epoch": 2.4181696726786908, + "grad_norm": 0.4389738142490387, + "learning_rate": 0.0002, + "loss": 1.6313, + "step": 1810 + }, + { + "epoch": 2.4315297261189044, + "grad_norm": 0.45293036103248596, + "learning_rate": 0.0002, + "loss": 1.5841, + "step": 1820 + }, + { + "epoch": 2.4448897795591185, + "grad_norm": 0.5211683511734009, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1830 + }, + { + "epoch": 2.458249832999332, + "grad_norm": 0.4631884694099426, + "learning_rate": 0.0002, + "loss": 1.6599, + "step": 1840 + }, + { + "epoch": 2.4716098864395457, + "grad_norm": 0.4276818335056305, + "learning_rate": 0.0002, + "loss": 1.6537, + "step": 1850 + }, + { + "epoch": 2.4849699398797593, + "grad_norm": 0.477524071931839, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 1860 + }, + { + "epoch": 2.4983299933199734, + "grad_norm": 0.44860973954200745, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1870 + }, + { + "epoch": 2.511690046760187, + "grad_norm": 0.46413546800613403, + "learning_rate": 0.0002, + "loss": 1.6308, + "step": 1880 + }, + { + "epoch": 2.5250501002004007, + "grad_norm": 0.42487645149230957, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 1890 + }, + { + "epoch": 2.5384101536406147, + "grad_norm": 0.4778307378292084, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 1900 + }, + { + "epoch": 2.5517702070808284, + "grad_norm": 0.45307061076164246, + "learning_rate": 0.0002, + "loss": 1.6143, + "step": 1910 + }, + { + "epoch": 2.565130260521042, + "grad_norm": 0.47886642813682556, + "learning_rate": 0.0002, + "loss": 1.7279, + "step": 1920 + }, + { + "epoch": 2.5784903139612556, + "grad_norm": 0.4839435815811157, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1930 + }, + { + "epoch": 2.5918503674014697, + "grad_norm": 0.4388359785079956, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 1940 + }, + { + "epoch": 2.6052104208416833, + "grad_norm": 0.47859734296798706, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1950 + }, + { + "epoch": 2.6185704742818974, + "grad_norm": 0.5526517033576965, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 1960 + }, + { + "epoch": 2.631930527722111, + "grad_norm": 0.5449170470237732, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1970 + }, + { + "epoch": 2.6452905811623246, + "grad_norm": 0.48521968722343445, + "learning_rate": 0.0002, + "loss": 1.6481, + "step": 1980 + }, + { + "epoch": 2.6586506346025383, + "grad_norm": 0.4733737111091614, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 1990 + }, + { + "epoch": 2.6720106880427523, + "grad_norm": 0.507118284702301, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2000 + }, + { + "epoch": 2.685370741482966, + "grad_norm": 0.4508971571922302, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2010 + }, + { + "epoch": 2.6987307949231796, + "grad_norm": 0.4657728672027588, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2020 + }, + { + "epoch": 2.7120908483633936, + "grad_norm": 0.48647549748420715, + "learning_rate": 0.0002, + "loss": 1.6261, + "step": 2030 + }, + { + "epoch": 2.7254509018036073, + "grad_norm": 0.49525555968284607, + "learning_rate": 0.0002, + "loss": 1.5638, + "step": 2040 + }, + { + "epoch": 2.738810955243821, + "grad_norm": 0.4712379276752472, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 2050 + }, + { + "epoch": 2.7521710086840345, + "grad_norm": 0.4846591055393219, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 2060 + }, + { + "epoch": 2.7655310621242486, + "grad_norm": 0.4823240041732788, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 2070 + }, + { + "epoch": 2.778891115564462, + "grad_norm": 0.4546685516834259, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 2080 + }, + { + "epoch": 2.7922511690046763, + "grad_norm": 0.45542681217193604, + "learning_rate": 0.0002, + "loss": 1.7015, + "step": 2090 + }, + { + "epoch": 2.80561122244489, + "grad_norm": 0.42137566208839417, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2100 + }, + { + "epoch": 2.8189712758851035, + "grad_norm": 0.6143282055854797, + "learning_rate": 0.0002, + "loss": 1.6526, + "step": 2110 + }, + { + "epoch": 2.832331329325317, + "grad_norm": 0.4828081727027893, + "learning_rate": 0.0002, + "loss": 1.6955, + "step": 2120 + }, + { + "epoch": 2.845691382765531, + "grad_norm": 0.4319005608558655, + "learning_rate": 0.0002, + "loss": 1.744, + "step": 2130 + }, + { + "epoch": 2.859051436205745, + "grad_norm": 0.4297086298465729, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2140 + }, + { + "epoch": 2.8724114896459585, + "grad_norm": 0.5011981129646301, + "learning_rate": 0.0002, + "loss": 1.5968, + "step": 2150 + }, + { + "epoch": 2.8857715430861726, + "grad_norm": 0.4401548504829407, + "learning_rate": 0.0002, + "loss": 1.7181, + "step": 2160 + }, + { + "epoch": 2.899131596526386, + "grad_norm": 0.48090746998786926, + "learning_rate": 0.0002, + "loss": 1.5722, + "step": 2170 + }, + { + "epoch": 2.9124916499666, + "grad_norm": 0.4740385413169861, + "learning_rate": 0.0002, + "loss": 1.6596, + "step": 2180 + }, + { + "epoch": 2.9258517034068134, + "grad_norm": 0.5337260365486145, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 2190 + }, + { + "epoch": 2.9392117568470275, + "grad_norm": 0.4420052766799927, + "learning_rate": 0.0002, + "loss": 1.6802, + "step": 2200 + }, + { + "epoch": 2.952571810287241, + "grad_norm": 0.477512389421463, + "learning_rate": 0.0002, + "loss": 1.5474, + "step": 2210 + }, + { + "epoch": 2.9659318637274548, + "grad_norm": 0.5344052910804749, + "learning_rate": 0.0002, + "loss": 1.6544, + "step": 2220 + }, + { + "epoch": 2.979291917167669, + "grad_norm": 0.4483940303325653, + "learning_rate": 0.0002, + "loss": 1.6866, + "step": 2230 + }, + { + "epoch": 2.9926519706078825, + "grad_norm": 0.4366597831249237, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2240 + }, + { + "epoch": 2.9993319973279893, + "eval_loss": 1.834012746810913, + "eval_runtime": 38.5659, + "eval_samples_per_second": 13.354, + "eval_steps_per_second": 1.685, + "step": 2245 + }, + { + "epoch": 3.006012024048096, + "grad_norm": 0.428824245929718, + "learning_rate": 0.0002, + "loss": 1.5582, + "step": 2250 + }, + { + "epoch": 3.01937207748831, + "grad_norm": 0.4870174825191498, + "learning_rate": 0.0002, + "loss": 1.499, + "step": 2260 + }, + { + "epoch": 3.032732130928524, + "grad_norm": 0.4684266149997711, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 2270 + }, + { + "epoch": 3.0460921843687374, + "grad_norm": 0.581604540348053, + "learning_rate": 0.0002, + "loss": 1.5284, + "step": 2280 + }, + { + "epoch": 3.059452237808951, + "grad_norm": 0.5561677813529968, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 2290 + }, + { + "epoch": 3.072812291249165, + "grad_norm": 0.5750220417976379, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 2300 + }, + { + "epoch": 3.0861723446893787, + "grad_norm": 0.5704626441001892, + "learning_rate": 0.0002, + "loss": 1.5903, + "step": 2310 + }, + { + "epoch": 3.0995323981295924, + "grad_norm": 0.6242083311080933, + "learning_rate": 0.0002, + "loss": 1.4292, + "step": 2320 + }, + { + "epoch": 3.1128924515698064, + "grad_norm": 0.5174121260643005, + "learning_rate": 0.0002, + "loss": 1.5092, + "step": 2330 + }, + { + "epoch": 3.12625250501002, + "grad_norm": 0.5697633028030396, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 2340 + }, + { + "epoch": 3.1396125584502337, + "grad_norm": 0.5969541072845459, + "learning_rate": 0.0002, + "loss": 1.5156, + "step": 2350 + }, + { + "epoch": 3.1529726118904478, + "grad_norm": 0.6244304180145264, + "learning_rate": 0.0002, + "loss": 1.52, + "step": 2360 + }, + { + "epoch": 3.1663326653306614, + "grad_norm": 0.5561705827713013, + "learning_rate": 0.0002, + "loss": 1.5244, + "step": 2370 + }, + { + "epoch": 3.179692718770875, + "grad_norm": 0.5401188135147095, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2380 + }, + { + "epoch": 3.1930527722110886, + "grad_norm": 0.6450421810150146, + "learning_rate": 0.0002, + "loss": 1.5387, + "step": 2390 + }, + { + "epoch": 3.2064128256513027, + "grad_norm": 0.5741903185844421, + "learning_rate": 0.0002, + "loss": 1.4839, + "step": 2400 + }, + { + "epoch": 3.2197728790915163, + "grad_norm": 0.6337407231330872, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2410 + }, + { + "epoch": 3.23313293253173, + "grad_norm": 0.6493517160415649, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 2420 + }, + { + "epoch": 3.246492985971944, + "grad_norm": 0.6230176091194153, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2430 + }, + { + "epoch": 3.2598530394121576, + "grad_norm": 0.680704653263092, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2440 + }, + { + "epoch": 3.2732130928523713, + "grad_norm": 0.5279417037963867, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2450 + }, + { + "epoch": 3.2865731462925853, + "grad_norm": 0.5601515173912048, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2460 + }, + { + "epoch": 3.299933199732799, + "grad_norm": 0.5591090321540833, + "learning_rate": 0.0002, + "loss": 1.4949, + "step": 2470 + }, + { + "epoch": 3.3132932531730126, + "grad_norm": 0.6596529483795166, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 2480 + }, + { + "epoch": 3.3266533066132267, + "grad_norm": 0.6115918755531311, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 2490 + }, + { + "epoch": 3.3400133600534403, + "grad_norm": 0.6443548202514648, + "learning_rate": 0.0002, + "loss": 1.5344, + "step": 2500 + }, + { + "epoch": 3.353373413493654, + "grad_norm": 0.5504242181777954, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 2510 + }, + { + "epoch": 3.3667334669338675, + "grad_norm": 0.6104483604431152, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 2520 + }, + { + "epoch": 3.3800935203740816, + "grad_norm": 0.8387531638145447, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 2530 + }, + { + "epoch": 3.3934535738142952, + "grad_norm": 0.6346094012260437, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 2540 + }, + { + "epoch": 3.406813627254509, + "grad_norm": 0.6261265873908997, + "learning_rate": 0.0002, + "loss": 1.4855, + "step": 2550 + }, + { + "epoch": 3.420173680694723, + "grad_norm": 0.5960372090339661, + "learning_rate": 0.0002, + "loss": 1.5233, + "step": 2560 + }, + { + "epoch": 3.4335337341349366, + "grad_norm": 0.5291280746459961, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 2570 + }, + { + "epoch": 3.44689378757515, + "grad_norm": 0.6133161783218384, + "learning_rate": 0.0002, + "loss": 1.5152, + "step": 2580 + }, + { + "epoch": 3.460253841015364, + "grad_norm": 0.623573362827301, + "learning_rate": 0.0002, + "loss": 1.5533, + "step": 2590 + }, + { + "epoch": 3.473613894455578, + "grad_norm": 0.5959834456443787, + "learning_rate": 0.0002, + "loss": 1.4935, + "step": 2600 + }, + { + "epoch": 3.4869739478957915, + "grad_norm": 0.583332359790802, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2610 + }, + { + "epoch": 3.5003340013360056, + "grad_norm": 0.6003559231758118, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2620 + }, + { + "epoch": 3.513694054776219, + "grad_norm": 0.5832992196083069, + "learning_rate": 0.0002, + "loss": 1.4901, + "step": 2630 + }, + { + "epoch": 3.527054108216433, + "grad_norm": 0.5942609906196594, + "learning_rate": 0.0002, + "loss": 1.5005, + "step": 2640 + }, + { + "epoch": 3.5404141616566465, + "grad_norm": 0.6087163686752319, + "learning_rate": 0.0002, + "loss": 1.5213, + "step": 2650 + }, + { + "epoch": 3.5537742150968605, + "grad_norm": 0.631948709487915, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2660 + }, + { + "epoch": 3.567134268537074, + "grad_norm": 0.6450803279876709, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2670 + }, + { + "epoch": 3.580494321977288, + "grad_norm": 0.6507797837257385, + "learning_rate": 0.0002, + "loss": 1.4981, + "step": 2680 + }, + { + "epoch": 3.593854375417502, + "grad_norm": 0.5778017044067383, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 2690 + }, + { + "epoch": 3.6072144288577155, + "grad_norm": 0.6214032173156738, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 2700 + }, + { + "epoch": 3.620574482297929, + "grad_norm": 0.5681133270263672, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 2710 + }, + { + "epoch": 3.6339345357381427, + "grad_norm": 0.6074244976043701, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 2720 + }, + { + "epoch": 3.647294589178357, + "grad_norm": 0.5900560617446899, + "learning_rate": 0.0002, + "loss": 1.5243, + "step": 2730 + }, + { + "epoch": 3.6606546426185704, + "grad_norm": 0.5817505717277527, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2740 + }, + { + "epoch": 3.6740146960587845, + "grad_norm": 0.6095547676086426, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2750 + }, + { + "epoch": 3.687374749498998, + "grad_norm": 0.612790584564209, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2760 + }, + { + "epoch": 3.7007348029392118, + "grad_norm": 0.6574140787124634, + "learning_rate": 0.0002, + "loss": 1.4976, + "step": 2770 + }, + { + "epoch": 3.7140948563794254, + "grad_norm": 0.5643761157989502, + "learning_rate": 0.0002, + "loss": 1.5306, + "step": 2780 + }, + { + "epoch": 3.727454909819639, + "grad_norm": 0.5652621388435364, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 2790 + }, + { + "epoch": 3.740814963259853, + "grad_norm": 0.5604206323623657, + "learning_rate": 0.0002, + "loss": 1.5262, + "step": 2800 + }, + { + "epoch": 3.7541750167000667, + "grad_norm": 3.911022663116455, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2810 + }, + { + "epoch": 3.7675350701402808, + "grad_norm": 0.6148333549499512, + "learning_rate": 0.0002, + "loss": 1.5793, + "step": 2820 + }, + { + "epoch": 3.7808951235804944, + "grad_norm": 0.5605677962303162, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 2830 + }, + { + "epoch": 3.794255177020708, + "grad_norm": 0.6101965308189392, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 2840 + }, + { + "epoch": 3.8076152304609217, + "grad_norm": 0.5387342572212219, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2850 + }, + { + "epoch": 3.8209752839011357, + "grad_norm": 0.5733087062835693, + "learning_rate": 0.0002, + "loss": 1.5193, + "step": 2860 + }, + { + "epoch": 3.8343353373413493, + "grad_norm": 0.6538485884666443, + "learning_rate": 0.0002, + "loss": 1.5545, + "step": 2870 + }, + { + "epoch": 3.847695390781563, + "grad_norm": 0.6247632503509521, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 2880 + }, + { + "epoch": 3.861055444221777, + "grad_norm": 0.5745735764503479, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 2890 + }, + { + "epoch": 3.8744154976619907, + "grad_norm": 0.5942763686180115, + "learning_rate": 0.0002, + "loss": 1.5706, + "step": 2900 + }, + { + "epoch": 3.8877755511022043, + "grad_norm": 0.7086281776428223, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2910 + }, + { + "epoch": 3.901135604542418, + "grad_norm": 0.8825129866600037, + "learning_rate": 0.0002, + "loss": 1.5526, + "step": 2920 + }, + { + "epoch": 3.914495657982632, + "grad_norm": 0.6260842680931091, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 2930 + }, + { + "epoch": 3.9278557114228456, + "grad_norm": 0.6015968322753906, + "learning_rate": 0.0002, + "loss": 1.5433, + "step": 2940 + }, + { + "epoch": 3.9412157648630597, + "grad_norm": 0.7042809128761292, + "learning_rate": 0.0002, + "loss": 1.4931, + "step": 2950 + }, + { + "epoch": 3.9545758183032733, + "grad_norm": 0.5860083699226379, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2960 + }, + { + "epoch": 3.967935871743487, + "grad_norm": 0.5939757823944092, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2970 + }, + { + "epoch": 3.9812959251837006, + "grad_norm": 0.5523964166641235, + "learning_rate": 0.0002, + "loss": 1.408, + "step": 2980 + }, + { + "epoch": 3.9946559786239146, + "grad_norm": 0.6380264759063721, + "learning_rate": 0.0002, + "loss": 1.5629, + "step": 2990 + }, + { + "epoch": 4.0, + "eval_loss": 1.8875294923782349, + "eval_runtime": 38.5837, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.685, + "step": 2994 + }, + { + "epoch": 4.008016032064128, + "grad_norm": 0.5478564500808716, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3000 + }, + { + "epoch": 4.021376085504342, + "grad_norm": 0.9384379982948303, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 3010 + }, + { + "epoch": 4.034736138944556, + "grad_norm": 0.7819344401359558, + "learning_rate": 0.0002, + "loss": 1.4127, + "step": 3020 + }, + { + "epoch": 4.04809619238477, + "grad_norm": 0.7737417817115784, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 3030 + }, + { + "epoch": 4.061456245824983, + "grad_norm": 0.8893805742263794, + "learning_rate": 0.0002, + "loss": 1.3203, + "step": 3040 + }, + { + "epoch": 4.074816299265197, + "grad_norm": 0.7759843468666077, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 3050 + }, + { + "epoch": 4.0881763527054105, + "grad_norm": 0.642654538154602, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 3060 + }, + { + "epoch": 4.101536406145625, + "grad_norm": 0.8515549302101135, + "learning_rate": 0.0002, + "loss": 1.3204, + "step": 3070 + }, + { + "epoch": 4.114896459585839, + "grad_norm": 0.7033658623695374, + "learning_rate": 0.0002, + "loss": 1.3683, + "step": 3080 + }, + { + "epoch": 4.128256513026052, + "grad_norm": 0.7063882946968079, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 3090 + }, + { + "epoch": 4.141616566466266, + "grad_norm": 0.6946853995323181, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 3100 + }, + { + "epoch": 4.1549766199064795, + "grad_norm": 0.7286741137504578, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 3110 + }, + { + "epoch": 4.168336673346693, + "grad_norm": 0.7894193530082703, + "learning_rate": 0.0002, + "loss": 1.3061, + "step": 3120 + }, + { + "epoch": 4.181696726786907, + "grad_norm": 0.7005895376205444, + "learning_rate": 0.0002, + "loss": 1.3346, + "step": 3130 + }, + { + "epoch": 4.195056780227121, + "grad_norm": 0.799567461013794, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 3140 + }, + { + "epoch": 4.208416833667335, + "grad_norm": 0.7010157108306885, + "learning_rate": 0.0002, + "loss": 1.3813, + "step": 3150 + }, + { + "epoch": 4.2217768871075485, + "grad_norm": 0.7489650249481201, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 3160 + }, + { + "epoch": 4.235136940547762, + "grad_norm": 0.7908048629760742, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 3170 + }, + { + "epoch": 4.248496993987976, + "grad_norm": 0.7002180814743042, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 3180 + }, + { + "epoch": 4.261857047428189, + "grad_norm": 0.8339495062828064, + "learning_rate": 0.0002, + "loss": 1.4525, + "step": 3190 + }, + { + "epoch": 4.275217100868403, + "grad_norm": 0.7884618043899536, + "learning_rate": 0.0002, + "loss": 1.3471, + "step": 3200 + }, + { + "epoch": 4.2885771543086175, + "grad_norm": 0.7964122295379639, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 3210 + }, + { + "epoch": 4.301937207748831, + "grad_norm": 0.838646650314331, + "learning_rate": 0.0002, + "loss": 1.3506, + "step": 3220 + }, + { + "epoch": 4.315297261189045, + "grad_norm": 0.8063107132911682, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 3230 + }, + { + "epoch": 4.328657314629258, + "grad_norm": 0.8147385120391846, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 3240 + }, + { + "epoch": 4.342017368069472, + "grad_norm": 0.7636798620223999, + "learning_rate": 0.0002, + "loss": 1.4118, + "step": 3250 + }, + { + "epoch": 4.355377421509686, + "grad_norm": 0.7530609965324402, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 3260 + }, + { + "epoch": 4.3687374749499, + "grad_norm": 0.8853573799133301, + "learning_rate": 0.0002, + "loss": 1.3507, + "step": 3270 + }, + { + "epoch": 4.382097528390114, + "grad_norm": 0.7180975675582886, + "learning_rate": 0.0002, + "loss": 1.3614, + "step": 3280 + }, + { + "epoch": 4.395457581830327, + "grad_norm": 0.837150514125824, + "learning_rate": 0.0002, + "loss": 1.4119, + "step": 3290 + }, + { + "epoch": 4.408817635270541, + "grad_norm": 0.8370638489723206, + "learning_rate": 0.0002, + "loss": 1.461, + "step": 3300 + }, + { + "epoch": 4.422177688710755, + "grad_norm": 0.7738229036331177, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 3310 + }, + { + "epoch": 4.435537742150968, + "grad_norm": 0.7665290832519531, + "learning_rate": 0.0002, + "loss": 1.4195, + "step": 3320 + }, + { + "epoch": 4.448897795591183, + "grad_norm": 0.7547745704650879, + "learning_rate": 0.0002, + "loss": 1.3308, + "step": 3330 + }, + { + "epoch": 4.462257849031396, + "grad_norm": 0.7421861290931702, + "learning_rate": 0.0002, + "loss": 1.4165, + "step": 3340 + }, + { + "epoch": 4.47561790247161, + "grad_norm": 0.8042104244232178, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3350 + }, + { + "epoch": 4.488977955911824, + "grad_norm": 0.8111839890480042, + "learning_rate": 0.0002, + "loss": 1.365, + "step": 3360 + }, + { + "epoch": 4.502338009352037, + "grad_norm": 0.7998340129852295, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 3370 + }, + { + "epoch": 4.515698062792251, + "grad_norm": 0.7668877243995667, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 3380 + }, + { + "epoch": 4.529058116232465, + "grad_norm": 0.7986718416213989, + "learning_rate": 0.0002, + "loss": 1.3972, + "step": 3390 + }, + { + "epoch": 4.542418169672679, + "grad_norm": 0.6806602478027344, + "learning_rate": 0.0002, + "loss": 1.3582, + "step": 3400 + }, + { + "epoch": 4.555778223112893, + "grad_norm": 0.8788819909095764, + "learning_rate": 0.0002, + "loss": 1.3942, + "step": 3410 + }, + { + "epoch": 4.569138276553106, + "grad_norm": 0.7499664425849915, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 3420 + }, + { + "epoch": 4.58249832999332, + "grad_norm": 0.7967109084129333, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 3430 + }, + { + "epoch": 4.595858383433534, + "grad_norm": 0.759639322757721, + "learning_rate": 0.0002, + "loss": 1.3531, + "step": 3440 + }, + { + "epoch": 4.609218436873747, + "grad_norm": 0.8327916264533997, + "learning_rate": 0.0002, + "loss": 1.3517, + "step": 3450 + }, + { + "epoch": 4.622578490313961, + "grad_norm": 0.7400892376899719, + "learning_rate": 0.0002, + "loss": 1.4619, + "step": 3460 + }, + { + "epoch": 4.635938543754175, + "grad_norm": 0.8116602301597595, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 3470 + }, + { + "epoch": 4.649298597194389, + "grad_norm": 0.7604362368583679, + "learning_rate": 0.0002, + "loss": 1.4445, + "step": 3480 + }, + { + "epoch": 4.662658650634603, + "grad_norm": 0.7397996783256531, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 3490 + }, + { + "epoch": 4.676018704074816, + "grad_norm": 0.869293749332428, + "learning_rate": 0.0002, + "loss": 1.4048, + "step": 3500 + }, + { + "epoch": 4.68937875751503, + "grad_norm": 0.6854358315467834, + "learning_rate": 0.0002, + "loss": 1.3873, + "step": 3510 + }, + { + "epoch": 4.7027388109552435, + "grad_norm": 0.8326661586761475, + "learning_rate": 0.0002, + "loss": 1.3413, + "step": 3520 + }, + { + "epoch": 4.716098864395457, + "grad_norm": 0.6887506246566772, + "learning_rate": 0.0002, + "loss": 1.3666, + "step": 3530 + }, + { + "epoch": 4.729458917835672, + "grad_norm": 3.837689161300659, + "learning_rate": 0.0002, + "loss": 1.4508, + "step": 3540 + }, + { + "epoch": 4.742818971275885, + "grad_norm": 0.6874563694000244, + "learning_rate": 0.0002, + "loss": 1.3775, + "step": 3550 + }, + { + "epoch": 4.756179024716099, + "grad_norm": 0.8340407609939575, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 3560 + }, + { + "epoch": 4.7695390781563125, + "grad_norm": 0.7286418676376343, + "learning_rate": 0.0002, + "loss": 1.3556, + "step": 3570 + }, + { + "epoch": 4.782899131596526, + "grad_norm": 0.7239373326301575, + "learning_rate": 0.0002, + "loss": 1.4338, + "step": 3580 + }, + { + "epoch": 4.796259185036741, + "grad_norm": 0.831310510635376, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3590 + }, + { + "epoch": 4.809619238476954, + "grad_norm": 0.767715573310852, + "learning_rate": 0.0002, + "loss": 1.4146, + "step": 3600 + }, + { + "epoch": 4.822979291917168, + "grad_norm": 0.9013199210166931, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 3610 + }, + { + "epoch": 4.8363393453573815, + "grad_norm": 0.7543512582778931, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3620 + }, + { + "epoch": 4.849699398797595, + "grad_norm": 0.7626057267189026, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3630 + }, + { + "epoch": 4.863059452237809, + "grad_norm": 0.847079336643219, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 3640 + }, + { + "epoch": 4.876419505678022, + "grad_norm": 0.8273295760154724, + "learning_rate": 0.0002, + "loss": 1.5014, + "step": 3650 + }, + { + "epoch": 4.889779559118237, + "grad_norm": 0.7675244808197021, + "learning_rate": 0.0002, + "loss": 1.3806, + "step": 3660 + }, + { + "epoch": 4.9031396125584505, + "grad_norm": 0.9560356736183167, + "learning_rate": 0.0002, + "loss": 1.4894, + "step": 3670 + }, + { + "epoch": 4.916499665998664, + "grad_norm": 0.7682451605796814, + "learning_rate": 0.0002, + "loss": 1.4044, + "step": 3680 + }, + { + "epoch": 4.929859719438878, + "grad_norm": 0.8113830089569092, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 3690 + }, + { + "epoch": 4.943219772879091, + "grad_norm": 0.7642542719841003, + "learning_rate": 0.0002, + "loss": 1.3559, + "step": 3700 + }, + { + "epoch": 4.956579826319305, + "grad_norm": 0.823863685131073, + "learning_rate": 0.0002, + "loss": 1.403, + "step": 3710 + }, + { + "epoch": 4.969939879759519, + "grad_norm": 0.8287797570228577, + "learning_rate": 0.0002, + "loss": 1.464, + "step": 3720 + }, + { + "epoch": 4.983299933199733, + "grad_norm": 0.778170108795166, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3730 + }, + { + "epoch": 4.996659986639947, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 1.4218, + "step": 3740 + }, + { + "epoch": 4.999331997327989, + "eval_loss": 1.9638569355010986, + "eval_runtime": 38.5725, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.685, + "step": 3742 + }, + { + "epoch": 5.01002004008016, + "grad_norm": 0.8864085078239441, + "learning_rate": 0.0002, + "loss": 1.225, + "step": 3750 + }, + { + "epoch": 5.023380093520374, + "grad_norm": 0.9191637635231018, + "learning_rate": 0.0002, + "loss": 1.2056, + "step": 3760 + }, + { + "epoch": 5.036740146960588, + "grad_norm": 0.749519407749176, + "learning_rate": 0.0002, + "loss": 1.2453, + "step": 3770 + }, + { + "epoch": 5.050100200400801, + "grad_norm": 0.7916892170906067, + "learning_rate": 0.0002, + "loss": 1.1959, + "step": 3780 + }, + { + "epoch": 5.063460253841015, + "grad_norm": 1.0318909883499146, + "learning_rate": 0.0002, + "loss": 1.2279, + "step": 3790 + }, + { + "epoch": 5.0768203072812295, + "grad_norm": 1.028586745262146, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 3800 + }, + { + "epoch": 5.090180360721443, + "grad_norm": 1.0568538904190063, + "learning_rate": 0.0002, + "loss": 1.1769, + "step": 3810 + }, + { + "epoch": 5.103540414161657, + "grad_norm": 0.9780595302581787, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 3820 + }, + { + "epoch": 5.11690046760187, + "grad_norm": 1.10311758518219, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 3830 + }, + { + "epoch": 5.130260521042084, + "grad_norm": 0.9497154355049133, + "learning_rate": 0.0002, + "loss": 1.2133, + "step": 3840 + }, + { + "epoch": 5.143620574482298, + "grad_norm": 0.948279857635498, + "learning_rate": 0.0002, + "loss": 1.1718, + "step": 3850 + }, + { + "epoch": 5.156980627922512, + "grad_norm": 0.9497880339622498, + "learning_rate": 0.0002, + "loss": 1.2108, + "step": 3860 + }, + { + "epoch": 5.170340681362726, + "grad_norm": 1.3213258981704712, + "learning_rate": 0.0002, + "loss": 1.1876, + "step": 3870 + }, + { + "epoch": 5.183700734802939, + "grad_norm": 0.9835752248764038, + "learning_rate": 0.0002, + "loss": 1.2327, + "step": 3880 + }, + { + "epoch": 5.197060788243153, + "grad_norm": 0.8426132202148438, + "learning_rate": 0.0002, + "loss": 1.2256, + "step": 3890 + }, + { + "epoch": 5.210420841683367, + "grad_norm": 1.0343470573425293, + "learning_rate": 0.0002, + "loss": 1.2066, + "step": 3900 + }, + { + "epoch": 5.22378089512358, + "grad_norm": 1.0771924257278442, + "learning_rate": 0.0002, + "loss": 1.2596, + "step": 3910 + }, + { + "epoch": 5.237140948563794, + "grad_norm": 0.8542634844779968, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 3920 + }, + { + "epoch": 5.250501002004008, + "grad_norm": 1.1021966934204102, + "learning_rate": 0.0002, + "loss": 1.2264, + "step": 3930 + }, + { + "epoch": 5.263861055444222, + "grad_norm": 1.170011281967163, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 3940 + }, + { + "epoch": 5.277221108884436, + "grad_norm": 0.9787653684616089, + "learning_rate": 0.0002, + "loss": 1.2101, + "step": 3950 + }, + { + "epoch": 5.290581162324649, + "grad_norm": 0.914513885974884, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 3960 + }, + { + "epoch": 5.303941215764863, + "grad_norm": 1.0831562280654907, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 3970 + }, + { + "epoch": 5.3173012692050765, + "grad_norm": 0.9810112714767456, + "learning_rate": 0.0002, + "loss": 1.2609, + "step": 3980 + }, + { + "epoch": 5.330661322645291, + "grad_norm": 0.9624066948890686, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 3990 + }, + { + "epoch": 5.344021376085505, + "grad_norm": 1.2296923398971558, + "learning_rate": 0.0002, + "loss": 1.273, + "step": 4000 + }, + { + "epoch": 5.357381429525718, + "grad_norm": 1.011299967765808, + "learning_rate": 0.0002, + "loss": 1.2452, + "step": 4010 + }, + { + "epoch": 5.370741482965932, + "grad_norm": 0.9144132733345032, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 4020 + }, + { + "epoch": 5.3841015364061455, + "grad_norm": 1.0573601722717285, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 4030 + }, + { + "epoch": 5.397461589846359, + "grad_norm": 1.1667137145996094, + "learning_rate": 0.0002, + "loss": 1.2295, + "step": 4040 + }, + { + "epoch": 5.410821643286573, + "grad_norm": 1.072070598602295, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 4050 + }, + { + "epoch": 5.424181696726787, + "grad_norm": 1.1005792617797852, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 4060 + }, + { + "epoch": 5.437541750167001, + "grad_norm": 1.033581018447876, + "learning_rate": 0.0002, + "loss": 1.2604, + "step": 4070 + }, + { + "epoch": 5.4509018036072145, + "grad_norm": 0.9537439942359924, + "learning_rate": 0.0002, + "loss": 1.2552, + "step": 4080 + }, + { + "epoch": 5.464261857047428, + "grad_norm": 1.0502177476882935, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 4090 + }, + { + "epoch": 5.477621910487642, + "grad_norm": 0.9098296761512756, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4100 + }, + { + "epoch": 5.490981963927855, + "grad_norm": 0.9551953077316284, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 4110 + }, + { + "epoch": 5.504342017368069, + "grad_norm": 0.9169427156448364, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 4120 + }, + { + "epoch": 5.517702070808284, + "grad_norm": 0.9430235624313354, + "learning_rate": 0.0002, + "loss": 1.2572, + "step": 4130 + }, + { + "epoch": 5.531062124248497, + "grad_norm": 0.817259669303894, + "learning_rate": 0.0002, + "loss": 1.2618, + "step": 4140 + }, + { + "epoch": 5.544422177688711, + "grad_norm": 1.124152660369873, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 4150 + }, + { + "epoch": 5.557782231128924, + "grad_norm": 0.9250756502151489, + "learning_rate": 0.0002, + "loss": 1.2508, + "step": 4160 + }, + { + "epoch": 5.571142284569138, + "grad_norm": 0.9582970142364502, + "learning_rate": 0.0002, + "loss": 1.2492, + "step": 4170 + }, + { + "epoch": 5.584502338009352, + "grad_norm": 1.0078704357147217, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 4180 + }, + { + "epoch": 5.597862391449565, + "grad_norm": 0.9585610032081604, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 4190 + }, + { + "epoch": 5.61122244488978, + "grad_norm": 1.0150971412658691, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 4200 + }, + { + "epoch": 5.6245824983299935, + "grad_norm": 0.9943351149559021, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 4210 + }, + { + "epoch": 5.637942551770207, + "grad_norm": 0.8880936503410339, + "learning_rate": 0.0002, + "loss": 1.2928, + "step": 4220 + }, + { + "epoch": 5.651302605210421, + "grad_norm": 0.9873887896537781, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 4230 + }, + { + "epoch": 5.664662658650634, + "grad_norm": 0.9185152649879456, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 4240 + }, + { + "epoch": 5.678022712090849, + "grad_norm": 1.0706779956817627, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 4250 + }, + { + "epoch": 5.6913827655310625, + "grad_norm": 0.9660224914550781, + "learning_rate": 0.0002, + "loss": 1.2737, + "step": 4260 + }, + { + "epoch": 5.704742818971276, + "grad_norm": 0.8685019612312317, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 4270 + }, + { + "epoch": 5.71810287241149, + "grad_norm": 1.0390565395355225, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 4280 + }, + { + "epoch": 5.731462925851703, + "grad_norm": 0.9290478825569153, + "learning_rate": 0.0002, + "loss": 1.3134, + "step": 4290 + }, + { + "epoch": 5.744822979291917, + "grad_norm": 1.0361281633377075, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 4300 + }, + { + "epoch": 5.758183032732131, + "grad_norm": 0.8804615139961243, + "learning_rate": 0.0002, + "loss": 1.2688, + "step": 4310 + }, + { + "epoch": 5.771543086172345, + "grad_norm": 1.0051425695419312, + "learning_rate": 0.0002, + "loss": 1.2479, + "step": 4320 + }, + { + "epoch": 5.784903139612559, + "grad_norm": 1.0051119327545166, + "learning_rate": 0.0002, + "loss": 1.1946, + "step": 4330 + }, + { + "epoch": 5.798263193052772, + "grad_norm": 0.9961661100387573, + "learning_rate": 0.0002, + "loss": 1.2571, + "step": 4340 + }, + { + "epoch": 5.811623246492986, + "grad_norm": 1.0229419469833374, + "learning_rate": 0.0002, + "loss": 1.2179, + "step": 4350 + }, + { + "epoch": 5.8249832999332, + "grad_norm": 1.1129552125930786, + "learning_rate": 0.0002, + "loss": 1.2984, + "step": 4360 + }, + { + "epoch": 5.838343353373413, + "grad_norm": 1.18964421749115, + "learning_rate": 0.0002, + "loss": 1.2692, + "step": 4370 + }, + { + "epoch": 5.851703406813627, + "grad_norm": 0.9490230083465576, + "learning_rate": 0.0002, + "loss": 1.1996, + "step": 4380 + }, + { + "epoch": 5.865063460253841, + "grad_norm": 0.8734540343284607, + "learning_rate": 0.0002, + "loss": 1.3177, + "step": 4390 + }, + { + "epoch": 5.878423513694055, + "grad_norm": 1.0017802715301514, + "learning_rate": 0.0002, + "loss": 1.3131, + "step": 4400 + }, + { + "epoch": 5.891783567134269, + "grad_norm": 0.953556478023529, + "learning_rate": 0.0002, + "loss": 1.2649, + "step": 4410 + }, + { + "epoch": 5.905143620574482, + "grad_norm": 0.8915258646011353, + "learning_rate": 0.0002, + "loss": 1.2684, + "step": 4420 + }, + { + "epoch": 5.918503674014696, + "grad_norm": 0.9715141654014587, + "learning_rate": 0.0002, + "loss": 1.2843, + "step": 4430 + }, + { + "epoch": 5.9318637274549095, + "grad_norm": 0.9432152509689331, + "learning_rate": 0.0002, + "loss": 1.2769, + "step": 4440 + }, + { + "epoch": 5.945223780895123, + "grad_norm": 0.9473979473114014, + "learning_rate": 0.0002, + "loss": 1.233, + "step": 4450 + }, + { + "epoch": 5.958583834335338, + "grad_norm": 1.104871392250061, + "learning_rate": 0.0002, + "loss": 1.3209, + "step": 4460 + }, + { + "epoch": 5.971943887775551, + "grad_norm": 1.0308905839920044, + "learning_rate": 0.0002, + "loss": 1.3427, + "step": 4470 + }, + { + "epoch": 5.985303941215765, + "grad_norm": 0.8895487189292908, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 4480 + }, + { + "epoch": 5.9986639946559785, + "grad_norm": 1.0148485898971558, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 4490 + }, + { + "epoch": 6.0, + "eval_loss": 2.0830726623535156, + "eval_runtime": 38.5442, + "eval_samples_per_second": 13.361, + "eval_steps_per_second": 1.686, + "step": 4491 + }, + { + "epoch": 6.012024048096192, + "grad_norm": 1.1640599966049194, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 4500 + }, + { + "epoch": 6.025384101536406, + "grad_norm": 1.213204264640808, + "learning_rate": 0.0002, + "loss": 1.0436, + "step": 4510 + }, + { + "epoch": 6.03874415497662, + "grad_norm": 1.1694388389587402, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 4520 + }, + { + "epoch": 6.052104208416834, + "grad_norm": 1.1044062376022339, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 4530 + }, + { + "epoch": 6.065464261857048, + "grad_norm": 1.0701100826263428, + "learning_rate": 0.0002, + "loss": 1.0552, + "step": 4540 + }, + { + "epoch": 6.078824315297261, + "grad_norm": 1.360065221786499, + "learning_rate": 0.0002, + "loss": 1.0018, + "step": 4550 + }, + { + "epoch": 6.092184368737475, + "grad_norm": 1.0648503303527832, + "learning_rate": 0.0002, + "loss": 1.0189, + "step": 4560 + }, + { + "epoch": 6.1055444221776884, + "grad_norm": 1.066245198249817, + "learning_rate": 0.0002, + "loss": 1.008, + "step": 4570 + }, + { + "epoch": 6.118904475617902, + "grad_norm": 1.1483700275421143, + "learning_rate": 0.0002, + "loss": 1.099, + "step": 4580 + }, + { + "epoch": 6.132264529058117, + "grad_norm": 1.334275722503662, + "learning_rate": 0.0002, + "loss": 1.1043, + "step": 4590 + }, + { + "epoch": 6.14562458249833, + "grad_norm": 1.2141029834747314, + "learning_rate": 0.0002, + "loss": 1.0783, + "step": 4600 + }, + { + "epoch": 6.158984635938544, + "grad_norm": 1.2284387350082397, + "learning_rate": 0.0002, + "loss": 1.0891, + "step": 4610 + }, + { + "epoch": 6.1723446893787575, + "grad_norm": 1.2326734066009521, + "learning_rate": 0.0002, + "loss": 1.122, + "step": 4620 + }, + { + "epoch": 6.185704742818971, + "grad_norm": 1.245004653930664, + "learning_rate": 0.0002, + "loss": 1.1069, + "step": 4630 + }, + { + "epoch": 6.199064796259185, + "grad_norm": 0.9685266017913818, + "learning_rate": 0.0002, + "loss": 1.0821, + "step": 4640 + }, + { + "epoch": 6.212424849699399, + "grad_norm": 1.141634464263916, + "learning_rate": 0.0002, + "loss": 1.0659, + "step": 4650 + }, + { + "epoch": 6.225784903139613, + "grad_norm": 1.4279003143310547, + "learning_rate": 0.0002, + "loss": 1.0971, + "step": 4660 + }, + { + "epoch": 6.2391449565798265, + "grad_norm": 1.186668872833252, + "learning_rate": 0.0002, + "loss": 1.093, + "step": 4670 + }, + { + "epoch": 6.25250501002004, + "grad_norm": 1.2656606435775757, + "learning_rate": 0.0002, + "loss": 1.0522, + "step": 4680 + }, + { + "epoch": 6.265865063460254, + "grad_norm": 1.1122987270355225, + "learning_rate": 0.0002, + "loss": 1.1138, + "step": 4690 + }, + { + "epoch": 6.279225116900467, + "grad_norm": 1.190050482749939, + "learning_rate": 0.0002, + "loss": 1.0906, + "step": 4700 + }, + { + "epoch": 6.292585170340681, + "grad_norm": 1.3683340549468994, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 4710 + }, + { + "epoch": 6.3059452237808955, + "grad_norm": 1.1787203550338745, + "learning_rate": 0.0002, + "loss": 1.0663, + "step": 4720 + }, + { + "epoch": 6.319305277221109, + "grad_norm": 1.3502576351165771, + "learning_rate": 0.0002, + "loss": 1.0856, + "step": 4730 + }, + { + "epoch": 6.332665330661323, + "grad_norm": 1.1958597898483276, + "learning_rate": 0.0002, + "loss": 1.0999, + "step": 4740 + }, + { + "epoch": 6.346025384101536, + "grad_norm": 1.0918327569961548, + "learning_rate": 0.0002, + "loss": 1.021, + "step": 4750 + }, + { + "epoch": 6.35938543754175, + "grad_norm": 1.2624558210372925, + "learning_rate": 0.0002, + "loss": 1.0484, + "step": 4760 + }, + { + "epoch": 6.372745490981964, + "grad_norm": 1.1390577554702759, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 4770 + }, + { + "epoch": 6.386105544422177, + "grad_norm": 1.041666865348816, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 4780 + }, + { + "epoch": 6.399465597862392, + "grad_norm": 1.4209141731262207, + "learning_rate": 0.0002, + "loss": 1.1026, + "step": 4790 + }, + { + "epoch": 6.412825651302605, + "grad_norm": 1.1001079082489014, + "learning_rate": 0.0002, + "loss": 1.119, + "step": 4800 + }, + { + "epoch": 6.426185704742819, + "grad_norm": 1.3324936628341675, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 4810 + }, + { + "epoch": 6.439545758183033, + "grad_norm": 1.1270194053649902, + "learning_rate": 0.0002, + "loss": 1.0785, + "step": 4820 + }, + { + "epoch": 6.452905811623246, + "grad_norm": 1.1961387395858765, + "learning_rate": 0.0002, + "loss": 1.1338, + "step": 4830 + }, + { + "epoch": 6.46626586506346, + "grad_norm": 1.255366563796997, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 4840 + }, + { + "epoch": 6.479625918503674, + "grad_norm": 1.343855381011963, + "learning_rate": 0.0002, + "loss": 1.1226, + "step": 4850 + }, + { + "epoch": 6.492985971943888, + "grad_norm": 1.3216257095336914, + "learning_rate": 0.0002, + "loss": 1.1118, + "step": 4860 + }, + { + "epoch": 6.506346025384102, + "grad_norm": 1.5244755744934082, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 4870 + }, + { + "epoch": 6.519706078824315, + "grad_norm": 1.1585701704025269, + "learning_rate": 0.0002, + "loss": 1.0403, + "step": 4880 + }, + { + "epoch": 6.533066132264529, + "grad_norm": 1.0301100015640259, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 4890 + }, + { + "epoch": 6.5464261857047426, + "grad_norm": 1.5772714614868164, + "learning_rate": 0.0002, + "loss": 1.1304, + "step": 4900 + }, + { + "epoch": 6.559786239144957, + "grad_norm": 1.2015259265899658, + "learning_rate": 0.0002, + "loss": 1.0953, + "step": 4910 + }, + { + "epoch": 6.573146292585171, + "grad_norm": 1.4365423917770386, + "learning_rate": 0.0002, + "loss": 1.1283, + "step": 4920 + }, + { + "epoch": 6.586506346025384, + "grad_norm": 1.2534470558166504, + "learning_rate": 0.0002, + "loss": 1.0717, + "step": 4930 + }, + { + "epoch": 6.599866399465598, + "grad_norm": 1.216138482093811, + "learning_rate": 0.0002, + "loss": 1.1099, + "step": 4940 + }, + { + "epoch": 6.613226452905812, + "grad_norm": 1.144316554069519, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 4950 + }, + { + "epoch": 6.626586506346025, + "grad_norm": 1.1127740144729614, + "learning_rate": 0.0002, + "loss": 1.0548, + "step": 4960 + }, + { + "epoch": 6.639946559786239, + "grad_norm": 1.1925606727600098, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 4970 + }, + { + "epoch": 6.653306613226453, + "grad_norm": 1.2500451803207397, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 4980 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.16154944896698, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 4990 + }, + { + "epoch": 6.680026720106881, + "grad_norm": 1.1921433210372925, + "learning_rate": 0.0002, + "loss": 1.1003, + "step": 5000 + }, + { + "epoch": 6.693386773547094, + "grad_norm": 1.1561170816421509, + "learning_rate": 0.0002, + "loss": 1.1278, + "step": 5010 + }, + { + "epoch": 6.706746826987308, + "grad_norm": 1.2988990545272827, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 5020 + }, + { + "epoch": 6.7201068804275215, + "grad_norm": 0.9620341062545776, + "learning_rate": 0.0002, + "loss": 1.1131, + "step": 5030 + }, + { + "epoch": 6.733466933867735, + "grad_norm": 1.084228515625, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5040 + }, + { + "epoch": 6.74682698730795, + "grad_norm": 1.1119431257247925, + "learning_rate": 0.0002, + "loss": 1.1474, + "step": 5050 + }, + { + "epoch": 6.760187040748163, + "grad_norm": 1.1365628242492676, + "learning_rate": 0.0002, + "loss": 1.179, + "step": 5060 + }, + { + "epoch": 6.773547094188377, + "grad_norm": 1.0989075899124146, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 5070 + }, + { + "epoch": 6.7869071476285905, + "grad_norm": 1.040647268295288, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 5080 + }, + { + "epoch": 6.800267201068804, + "grad_norm": 1.1083087921142578, + "learning_rate": 0.0002, + "loss": 1.0793, + "step": 5090 + }, + { + "epoch": 6.813627254509018, + "grad_norm": 1.3434782028198242, + "learning_rate": 0.0002, + "loss": 1.1081, + "step": 5100 + }, + { + "epoch": 6.826987307949231, + "grad_norm": 1.2493442296981812, + "learning_rate": 0.0002, + "loss": 1.1243, + "step": 5110 + }, + { + "epoch": 6.840347361389446, + "grad_norm": 1.0672307014465332, + "learning_rate": 0.0002, + "loss": 1.0633, + "step": 5120 + }, + { + "epoch": 6.8537074148296595, + "grad_norm": 1.068350911140442, + "learning_rate": 0.0002, + "loss": 1.1344, + "step": 5130 + }, + { + "epoch": 6.867067468269873, + "grad_norm": 1.2880923748016357, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 5140 + }, + { + "epoch": 6.880427521710087, + "grad_norm": 1.0895041227340698, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 5150 + }, + { + "epoch": 6.8937875751503, + "grad_norm": 1.2383300065994263, + "learning_rate": 0.0002, + "loss": 1.1535, + "step": 5160 + }, + { + "epoch": 6.907147628590514, + "grad_norm": 1.5274227857589722, + "learning_rate": 0.0002, + "loss": 1.1653, + "step": 5170 + }, + { + "epoch": 6.920507682030728, + "grad_norm": 1.1453371047973633, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 5180 + }, + { + "epoch": 6.933867735470942, + "grad_norm": 1.171336054801941, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 5190 + }, + { + "epoch": 6.947227788911156, + "grad_norm": 1.1946955919265747, + "learning_rate": 0.0002, + "loss": 1.1142, + "step": 5200 + }, + { + "epoch": 6.960587842351369, + "grad_norm": 1.2290117740631104, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 5210 + }, + { + "epoch": 6.973947895791583, + "grad_norm": 1.3134533166885376, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 5220 + }, + { + "epoch": 6.987307949231797, + "grad_norm": 1.1500377655029297, + "learning_rate": 0.0002, + "loss": 1.1687, + "step": 5230 + }, + { + "epoch": 6.999331997327989, + "eval_loss": 2.2211341857910156, + "eval_runtime": 38.5729, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.685, + "step": 5239 + }, + { + "epoch": 7.00066800267201, + "grad_norm": 1.1143344640731812, + "learning_rate": 0.0002, + "loss": 1.1235, + "step": 5240 + }, + { + "epoch": 7.014028056112225, + "grad_norm": 1.5164896249771118, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 5250 + }, + { + "epoch": 7.027388109552438, + "grad_norm": 1.3737165927886963, + "learning_rate": 0.0002, + "loss": 0.9605, + "step": 5260 + }, + { + "epoch": 7.040748162992652, + "grad_norm": 1.2159202098846436, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 5270 + }, + { + "epoch": 7.054108216432866, + "grad_norm": 1.4183212518692017, + "learning_rate": 0.0002, + "loss": 0.9469, + "step": 5280 + }, + { + "epoch": 7.067468269873079, + "grad_norm": 1.4752920866012573, + "learning_rate": 0.0002, + "loss": 0.9188, + "step": 5290 + }, + { + "epoch": 7.080828323313293, + "grad_norm": 1.398065447807312, + "learning_rate": 0.0002, + "loss": 0.873, + "step": 5300 + }, + { + "epoch": 7.094188376753507, + "grad_norm": 1.4385913610458374, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 5310 + }, + { + "epoch": 7.107548430193721, + "grad_norm": 1.3779526948928833, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5320 + }, + { + "epoch": 7.120908483633935, + "grad_norm": 1.5290347337722778, + "learning_rate": 0.0002, + "loss": 0.9139, + "step": 5330 + }, + { + "epoch": 7.134268537074148, + "grad_norm": 1.2389367818832397, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 5340 + }, + { + "epoch": 7.147628590514362, + "grad_norm": 1.4514659643173218, + "learning_rate": 0.0002, + "loss": 0.9396, + "step": 5350 + }, + { + "epoch": 7.160988643954576, + "grad_norm": 1.3247307538986206, + "learning_rate": 0.0002, + "loss": 0.9303, + "step": 5360 + }, + { + "epoch": 7.174348697394789, + "grad_norm": 1.1711286306381226, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 5370 + }, + { + "epoch": 7.187708750835004, + "grad_norm": 1.4408347606658936, + "learning_rate": 0.0002, + "loss": 0.944, + "step": 5380 + }, + { + "epoch": 7.201068804275217, + "grad_norm": 1.4405876398086548, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 5390 + }, + { + "epoch": 7.214428857715431, + "grad_norm": 1.233242154121399, + "learning_rate": 0.0002, + "loss": 0.9428, + "step": 5400 + }, + { + "epoch": 7.227788911155645, + "grad_norm": 1.734960675239563, + "learning_rate": 0.0002, + "loss": 0.9401, + "step": 5410 + }, + { + "epoch": 7.241148964595858, + "grad_norm": 1.5165163278579712, + "learning_rate": 0.0002, + "loss": 0.9232, + "step": 5420 + }, + { + "epoch": 7.254509018036072, + "grad_norm": 1.4353035688400269, + "learning_rate": 0.0002, + "loss": 0.949, + "step": 5430 + }, + { + "epoch": 7.2678690714762855, + "grad_norm": 1.4540636539459229, + "learning_rate": 0.0002, + "loss": 0.9029, + "step": 5440 + }, + { + "epoch": 7.2812291249165, + "grad_norm": 1.2676037549972534, + "learning_rate": 0.0002, + "loss": 0.9117, + "step": 5450 + }, + { + "epoch": 7.294589178356714, + "grad_norm": 1.2626118659973145, + "learning_rate": 0.0002, + "loss": 1.0148, + "step": 5460 + }, + { + "epoch": 7.307949231796927, + "grad_norm": 1.4866795539855957, + "learning_rate": 0.0002, + "loss": 0.9373, + "step": 5470 + }, + { + "epoch": 7.321309285237141, + "grad_norm": 1.2464289665222168, + "learning_rate": 0.0002, + "loss": 0.9474, + "step": 5480 + }, + { + "epoch": 7.3346693386773545, + "grad_norm": 1.2815988063812256, + "learning_rate": 0.0002, + "loss": 0.896, + "step": 5490 + }, + { + "epoch": 7.348029392117568, + "grad_norm": 1.282402753829956, + "learning_rate": 0.0002, + "loss": 0.9733, + "step": 5500 + }, + { + "epoch": 7.361389445557783, + "grad_norm": 1.5422425270080566, + "learning_rate": 0.0002, + "loss": 0.9427, + "step": 5510 + }, + { + "epoch": 7.374749498997996, + "grad_norm": 1.4137073755264282, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 5520 + }, + { + "epoch": 7.38810955243821, + "grad_norm": 1.4875508546829224, + "learning_rate": 0.0002, + "loss": 0.8669, + "step": 5530 + }, + { + "epoch": 7.4014696058784235, + "grad_norm": 1.292340874671936, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 5540 + }, + { + "epoch": 7.414829659318637, + "grad_norm": 1.5553388595581055, + "learning_rate": 0.0002, + "loss": 0.9031, + "step": 5550 + }, + { + "epoch": 7.428189712758851, + "grad_norm": 1.394142746925354, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 5560 + }, + { + "epoch": 7.441549766199064, + "grad_norm": 1.3249385356903076, + "learning_rate": 0.0002, + "loss": 0.9493, + "step": 5570 + }, + { + "epoch": 7.454909819639279, + "grad_norm": 1.3204814195632935, + "learning_rate": 0.0002, + "loss": 0.9806, + "step": 5580 + }, + { + "epoch": 7.4682698730794925, + "grad_norm": 1.4062745571136475, + "learning_rate": 0.0002, + "loss": 0.9182, + "step": 5590 + }, + { + "epoch": 7.481629926519706, + "grad_norm": 1.2828562259674072, + "learning_rate": 0.0002, + "loss": 0.9429, + "step": 5600 + }, + { + "epoch": 7.49498997995992, + "grad_norm": 1.440412998199463, + "learning_rate": 0.0002, + "loss": 0.9498, + "step": 5610 + }, + { + "epoch": 7.508350033400133, + "grad_norm": 1.4771733283996582, + "learning_rate": 0.0002, + "loss": 1.0005, + "step": 5620 + }, + { + "epoch": 7.521710086840347, + "grad_norm": 1.329460620880127, + "learning_rate": 0.0002, + "loss": 0.9472, + "step": 5630 + }, + { + "epoch": 7.5350701402805615, + "grad_norm": 1.2443828582763672, + "learning_rate": 0.0002, + "loss": 0.9635, + "step": 5640 + }, + { + "epoch": 7.548430193720775, + "grad_norm": 1.3739941120147705, + "learning_rate": 0.0002, + "loss": 0.9785, + "step": 5650 + }, + { + "epoch": 7.561790247160989, + "grad_norm": 1.5168178081512451, + "learning_rate": 0.0002, + "loss": 0.9538, + "step": 5660 + }, + { + "epoch": 7.575150300601202, + "grad_norm": 1.3648325204849243, + "learning_rate": 0.0002, + "loss": 0.9381, + "step": 5670 + }, + { + "epoch": 7.588510354041416, + "grad_norm": 1.308164119720459, + "learning_rate": 0.0002, + "loss": 0.9696, + "step": 5680 + }, + { + "epoch": 7.60187040748163, + "grad_norm": 1.3583498001098633, + "learning_rate": 0.0002, + "loss": 0.9889, + "step": 5690 + }, + { + "epoch": 7.615230460921843, + "grad_norm": 1.4746732711791992, + "learning_rate": 0.0002, + "loss": 0.9566, + "step": 5700 + }, + { + "epoch": 7.628590514362058, + "grad_norm": 1.3042285442352295, + "learning_rate": 0.0002, + "loss": 0.9462, + "step": 5710 + }, + { + "epoch": 7.641950567802271, + "grad_norm": 1.5272791385650635, + "learning_rate": 0.0002, + "loss": 1.0146, + "step": 5720 + }, + { + "epoch": 7.655310621242485, + "grad_norm": 1.1505831480026245, + "learning_rate": 0.0002, + "loss": 0.9795, + "step": 5730 + }, + { + "epoch": 7.668670674682699, + "grad_norm": 1.3690030574798584, + "learning_rate": 0.0002, + "loss": 0.9758, + "step": 5740 + }, + { + "epoch": 7.682030728122912, + "grad_norm": 1.3092460632324219, + "learning_rate": 0.0002, + "loss": 0.9565, + "step": 5750 + }, + { + "epoch": 7.695390781563126, + "grad_norm": 1.7011737823486328, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 5760 + }, + { + "epoch": 7.70875083500334, + "grad_norm": 1.4010082483291626, + "learning_rate": 0.0002, + "loss": 1.0261, + "step": 5770 + }, + { + "epoch": 7.722110888443554, + "grad_norm": 1.63649582862854, + "learning_rate": 0.0002, + "loss": 1.0288, + "step": 5780 + }, + { + "epoch": 7.735470941883768, + "grad_norm": 1.5091519355773926, + "learning_rate": 0.0002, + "loss": 0.9496, + "step": 5790 + }, + { + "epoch": 7.748830995323981, + "grad_norm": 1.345441460609436, + "learning_rate": 0.0002, + "loss": 0.9627, + "step": 5800 + }, + { + "epoch": 7.762191048764195, + "grad_norm": 1.461037278175354, + "learning_rate": 0.0002, + "loss": 0.9518, + "step": 5810 + }, + { + "epoch": 7.775551102204409, + "grad_norm": 1.3914507627487183, + "learning_rate": 0.0002, + "loss": 0.9544, + "step": 5820 + }, + { + "epoch": 7.788911155644622, + "grad_norm": 1.293625831604004, + "learning_rate": 0.0002, + "loss": 0.9946, + "step": 5830 + }, + { + "epoch": 7.802271209084836, + "grad_norm": 1.5641531944274902, + "learning_rate": 0.0002, + "loss": 0.9732, + "step": 5840 + }, + { + "epoch": 7.81563126252505, + "grad_norm": 1.2400811910629272, + "learning_rate": 0.0002, + "loss": 0.9952, + "step": 5850 + }, + { + "epoch": 7.828991315965264, + "grad_norm": 1.335532546043396, + "learning_rate": 0.0002, + "loss": 0.9841, + "step": 5860 + }, + { + "epoch": 7.842351369405478, + "grad_norm": 1.4629961252212524, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 5870 + }, + { + "epoch": 7.855711422845691, + "grad_norm": 1.3005847930908203, + "learning_rate": 0.0002, + "loss": 1.0813, + "step": 5880 + }, + { + "epoch": 7.869071476285905, + "grad_norm": 1.6970791816711426, + "learning_rate": 0.0002, + "loss": 1.0466, + "step": 5890 + }, + { + "epoch": 7.882431529726119, + "grad_norm": 1.6887991428375244, + "learning_rate": 0.0002, + "loss": 0.9399, + "step": 5900 + }, + { + "epoch": 7.895791583166333, + "grad_norm": 1.4156445264816284, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 5910 + }, + { + "epoch": 7.909151636606547, + "grad_norm": 1.2695387601852417, + "learning_rate": 0.0002, + "loss": 0.9856, + "step": 5920 + }, + { + "epoch": 7.92251169004676, + "grad_norm": 1.4491169452667236, + "learning_rate": 0.0002, + "loss": 0.9902, + "step": 5930 + }, + { + "epoch": 7.935871743486974, + "grad_norm": 1.4262619018554688, + "learning_rate": 0.0002, + "loss": 1.047, + "step": 5940 + }, + { + "epoch": 7.9492317969271875, + "grad_norm": 1.5128049850463867, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 5950 + }, + { + "epoch": 7.962591850367401, + "grad_norm": 1.3630818128585815, + "learning_rate": 0.0002, + "loss": 0.9721, + "step": 5960 + }, + { + "epoch": 7.975951903807616, + "grad_norm": 1.410461187362671, + "learning_rate": 0.0002, + "loss": 1.0154, + "step": 5970 + }, + { + "epoch": 7.989311957247829, + "grad_norm": 1.4158549308776855, + "learning_rate": 0.0002, + "loss": 1.0192, + "step": 5980 + }, + { + "epoch": 7.994655978623914, + "eval_loss": 2.348383903503418, + "eval_runtime": 38.6667, + "eval_samples_per_second": 13.319, + "eval_steps_per_second": 1.681, + "step": 5984 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.76926121281323e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-5984/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4990b8b186514ae90c063249835a6a29e09a1ee8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97ce501cc4f45630cbcfbd87207b09cb69e3d8e89b6cc1e4f1c36d4de0ef02f3 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d2de8c0683b4ef92085aebc369583bd5a714d41 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad3719cffecade4ebad3224eb133f26d219594c8081a7c8652a1ab9c7eb9fc20 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..79bb4713fe2b3401b996333e5c279df16227df43 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de9c5fc0e036cdd81b791ad579709aba7b58fe21596b8ec7a21bc2fc20bc6820 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f96b594f266e3eac04bbb3ede8e998deec2987d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d522cd2bba0b823893f0bc5610468bcc90c44a25295cdeeb1ed477e2c2fd955 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f8d92c3fb5cedd95fdc005242a2be638b8785461 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/trainer_state.json @@ -0,0 +1,559 @@ +{ + "best_metric": 1.8201380968093872, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748", + "epoch": 0.9993319973279893, + "eval_steps": 10, + "global_step": 748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013360053440213761, + "grad_norm": 0.5097216367721558, + "learning_rate": 0.0002, + "loss": 2.6569, + "step": 10 + }, + { + "epoch": 0.026720106880427523, + "grad_norm": 0.5924790501594543, + "learning_rate": 0.0002, + "loss": 2.2557, + "step": 20 + }, + { + "epoch": 0.04008016032064128, + "grad_norm": 0.5158102512359619, + "learning_rate": 0.0002, + "loss": 2.0626, + "step": 30 + }, + { + "epoch": 0.053440213760855046, + "grad_norm": 0.5033753514289856, + "learning_rate": 0.0002, + "loss": 1.9452, + "step": 40 + }, + { + "epoch": 0.06680026720106881, + "grad_norm": 0.5390949845314026, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 50 + }, + { + "epoch": 0.08016032064128256, + "grad_norm": 0.6376217007637024, + "learning_rate": 0.0002, + "loss": 1.937, + "step": 60 + }, + { + "epoch": 0.09352037408149633, + "grad_norm": 0.4202035069465637, + "learning_rate": 0.0002, + "loss": 1.929, + "step": 70 + }, + { + "epoch": 0.10688042752171009, + "grad_norm": 0.4269474744796753, + "learning_rate": 0.0002, + "loss": 1.811, + "step": 80 + }, + { + "epoch": 0.12024048096192384, + "grad_norm": 0.4306574761867523, + "learning_rate": 0.0002, + "loss": 1.8303, + "step": 90 + }, + { + "epoch": 0.13360053440213762, + "grad_norm": 0.5297011137008667, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 100 + }, + { + "epoch": 0.14696058784235136, + "grad_norm": 1.2313778400421143, + "learning_rate": 0.0002, + "loss": 1.864, + "step": 110 + }, + { + "epoch": 0.16032064128256512, + "grad_norm": 0.5351294279098511, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 120 + }, + { + "epoch": 0.1736806947227789, + "grad_norm": 0.4848092496395111, + "learning_rate": 0.0002, + "loss": 1.9232, + "step": 130 + }, + { + "epoch": 0.18704074816299265, + "grad_norm": 0.4339500665664673, + "learning_rate": 0.0002, + "loss": 1.8633, + "step": 140 + }, + { + "epoch": 0.20040080160320642, + "grad_norm": 0.46877285838127136, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 150 + }, + { + "epoch": 0.21376085504342018, + "grad_norm": 0.5600412487983704, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 160 + }, + { + "epoch": 0.22712090848363392, + "grad_norm": 0.3733620345592499, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 170 + }, + { + "epoch": 0.24048096192384769, + "grad_norm": 0.5116042494773865, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 180 + }, + { + "epoch": 0.25384101536406145, + "grad_norm": 0.4071602523326874, + "learning_rate": 0.0002, + "loss": 1.915, + "step": 190 + }, + { + "epoch": 0.26720106880427524, + "grad_norm": 0.44189608097076416, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 200 + }, + { + "epoch": 0.280561122244489, + "grad_norm": 0.398699015378952, + "learning_rate": 0.0002, + "loss": 1.8728, + "step": 210 + }, + { + "epoch": 0.2939211756847027, + "grad_norm": 0.3585626482963562, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 220 + }, + { + "epoch": 0.3072812291249165, + "grad_norm": 0.3811776041984558, + "learning_rate": 0.0002, + "loss": 1.8861, + "step": 230 + }, + { + "epoch": 0.32064128256513025, + "grad_norm": 0.37261509895324707, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 240 + }, + { + "epoch": 0.33400133600534404, + "grad_norm": 0.39762404561042786, + "learning_rate": 0.0002, + "loss": 1.9186, + "step": 250 + }, + { + "epoch": 0.3473613894455578, + "grad_norm": 0.3509528934955597, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 260 + }, + { + "epoch": 0.36072144288577157, + "grad_norm": 0.3169104754924774, + "learning_rate": 0.0002, + "loss": 1.7802, + "step": 270 + }, + { + "epoch": 0.3740814963259853, + "grad_norm": 0.33714795112609863, + "learning_rate": 0.0002, + "loss": 1.8038, + "step": 280 + }, + { + "epoch": 0.38744154976619904, + "grad_norm": 1.2936875820159912, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 290 + }, + { + "epoch": 0.40080160320641284, + "grad_norm": 0.3459427058696747, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 300 + }, + { + "epoch": 0.4141616566466266, + "grad_norm": 0.3380655348300934, + "learning_rate": 0.0002, + "loss": 1.8879, + "step": 310 + }, + { + "epoch": 0.42752171008684037, + "grad_norm": 0.3890381455421448, + "learning_rate": 0.0002, + "loss": 1.9196, + "step": 320 + }, + { + "epoch": 0.4408817635270541, + "grad_norm": 0.432327002286911, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 330 + }, + { + "epoch": 0.45424181696726784, + "grad_norm": 0.3736560046672821, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 340 + }, + { + "epoch": 0.46760187040748163, + "grad_norm": 0.3700982630252838, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 350 + }, + { + "epoch": 0.48096192384769537, + "grad_norm": 0.4533902406692505, + "learning_rate": 0.0002, + "loss": 1.7978, + "step": 360 + }, + { + "epoch": 0.49432197728790916, + "grad_norm": 0.35999053716659546, + "learning_rate": 0.0002, + "loss": 1.7427, + "step": 370 + }, + { + "epoch": 0.5076820307281229, + "grad_norm": 0.3490903675556183, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 380 + }, + { + "epoch": 0.5210420841683366, + "grad_norm": 0.34704291820526123, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 390 + }, + { + "epoch": 0.5344021376085505, + "grad_norm": 0.343565434217453, + "learning_rate": 0.0002, + "loss": 1.7948, + "step": 400 + }, + { + "epoch": 0.5477621910487642, + "grad_norm": 0.3573552966117859, + "learning_rate": 0.0002, + "loss": 1.8564, + "step": 410 + }, + { + "epoch": 0.561122244488978, + "grad_norm": 0.32980719208717346, + "learning_rate": 0.0002, + "loss": 1.8477, + "step": 420 + }, + { + "epoch": 0.5744822979291917, + "grad_norm": 0.356952428817749, + "learning_rate": 0.0002, + "loss": 1.9233, + "step": 430 + }, + { + "epoch": 0.5878423513694054, + "grad_norm": 0.3170869052410126, + "learning_rate": 0.0002, + "loss": 1.7433, + "step": 440 + }, + { + "epoch": 0.6012024048096193, + "grad_norm": 0.35233718156814575, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 450 + }, + { + "epoch": 0.614562458249833, + "grad_norm": 0.3480125367641449, + "learning_rate": 0.0002, + "loss": 1.8111, + "step": 460 + }, + { + "epoch": 0.6279225116900468, + "grad_norm": 0.4762810468673706, + "learning_rate": 0.0002, + "loss": 1.8386, + "step": 470 + }, + { + "epoch": 0.6412825651302605, + "grad_norm": 0.3907663822174072, + "learning_rate": 0.0002, + "loss": 1.805, + "step": 480 + }, + { + "epoch": 0.6546426185704742, + "grad_norm": 0.36315613985061646, + "learning_rate": 0.0002, + "loss": 1.8113, + "step": 490 + }, + { + "epoch": 0.6680026720106881, + "grad_norm": 0.377796471118927, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 500 + }, + { + "epoch": 0.6813627254509018, + "grad_norm": 0.34284207224845886, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 510 + }, + { + "epoch": 0.6947227788911156, + "grad_norm": 0.35563018918037415, + "learning_rate": 0.0002, + "loss": 1.8013, + "step": 520 + }, + { + "epoch": 0.7080828323313293, + "grad_norm": 0.37575867772102356, + "learning_rate": 0.0002, + "loss": 1.8414, + "step": 530 + }, + { + "epoch": 0.7214428857715431, + "grad_norm": 0.35719701647758484, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 540 + }, + { + "epoch": 0.7348029392117569, + "grad_norm": 0.385813444852829, + "learning_rate": 0.0002, + "loss": 1.7574, + "step": 550 + }, + { + "epoch": 0.7481629926519706, + "grad_norm": 0.44509607553482056, + "learning_rate": 0.0002, + "loss": 1.7985, + "step": 560 + }, + { + "epoch": 0.7615230460921844, + "grad_norm": 0.36108464002609253, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 570 + }, + { + "epoch": 0.7748830995323981, + "grad_norm": 0.3530745804309845, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 580 + }, + { + "epoch": 0.7882431529726119, + "grad_norm": 0.34888574481010437, + "learning_rate": 0.0002, + "loss": 1.7479, + "step": 590 + }, + { + "epoch": 0.8016032064128257, + "grad_norm": 0.387346476316452, + "learning_rate": 0.0002, + "loss": 1.8656, + "step": 600 + }, + { + "epoch": 0.8149632598530394, + "grad_norm": 0.3641138970851898, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 610 + }, + { + "epoch": 0.8283233132932531, + "grad_norm": 0.33729103207588196, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.8416833667334669, + "grad_norm": 0.3652004599571228, + "learning_rate": 0.0002, + "loss": 1.8613, + "step": 630 + }, + { + "epoch": 0.8550434201736807, + "grad_norm": 0.3986643850803375, + "learning_rate": 0.0002, + "loss": 1.9184, + "step": 640 + }, + { + "epoch": 0.8684034736138945, + "grad_norm": 0.3458964228630066, + "learning_rate": 0.0002, + "loss": 1.8198, + "step": 650 + }, + { + "epoch": 0.8817635270541082, + "grad_norm": 0.3559381365776062, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 660 + }, + { + "epoch": 0.895123580494322, + "grad_norm": 0.3612841069698334, + "learning_rate": 0.0002, + "loss": 1.7641, + "step": 670 + }, + { + "epoch": 0.9084836339345357, + "grad_norm": 0.34771719574928284, + "learning_rate": 0.0002, + "loss": 1.7888, + "step": 680 + }, + { + "epoch": 0.9218436873747495, + "grad_norm": 0.3371497094631195, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 690 + }, + { + "epoch": 0.9352037408149633, + "grad_norm": 0.5596055388450623, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 700 + }, + { + "epoch": 0.948563794255177, + "grad_norm": 0.311880499124527, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 710 + }, + { + "epoch": 0.9619238476953907, + "grad_norm": 0.3462068736553192, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 720 + }, + { + "epoch": 0.9752839011356046, + "grad_norm": 0.29982393980026245, + "learning_rate": 0.0002, + "loss": 1.8197, + "step": 730 + }, + { + "epoch": 0.9886439545758183, + "grad_norm": 0.34606459736824036, + "learning_rate": 0.0002, + "loss": 1.8503, + "step": 740 + }, + { + "epoch": 0.9993319973279893, + "eval_loss": 1.8201380968093872, + "eval_runtime": 38.6124, + "eval_samples_per_second": 13.338, + "eval_steps_per_second": 1.683, + "step": 748 + } + ], + "logging_steps": 10, + "max_steps": 5984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.463890404062003e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7025f98e983b93b4fec0d5370c1456c7adecaf8e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e92f0598f349930e52e126f358a3dca045c3c88312ec4b3f814f018a04982a9 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b41df7b8c98cf1ed2ef666b0cb5947b2f922af93 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/training_log.jsonl @@ -0,0 +1,10 @@ +{"epoch": 0.9993319973279893, "step": 748, "epoch_duration": 1492.3190751075745, "total_accumulated_duration": 1492.3190751075745, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6557, "grad_norm": 0.5145056247711182, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2534, "grad_norm": 0.5996711850166321, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0618, "grad_norm": 0.6517987251281738, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9439, "grad_norm": 0.5026489496231079, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9116, "grad_norm": 0.5164623856544495, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.9361, "grad_norm": 0.6341000199317932, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.9283, "grad_norm": 0.41312524676322937, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.8119, "grad_norm": 0.4311992824077606, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.831, "grad_norm": 0.4293884038925171, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8465, "grad_norm": 0.544420063495636, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.8629, "grad_norm": 1.107458233833313, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8529, "grad_norm": 0.5141991972923279, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9222, "grad_norm": 0.46840447187423706, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8624, "grad_norm": 0.4327720105648041, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.8151, "grad_norm": 0.4583268463611603, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8032, "grad_norm": 0.5514822006225586, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8161, "grad_norm": 0.36940041184425354, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8553, "grad_norm": 0.4596315622329712, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.9108, "grad_norm": 0.4041196405887604, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7961, "grad_norm": 0.6554355621337891, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8736, "grad_norm": 0.42680826783180237, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8198, "grad_norm": 0.36290431022644043, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8859, "grad_norm": 0.38558459281921387, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8367, "grad_norm": 0.3701355755329132, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9172, "grad_norm": 0.39881789684295654, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7959, "grad_norm": 0.33923494815826416, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7813, "grad_norm": 0.31573089957237244, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8025, "grad_norm": 0.3342670202255249, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.786, "grad_norm": 0.4824513792991638, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.797, "grad_norm": 0.5301131010055542, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8865, "grad_norm": 0.34163719415664673, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9192, "grad_norm": 0.4014457166194916, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8017, "grad_norm": 0.4126872420310974, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8411, "grad_norm": 0.40941300988197327, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8493, "grad_norm": 0.3820064663887024, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7975, "grad_norm": 0.4614120125770569, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35886332392692566, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7974, "grad_norm": 0.32689186930656433, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8698, "grad_norm": 0.36681291460990906, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7943, "grad_norm": 0.3464324176311493, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8571, "grad_norm": 0.37786048650741577, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8476, "grad_norm": 0.3381720781326294, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9237, "grad_norm": 0.3579493463039398, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7435, "grad_norm": 0.31530654430389404, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.355329304933548, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8077, "grad_norm": 0.35699936747550964, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8346, "grad_norm": 0.3523382842540741, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.8057, "grad_norm": 0.3932587504386902, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.81, "grad_norm": 0.3593502640724182, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.779, "grad_norm": 0.3738945722579956, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7453, "grad_norm": 0.34351104497909546, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.3606947660446167, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8421, "grad_norm": 0.3633013367652893, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7987, "grad_norm": 0.35114890336990356, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7566, "grad_norm": 0.39301058650016785, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.8001, "grad_norm": 0.4035342037677765, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7469, "grad_norm": 0.39972934126853943, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8235, "grad_norm": 0.3460000157356262, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7485, "grad_norm": 0.35099920630455017, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8635, "grad_norm": 0.3802923560142517, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.806, "grad_norm": 0.36409080028533936, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7529, "grad_norm": 0.3338017165660858, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8597, "grad_norm": 0.3630930781364441, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9177, "grad_norm": 0.36975711584091187, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.818, "grad_norm": 0.34457138180732727, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.8031, "grad_norm": 0.37342599034309387, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7642, "grad_norm": 0.3463771343231201, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7878, "grad_norm": 0.35239866375923157, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8201, "grad_norm": 0.3397367596626282, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7846, "grad_norm": 0.4075145721435547, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7798, "grad_norm": 0.3126145005226135, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8469, "grad_norm": 0.3454380929470062, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8187, "grad_norm": 0.3040635883808136, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8506, "grad_norm": 0.343362420797348, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}]} +{"epoch": 2.0, "step": 1497, "epoch_duration": 1945.510260105133, "total_accumulated_duration": 3437.8293352127075, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6557, "grad_norm": 0.5145056247711182, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2534, "grad_norm": 0.5996711850166321, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0618, "grad_norm": 0.6517987251281738, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9439, "grad_norm": 0.5026489496231079, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9116, "grad_norm": 0.5164623856544495, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.9361, "grad_norm": 0.6341000199317932, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.9283, "grad_norm": 0.41312524676322937, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.8119, "grad_norm": 0.4311992824077606, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.831, "grad_norm": 0.4293884038925171, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8465, "grad_norm": 0.544420063495636, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.8629, "grad_norm": 1.107458233833313, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8529, "grad_norm": 0.5141991972923279, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9222, "grad_norm": 0.46840447187423706, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8624, "grad_norm": 0.4327720105648041, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.8151, "grad_norm": 0.4583268463611603, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8032, "grad_norm": 0.5514822006225586, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8161, "grad_norm": 0.36940041184425354, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8553, "grad_norm": 0.4596315622329712, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.9108, "grad_norm": 0.4041196405887604, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7961, "grad_norm": 0.6554355621337891, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8736, "grad_norm": 0.42680826783180237, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8198, "grad_norm": 0.36290431022644043, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8859, "grad_norm": 0.38558459281921387, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8367, "grad_norm": 0.3701355755329132, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9172, "grad_norm": 0.39881789684295654, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7959, "grad_norm": 0.33923494815826416, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7813, "grad_norm": 0.31573089957237244, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8025, "grad_norm": 0.3342670202255249, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.786, "grad_norm": 0.4824513792991638, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.797, "grad_norm": 0.5301131010055542, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8865, "grad_norm": 0.34163719415664673, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9192, "grad_norm": 0.4014457166194916, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8017, "grad_norm": 0.4126872420310974, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8411, "grad_norm": 0.40941300988197327, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8493, "grad_norm": 0.3820064663887024, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7975, "grad_norm": 0.4614120125770569, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35886332392692566, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7974, "grad_norm": 0.32689186930656433, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8698, "grad_norm": 0.36681291460990906, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7943, "grad_norm": 0.3464324176311493, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8571, "grad_norm": 0.37786048650741577, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8476, "grad_norm": 0.3381720781326294, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9237, "grad_norm": 0.3579493463039398, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7435, "grad_norm": 0.31530654430389404, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.355329304933548, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8077, "grad_norm": 0.35699936747550964, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8346, "grad_norm": 0.3523382842540741, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.8057, "grad_norm": 0.3932587504386902, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.81, "grad_norm": 0.3593502640724182, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.779, "grad_norm": 0.3738945722579956, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7453, "grad_norm": 0.34351104497909546, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.3606947660446167, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8421, "grad_norm": 0.3633013367652893, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7987, "grad_norm": 0.35114890336990356, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7566, "grad_norm": 0.39301058650016785, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.8001, "grad_norm": 0.4035342037677765, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7469, "grad_norm": 0.39972934126853943, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8235, "grad_norm": 0.3460000157356262, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7485, "grad_norm": 0.35099920630455017, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8635, "grad_norm": 0.3802923560142517, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.806, "grad_norm": 0.36409080028533936, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7529, "grad_norm": 0.3338017165660858, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8597, "grad_norm": 0.3630930781364441, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9177, "grad_norm": 0.36975711584091187, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.818, "grad_norm": 0.34457138180732727, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.8031, "grad_norm": 0.37342599034309387, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7642, "grad_norm": 0.3463771343231201, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7878, "grad_norm": 0.35239866375923157, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8201, "grad_norm": 0.3397367596626282, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7846, "grad_norm": 0.4075145721435547, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7798, "grad_norm": 0.3126145005226135, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8469, "grad_norm": 0.3454380929470062, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8187, "grad_norm": 0.3040635883808136, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8506, "grad_norm": 0.343362420797348, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.818781852722168, "eval_runtime": 59.4981, "eval_samples_per_second": 8.656, "eval_steps_per_second": 1.092, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7778, "grad_norm": 0.3213261663913727, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7283, "grad_norm": 0.3322339653968811, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.6998, "grad_norm": 0.3345193564891815, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.8098, "grad_norm": 0.40045303106307983, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7087, "grad_norm": 0.33195099234580994, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7752, "grad_norm": 0.403153657913208, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7418, "grad_norm": 0.3418046832084656, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7246, "grad_norm": 0.3740319013595581, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7219, "grad_norm": 0.33727121353149414, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7535, "grad_norm": 0.37017568945884705, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7158, "grad_norm": 0.37981611490249634, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7144, "grad_norm": 0.38836708664894104, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35578230023384094, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7763, "grad_norm": 0.3892936408519745, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7497, "grad_norm": 0.3329775631427765, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7117, "grad_norm": 0.36562827229499817, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7547, "grad_norm": 0.3779258728027344, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7502, "grad_norm": 0.3782391846179962, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7587, "grad_norm": 0.3202662467956543, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.677, "grad_norm": 0.38925567269325256, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7717, "grad_norm": 0.3602938950061798, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7146, "grad_norm": 0.35788992047309875, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3859518766403198, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7358, "grad_norm": 0.3629772365093231, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7604, "grad_norm": 0.4093330502510071, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6922, "grad_norm": 0.398992121219635, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.744, "grad_norm": 0.3655536472797394, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7131, "grad_norm": 0.42729514837265015, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.8029, "grad_norm": 0.36032676696777344, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.7247, "grad_norm": 0.3375038802623749, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8208, "grad_norm": 0.3981837332248688, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7419, "grad_norm": 0.38041386008262634, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7461, "grad_norm": 0.3807733952999115, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6568, "grad_norm": 0.4061289131641388, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7307, "grad_norm": 0.3604201078414917, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7333, "grad_norm": 0.36671286821365356, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7449, "grad_norm": 0.4039835035800934, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7124, "grad_norm": 0.3332546651363373, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6899, "grad_norm": 0.37804311513900757, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7225, "grad_norm": 0.33254241943359375, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7158, "grad_norm": 0.3753426969051361, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6973, "grad_norm": 0.37859225273132324, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7431, "grad_norm": 0.34741416573524475, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7526, "grad_norm": 0.4349542558193207, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6441, "grad_norm": 0.33280444145202637, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7583, "grad_norm": 0.3488023579120636, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7822, "grad_norm": 0.3879616856575012, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7458, "grad_norm": 0.3678235411643982, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7598, "grad_norm": 0.35086360573768616, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7182, "grad_norm": 0.418790727853775, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7579, "grad_norm": 0.33470529317855835, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6941, "grad_norm": 0.36641785502433777, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.7267, "grad_norm": 0.35484573245048523, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7464, "grad_norm": 0.3373892605304718, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7104, "grad_norm": 0.35930702090263367, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7409, "grad_norm": 0.37268316745758057, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7859, "grad_norm": 0.36591675877571106, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.735, "grad_norm": 0.38185861706733704, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7543, "grad_norm": 0.3542157709598541, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7247, "grad_norm": 0.3794424533843994, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7868, "grad_norm": 0.43353381752967834, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7332, "grad_norm": 0.42096200585365295, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7775, "grad_norm": 0.367298424243927, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7419, "grad_norm": 0.404933899641037, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7422, "grad_norm": 0.36570486426353455, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7942, "grad_norm": 0.37176936864852905, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7882, "grad_norm": 0.36557459831237793, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7531, "grad_norm": 0.3741154968738556, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7695, "grad_norm": 0.3762620985507965, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7243, "grad_norm": 0.3563912808895111, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7986, "grad_norm": 0.37095218896865845, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6697, "grad_norm": 0.3762151002883911, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6828, "grad_norm": 0.37596359848976135, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7444, "grad_norm": 0.36733531951904297, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7042, "grad_norm": 0.38418638706207275, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}]} +{"epoch": 0.9993319973279893, "step": 748, "epoch_duration": 823.8140847682953, "total_accumulated_duration": 823.8140847682953, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}]} +{"epoch": 2.0, "step": 1497, "epoch_duration": 845.7833571434021, "total_accumulated_duration": 1669.5974419116974, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-748", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.8201380968093872, "eval_runtime": 38.6124, "eval_samples_per_second": 13.338, "eval_steps_per_second": 1.683, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7786, "grad_norm": 0.32302048802375793, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7297, "grad_norm": 0.37585633993148804, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.7008, "grad_norm": 0.33826273679733276, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.809, "grad_norm": 0.44682955741882324, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7092, "grad_norm": 0.422188401222229, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7765, "grad_norm": 0.3809906244277954, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3454349637031555, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7257, "grad_norm": 0.3767355978488922, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7224, "grad_norm": 0.3361407518386841, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7509, "grad_norm": 0.3654632568359375, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7151, "grad_norm": 0.3822861313819885, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7121, "grad_norm": 0.3853831887245178, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35521796345710754, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7735, "grad_norm": 0.4107200503349304, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7484, "grad_norm": 0.33219534158706665, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7071, "grad_norm": 0.3559704124927521, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7535, "grad_norm": 0.3700537383556366, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7513, "grad_norm": 0.3771909475326538, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7566, "grad_norm": 0.3136613965034485, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.6783, "grad_norm": 0.3952099084854126, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7691, "grad_norm": 0.36534377932548523, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7127, "grad_norm": 0.3803492486476898, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3992428183555603, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7343, "grad_norm": 0.3627142906188965, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7598, "grad_norm": 0.4248180091381073, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6896, "grad_norm": 0.4060308039188385, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.7457, "grad_norm": 0.3788969814777374, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7111, "grad_norm": 0.4174270033836365, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.7975, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.724, "grad_norm": 0.3454059362411499, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8299, "grad_norm": 0.45807570219039917, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7425, "grad_norm": 0.39338022470474243, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7457, "grad_norm": 0.3870709240436554, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6565, "grad_norm": 0.40996190905570984, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7324, "grad_norm": 0.38762837648391724, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7362, "grad_norm": 0.36756977438926697, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7451, "grad_norm": 0.4087235927581787, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7114, "grad_norm": 0.3357745110988617, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6877, "grad_norm": 0.37486532330513, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7252, "grad_norm": 0.3387809991836548, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7169, "grad_norm": 0.37462118268013, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6988, "grad_norm": 0.38575324416160583, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7438, "grad_norm": 0.3515765964984894, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7524, "grad_norm": 0.39308643341064453, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6422, "grad_norm": 0.3308864235877991, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7566, "grad_norm": 0.3397478461265564, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7871, "grad_norm": 0.3911525309085846, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7443, "grad_norm": 0.3771969974040985, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7631, "grad_norm": 0.35346856713294983, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.41736963391304016, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7582, "grad_norm": 0.3375225067138672, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6916, "grad_norm": 0.3779928982257843, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.728, "grad_norm": 0.35388994216918945, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7461, "grad_norm": 0.33884134888648987, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7083, "grad_norm": 0.35439756512641907, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7389, "grad_norm": 0.3766156733036041, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7847, "grad_norm": 0.36148911714553833, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.39687496423721313, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7541, "grad_norm": 0.35639452934265137, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7254, "grad_norm": 0.38781628012657166, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7867, "grad_norm": 0.42784637212753296, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7336, "grad_norm": 0.40258511900901794, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7771, "grad_norm": 0.36674195528030396, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7425, "grad_norm": 0.4064558446407318, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7425, "grad_norm": 0.3669849932193756, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7924, "grad_norm": 0.37569567561149597, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7885, "grad_norm": 0.37307995557785034, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7548, "grad_norm": 0.3772695064544678, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7682, "grad_norm": 0.36993589997291565, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7249, "grad_norm": 0.3490557372570038, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7979, "grad_norm": 0.3716149628162384, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6664, "grad_norm": 0.39236098527908325, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6852, "grad_norm": 0.37258651852607727, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7427, "grad_norm": 0.36183077096939087, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7055, "grad_norm": 0.3956947326660156, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}]} +{"epoch": 2.9993319973279893, "step": 2245, "epoch_duration": 812.8899757862091, "total_accumulated_duration": 2482.4874176979065, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.8201380968093872, "eval_runtime": 38.6124, "eval_samples_per_second": 13.338, "eval_steps_per_second": 1.683, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7786, "grad_norm": 0.32302048802375793, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7297, "grad_norm": 0.37585633993148804, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.7008, "grad_norm": 0.33826273679733276, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.809, "grad_norm": 0.44682955741882324, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7092, "grad_norm": 0.422188401222229, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7765, "grad_norm": 0.3809906244277954, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3454349637031555, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7257, "grad_norm": 0.3767355978488922, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7224, "grad_norm": 0.3361407518386841, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7509, "grad_norm": 0.3654632568359375, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7151, "grad_norm": 0.3822861313819885, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7121, "grad_norm": 0.3853831887245178, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35521796345710754, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7735, "grad_norm": 0.4107200503349304, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7484, "grad_norm": 0.33219534158706665, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7071, "grad_norm": 0.3559704124927521, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7535, "grad_norm": 0.3700537383556366, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7513, "grad_norm": 0.3771909475326538, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7566, "grad_norm": 0.3136613965034485, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.6783, "grad_norm": 0.3952099084854126, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7691, "grad_norm": 0.36534377932548523, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7127, "grad_norm": 0.3803492486476898, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3992428183555603, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7343, "grad_norm": 0.3627142906188965, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7598, "grad_norm": 0.4248180091381073, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6896, "grad_norm": 0.4060308039188385, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.7457, "grad_norm": 0.3788969814777374, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7111, "grad_norm": 0.4174270033836365, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.7975, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.724, "grad_norm": 0.3454059362411499, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8299, "grad_norm": 0.45807570219039917, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7425, "grad_norm": 0.39338022470474243, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7457, "grad_norm": 0.3870709240436554, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6565, "grad_norm": 0.40996190905570984, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7324, "grad_norm": 0.38762837648391724, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7362, "grad_norm": 0.36756977438926697, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7451, "grad_norm": 0.4087235927581787, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7114, "grad_norm": 0.3357745110988617, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6877, "grad_norm": 0.37486532330513, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7252, "grad_norm": 0.3387809991836548, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7169, "grad_norm": 0.37462118268013, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6988, "grad_norm": 0.38575324416160583, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7438, "grad_norm": 0.3515765964984894, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7524, "grad_norm": 0.39308643341064453, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6422, "grad_norm": 0.3308864235877991, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7566, "grad_norm": 0.3397478461265564, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7871, "grad_norm": 0.3911525309085846, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7443, "grad_norm": 0.3771969974040985, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7631, "grad_norm": 0.35346856713294983, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.41736963391304016, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7582, "grad_norm": 0.3375225067138672, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6916, "grad_norm": 0.3779928982257843, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.728, "grad_norm": 0.35388994216918945, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7461, "grad_norm": 0.33884134888648987, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7083, "grad_norm": 0.35439756512641907, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7389, "grad_norm": 0.3766156733036041, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7847, "grad_norm": 0.36148911714553833, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.39687496423721313, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7541, "grad_norm": 0.35639452934265137, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7254, "grad_norm": 0.38781628012657166, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7867, "grad_norm": 0.42784637212753296, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7336, "grad_norm": 0.40258511900901794, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7771, "grad_norm": 0.36674195528030396, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7425, "grad_norm": 0.4064558446407318, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7425, "grad_norm": 0.3669849932193756, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7924, "grad_norm": 0.37569567561149597, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7885, "grad_norm": 0.37307995557785034, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7548, "grad_norm": 0.3772695064544678, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7682, "grad_norm": 0.36993589997291565, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7249, "grad_norm": 0.3490557372570038, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7979, "grad_norm": 0.3716149628162384, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6664, "grad_norm": 0.39236098527908325, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6852, "grad_norm": 0.37258651852607727, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7427, "grad_norm": 0.36183077096939087, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7055, "grad_norm": 0.3956947326660156, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}, {"eval_loss": 1.8132041692733765, "eval_runtime": 38.6287, "eval_samples_per_second": 13.332, "eval_steps_per_second": 1.683, "epoch": 2.0, "step": 1497}, {"loss": 1.6791, "grad_norm": 0.34480565786361694, "learning_rate": 0.0002, "epoch": 2.004008016032064, "step": 1500}, {"loss": 1.6367, "grad_norm": 0.3418028652667999, "learning_rate": 0.0002, "epoch": 2.017368069472278, "step": 1510}, {"loss": 1.5827, "grad_norm": 0.4514467716217041, "learning_rate": 0.0002, "epoch": 2.0307281229124916, "step": 1520}, {"loss": 1.6365, "grad_norm": 0.4197506606578827, "learning_rate": 0.0002, "epoch": 2.0440881763527052, "step": 1530}, {"loss": 1.6221, "grad_norm": 0.4134170711040497, "learning_rate": 0.0002, "epoch": 2.0574482297929193, "step": 1540}, {"loss": 1.6876, "grad_norm": 0.43709826469421387, "learning_rate": 0.0002, "epoch": 2.070808283233133, "step": 1550}, {"loss": 1.5779, "grad_norm": 0.4703378677368164, "learning_rate": 0.0002, "epoch": 2.0841683366733466, "step": 1560}, {"loss": 1.599, "grad_norm": 0.4538188576698303, "learning_rate": 0.0002, "epoch": 2.0975283901135606, "step": 1570}, {"loss": 1.6464, "grad_norm": 0.4649668037891388, "learning_rate": 0.0002, "epoch": 2.1108884435537743, "step": 1580}, {"loss": 1.6348, "grad_norm": 0.42669883370399475, "learning_rate": 0.0002, "epoch": 2.124248496993988, "step": 1590}, {"loss": 1.5838, "grad_norm": 0.43162038922309875, "learning_rate": 0.0002, "epoch": 2.1376085504342015, "step": 1600}, {"loss": 1.6673, "grad_norm": 0.4294586479663849, "learning_rate": 0.0002, "epoch": 2.1509686038744156, "step": 1610}, {"loss": 1.6024, "grad_norm": 0.4669102132320404, "learning_rate": 0.0002, "epoch": 2.164328657314629, "step": 1620}, {"loss": 1.659, "grad_norm": 0.4188412129878998, "learning_rate": 0.0002, "epoch": 2.177688710754843, "step": 1630}, {"loss": 1.625, "grad_norm": 0.4662680923938751, "learning_rate": 0.0002, "epoch": 2.191048764195057, "step": 1640}, {"loss": 1.6699, "grad_norm": 0.4020286500453949, "learning_rate": 0.0002, "epoch": 2.2044088176352705, "step": 1650}, {"loss": 1.6284, "grad_norm": 0.41919606924057007, "learning_rate": 0.0002, "epoch": 2.217768871075484, "step": 1660}, {"loss": 1.6776, "grad_norm": 0.4644531309604645, "learning_rate": 0.0002, "epoch": 2.231128924515698, "step": 1670}, {"loss": 1.6711, "grad_norm": 0.4526427984237671, "learning_rate": 0.0002, "epoch": 2.244488977955912, "step": 1680}, {"loss": 1.6058, "grad_norm": 0.45953166484832764, "learning_rate": 0.0002, "epoch": 2.2578490313961255, "step": 1690}, {"loss": 1.5979, "grad_norm": 0.4701860249042511, "learning_rate": 0.0002, "epoch": 2.2712090848363395, "step": 1700}, {"loss": 1.6183, "grad_norm": 0.4749310612678528, "learning_rate": 0.0002, "epoch": 2.284569138276553, "step": 1710}, {"loss": 1.6703, "grad_norm": 0.45026102662086487, "learning_rate": 0.0002, "epoch": 2.297929191716767, "step": 1720}, {"loss": 1.6386, "grad_norm": 0.4755004048347473, "learning_rate": 0.0002, "epoch": 2.3112892451569804, "step": 1730}, {"loss": 1.6365, "grad_norm": 0.4505726993083954, "learning_rate": 0.0002, "epoch": 2.3246492985971945, "step": 1740}, {"loss": 1.589, "grad_norm": 0.44464054703712463, "learning_rate": 0.0002, "epoch": 2.338009352037408, "step": 1750}, {"loss": 1.6139, "grad_norm": 0.4449476897716522, "learning_rate": 0.0002, "epoch": 2.3513694054776217, "step": 1760}, {"loss": 1.7195, "grad_norm": 0.4216482937335968, "learning_rate": 0.0002, "epoch": 2.364729458917836, "step": 1770}, {"loss": 1.7075, "grad_norm": 0.4379308521747589, "learning_rate": 0.0002, "epoch": 2.3780895123580494, "step": 1780}, {"loss": 1.7024, "grad_norm": 0.41670042276382446, "learning_rate": 0.0002, "epoch": 2.391449565798263, "step": 1790}, {"loss": 1.5989, "grad_norm": 0.48089510202407837, "learning_rate": 0.0002, "epoch": 2.404809619238477, "step": 1800}, {"loss": 1.6313, "grad_norm": 0.4389738142490387, "learning_rate": 0.0002, "epoch": 2.4181696726786908, "step": 1810}, {"loss": 1.5841, "grad_norm": 0.45293036103248596, "learning_rate": 0.0002, "epoch": 2.4315297261189044, "step": 1820}, {"loss": 1.6887, "grad_norm": 0.5211683511734009, "learning_rate": 0.0002, "epoch": 2.4448897795591185, "step": 1830}, {"loss": 1.6599, "grad_norm": 0.4631884694099426, "learning_rate": 0.0002, "epoch": 2.458249832999332, "step": 1840}, {"loss": 1.6537, "grad_norm": 0.4276818335056305, "learning_rate": 0.0002, "epoch": 2.4716098864395457, "step": 1850}, {"loss": 1.6836, "grad_norm": 0.477524071931839, "learning_rate": 0.0002, "epoch": 2.4849699398797593, "step": 1860}, {"loss": 1.66, "grad_norm": 0.44860973954200745, "learning_rate": 0.0002, "epoch": 2.4983299933199734, "step": 1870}, {"loss": 1.6308, "grad_norm": 0.46413546800613403, "learning_rate": 0.0002, "epoch": 2.511690046760187, "step": 1880}, {"loss": 1.6225, "grad_norm": 0.42487645149230957, "learning_rate": 0.0002, "epoch": 2.5250501002004007, "step": 1890}, {"loss": 1.6268, "grad_norm": 0.4778307378292084, "learning_rate": 0.0002, "epoch": 2.5384101536406147, "step": 1900}, {"loss": 1.6143, "grad_norm": 0.45307061076164246, "learning_rate": 0.0002, "epoch": 2.5517702070808284, "step": 1910}, {"loss": 1.7279, "grad_norm": 0.47886642813682556, "learning_rate": 0.0002, "epoch": 2.565130260521042, "step": 1920}, {"loss": 1.5931, "grad_norm": 0.4839435815811157, "learning_rate": 0.0002, "epoch": 2.5784903139612556, "step": 1930}, {"loss": 1.6089, "grad_norm": 0.4388359785079956, "learning_rate": 0.0002, "epoch": 2.5918503674014697, "step": 1940}, {"loss": 1.6828, "grad_norm": 0.47859734296798706, "learning_rate": 0.0002, "epoch": 2.6052104208416833, "step": 1950}, {"loss": 1.6014, "grad_norm": 0.5526517033576965, "learning_rate": 0.0002, "epoch": 2.6185704742818974, "step": 1960}, {"loss": 1.6889, "grad_norm": 0.5449170470237732, "learning_rate": 0.0002, "epoch": 2.631930527722111, "step": 1970}, {"loss": 1.6481, "grad_norm": 0.48521968722343445, "learning_rate": 0.0002, "epoch": 2.6452905811623246, "step": 1980}, {"loss": 1.6741, "grad_norm": 0.4733737111091614, "learning_rate": 0.0002, "epoch": 2.6586506346025383, "step": 1990}, {"loss": 1.662, "grad_norm": 0.507118284702301, "learning_rate": 0.0002, "epoch": 2.6720106880427523, "step": 2000}, {"loss": 1.6419, "grad_norm": 0.4508971571922302, "learning_rate": 0.0002, "epoch": 2.685370741482966, "step": 2010}, {"loss": 1.7052, "grad_norm": 0.4657728672027588, "learning_rate": 0.0002, "epoch": 2.6987307949231796, "step": 2020}, {"loss": 1.6261, "grad_norm": 0.48647549748420715, "learning_rate": 0.0002, "epoch": 2.7120908483633936, "step": 2030}, {"loss": 1.5638, "grad_norm": 0.49525555968284607, "learning_rate": 0.0002, "epoch": 2.7254509018036073, "step": 2040}, {"loss": 1.658, "grad_norm": 0.4712379276752472, "learning_rate": 0.0002, "epoch": 2.738810955243821, "step": 2050}, {"loss": 1.6464, "grad_norm": 0.4846591055393219, "learning_rate": 0.0002, "epoch": 2.7521710086840345, "step": 2060}, {"loss": 1.5641, "grad_norm": 0.4823240041732788, "learning_rate": 0.0002, "epoch": 2.7655310621242486, "step": 2070}, {"loss": 1.6701, "grad_norm": 0.4546685516834259, "learning_rate": 0.0002, "epoch": 2.778891115564462, "step": 2080}, {"loss": 1.7015, "grad_norm": 0.45542681217193604, "learning_rate": 0.0002, "epoch": 2.7922511690046763, "step": 2090}, {"loss": 1.6398, "grad_norm": 0.42137566208839417, "learning_rate": 0.0002, "epoch": 2.80561122244489, "step": 2100}, {"loss": 1.6526, "grad_norm": 0.6143282055854797, "learning_rate": 0.0002, "epoch": 2.8189712758851035, "step": 2110}, {"loss": 1.6955, "grad_norm": 0.4828081727027893, "learning_rate": 0.0002, "epoch": 2.832331329325317, "step": 2120}, {"loss": 1.744, "grad_norm": 0.4319005608558655, "learning_rate": 0.0002, "epoch": 2.845691382765531, "step": 2130}, {"loss": 1.6717, "grad_norm": 0.4297086298465729, "learning_rate": 0.0002, "epoch": 2.859051436205745, "step": 2140}, {"loss": 1.5968, "grad_norm": 0.5011981129646301, "learning_rate": 0.0002, "epoch": 2.8724114896459585, "step": 2150}, {"loss": 1.7181, "grad_norm": 0.4401548504829407, "learning_rate": 0.0002, "epoch": 2.8857715430861726, "step": 2160}, {"loss": 1.5722, "grad_norm": 0.48090746998786926, "learning_rate": 0.0002, "epoch": 2.899131596526386, "step": 2170}, {"loss": 1.6596, "grad_norm": 0.4740385413169861, "learning_rate": 0.0002, "epoch": 2.9124916499666, "step": 2180}, {"loss": 1.6501, "grad_norm": 0.5337260365486145, "learning_rate": 0.0002, "epoch": 2.9258517034068134, "step": 2190}, {"loss": 1.6802, "grad_norm": 0.4420052766799927, "learning_rate": 0.0002, "epoch": 2.9392117568470275, "step": 2200}, {"loss": 1.5474, "grad_norm": 0.477512389421463, "learning_rate": 0.0002, "epoch": 2.952571810287241, "step": 2210}, {"loss": 1.6544, "grad_norm": 0.5344052910804749, "learning_rate": 0.0002, "epoch": 2.9659318637274548, "step": 2220}, {"loss": 1.6866, "grad_norm": 0.4483940303325653, "learning_rate": 0.0002, "epoch": 2.979291917167669, "step": 2230}, {"loss": 1.6477, "grad_norm": 0.4366597831249237, "learning_rate": 0.0002, "epoch": 2.9926519706078825, "step": 2240}]} +{"epoch": 4.0, "step": 2994, "epoch_duration": 823.2370874881744, "total_accumulated_duration": 3305.724505186081, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.8201380968093872, "eval_runtime": 38.6124, "eval_samples_per_second": 13.338, "eval_steps_per_second": 1.683, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7786, "grad_norm": 0.32302048802375793, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7297, "grad_norm": 0.37585633993148804, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.7008, "grad_norm": 0.33826273679733276, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.809, "grad_norm": 0.44682955741882324, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7092, "grad_norm": 0.422188401222229, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7765, "grad_norm": 0.3809906244277954, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3454349637031555, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7257, "grad_norm": 0.3767355978488922, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7224, "grad_norm": 0.3361407518386841, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7509, "grad_norm": 0.3654632568359375, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7151, "grad_norm": 0.3822861313819885, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7121, "grad_norm": 0.3853831887245178, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35521796345710754, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7735, "grad_norm": 0.4107200503349304, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7484, "grad_norm": 0.33219534158706665, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7071, "grad_norm": 0.3559704124927521, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7535, "grad_norm": 0.3700537383556366, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7513, "grad_norm": 0.3771909475326538, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7566, "grad_norm": 0.3136613965034485, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.6783, "grad_norm": 0.3952099084854126, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7691, "grad_norm": 0.36534377932548523, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7127, "grad_norm": 0.3803492486476898, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3992428183555603, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7343, "grad_norm": 0.3627142906188965, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7598, "grad_norm": 0.4248180091381073, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6896, "grad_norm": 0.4060308039188385, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.7457, "grad_norm": 0.3788969814777374, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7111, "grad_norm": 0.4174270033836365, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.7975, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.724, "grad_norm": 0.3454059362411499, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8299, "grad_norm": 0.45807570219039917, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7425, "grad_norm": 0.39338022470474243, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7457, "grad_norm": 0.3870709240436554, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6565, "grad_norm": 0.40996190905570984, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7324, "grad_norm": 0.38762837648391724, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7362, "grad_norm": 0.36756977438926697, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7451, "grad_norm": 0.4087235927581787, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7114, "grad_norm": 0.3357745110988617, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6877, "grad_norm": 0.37486532330513, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7252, "grad_norm": 0.3387809991836548, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7169, "grad_norm": 0.37462118268013, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6988, "grad_norm": 0.38575324416160583, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7438, "grad_norm": 0.3515765964984894, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7524, "grad_norm": 0.39308643341064453, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6422, "grad_norm": 0.3308864235877991, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7566, "grad_norm": 0.3397478461265564, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7871, "grad_norm": 0.3911525309085846, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7443, "grad_norm": 0.3771969974040985, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7631, "grad_norm": 0.35346856713294983, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.41736963391304016, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7582, "grad_norm": 0.3375225067138672, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6916, "grad_norm": 0.3779928982257843, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.728, "grad_norm": 0.35388994216918945, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7461, "grad_norm": 0.33884134888648987, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7083, "grad_norm": 0.35439756512641907, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7389, "grad_norm": 0.3766156733036041, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7847, "grad_norm": 0.36148911714553833, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.39687496423721313, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7541, "grad_norm": 0.35639452934265137, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7254, "grad_norm": 0.38781628012657166, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7867, "grad_norm": 0.42784637212753296, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7336, "grad_norm": 0.40258511900901794, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7771, "grad_norm": 0.36674195528030396, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7425, "grad_norm": 0.4064558446407318, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7425, "grad_norm": 0.3669849932193756, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7924, "grad_norm": 0.37569567561149597, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7885, "grad_norm": 0.37307995557785034, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7548, "grad_norm": 0.3772695064544678, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7682, "grad_norm": 0.36993589997291565, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7249, "grad_norm": 0.3490557372570038, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7979, "grad_norm": 0.3716149628162384, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6664, "grad_norm": 0.39236098527908325, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6852, "grad_norm": 0.37258651852607727, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7427, "grad_norm": 0.36183077096939087, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7055, "grad_norm": 0.3956947326660156, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}, {"eval_loss": 1.8132041692733765, "eval_runtime": 38.6287, "eval_samples_per_second": 13.332, "eval_steps_per_second": 1.683, "epoch": 2.0, "step": 1497}, {"loss": 1.6791, "grad_norm": 0.34480565786361694, "learning_rate": 0.0002, "epoch": 2.004008016032064, "step": 1500}, {"loss": 1.6367, "grad_norm": 0.3418028652667999, "learning_rate": 0.0002, "epoch": 2.017368069472278, "step": 1510}, {"loss": 1.5827, "grad_norm": 0.4514467716217041, "learning_rate": 0.0002, "epoch": 2.0307281229124916, "step": 1520}, {"loss": 1.6365, "grad_norm": 0.4197506606578827, "learning_rate": 0.0002, "epoch": 2.0440881763527052, "step": 1530}, {"loss": 1.6221, "grad_norm": 0.4134170711040497, "learning_rate": 0.0002, "epoch": 2.0574482297929193, "step": 1540}, {"loss": 1.6876, "grad_norm": 0.43709826469421387, "learning_rate": 0.0002, "epoch": 2.070808283233133, "step": 1550}, {"loss": 1.5779, "grad_norm": 0.4703378677368164, "learning_rate": 0.0002, "epoch": 2.0841683366733466, "step": 1560}, {"loss": 1.599, "grad_norm": 0.4538188576698303, "learning_rate": 0.0002, "epoch": 2.0975283901135606, "step": 1570}, {"loss": 1.6464, "grad_norm": 0.4649668037891388, "learning_rate": 0.0002, "epoch": 2.1108884435537743, "step": 1580}, {"loss": 1.6348, "grad_norm": 0.42669883370399475, "learning_rate": 0.0002, "epoch": 2.124248496993988, "step": 1590}, {"loss": 1.5838, "grad_norm": 0.43162038922309875, "learning_rate": 0.0002, "epoch": 2.1376085504342015, "step": 1600}, {"loss": 1.6673, "grad_norm": 0.4294586479663849, "learning_rate": 0.0002, "epoch": 2.1509686038744156, "step": 1610}, {"loss": 1.6024, "grad_norm": 0.4669102132320404, "learning_rate": 0.0002, "epoch": 2.164328657314629, "step": 1620}, {"loss": 1.659, "grad_norm": 0.4188412129878998, "learning_rate": 0.0002, "epoch": 2.177688710754843, "step": 1630}, {"loss": 1.625, "grad_norm": 0.4662680923938751, "learning_rate": 0.0002, "epoch": 2.191048764195057, "step": 1640}, {"loss": 1.6699, "grad_norm": 0.4020286500453949, "learning_rate": 0.0002, "epoch": 2.2044088176352705, "step": 1650}, {"loss": 1.6284, "grad_norm": 0.41919606924057007, "learning_rate": 0.0002, "epoch": 2.217768871075484, "step": 1660}, {"loss": 1.6776, "grad_norm": 0.4644531309604645, "learning_rate": 0.0002, "epoch": 2.231128924515698, "step": 1670}, {"loss": 1.6711, "grad_norm": 0.4526427984237671, "learning_rate": 0.0002, "epoch": 2.244488977955912, "step": 1680}, {"loss": 1.6058, "grad_norm": 0.45953166484832764, "learning_rate": 0.0002, "epoch": 2.2578490313961255, "step": 1690}, {"loss": 1.5979, "grad_norm": 0.4701860249042511, "learning_rate": 0.0002, "epoch": 2.2712090848363395, "step": 1700}, {"loss": 1.6183, "grad_norm": 0.4749310612678528, "learning_rate": 0.0002, "epoch": 2.284569138276553, "step": 1710}, {"loss": 1.6703, "grad_norm": 0.45026102662086487, "learning_rate": 0.0002, "epoch": 2.297929191716767, "step": 1720}, {"loss": 1.6386, "grad_norm": 0.4755004048347473, "learning_rate": 0.0002, "epoch": 2.3112892451569804, "step": 1730}, {"loss": 1.6365, "grad_norm": 0.4505726993083954, "learning_rate": 0.0002, "epoch": 2.3246492985971945, "step": 1740}, {"loss": 1.589, "grad_norm": 0.44464054703712463, "learning_rate": 0.0002, "epoch": 2.338009352037408, "step": 1750}, {"loss": 1.6139, "grad_norm": 0.4449476897716522, "learning_rate": 0.0002, "epoch": 2.3513694054776217, "step": 1760}, {"loss": 1.7195, "grad_norm": 0.4216482937335968, "learning_rate": 0.0002, "epoch": 2.364729458917836, "step": 1770}, {"loss": 1.7075, "grad_norm": 0.4379308521747589, "learning_rate": 0.0002, "epoch": 2.3780895123580494, "step": 1780}, {"loss": 1.7024, "grad_norm": 0.41670042276382446, "learning_rate": 0.0002, "epoch": 2.391449565798263, "step": 1790}, {"loss": 1.5989, "grad_norm": 0.48089510202407837, "learning_rate": 0.0002, "epoch": 2.404809619238477, "step": 1800}, {"loss": 1.6313, "grad_norm": 0.4389738142490387, "learning_rate": 0.0002, "epoch": 2.4181696726786908, "step": 1810}, {"loss": 1.5841, "grad_norm": 0.45293036103248596, "learning_rate": 0.0002, "epoch": 2.4315297261189044, "step": 1820}, {"loss": 1.6887, "grad_norm": 0.5211683511734009, "learning_rate": 0.0002, "epoch": 2.4448897795591185, "step": 1830}, {"loss": 1.6599, "grad_norm": 0.4631884694099426, "learning_rate": 0.0002, "epoch": 2.458249832999332, "step": 1840}, {"loss": 1.6537, "grad_norm": 0.4276818335056305, "learning_rate": 0.0002, "epoch": 2.4716098864395457, "step": 1850}, {"loss": 1.6836, "grad_norm": 0.477524071931839, "learning_rate": 0.0002, "epoch": 2.4849699398797593, "step": 1860}, {"loss": 1.66, "grad_norm": 0.44860973954200745, "learning_rate": 0.0002, "epoch": 2.4983299933199734, "step": 1870}, {"loss": 1.6308, "grad_norm": 0.46413546800613403, "learning_rate": 0.0002, "epoch": 2.511690046760187, "step": 1880}, {"loss": 1.6225, "grad_norm": 0.42487645149230957, "learning_rate": 0.0002, "epoch": 2.5250501002004007, "step": 1890}, {"loss": 1.6268, "grad_norm": 0.4778307378292084, "learning_rate": 0.0002, "epoch": 2.5384101536406147, "step": 1900}, {"loss": 1.6143, "grad_norm": 0.45307061076164246, "learning_rate": 0.0002, "epoch": 2.5517702070808284, "step": 1910}, {"loss": 1.7279, "grad_norm": 0.47886642813682556, "learning_rate": 0.0002, "epoch": 2.565130260521042, "step": 1920}, {"loss": 1.5931, "grad_norm": 0.4839435815811157, "learning_rate": 0.0002, "epoch": 2.5784903139612556, "step": 1930}, {"loss": 1.6089, "grad_norm": 0.4388359785079956, "learning_rate": 0.0002, "epoch": 2.5918503674014697, "step": 1940}, {"loss": 1.6828, "grad_norm": 0.47859734296798706, "learning_rate": 0.0002, "epoch": 2.6052104208416833, "step": 1950}, {"loss": 1.6014, "grad_norm": 0.5526517033576965, "learning_rate": 0.0002, "epoch": 2.6185704742818974, "step": 1960}, {"loss": 1.6889, "grad_norm": 0.5449170470237732, "learning_rate": 0.0002, "epoch": 2.631930527722111, "step": 1970}, {"loss": 1.6481, "grad_norm": 0.48521968722343445, "learning_rate": 0.0002, "epoch": 2.6452905811623246, "step": 1980}, {"loss": 1.6741, "grad_norm": 0.4733737111091614, "learning_rate": 0.0002, "epoch": 2.6586506346025383, "step": 1990}, {"loss": 1.662, "grad_norm": 0.507118284702301, "learning_rate": 0.0002, "epoch": 2.6720106880427523, "step": 2000}, {"loss": 1.6419, "grad_norm": 0.4508971571922302, "learning_rate": 0.0002, "epoch": 2.685370741482966, "step": 2010}, {"loss": 1.7052, "grad_norm": 0.4657728672027588, "learning_rate": 0.0002, "epoch": 2.6987307949231796, "step": 2020}, {"loss": 1.6261, "grad_norm": 0.48647549748420715, "learning_rate": 0.0002, "epoch": 2.7120908483633936, "step": 2030}, {"loss": 1.5638, "grad_norm": 0.49525555968284607, "learning_rate": 0.0002, "epoch": 2.7254509018036073, "step": 2040}, {"loss": 1.658, "grad_norm": 0.4712379276752472, "learning_rate": 0.0002, "epoch": 2.738810955243821, "step": 2050}, {"loss": 1.6464, "grad_norm": 0.4846591055393219, "learning_rate": 0.0002, "epoch": 2.7521710086840345, "step": 2060}, {"loss": 1.5641, "grad_norm": 0.4823240041732788, "learning_rate": 0.0002, "epoch": 2.7655310621242486, "step": 2070}, {"loss": 1.6701, "grad_norm": 0.4546685516834259, "learning_rate": 0.0002, "epoch": 2.778891115564462, "step": 2080}, {"loss": 1.7015, "grad_norm": 0.45542681217193604, "learning_rate": 0.0002, "epoch": 2.7922511690046763, "step": 2090}, {"loss": 1.6398, "grad_norm": 0.42137566208839417, "learning_rate": 0.0002, "epoch": 2.80561122244489, "step": 2100}, {"loss": 1.6526, "grad_norm": 0.6143282055854797, "learning_rate": 0.0002, "epoch": 2.8189712758851035, "step": 2110}, {"loss": 1.6955, "grad_norm": 0.4828081727027893, "learning_rate": 0.0002, "epoch": 2.832331329325317, "step": 2120}, {"loss": 1.744, "grad_norm": 0.4319005608558655, "learning_rate": 0.0002, "epoch": 2.845691382765531, "step": 2130}, {"loss": 1.6717, "grad_norm": 0.4297086298465729, "learning_rate": 0.0002, "epoch": 2.859051436205745, "step": 2140}, {"loss": 1.5968, "grad_norm": 0.5011981129646301, "learning_rate": 0.0002, "epoch": 2.8724114896459585, "step": 2150}, {"loss": 1.7181, "grad_norm": 0.4401548504829407, "learning_rate": 0.0002, "epoch": 2.8857715430861726, "step": 2160}, {"loss": 1.5722, "grad_norm": 0.48090746998786926, "learning_rate": 0.0002, "epoch": 2.899131596526386, "step": 2170}, {"loss": 1.6596, "grad_norm": 0.4740385413169861, "learning_rate": 0.0002, "epoch": 2.9124916499666, "step": 2180}, {"loss": 1.6501, "grad_norm": 0.5337260365486145, "learning_rate": 0.0002, "epoch": 2.9258517034068134, "step": 2190}, {"loss": 1.6802, "grad_norm": 0.4420052766799927, "learning_rate": 0.0002, "epoch": 2.9392117568470275, "step": 2200}, {"loss": 1.5474, "grad_norm": 0.477512389421463, "learning_rate": 0.0002, "epoch": 2.952571810287241, "step": 2210}, {"loss": 1.6544, "grad_norm": 0.5344052910804749, "learning_rate": 0.0002, "epoch": 2.9659318637274548, "step": 2220}, {"loss": 1.6866, "grad_norm": 0.4483940303325653, "learning_rate": 0.0002, "epoch": 2.979291917167669, "step": 2230}, {"loss": 1.6477, "grad_norm": 0.4366597831249237, "learning_rate": 0.0002, "epoch": 2.9926519706078825, "step": 2240}, {"eval_loss": 1.834012746810913, "eval_runtime": 38.5659, "eval_samples_per_second": 13.354, "eval_steps_per_second": 1.685, "epoch": 2.9993319973279893, "step": 2245}, {"loss": 1.5582, "grad_norm": 0.428824245929718, "learning_rate": 0.0002, "epoch": 3.006012024048096, "step": 2250}, {"loss": 1.499, "grad_norm": 0.4870174825191498, "learning_rate": 0.0002, "epoch": 3.01937207748831, "step": 2260}, {"loss": 1.4872, "grad_norm": 0.4684266149997711, "learning_rate": 0.0002, "epoch": 3.032732130928524, "step": 2270}, {"loss": 1.5284, "grad_norm": 0.581604540348053, "learning_rate": 0.0002, "epoch": 3.0460921843687374, "step": 2280}, {"loss": 1.4549, "grad_norm": 0.5561677813529968, "learning_rate": 0.0002, "epoch": 3.059452237808951, "step": 2290}, {"loss": 1.4903, "grad_norm": 0.5750220417976379, "learning_rate": 0.0002, "epoch": 3.072812291249165, "step": 2300}, {"loss": 1.5903, "grad_norm": 0.5704626441001892, "learning_rate": 0.0002, "epoch": 3.0861723446893787, "step": 2310}, {"loss": 1.4292, "grad_norm": 0.6242083311080933, "learning_rate": 0.0002, "epoch": 3.0995323981295924, "step": 2320}, {"loss": 1.5092, "grad_norm": 0.5174121260643005, "learning_rate": 0.0002, "epoch": 3.1128924515698064, "step": 2330}, {"loss": 1.5106, "grad_norm": 0.5697633028030396, "learning_rate": 0.0002, "epoch": 3.12625250501002, "step": 2340}, {"loss": 1.5156, "grad_norm": 0.5969541072845459, "learning_rate": 0.0002, "epoch": 3.1396125584502337, "step": 2350}, {"loss": 1.52, "grad_norm": 0.6244304180145264, "learning_rate": 0.0002, "epoch": 3.1529726118904478, "step": 2360}, {"loss": 1.5244, "grad_norm": 0.5561705827713013, "learning_rate": 0.0002, "epoch": 3.1663326653306614, "step": 2370}, {"loss": 1.6169, "grad_norm": 0.5401188135147095, "learning_rate": 0.0002, "epoch": 3.179692718770875, "step": 2380}, {"loss": 1.5387, "grad_norm": 0.6450421810150146, "learning_rate": 0.0002, "epoch": 3.1930527722110886, "step": 2390}, {"loss": 1.4839, "grad_norm": 0.5741903185844421, "learning_rate": 0.0002, "epoch": 3.2064128256513027, "step": 2400}, {"loss": 1.5584, "grad_norm": 0.6337407231330872, "learning_rate": 0.0002, "epoch": 3.2197728790915163, "step": 2410}, {"loss": 1.5025, "grad_norm": 0.6493517160415649, "learning_rate": 0.0002, "epoch": 3.23313293253173, "step": 2420}, {"loss": 1.5168, "grad_norm": 0.6230176091194153, "learning_rate": 0.0002, "epoch": 3.246492985971944, "step": 2430}, {"loss": 1.5408, "grad_norm": 0.680704653263092, "learning_rate": 0.0002, "epoch": 3.2598530394121576, "step": 2440}, {"loss": 1.6005, "grad_norm": 0.5279417037963867, "learning_rate": 0.0002, "epoch": 3.2732130928523713, "step": 2450}, {"loss": 1.5231, "grad_norm": 0.5601515173912048, "learning_rate": 0.0002, "epoch": 3.2865731462925853, "step": 2460}, {"loss": 1.4949, "grad_norm": 0.5591090321540833, "learning_rate": 0.0002, "epoch": 3.299933199732799, "step": 2470}, {"loss": 1.5181, "grad_norm": 0.6596529483795166, "learning_rate": 0.0002, "epoch": 3.3132932531730126, "step": 2480}, {"loss": 1.5259, "grad_norm": 0.6115918755531311, "learning_rate": 0.0002, "epoch": 3.3266533066132267, "step": 2490}, {"loss": 1.5344, "grad_norm": 0.6443548202514648, "learning_rate": 0.0002, "epoch": 3.3400133600534403, "step": 2500}, {"loss": 1.5037, "grad_norm": 0.5504242181777954, "learning_rate": 0.0002, "epoch": 3.353373413493654, "step": 2510}, {"loss": 1.5049, "grad_norm": 0.6104483604431152, "learning_rate": 0.0002, "epoch": 3.3667334669338675, "step": 2520}, {"loss": 1.587, "grad_norm": 0.8387531638145447, "learning_rate": 0.0002, "epoch": 3.3800935203740816, "step": 2530}, {"loss": 1.5227, "grad_norm": 0.6346094012260437, "learning_rate": 0.0002, "epoch": 3.3934535738142952, "step": 2540}, {"loss": 1.4855, "grad_norm": 0.6261265873908997, "learning_rate": 0.0002, "epoch": 3.406813627254509, "step": 2550}, {"loss": 1.5233, "grad_norm": 0.5960372090339661, "learning_rate": 0.0002, "epoch": 3.420173680694723, "step": 2560}, {"loss": 1.5153, "grad_norm": 0.5291280746459961, "learning_rate": 0.0002, "epoch": 3.4335337341349366, "step": 2570}, {"loss": 1.5152, "grad_norm": 0.6133161783218384, "learning_rate": 0.0002, "epoch": 3.44689378757515, "step": 2580}, {"loss": 1.5533, "grad_norm": 0.623573362827301, "learning_rate": 0.0002, "epoch": 3.460253841015364, "step": 2590}, {"loss": 1.4935, "grad_norm": 0.5959834456443787, "learning_rate": 0.0002, "epoch": 3.473613894455578, "step": 2600}, {"loss": 1.5792, "grad_norm": 0.583332359790802, "learning_rate": 0.0002, "epoch": 3.4869739478957915, "step": 2610}, {"loss": 1.5229, "grad_norm": 0.6003559231758118, "learning_rate": 0.0002, "epoch": 3.5003340013360056, "step": 2620}, {"loss": 1.4901, "grad_norm": 0.5832992196083069, "learning_rate": 0.0002, "epoch": 3.513694054776219, "step": 2630}, {"loss": 1.5005, "grad_norm": 0.5942609906196594, "learning_rate": 0.0002, "epoch": 3.527054108216433, "step": 2640}, {"loss": 1.5213, "grad_norm": 0.6087163686752319, "learning_rate": 0.0002, "epoch": 3.5404141616566465, "step": 2650}, {"loss": 1.5826, "grad_norm": 0.631948709487915, "learning_rate": 0.0002, "epoch": 3.5537742150968605, "step": 2660}, {"loss": 1.5844, "grad_norm": 0.6450803279876709, "learning_rate": 0.0002, "epoch": 3.567134268537074, "step": 2670}, {"loss": 1.4981, "grad_norm": 0.6507797837257385, "learning_rate": 0.0002, "epoch": 3.580494321977288, "step": 2680}, {"loss": 1.5826, "grad_norm": 0.5778017044067383, "learning_rate": 0.0002, "epoch": 3.593854375417502, "step": 2690}, {"loss": 1.4688, "grad_norm": 0.6214032173156738, "learning_rate": 0.0002, "epoch": 3.6072144288577155, "step": 2700}, {"loss": 1.5084, "grad_norm": 0.5681133270263672, "learning_rate": 0.0002, "epoch": 3.620574482297929, "step": 2710}, {"loss": 1.471, "grad_norm": 0.6074244976043701, "learning_rate": 0.0002, "epoch": 3.6339345357381427, "step": 2720}, {"loss": 1.5243, "grad_norm": 0.5900560617446899, "learning_rate": 0.0002, "epoch": 3.647294589178357, "step": 2730}, {"loss": 1.5074, "grad_norm": 0.5817505717277527, "learning_rate": 0.0002, "epoch": 3.6606546426185704, "step": 2740}, {"loss": 1.5117, "grad_norm": 0.6095547676086426, "learning_rate": 0.0002, "epoch": 3.6740146960587845, "step": 2750}, {"loss": 1.5117, "grad_norm": 0.612790584564209, "learning_rate": 0.0002, "epoch": 3.687374749498998, "step": 2760}, {"loss": 1.4976, "grad_norm": 0.6574140787124634, "learning_rate": 0.0002, "epoch": 3.7007348029392118, "step": 2770}, {"loss": 1.5306, "grad_norm": 0.5643761157989502, "learning_rate": 0.0002, "epoch": 3.7140948563794254, "step": 2780}, {"loss": 1.5751, "grad_norm": 0.5652621388435364, "learning_rate": 0.0002, "epoch": 3.727454909819639, "step": 2790}, {"loss": 1.5262, "grad_norm": 0.5604206323623657, "learning_rate": 0.0002, "epoch": 3.740814963259853, "step": 2800}, {"loss": 1.5013, "grad_norm": 3.911022663116455, "learning_rate": 0.0002, "epoch": 3.7541750167000667, "step": 2810}, {"loss": 1.5793, "grad_norm": 0.6148333549499512, "learning_rate": 0.0002, "epoch": 3.7675350701402808, "step": 2820}, {"loss": 1.5122, "grad_norm": 0.5605677962303162, "learning_rate": 0.0002, "epoch": 3.7808951235804944, "step": 2830}, {"loss": 1.5659, "grad_norm": 0.6101965308189392, "learning_rate": 0.0002, "epoch": 3.794255177020708, "step": 2840}, {"loss": 1.5618, "grad_norm": 0.5387342572212219, "learning_rate": 0.0002, "epoch": 3.8076152304609217, "step": 2850}, {"loss": 1.5193, "grad_norm": 0.5733087062835693, "learning_rate": 0.0002, "epoch": 3.8209752839011357, "step": 2860}, {"loss": 1.5545, "grad_norm": 0.6538485884666443, "learning_rate": 0.0002, "epoch": 3.8343353373413493, "step": 2870}, {"loss": 1.523, "grad_norm": 0.6247632503509521, "learning_rate": 0.0002, "epoch": 3.847695390781563, "step": 2880}, {"loss": 1.5591, "grad_norm": 0.5745735764503479, "learning_rate": 0.0002, "epoch": 3.861055444221777, "step": 2890}, {"loss": 1.5706, "grad_norm": 0.5942763686180115, "learning_rate": 0.0002, "epoch": 3.8744154976619907, "step": 2900}, {"loss": 1.564, "grad_norm": 0.7086281776428223, "learning_rate": 0.0002, "epoch": 3.8877755511022043, "step": 2910}, {"loss": 1.5526, "grad_norm": 0.8825129866600037, "learning_rate": 0.0002, "epoch": 3.901135604542418, "step": 2920}, {"loss": 1.4519, "grad_norm": 0.6260842680931091, "learning_rate": 0.0002, "epoch": 3.914495657982632, "step": 2930}, {"loss": 1.5433, "grad_norm": 0.6015968322753906, "learning_rate": 0.0002, "epoch": 3.9278557114228456, "step": 2940}, {"loss": 1.4931, "grad_norm": 0.7042809128761292, "learning_rate": 0.0002, "epoch": 3.9412157648630597, "step": 2950}, {"loss": 1.5596, "grad_norm": 0.5860083699226379, "learning_rate": 0.0002, "epoch": 3.9545758183032733, "step": 2960}, {"loss": 1.565, "grad_norm": 0.5939757823944092, "learning_rate": 0.0002, "epoch": 3.967935871743487, "step": 2970}, {"loss": 1.408, "grad_norm": 0.5523964166641235, "learning_rate": 0.0002, "epoch": 3.9812959251837006, "step": 2980}, {"loss": 1.5629, "grad_norm": 0.6380264759063721, "learning_rate": 0.0002, "epoch": 3.9946559786239146, "step": 2990}]} +{"epoch": 4.999331997327989, "step": 3742, "epoch_duration": 847.4992234706879, "total_accumulated_duration": 4153.223728656769, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.8201380968093872, "eval_runtime": 38.6124, "eval_samples_per_second": 13.338, "eval_steps_per_second": 1.683, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7786, "grad_norm": 0.32302048802375793, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7297, "grad_norm": 0.37585633993148804, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.7008, "grad_norm": 0.33826273679733276, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.809, "grad_norm": 0.44682955741882324, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7092, "grad_norm": 0.422188401222229, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7765, "grad_norm": 0.3809906244277954, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3454349637031555, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7257, "grad_norm": 0.3767355978488922, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7224, "grad_norm": 0.3361407518386841, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7509, "grad_norm": 0.3654632568359375, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7151, "grad_norm": 0.3822861313819885, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7121, "grad_norm": 0.3853831887245178, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35521796345710754, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7735, "grad_norm": 0.4107200503349304, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7484, "grad_norm": 0.33219534158706665, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7071, "grad_norm": 0.3559704124927521, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7535, "grad_norm": 0.3700537383556366, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7513, "grad_norm": 0.3771909475326538, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7566, "grad_norm": 0.3136613965034485, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.6783, "grad_norm": 0.3952099084854126, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7691, "grad_norm": 0.36534377932548523, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7127, "grad_norm": 0.3803492486476898, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3992428183555603, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7343, "grad_norm": 0.3627142906188965, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7598, "grad_norm": 0.4248180091381073, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6896, "grad_norm": 0.4060308039188385, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.7457, "grad_norm": 0.3788969814777374, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7111, "grad_norm": 0.4174270033836365, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.7975, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.724, "grad_norm": 0.3454059362411499, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8299, "grad_norm": 0.45807570219039917, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7425, "grad_norm": 0.39338022470474243, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7457, "grad_norm": 0.3870709240436554, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6565, "grad_norm": 0.40996190905570984, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7324, "grad_norm": 0.38762837648391724, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7362, "grad_norm": 0.36756977438926697, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7451, "grad_norm": 0.4087235927581787, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7114, "grad_norm": 0.3357745110988617, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6877, "grad_norm": 0.37486532330513, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7252, "grad_norm": 0.3387809991836548, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7169, "grad_norm": 0.37462118268013, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6988, "grad_norm": 0.38575324416160583, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7438, "grad_norm": 0.3515765964984894, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7524, "grad_norm": 0.39308643341064453, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6422, "grad_norm": 0.3308864235877991, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7566, "grad_norm": 0.3397478461265564, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7871, "grad_norm": 0.3911525309085846, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7443, "grad_norm": 0.3771969974040985, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7631, "grad_norm": 0.35346856713294983, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.41736963391304016, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7582, "grad_norm": 0.3375225067138672, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6916, "grad_norm": 0.3779928982257843, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.728, "grad_norm": 0.35388994216918945, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7461, "grad_norm": 0.33884134888648987, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7083, "grad_norm": 0.35439756512641907, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7389, "grad_norm": 0.3766156733036041, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7847, "grad_norm": 0.36148911714553833, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.39687496423721313, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7541, "grad_norm": 0.35639452934265137, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7254, "grad_norm": 0.38781628012657166, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7867, "grad_norm": 0.42784637212753296, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7336, "grad_norm": 0.40258511900901794, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7771, "grad_norm": 0.36674195528030396, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7425, "grad_norm": 0.4064558446407318, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7425, "grad_norm": 0.3669849932193756, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7924, "grad_norm": 0.37569567561149597, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7885, "grad_norm": 0.37307995557785034, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7548, "grad_norm": 0.3772695064544678, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7682, "grad_norm": 0.36993589997291565, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7249, "grad_norm": 0.3490557372570038, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7979, "grad_norm": 0.3716149628162384, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6664, "grad_norm": 0.39236098527908325, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6852, "grad_norm": 0.37258651852607727, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7427, "grad_norm": 0.36183077096939087, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7055, "grad_norm": 0.3956947326660156, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}, {"eval_loss": 1.8132041692733765, "eval_runtime": 38.6287, "eval_samples_per_second": 13.332, "eval_steps_per_second": 1.683, "epoch": 2.0, "step": 1497}, {"loss": 1.6791, "grad_norm": 0.34480565786361694, "learning_rate": 0.0002, "epoch": 2.004008016032064, "step": 1500}, {"loss": 1.6367, "grad_norm": 0.3418028652667999, "learning_rate": 0.0002, "epoch": 2.017368069472278, "step": 1510}, {"loss": 1.5827, "grad_norm": 0.4514467716217041, "learning_rate": 0.0002, "epoch": 2.0307281229124916, "step": 1520}, {"loss": 1.6365, "grad_norm": 0.4197506606578827, "learning_rate": 0.0002, "epoch": 2.0440881763527052, "step": 1530}, {"loss": 1.6221, "grad_norm": 0.4134170711040497, "learning_rate": 0.0002, "epoch": 2.0574482297929193, "step": 1540}, {"loss": 1.6876, "grad_norm": 0.43709826469421387, "learning_rate": 0.0002, "epoch": 2.070808283233133, "step": 1550}, {"loss": 1.5779, "grad_norm": 0.4703378677368164, "learning_rate": 0.0002, "epoch": 2.0841683366733466, "step": 1560}, {"loss": 1.599, "grad_norm": 0.4538188576698303, "learning_rate": 0.0002, "epoch": 2.0975283901135606, "step": 1570}, {"loss": 1.6464, "grad_norm": 0.4649668037891388, "learning_rate": 0.0002, "epoch": 2.1108884435537743, "step": 1580}, {"loss": 1.6348, "grad_norm": 0.42669883370399475, "learning_rate": 0.0002, "epoch": 2.124248496993988, "step": 1590}, {"loss": 1.5838, "grad_norm": 0.43162038922309875, "learning_rate": 0.0002, "epoch": 2.1376085504342015, "step": 1600}, {"loss": 1.6673, "grad_norm": 0.4294586479663849, "learning_rate": 0.0002, "epoch": 2.1509686038744156, "step": 1610}, {"loss": 1.6024, "grad_norm": 0.4669102132320404, "learning_rate": 0.0002, "epoch": 2.164328657314629, "step": 1620}, {"loss": 1.659, "grad_norm": 0.4188412129878998, "learning_rate": 0.0002, "epoch": 2.177688710754843, "step": 1630}, {"loss": 1.625, "grad_norm": 0.4662680923938751, "learning_rate": 0.0002, "epoch": 2.191048764195057, "step": 1640}, {"loss": 1.6699, "grad_norm": 0.4020286500453949, "learning_rate": 0.0002, "epoch": 2.2044088176352705, "step": 1650}, {"loss": 1.6284, "grad_norm": 0.41919606924057007, "learning_rate": 0.0002, "epoch": 2.217768871075484, "step": 1660}, {"loss": 1.6776, "grad_norm": 0.4644531309604645, "learning_rate": 0.0002, "epoch": 2.231128924515698, "step": 1670}, {"loss": 1.6711, "grad_norm": 0.4526427984237671, "learning_rate": 0.0002, "epoch": 2.244488977955912, "step": 1680}, {"loss": 1.6058, "grad_norm": 0.45953166484832764, "learning_rate": 0.0002, "epoch": 2.2578490313961255, "step": 1690}, {"loss": 1.5979, "grad_norm": 0.4701860249042511, "learning_rate": 0.0002, "epoch": 2.2712090848363395, "step": 1700}, {"loss": 1.6183, "grad_norm": 0.4749310612678528, "learning_rate": 0.0002, "epoch": 2.284569138276553, "step": 1710}, {"loss": 1.6703, "grad_norm": 0.45026102662086487, "learning_rate": 0.0002, "epoch": 2.297929191716767, "step": 1720}, {"loss": 1.6386, "grad_norm": 0.4755004048347473, "learning_rate": 0.0002, "epoch": 2.3112892451569804, "step": 1730}, {"loss": 1.6365, "grad_norm": 0.4505726993083954, "learning_rate": 0.0002, "epoch": 2.3246492985971945, "step": 1740}, {"loss": 1.589, "grad_norm": 0.44464054703712463, "learning_rate": 0.0002, "epoch": 2.338009352037408, "step": 1750}, {"loss": 1.6139, "grad_norm": 0.4449476897716522, "learning_rate": 0.0002, "epoch": 2.3513694054776217, "step": 1760}, {"loss": 1.7195, "grad_norm": 0.4216482937335968, "learning_rate": 0.0002, "epoch": 2.364729458917836, "step": 1770}, {"loss": 1.7075, "grad_norm": 0.4379308521747589, "learning_rate": 0.0002, "epoch": 2.3780895123580494, "step": 1780}, {"loss": 1.7024, "grad_norm": 0.41670042276382446, "learning_rate": 0.0002, "epoch": 2.391449565798263, "step": 1790}, {"loss": 1.5989, "grad_norm": 0.48089510202407837, "learning_rate": 0.0002, "epoch": 2.404809619238477, "step": 1800}, {"loss": 1.6313, "grad_norm": 0.4389738142490387, "learning_rate": 0.0002, "epoch": 2.4181696726786908, "step": 1810}, {"loss": 1.5841, "grad_norm": 0.45293036103248596, "learning_rate": 0.0002, "epoch": 2.4315297261189044, "step": 1820}, {"loss": 1.6887, "grad_norm": 0.5211683511734009, "learning_rate": 0.0002, "epoch": 2.4448897795591185, "step": 1830}, {"loss": 1.6599, "grad_norm": 0.4631884694099426, "learning_rate": 0.0002, "epoch": 2.458249832999332, "step": 1840}, {"loss": 1.6537, "grad_norm": 0.4276818335056305, "learning_rate": 0.0002, "epoch": 2.4716098864395457, "step": 1850}, {"loss": 1.6836, "grad_norm": 0.477524071931839, "learning_rate": 0.0002, "epoch": 2.4849699398797593, "step": 1860}, {"loss": 1.66, "grad_norm": 0.44860973954200745, "learning_rate": 0.0002, "epoch": 2.4983299933199734, "step": 1870}, {"loss": 1.6308, "grad_norm": 0.46413546800613403, "learning_rate": 0.0002, "epoch": 2.511690046760187, "step": 1880}, {"loss": 1.6225, "grad_norm": 0.42487645149230957, "learning_rate": 0.0002, "epoch": 2.5250501002004007, "step": 1890}, {"loss": 1.6268, "grad_norm": 0.4778307378292084, "learning_rate": 0.0002, "epoch": 2.5384101536406147, "step": 1900}, {"loss": 1.6143, "grad_norm": 0.45307061076164246, "learning_rate": 0.0002, "epoch": 2.5517702070808284, "step": 1910}, {"loss": 1.7279, "grad_norm": 0.47886642813682556, "learning_rate": 0.0002, "epoch": 2.565130260521042, "step": 1920}, {"loss": 1.5931, "grad_norm": 0.4839435815811157, "learning_rate": 0.0002, "epoch": 2.5784903139612556, "step": 1930}, {"loss": 1.6089, "grad_norm": 0.4388359785079956, "learning_rate": 0.0002, "epoch": 2.5918503674014697, "step": 1940}, {"loss": 1.6828, "grad_norm": 0.47859734296798706, "learning_rate": 0.0002, "epoch": 2.6052104208416833, "step": 1950}, {"loss": 1.6014, "grad_norm": 0.5526517033576965, "learning_rate": 0.0002, "epoch": 2.6185704742818974, "step": 1960}, {"loss": 1.6889, "grad_norm": 0.5449170470237732, "learning_rate": 0.0002, "epoch": 2.631930527722111, "step": 1970}, {"loss": 1.6481, "grad_norm": 0.48521968722343445, "learning_rate": 0.0002, "epoch": 2.6452905811623246, "step": 1980}, {"loss": 1.6741, "grad_norm": 0.4733737111091614, "learning_rate": 0.0002, "epoch": 2.6586506346025383, "step": 1990}, {"loss": 1.662, "grad_norm": 0.507118284702301, "learning_rate": 0.0002, "epoch": 2.6720106880427523, "step": 2000}, {"loss": 1.6419, "grad_norm": 0.4508971571922302, "learning_rate": 0.0002, "epoch": 2.685370741482966, "step": 2010}, {"loss": 1.7052, "grad_norm": 0.4657728672027588, "learning_rate": 0.0002, "epoch": 2.6987307949231796, "step": 2020}, {"loss": 1.6261, "grad_norm": 0.48647549748420715, "learning_rate": 0.0002, "epoch": 2.7120908483633936, "step": 2030}, {"loss": 1.5638, "grad_norm": 0.49525555968284607, "learning_rate": 0.0002, "epoch": 2.7254509018036073, "step": 2040}, {"loss": 1.658, "grad_norm": 0.4712379276752472, "learning_rate": 0.0002, "epoch": 2.738810955243821, "step": 2050}, {"loss": 1.6464, "grad_norm": 0.4846591055393219, "learning_rate": 0.0002, "epoch": 2.7521710086840345, "step": 2060}, {"loss": 1.5641, "grad_norm": 0.4823240041732788, "learning_rate": 0.0002, "epoch": 2.7655310621242486, "step": 2070}, {"loss": 1.6701, "grad_norm": 0.4546685516834259, "learning_rate": 0.0002, "epoch": 2.778891115564462, "step": 2080}, {"loss": 1.7015, "grad_norm": 0.45542681217193604, "learning_rate": 0.0002, "epoch": 2.7922511690046763, "step": 2090}, {"loss": 1.6398, "grad_norm": 0.42137566208839417, "learning_rate": 0.0002, "epoch": 2.80561122244489, "step": 2100}, {"loss": 1.6526, "grad_norm": 0.6143282055854797, "learning_rate": 0.0002, "epoch": 2.8189712758851035, "step": 2110}, {"loss": 1.6955, "grad_norm": 0.4828081727027893, "learning_rate": 0.0002, "epoch": 2.832331329325317, "step": 2120}, {"loss": 1.744, "grad_norm": 0.4319005608558655, "learning_rate": 0.0002, "epoch": 2.845691382765531, "step": 2130}, {"loss": 1.6717, "grad_norm": 0.4297086298465729, "learning_rate": 0.0002, "epoch": 2.859051436205745, "step": 2140}, {"loss": 1.5968, "grad_norm": 0.5011981129646301, "learning_rate": 0.0002, "epoch": 2.8724114896459585, "step": 2150}, {"loss": 1.7181, "grad_norm": 0.4401548504829407, "learning_rate": 0.0002, "epoch": 2.8857715430861726, "step": 2160}, {"loss": 1.5722, "grad_norm": 0.48090746998786926, "learning_rate": 0.0002, "epoch": 2.899131596526386, "step": 2170}, {"loss": 1.6596, "grad_norm": 0.4740385413169861, "learning_rate": 0.0002, "epoch": 2.9124916499666, "step": 2180}, {"loss": 1.6501, "grad_norm": 0.5337260365486145, "learning_rate": 0.0002, "epoch": 2.9258517034068134, "step": 2190}, {"loss": 1.6802, "grad_norm": 0.4420052766799927, "learning_rate": 0.0002, "epoch": 2.9392117568470275, "step": 2200}, {"loss": 1.5474, "grad_norm": 0.477512389421463, "learning_rate": 0.0002, "epoch": 2.952571810287241, "step": 2210}, {"loss": 1.6544, "grad_norm": 0.5344052910804749, "learning_rate": 0.0002, "epoch": 2.9659318637274548, "step": 2220}, {"loss": 1.6866, "grad_norm": 0.4483940303325653, "learning_rate": 0.0002, "epoch": 2.979291917167669, "step": 2230}, {"loss": 1.6477, "grad_norm": 0.4366597831249237, "learning_rate": 0.0002, "epoch": 2.9926519706078825, "step": 2240}, {"eval_loss": 1.834012746810913, "eval_runtime": 38.5659, "eval_samples_per_second": 13.354, "eval_steps_per_second": 1.685, "epoch": 2.9993319973279893, "step": 2245}, {"loss": 1.5582, "grad_norm": 0.428824245929718, "learning_rate": 0.0002, "epoch": 3.006012024048096, "step": 2250}, {"loss": 1.499, "grad_norm": 0.4870174825191498, "learning_rate": 0.0002, "epoch": 3.01937207748831, "step": 2260}, {"loss": 1.4872, "grad_norm": 0.4684266149997711, "learning_rate": 0.0002, "epoch": 3.032732130928524, "step": 2270}, {"loss": 1.5284, "grad_norm": 0.581604540348053, "learning_rate": 0.0002, "epoch": 3.0460921843687374, "step": 2280}, {"loss": 1.4549, "grad_norm": 0.5561677813529968, "learning_rate": 0.0002, "epoch": 3.059452237808951, "step": 2290}, {"loss": 1.4903, "grad_norm": 0.5750220417976379, "learning_rate": 0.0002, "epoch": 3.072812291249165, "step": 2300}, {"loss": 1.5903, "grad_norm": 0.5704626441001892, "learning_rate": 0.0002, "epoch": 3.0861723446893787, "step": 2310}, {"loss": 1.4292, "grad_norm": 0.6242083311080933, "learning_rate": 0.0002, "epoch": 3.0995323981295924, "step": 2320}, {"loss": 1.5092, "grad_norm": 0.5174121260643005, "learning_rate": 0.0002, "epoch": 3.1128924515698064, "step": 2330}, {"loss": 1.5106, "grad_norm": 0.5697633028030396, "learning_rate": 0.0002, "epoch": 3.12625250501002, "step": 2340}, {"loss": 1.5156, "grad_norm": 0.5969541072845459, "learning_rate": 0.0002, "epoch": 3.1396125584502337, "step": 2350}, {"loss": 1.52, "grad_norm": 0.6244304180145264, "learning_rate": 0.0002, "epoch": 3.1529726118904478, "step": 2360}, {"loss": 1.5244, "grad_norm": 0.5561705827713013, "learning_rate": 0.0002, "epoch": 3.1663326653306614, "step": 2370}, {"loss": 1.6169, "grad_norm": 0.5401188135147095, "learning_rate": 0.0002, "epoch": 3.179692718770875, "step": 2380}, {"loss": 1.5387, "grad_norm": 0.6450421810150146, "learning_rate": 0.0002, "epoch": 3.1930527722110886, "step": 2390}, {"loss": 1.4839, "grad_norm": 0.5741903185844421, "learning_rate": 0.0002, "epoch": 3.2064128256513027, "step": 2400}, {"loss": 1.5584, "grad_norm": 0.6337407231330872, "learning_rate": 0.0002, "epoch": 3.2197728790915163, "step": 2410}, {"loss": 1.5025, "grad_norm": 0.6493517160415649, "learning_rate": 0.0002, "epoch": 3.23313293253173, "step": 2420}, {"loss": 1.5168, "grad_norm": 0.6230176091194153, "learning_rate": 0.0002, "epoch": 3.246492985971944, "step": 2430}, {"loss": 1.5408, "grad_norm": 0.680704653263092, "learning_rate": 0.0002, "epoch": 3.2598530394121576, "step": 2440}, {"loss": 1.6005, "grad_norm": 0.5279417037963867, "learning_rate": 0.0002, "epoch": 3.2732130928523713, "step": 2450}, {"loss": 1.5231, "grad_norm": 0.5601515173912048, "learning_rate": 0.0002, "epoch": 3.2865731462925853, "step": 2460}, {"loss": 1.4949, "grad_norm": 0.5591090321540833, "learning_rate": 0.0002, "epoch": 3.299933199732799, "step": 2470}, {"loss": 1.5181, "grad_norm": 0.6596529483795166, "learning_rate": 0.0002, "epoch": 3.3132932531730126, "step": 2480}, {"loss": 1.5259, "grad_norm": 0.6115918755531311, "learning_rate": 0.0002, "epoch": 3.3266533066132267, "step": 2490}, {"loss": 1.5344, "grad_norm": 0.6443548202514648, "learning_rate": 0.0002, "epoch": 3.3400133600534403, "step": 2500}, {"loss": 1.5037, "grad_norm": 0.5504242181777954, "learning_rate": 0.0002, "epoch": 3.353373413493654, "step": 2510}, {"loss": 1.5049, "grad_norm": 0.6104483604431152, "learning_rate": 0.0002, "epoch": 3.3667334669338675, "step": 2520}, {"loss": 1.587, "grad_norm": 0.8387531638145447, "learning_rate": 0.0002, "epoch": 3.3800935203740816, "step": 2530}, {"loss": 1.5227, "grad_norm": 0.6346094012260437, "learning_rate": 0.0002, "epoch": 3.3934535738142952, "step": 2540}, {"loss": 1.4855, "grad_norm": 0.6261265873908997, "learning_rate": 0.0002, "epoch": 3.406813627254509, "step": 2550}, {"loss": 1.5233, "grad_norm": 0.5960372090339661, "learning_rate": 0.0002, "epoch": 3.420173680694723, "step": 2560}, {"loss": 1.5153, "grad_norm": 0.5291280746459961, "learning_rate": 0.0002, "epoch": 3.4335337341349366, "step": 2570}, {"loss": 1.5152, "grad_norm": 0.6133161783218384, "learning_rate": 0.0002, "epoch": 3.44689378757515, "step": 2580}, {"loss": 1.5533, "grad_norm": 0.623573362827301, "learning_rate": 0.0002, "epoch": 3.460253841015364, "step": 2590}, {"loss": 1.4935, "grad_norm": 0.5959834456443787, "learning_rate": 0.0002, "epoch": 3.473613894455578, "step": 2600}, {"loss": 1.5792, "grad_norm": 0.583332359790802, "learning_rate": 0.0002, "epoch": 3.4869739478957915, "step": 2610}, {"loss": 1.5229, "grad_norm": 0.6003559231758118, "learning_rate": 0.0002, "epoch": 3.5003340013360056, "step": 2620}, {"loss": 1.4901, "grad_norm": 0.5832992196083069, "learning_rate": 0.0002, "epoch": 3.513694054776219, "step": 2630}, {"loss": 1.5005, "grad_norm": 0.5942609906196594, "learning_rate": 0.0002, "epoch": 3.527054108216433, "step": 2640}, {"loss": 1.5213, "grad_norm": 0.6087163686752319, "learning_rate": 0.0002, "epoch": 3.5404141616566465, "step": 2650}, {"loss": 1.5826, "grad_norm": 0.631948709487915, "learning_rate": 0.0002, "epoch": 3.5537742150968605, "step": 2660}, {"loss": 1.5844, "grad_norm": 0.6450803279876709, "learning_rate": 0.0002, "epoch": 3.567134268537074, "step": 2670}, {"loss": 1.4981, "grad_norm": 0.6507797837257385, "learning_rate": 0.0002, "epoch": 3.580494321977288, "step": 2680}, {"loss": 1.5826, "grad_norm": 0.5778017044067383, "learning_rate": 0.0002, "epoch": 3.593854375417502, "step": 2690}, {"loss": 1.4688, "grad_norm": 0.6214032173156738, "learning_rate": 0.0002, "epoch": 3.6072144288577155, "step": 2700}, {"loss": 1.5084, "grad_norm": 0.5681133270263672, "learning_rate": 0.0002, "epoch": 3.620574482297929, "step": 2710}, {"loss": 1.471, "grad_norm": 0.6074244976043701, "learning_rate": 0.0002, "epoch": 3.6339345357381427, "step": 2720}, {"loss": 1.5243, "grad_norm": 0.5900560617446899, "learning_rate": 0.0002, "epoch": 3.647294589178357, "step": 2730}, {"loss": 1.5074, "grad_norm": 0.5817505717277527, "learning_rate": 0.0002, "epoch": 3.6606546426185704, "step": 2740}, {"loss": 1.5117, "grad_norm": 0.6095547676086426, "learning_rate": 0.0002, "epoch": 3.6740146960587845, "step": 2750}, {"loss": 1.5117, "grad_norm": 0.612790584564209, "learning_rate": 0.0002, "epoch": 3.687374749498998, "step": 2760}, {"loss": 1.4976, "grad_norm": 0.6574140787124634, "learning_rate": 0.0002, "epoch": 3.7007348029392118, "step": 2770}, {"loss": 1.5306, "grad_norm": 0.5643761157989502, "learning_rate": 0.0002, "epoch": 3.7140948563794254, "step": 2780}, {"loss": 1.5751, "grad_norm": 0.5652621388435364, "learning_rate": 0.0002, "epoch": 3.727454909819639, "step": 2790}, {"loss": 1.5262, "grad_norm": 0.5604206323623657, "learning_rate": 0.0002, "epoch": 3.740814963259853, "step": 2800}, {"loss": 1.5013, "grad_norm": 3.911022663116455, "learning_rate": 0.0002, "epoch": 3.7541750167000667, "step": 2810}, {"loss": 1.5793, "grad_norm": 0.6148333549499512, "learning_rate": 0.0002, "epoch": 3.7675350701402808, "step": 2820}, {"loss": 1.5122, "grad_norm": 0.5605677962303162, "learning_rate": 0.0002, "epoch": 3.7808951235804944, "step": 2830}, {"loss": 1.5659, "grad_norm": 0.6101965308189392, "learning_rate": 0.0002, "epoch": 3.794255177020708, "step": 2840}, {"loss": 1.5618, "grad_norm": 0.5387342572212219, "learning_rate": 0.0002, "epoch": 3.8076152304609217, "step": 2850}, {"loss": 1.5193, "grad_norm": 0.5733087062835693, "learning_rate": 0.0002, "epoch": 3.8209752839011357, "step": 2860}, {"loss": 1.5545, "grad_norm": 0.6538485884666443, "learning_rate": 0.0002, "epoch": 3.8343353373413493, "step": 2870}, {"loss": 1.523, "grad_norm": 0.6247632503509521, "learning_rate": 0.0002, "epoch": 3.847695390781563, "step": 2880}, {"loss": 1.5591, "grad_norm": 0.5745735764503479, "learning_rate": 0.0002, "epoch": 3.861055444221777, "step": 2890}, {"loss": 1.5706, "grad_norm": 0.5942763686180115, "learning_rate": 0.0002, "epoch": 3.8744154976619907, "step": 2900}, {"loss": 1.564, "grad_norm": 0.7086281776428223, "learning_rate": 0.0002, "epoch": 3.8877755511022043, "step": 2910}, {"loss": 1.5526, "grad_norm": 0.8825129866600037, "learning_rate": 0.0002, "epoch": 3.901135604542418, "step": 2920}, {"loss": 1.4519, "grad_norm": 0.6260842680931091, "learning_rate": 0.0002, "epoch": 3.914495657982632, "step": 2930}, {"loss": 1.5433, "grad_norm": 0.6015968322753906, "learning_rate": 0.0002, "epoch": 3.9278557114228456, "step": 2940}, {"loss": 1.4931, "grad_norm": 0.7042809128761292, "learning_rate": 0.0002, "epoch": 3.9412157648630597, "step": 2950}, {"loss": 1.5596, "grad_norm": 0.5860083699226379, "learning_rate": 0.0002, "epoch": 3.9545758183032733, "step": 2960}, {"loss": 1.565, "grad_norm": 0.5939757823944092, "learning_rate": 0.0002, "epoch": 3.967935871743487, "step": 2970}, {"loss": 1.408, "grad_norm": 0.5523964166641235, "learning_rate": 0.0002, "epoch": 3.9812959251837006, "step": 2980}, {"loss": 1.5629, "grad_norm": 0.6380264759063721, "learning_rate": 0.0002, "epoch": 3.9946559786239146, "step": 2990}, {"eval_loss": 1.8875294923782349, "eval_runtime": 38.5837, "eval_samples_per_second": 13.348, "eval_steps_per_second": 1.685, "epoch": 4.0, "step": 2994}, {"loss": 1.4002, "grad_norm": 0.5478564500808716, "learning_rate": 0.0002, "epoch": 4.008016032064128, "step": 3000}, {"loss": 1.436, "grad_norm": 0.9384379982948303, "learning_rate": 0.0002, "epoch": 4.021376085504342, "step": 3010}, {"loss": 1.4127, "grad_norm": 0.7819344401359558, "learning_rate": 0.0002, "epoch": 4.034736138944556, "step": 3020}, {"loss": 1.326, "grad_norm": 0.7737417817115784, "learning_rate": 0.0002, "epoch": 4.04809619238477, "step": 3030}, {"loss": 1.3203, "grad_norm": 0.8893805742263794, "learning_rate": 0.0002, "epoch": 4.061456245824983, "step": 3040}, {"loss": 1.3913, "grad_norm": 0.7759843468666077, "learning_rate": 0.0002, "epoch": 4.074816299265197, "step": 3050}, {"loss": 1.2941, "grad_norm": 0.642654538154602, "learning_rate": 0.0002, "epoch": 4.0881763527054105, "step": 3060}, {"loss": 1.3204, "grad_norm": 0.8515549302101135, "learning_rate": 0.0002, "epoch": 4.101536406145625, "step": 3070}, {"loss": 1.3683, "grad_norm": 0.7033658623695374, "learning_rate": 0.0002, "epoch": 4.114896459585839, "step": 3080}, {"loss": 1.4159, "grad_norm": 0.7063882946968079, "learning_rate": 0.0002, "epoch": 4.128256513026052, "step": 3090}, {"loss": 1.384, "grad_norm": 0.6946853995323181, "learning_rate": 0.0002, "epoch": 4.141616566466266, "step": 3100}, {"loss": 1.3689, "grad_norm": 0.7286741137504578, "learning_rate": 0.0002, "epoch": 4.1549766199064795, "step": 3110}, {"loss": 1.3061, "grad_norm": 0.7894193530082703, "learning_rate": 0.0002, "epoch": 4.168336673346693, "step": 3120}, {"loss": 1.3346, "grad_norm": 0.7005895376205444, "learning_rate": 0.0002, "epoch": 4.181696726786907, "step": 3130}, {"loss": 1.3834, "grad_norm": 0.799567461013794, "learning_rate": 0.0002, "epoch": 4.195056780227121, "step": 3140}, {"loss": 1.3813, "grad_norm": 0.7010157108306885, "learning_rate": 0.0002, "epoch": 4.208416833667335, "step": 3150}, {"loss": 1.3637, "grad_norm": 0.7489650249481201, "learning_rate": 0.0002, "epoch": 4.2217768871075485, "step": 3160}, {"loss": 1.3546, "grad_norm": 0.7908048629760742, "learning_rate": 0.0002, "epoch": 4.235136940547762, "step": 3170}, {"loss": 1.3073, "grad_norm": 0.7002180814743042, "learning_rate": 0.0002, "epoch": 4.248496993987976, "step": 3180}, {"loss": 1.4525, "grad_norm": 0.8339495062828064, "learning_rate": 0.0002, "epoch": 4.261857047428189, "step": 3190}, {"loss": 1.3471, "grad_norm": 0.7884618043899536, "learning_rate": 0.0002, "epoch": 4.275217100868403, "step": 3200}, {"loss": 1.4261, "grad_norm": 0.7964122295379639, "learning_rate": 0.0002, "epoch": 4.2885771543086175, "step": 3210}, {"loss": 1.3506, "grad_norm": 0.838646650314331, "learning_rate": 0.0002, "epoch": 4.301937207748831, "step": 3220}, {"loss": 1.3738, "grad_norm": 0.8063107132911682, "learning_rate": 0.0002, "epoch": 4.315297261189045, "step": 3230}, {"loss": 1.3769, "grad_norm": 0.8147385120391846, "learning_rate": 0.0002, "epoch": 4.328657314629258, "step": 3240}, {"loss": 1.4118, "grad_norm": 0.7636798620223999, "learning_rate": 0.0002, "epoch": 4.342017368069472, "step": 3250}, {"loss": 1.3698, "grad_norm": 0.7530609965324402, "learning_rate": 0.0002, "epoch": 4.355377421509686, "step": 3260}, {"loss": 1.3507, "grad_norm": 0.8853573799133301, "learning_rate": 0.0002, "epoch": 4.3687374749499, "step": 3270}, {"loss": 1.3614, "grad_norm": 0.7180975675582886, "learning_rate": 0.0002, "epoch": 4.382097528390114, "step": 3280}, {"loss": 1.4119, "grad_norm": 0.837150514125824, "learning_rate": 0.0002, "epoch": 4.395457581830327, "step": 3290}, {"loss": 1.461, "grad_norm": 0.8370638489723206, "learning_rate": 0.0002, "epoch": 4.408817635270541, "step": 3300}, {"loss": 1.4478, "grad_norm": 0.7738229036331177, "learning_rate": 0.0002, "epoch": 4.422177688710755, "step": 3310}, {"loss": 1.4195, "grad_norm": 0.7665290832519531, "learning_rate": 0.0002, "epoch": 4.435537742150968, "step": 3320}, {"loss": 1.3308, "grad_norm": 0.7547745704650879, "learning_rate": 0.0002, "epoch": 4.448897795591183, "step": 3330}, {"loss": 1.4165, "grad_norm": 0.7421861290931702, "learning_rate": 0.0002, "epoch": 4.462257849031396, "step": 3340}, {"loss": 1.4244, "grad_norm": 0.8042104244232178, "learning_rate": 0.0002, "epoch": 4.47561790247161, "step": 3350}, {"loss": 1.365, "grad_norm": 0.8111839890480042, "learning_rate": 0.0002, "epoch": 4.488977955911824, "step": 3360}, {"loss": 1.3537, "grad_norm": 0.7998340129852295, "learning_rate": 0.0002, "epoch": 4.502338009352037, "step": 3370}, {"loss": 1.3812, "grad_norm": 0.7668877243995667, "learning_rate": 0.0002, "epoch": 4.515698062792251, "step": 3380}, {"loss": 1.3972, "grad_norm": 0.7986718416213989, "learning_rate": 0.0002, "epoch": 4.529058116232465, "step": 3390}, {"loss": 1.3582, "grad_norm": 0.6806602478027344, "learning_rate": 0.0002, "epoch": 4.542418169672679, "step": 3400}, {"loss": 1.3942, "grad_norm": 0.8788819909095764, "learning_rate": 0.0002, "epoch": 4.555778223112893, "step": 3410}, {"loss": 1.3379, "grad_norm": 0.7499664425849915, "learning_rate": 0.0002, "epoch": 4.569138276553106, "step": 3420}, {"loss": 1.3823, "grad_norm": 0.7967109084129333, "learning_rate": 0.0002, "epoch": 4.58249832999332, "step": 3430}, {"loss": 1.3531, "grad_norm": 0.759639322757721, "learning_rate": 0.0002, "epoch": 4.595858383433534, "step": 3440}, {"loss": 1.3517, "grad_norm": 0.8327916264533997, "learning_rate": 0.0002, "epoch": 4.609218436873747, "step": 3450}, {"loss": 1.4619, "grad_norm": 0.7400892376899719, "learning_rate": 0.0002, "epoch": 4.622578490313961, "step": 3460}, {"loss": 1.3374, "grad_norm": 0.8116602301597595, "learning_rate": 0.0002, "epoch": 4.635938543754175, "step": 3470}, {"loss": 1.4445, "grad_norm": 0.7604362368583679, "learning_rate": 0.0002, "epoch": 4.649298597194389, "step": 3480}, {"loss": 1.3724, "grad_norm": 0.7397996783256531, "learning_rate": 0.0002, "epoch": 4.662658650634603, "step": 3490}, {"loss": 1.4048, "grad_norm": 0.869293749332428, "learning_rate": 0.0002, "epoch": 4.676018704074816, "step": 3500}, {"loss": 1.3873, "grad_norm": 0.6854358315467834, "learning_rate": 0.0002, "epoch": 4.68937875751503, "step": 3510}, {"loss": 1.3413, "grad_norm": 0.8326661586761475, "learning_rate": 0.0002, "epoch": 4.7027388109552435, "step": 3520}, {"loss": 1.3666, "grad_norm": 0.6887506246566772, "learning_rate": 0.0002, "epoch": 4.716098864395457, "step": 3530}, {"loss": 1.4508, "grad_norm": 3.837689161300659, "learning_rate": 0.0002, "epoch": 4.729458917835672, "step": 3540}, {"loss": 1.3775, "grad_norm": 0.6874563694000244, "learning_rate": 0.0002, "epoch": 4.742818971275885, "step": 3550}, {"loss": 1.3643, "grad_norm": 0.8340407609939575, "learning_rate": 0.0002, "epoch": 4.756179024716099, "step": 3560}, {"loss": 1.3556, "grad_norm": 0.7286418676376343, "learning_rate": 0.0002, "epoch": 4.7695390781563125, "step": 3570}, {"loss": 1.4338, "grad_norm": 0.7239373326301575, "learning_rate": 0.0002, "epoch": 4.782899131596526, "step": 3580}, {"loss": 1.4697, "grad_norm": 0.831310510635376, "learning_rate": 0.0002, "epoch": 4.796259185036741, "step": 3590}, {"loss": 1.4146, "grad_norm": 0.767715573310852, "learning_rate": 0.0002, "epoch": 4.809619238476954, "step": 3600}, {"loss": 1.4199, "grad_norm": 0.9013199210166931, "learning_rate": 0.0002, "epoch": 4.822979291917168, "step": 3610}, {"loss": 1.4513, "grad_norm": 0.7543512582778931, "learning_rate": 0.0002, "epoch": 4.8363393453573815, "step": 3620}, {"loss": 1.4218, "grad_norm": 0.7626057267189026, "learning_rate": 0.0002, "epoch": 4.849699398797595, "step": 3630}, {"loss": 1.4102, "grad_norm": 0.847079336643219, "learning_rate": 0.0002, "epoch": 4.863059452237809, "step": 3640}, {"loss": 1.5014, "grad_norm": 0.8273295760154724, "learning_rate": 0.0002, "epoch": 4.876419505678022, "step": 3650}, {"loss": 1.3806, "grad_norm": 0.7675244808197021, "learning_rate": 0.0002, "epoch": 4.889779559118237, "step": 3660}, {"loss": 1.4894, "grad_norm": 0.9560356736183167, "learning_rate": 0.0002, "epoch": 4.9031396125584505, "step": 3670}, {"loss": 1.4044, "grad_norm": 0.7682451605796814, "learning_rate": 0.0002, "epoch": 4.916499665998664, "step": 3680}, {"loss": 1.342, "grad_norm": 0.8113830089569092, "learning_rate": 0.0002, "epoch": 4.929859719438878, "step": 3690}, {"loss": 1.3559, "grad_norm": 0.7642542719841003, "learning_rate": 0.0002, "epoch": 4.943219772879091, "step": 3700}, {"loss": 1.403, "grad_norm": 0.823863685131073, "learning_rate": 0.0002, "epoch": 4.956579826319305, "step": 3710}, {"loss": 1.464, "grad_norm": 0.8287797570228577, "learning_rate": 0.0002, "epoch": 4.969939879759519, "step": 3720}, {"loss": 1.4139, "grad_norm": 0.778170108795166, "learning_rate": 0.0002, "epoch": 4.983299933199733, "step": 3730}, {"loss": 1.4218, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 4.996659986639947, "step": 3740}]} +{"epoch": 6.0, "step": 4491, "epoch_duration": 811.1629409790039, "total_accumulated_duration": 4964.386669635773, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.8201380968093872, "eval_runtime": 38.6124, "eval_samples_per_second": 13.338, "eval_steps_per_second": 1.683, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7786, "grad_norm": 0.32302048802375793, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7297, "grad_norm": 0.37585633993148804, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.7008, "grad_norm": 0.33826273679733276, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.809, "grad_norm": 0.44682955741882324, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7092, "grad_norm": 0.422188401222229, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7765, "grad_norm": 0.3809906244277954, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3454349637031555, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7257, "grad_norm": 0.3767355978488922, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7224, "grad_norm": 0.3361407518386841, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7509, "grad_norm": 0.3654632568359375, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7151, "grad_norm": 0.3822861313819885, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7121, "grad_norm": 0.3853831887245178, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35521796345710754, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7735, "grad_norm": 0.4107200503349304, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7484, "grad_norm": 0.33219534158706665, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7071, "grad_norm": 0.3559704124927521, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7535, "grad_norm": 0.3700537383556366, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7513, "grad_norm": 0.3771909475326538, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7566, "grad_norm": 0.3136613965034485, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.6783, "grad_norm": 0.3952099084854126, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7691, "grad_norm": 0.36534377932548523, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7127, "grad_norm": 0.3803492486476898, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3992428183555603, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7343, "grad_norm": 0.3627142906188965, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7598, "grad_norm": 0.4248180091381073, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6896, "grad_norm": 0.4060308039188385, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.7457, "grad_norm": 0.3788969814777374, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7111, "grad_norm": 0.4174270033836365, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.7975, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.724, "grad_norm": 0.3454059362411499, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8299, "grad_norm": 0.45807570219039917, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7425, "grad_norm": 0.39338022470474243, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7457, "grad_norm": 0.3870709240436554, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6565, "grad_norm": 0.40996190905570984, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7324, "grad_norm": 0.38762837648391724, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7362, "grad_norm": 0.36756977438926697, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7451, "grad_norm": 0.4087235927581787, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7114, "grad_norm": 0.3357745110988617, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6877, "grad_norm": 0.37486532330513, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7252, "grad_norm": 0.3387809991836548, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7169, "grad_norm": 0.37462118268013, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6988, "grad_norm": 0.38575324416160583, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7438, "grad_norm": 0.3515765964984894, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7524, "grad_norm": 0.39308643341064453, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6422, "grad_norm": 0.3308864235877991, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7566, "grad_norm": 0.3397478461265564, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7871, "grad_norm": 0.3911525309085846, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7443, "grad_norm": 0.3771969974040985, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7631, "grad_norm": 0.35346856713294983, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.41736963391304016, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7582, "grad_norm": 0.3375225067138672, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6916, "grad_norm": 0.3779928982257843, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.728, "grad_norm": 0.35388994216918945, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7461, "grad_norm": 0.33884134888648987, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7083, "grad_norm": 0.35439756512641907, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7389, "grad_norm": 0.3766156733036041, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7847, "grad_norm": 0.36148911714553833, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.39687496423721313, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7541, "grad_norm": 0.35639452934265137, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7254, "grad_norm": 0.38781628012657166, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7867, "grad_norm": 0.42784637212753296, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7336, "grad_norm": 0.40258511900901794, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7771, "grad_norm": 0.36674195528030396, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7425, "grad_norm": 0.4064558446407318, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7425, "grad_norm": 0.3669849932193756, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7924, "grad_norm": 0.37569567561149597, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7885, "grad_norm": 0.37307995557785034, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7548, "grad_norm": 0.3772695064544678, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7682, "grad_norm": 0.36993589997291565, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7249, "grad_norm": 0.3490557372570038, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7979, "grad_norm": 0.3716149628162384, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6664, "grad_norm": 0.39236098527908325, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6852, "grad_norm": 0.37258651852607727, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7427, "grad_norm": 0.36183077096939087, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7055, "grad_norm": 0.3956947326660156, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}, {"eval_loss": 1.8132041692733765, "eval_runtime": 38.6287, "eval_samples_per_second": 13.332, "eval_steps_per_second": 1.683, "epoch": 2.0, "step": 1497}, {"loss": 1.6791, "grad_norm": 0.34480565786361694, "learning_rate": 0.0002, "epoch": 2.004008016032064, "step": 1500}, {"loss": 1.6367, "grad_norm": 0.3418028652667999, "learning_rate": 0.0002, "epoch": 2.017368069472278, "step": 1510}, {"loss": 1.5827, "grad_norm": 0.4514467716217041, "learning_rate": 0.0002, "epoch": 2.0307281229124916, "step": 1520}, {"loss": 1.6365, "grad_norm": 0.4197506606578827, "learning_rate": 0.0002, "epoch": 2.0440881763527052, "step": 1530}, {"loss": 1.6221, "grad_norm": 0.4134170711040497, "learning_rate": 0.0002, "epoch": 2.0574482297929193, "step": 1540}, {"loss": 1.6876, "grad_norm": 0.43709826469421387, "learning_rate": 0.0002, "epoch": 2.070808283233133, "step": 1550}, {"loss": 1.5779, "grad_norm": 0.4703378677368164, "learning_rate": 0.0002, "epoch": 2.0841683366733466, "step": 1560}, {"loss": 1.599, "grad_norm": 0.4538188576698303, "learning_rate": 0.0002, "epoch": 2.0975283901135606, "step": 1570}, {"loss": 1.6464, "grad_norm": 0.4649668037891388, "learning_rate": 0.0002, "epoch": 2.1108884435537743, "step": 1580}, {"loss": 1.6348, "grad_norm": 0.42669883370399475, "learning_rate": 0.0002, "epoch": 2.124248496993988, "step": 1590}, {"loss": 1.5838, "grad_norm": 0.43162038922309875, "learning_rate": 0.0002, "epoch": 2.1376085504342015, "step": 1600}, {"loss": 1.6673, "grad_norm": 0.4294586479663849, "learning_rate": 0.0002, "epoch": 2.1509686038744156, "step": 1610}, {"loss": 1.6024, "grad_norm": 0.4669102132320404, "learning_rate": 0.0002, "epoch": 2.164328657314629, "step": 1620}, {"loss": 1.659, "grad_norm": 0.4188412129878998, "learning_rate": 0.0002, "epoch": 2.177688710754843, "step": 1630}, {"loss": 1.625, "grad_norm": 0.4662680923938751, "learning_rate": 0.0002, "epoch": 2.191048764195057, "step": 1640}, {"loss": 1.6699, "grad_norm": 0.4020286500453949, "learning_rate": 0.0002, "epoch": 2.2044088176352705, "step": 1650}, {"loss": 1.6284, "grad_norm": 0.41919606924057007, "learning_rate": 0.0002, "epoch": 2.217768871075484, "step": 1660}, {"loss": 1.6776, "grad_norm": 0.4644531309604645, "learning_rate": 0.0002, "epoch": 2.231128924515698, "step": 1670}, {"loss": 1.6711, "grad_norm": 0.4526427984237671, "learning_rate": 0.0002, "epoch": 2.244488977955912, "step": 1680}, {"loss": 1.6058, "grad_norm": 0.45953166484832764, "learning_rate": 0.0002, "epoch": 2.2578490313961255, "step": 1690}, {"loss": 1.5979, "grad_norm": 0.4701860249042511, "learning_rate": 0.0002, "epoch": 2.2712090848363395, "step": 1700}, {"loss": 1.6183, "grad_norm": 0.4749310612678528, "learning_rate": 0.0002, "epoch": 2.284569138276553, "step": 1710}, {"loss": 1.6703, "grad_norm": 0.45026102662086487, "learning_rate": 0.0002, "epoch": 2.297929191716767, "step": 1720}, {"loss": 1.6386, "grad_norm": 0.4755004048347473, "learning_rate": 0.0002, "epoch": 2.3112892451569804, "step": 1730}, {"loss": 1.6365, "grad_norm": 0.4505726993083954, "learning_rate": 0.0002, "epoch": 2.3246492985971945, "step": 1740}, {"loss": 1.589, "grad_norm": 0.44464054703712463, "learning_rate": 0.0002, "epoch": 2.338009352037408, "step": 1750}, {"loss": 1.6139, "grad_norm": 0.4449476897716522, "learning_rate": 0.0002, "epoch": 2.3513694054776217, "step": 1760}, {"loss": 1.7195, "grad_norm": 0.4216482937335968, "learning_rate": 0.0002, "epoch": 2.364729458917836, "step": 1770}, {"loss": 1.7075, "grad_norm": 0.4379308521747589, "learning_rate": 0.0002, "epoch": 2.3780895123580494, "step": 1780}, {"loss": 1.7024, "grad_norm": 0.41670042276382446, "learning_rate": 0.0002, "epoch": 2.391449565798263, "step": 1790}, {"loss": 1.5989, "grad_norm": 0.48089510202407837, "learning_rate": 0.0002, "epoch": 2.404809619238477, "step": 1800}, {"loss": 1.6313, "grad_norm": 0.4389738142490387, "learning_rate": 0.0002, "epoch": 2.4181696726786908, "step": 1810}, {"loss": 1.5841, "grad_norm": 0.45293036103248596, "learning_rate": 0.0002, "epoch": 2.4315297261189044, "step": 1820}, {"loss": 1.6887, "grad_norm": 0.5211683511734009, "learning_rate": 0.0002, "epoch": 2.4448897795591185, "step": 1830}, {"loss": 1.6599, "grad_norm": 0.4631884694099426, "learning_rate": 0.0002, "epoch": 2.458249832999332, "step": 1840}, {"loss": 1.6537, "grad_norm": 0.4276818335056305, "learning_rate": 0.0002, "epoch": 2.4716098864395457, "step": 1850}, {"loss": 1.6836, "grad_norm": 0.477524071931839, "learning_rate": 0.0002, "epoch": 2.4849699398797593, "step": 1860}, {"loss": 1.66, "grad_norm": 0.44860973954200745, "learning_rate": 0.0002, "epoch": 2.4983299933199734, "step": 1870}, {"loss": 1.6308, "grad_norm": 0.46413546800613403, "learning_rate": 0.0002, "epoch": 2.511690046760187, "step": 1880}, {"loss": 1.6225, "grad_norm": 0.42487645149230957, "learning_rate": 0.0002, "epoch": 2.5250501002004007, "step": 1890}, {"loss": 1.6268, "grad_norm": 0.4778307378292084, "learning_rate": 0.0002, "epoch": 2.5384101536406147, "step": 1900}, {"loss": 1.6143, "grad_norm": 0.45307061076164246, "learning_rate": 0.0002, "epoch": 2.5517702070808284, "step": 1910}, {"loss": 1.7279, "grad_norm": 0.47886642813682556, "learning_rate": 0.0002, "epoch": 2.565130260521042, "step": 1920}, {"loss": 1.5931, "grad_norm": 0.4839435815811157, "learning_rate": 0.0002, "epoch": 2.5784903139612556, "step": 1930}, {"loss": 1.6089, "grad_norm": 0.4388359785079956, "learning_rate": 0.0002, "epoch": 2.5918503674014697, "step": 1940}, {"loss": 1.6828, "grad_norm": 0.47859734296798706, "learning_rate": 0.0002, "epoch": 2.6052104208416833, "step": 1950}, {"loss": 1.6014, "grad_norm": 0.5526517033576965, "learning_rate": 0.0002, "epoch": 2.6185704742818974, "step": 1960}, {"loss": 1.6889, "grad_norm": 0.5449170470237732, "learning_rate": 0.0002, "epoch": 2.631930527722111, "step": 1970}, {"loss": 1.6481, "grad_norm": 0.48521968722343445, "learning_rate": 0.0002, "epoch": 2.6452905811623246, "step": 1980}, {"loss": 1.6741, "grad_norm": 0.4733737111091614, "learning_rate": 0.0002, "epoch": 2.6586506346025383, "step": 1990}, {"loss": 1.662, "grad_norm": 0.507118284702301, "learning_rate": 0.0002, "epoch": 2.6720106880427523, "step": 2000}, {"loss": 1.6419, "grad_norm": 0.4508971571922302, "learning_rate": 0.0002, "epoch": 2.685370741482966, "step": 2010}, {"loss": 1.7052, "grad_norm": 0.4657728672027588, "learning_rate": 0.0002, "epoch": 2.6987307949231796, "step": 2020}, {"loss": 1.6261, "grad_norm": 0.48647549748420715, "learning_rate": 0.0002, "epoch": 2.7120908483633936, "step": 2030}, {"loss": 1.5638, "grad_norm": 0.49525555968284607, "learning_rate": 0.0002, "epoch": 2.7254509018036073, "step": 2040}, {"loss": 1.658, "grad_norm": 0.4712379276752472, "learning_rate": 0.0002, "epoch": 2.738810955243821, "step": 2050}, {"loss": 1.6464, "grad_norm": 0.4846591055393219, "learning_rate": 0.0002, "epoch": 2.7521710086840345, "step": 2060}, {"loss": 1.5641, "grad_norm": 0.4823240041732788, "learning_rate": 0.0002, "epoch": 2.7655310621242486, "step": 2070}, {"loss": 1.6701, "grad_norm": 0.4546685516834259, "learning_rate": 0.0002, "epoch": 2.778891115564462, "step": 2080}, {"loss": 1.7015, "grad_norm": 0.45542681217193604, "learning_rate": 0.0002, "epoch": 2.7922511690046763, "step": 2090}, {"loss": 1.6398, "grad_norm": 0.42137566208839417, "learning_rate": 0.0002, "epoch": 2.80561122244489, "step": 2100}, {"loss": 1.6526, "grad_norm": 0.6143282055854797, "learning_rate": 0.0002, "epoch": 2.8189712758851035, "step": 2110}, {"loss": 1.6955, "grad_norm": 0.4828081727027893, "learning_rate": 0.0002, "epoch": 2.832331329325317, "step": 2120}, {"loss": 1.744, "grad_norm": 0.4319005608558655, "learning_rate": 0.0002, "epoch": 2.845691382765531, "step": 2130}, {"loss": 1.6717, "grad_norm": 0.4297086298465729, "learning_rate": 0.0002, "epoch": 2.859051436205745, "step": 2140}, {"loss": 1.5968, "grad_norm": 0.5011981129646301, "learning_rate": 0.0002, "epoch": 2.8724114896459585, "step": 2150}, {"loss": 1.7181, "grad_norm": 0.4401548504829407, "learning_rate": 0.0002, "epoch": 2.8857715430861726, "step": 2160}, {"loss": 1.5722, "grad_norm": 0.48090746998786926, "learning_rate": 0.0002, "epoch": 2.899131596526386, "step": 2170}, {"loss": 1.6596, "grad_norm": 0.4740385413169861, "learning_rate": 0.0002, "epoch": 2.9124916499666, "step": 2180}, {"loss": 1.6501, "grad_norm": 0.5337260365486145, "learning_rate": 0.0002, "epoch": 2.9258517034068134, "step": 2190}, {"loss": 1.6802, "grad_norm": 0.4420052766799927, "learning_rate": 0.0002, "epoch": 2.9392117568470275, "step": 2200}, {"loss": 1.5474, "grad_norm": 0.477512389421463, "learning_rate": 0.0002, "epoch": 2.952571810287241, "step": 2210}, {"loss": 1.6544, "grad_norm": 0.5344052910804749, "learning_rate": 0.0002, "epoch": 2.9659318637274548, "step": 2220}, {"loss": 1.6866, "grad_norm": 0.4483940303325653, "learning_rate": 0.0002, "epoch": 2.979291917167669, "step": 2230}, {"loss": 1.6477, "grad_norm": 0.4366597831249237, "learning_rate": 0.0002, "epoch": 2.9926519706078825, "step": 2240}, {"eval_loss": 1.834012746810913, "eval_runtime": 38.5659, "eval_samples_per_second": 13.354, "eval_steps_per_second": 1.685, "epoch": 2.9993319973279893, "step": 2245}, {"loss": 1.5582, "grad_norm": 0.428824245929718, "learning_rate": 0.0002, "epoch": 3.006012024048096, "step": 2250}, {"loss": 1.499, "grad_norm": 0.4870174825191498, "learning_rate": 0.0002, "epoch": 3.01937207748831, "step": 2260}, {"loss": 1.4872, "grad_norm": 0.4684266149997711, "learning_rate": 0.0002, "epoch": 3.032732130928524, "step": 2270}, {"loss": 1.5284, "grad_norm": 0.581604540348053, "learning_rate": 0.0002, "epoch": 3.0460921843687374, "step": 2280}, {"loss": 1.4549, "grad_norm": 0.5561677813529968, "learning_rate": 0.0002, "epoch": 3.059452237808951, "step": 2290}, {"loss": 1.4903, "grad_norm": 0.5750220417976379, "learning_rate": 0.0002, "epoch": 3.072812291249165, "step": 2300}, {"loss": 1.5903, "grad_norm": 0.5704626441001892, "learning_rate": 0.0002, "epoch": 3.0861723446893787, "step": 2310}, {"loss": 1.4292, "grad_norm": 0.6242083311080933, "learning_rate": 0.0002, "epoch": 3.0995323981295924, "step": 2320}, {"loss": 1.5092, "grad_norm": 0.5174121260643005, "learning_rate": 0.0002, "epoch": 3.1128924515698064, "step": 2330}, {"loss": 1.5106, "grad_norm": 0.5697633028030396, "learning_rate": 0.0002, "epoch": 3.12625250501002, "step": 2340}, {"loss": 1.5156, "grad_norm": 0.5969541072845459, "learning_rate": 0.0002, "epoch": 3.1396125584502337, "step": 2350}, {"loss": 1.52, "grad_norm": 0.6244304180145264, "learning_rate": 0.0002, "epoch": 3.1529726118904478, "step": 2360}, {"loss": 1.5244, "grad_norm": 0.5561705827713013, "learning_rate": 0.0002, "epoch": 3.1663326653306614, "step": 2370}, {"loss": 1.6169, "grad_norm": 0.5401188135147095, "learning_rate": 0.0002, "epoch": 3.179692718770875, "step": 2380}, {"loss": 1.5387, "grad_norm": 0.6450421810150146, "learning_rate": 0.0002, "epoch": 3.1930527722110886, "step": 2390}, {"loss": 1.4839, "grad_norm": 0.5741903185844421, "learning_rate": 0.0002, "epoch": 3.2064128256513027, "step": 2400}, {"loss": 1.5584, "grad_norm": 0.6337407231330872, "learning_rate": 0.0002, "epoch": 3.2197728790915163, "step": 2410}, {"loss": 1.5025, "grad_norm": 0.6493517160415649, "learning_rate": 0.0002, "epoch": 3.23313293253173, "step": 2420}, {"loss": 1.5168, "grad_norm": 0.6230176091194153, "learning_rate": 0.0002, "epoch": 3.246492985971944, "step": 2430}, {"loss": 1.5408, "grad_norm": 0.680704653263092, "learning_rate": 0.0002, "epoch": 3.2598530394121576, "step": 2440}, {"loss": 1.6005, "grad_norm": 0.5279417037963867, "learning_rate": 0.0002, "epoch": 3.2732130928523713, "step": 2450}, {"loss": 1.5231, "grad_norm": 0.5601515173912048, "learning_rate": 0.0002, "epoch": 3.2865731462925853, "step": 2460}, {"loss": 1.4949, "grad_norm": 0.5591090321540833, "learning_rate": 0.0002, "epoch": 3.299933199732799, "step": 2470}, {"loss": 1.5181, "grad_norm": 0.6596529483795166, "learning_rate": 0.0002, "epoch": 3.3132932531730126, "step": 2480}, {"loss": 1.5259, "grad_norm": 0.6115918755531311, "learning_rate": 0.0002, "epoch": 3.3266533066132267, "step": 2490}, {"loss": 1.5344, "grad_norm": 0.6443548202514648, "learning_rate": 0.0002, "epoch": 3.3400133600534403, "step": 2500}, {"loss": 1.5037, "grad_norm": 0.5504242181777954, "learning_rate": 0.0002, "epoch": 3.353373413493654, "step": 2510}, {"loss": 1.5049, "grad_norm": 0.6104483604431152, "learning_rate": 0.0002, "epoch": 3.3667334669338675, "step": 2520}, {"loss": 1.587, "grad_norm": 0.8387531638145447, "learning_rate": 0.0002, "epoch": 3.3800935203740816, "step": 2530}, {"loss": 1.5227, "grad_norm": 0.6346094012260437, "learning_rate": 0.0002, "epoch": 3.3934535738142952, "step": 2540}, {"loss": 1.4855, "grad_norm": 0.6261265873908997, "learning_rate": 0.0002, "epoch": 3.406813627254509, "step": 2550}, {"loss": 1.5233, "grad_norm": 0.5960372090339661, "learning_rate": 0.0002, "epoch": 3.420173680694723, "step": 2560}, {"loss": 1.5153, "grad_norm": 0.5291280746459961, "learning_rate": 0.0002, "epoch": 3.4335337341349366, "step": 2570}, {"loss": 1.5152, "grad_norm": 0.6133161783218384, "learning_rate": 0.0002, "epoch": 3.44689378757515, "step": 2580}, {"loss": 1.5533, "grad_norm": 0.623573362827301, "learning_rate": 0.0002, "epoch": 3.460253841015364, "step": 2590}, {"loss": 1.4935, "grad_norm": 0.5959834456443787, "learning_rate": 0.0002, "epoch": 3.473613894455578, "step": 2600}, {"loss": 1.5792, "grad_norm": 0.583332359790802, "learning_rate": 0.0002, "epoch": 3.4869739478957915, "step": 2610}, {"loss": 1.5229, "grad_norm": 0.6003559231758118, "learning_rate": 0.0002, "epoch": 3.5003340013360056, "step": 2620}, {"loss": 1.4901, "grad_norm": 0.5832992196083069, "learning_rate": 0.0002, "epoch": 3.513694054776219, "step": 2630}, {"loss": 1.5005, "grad_norm": 0.5942609906196594, "learning_rate": 0.0002, "epoch": 3.527054108216433, "step": 2640}, {"loss": 1.5213, "grad_norm": 0.6087163686752319, "learning_rate": 0.0002, "epoch": 3.5404141616566465, "step": 2650}, {"loss": 1.5826, "grad_norm": 0.631948709487915, "learning_rate": 0.0002, "epoch": 3.5537742150968605, "step": 2660}, {"loss": 1.5844, "grad_norm": 0.6450803279876709, "learning_rate": 0.0002, "epoch": 3.567134268537074, "step": 2670}, {"loss": 1.4981, "grad_norm": 0.6507797837257385, "learning_rate": 0.0002, "epoch": 3.580494321977288, "step": 2680}, {"loss": 1.5826, "grad_norm": 0.5778017044067383, "learning_rate": 0.0002, "epoch": 3.593854375417502, "step": 2690}, {"loss": 1.4688, "grad_norm": 0.6214032173156738, "learning_rate": 0.0002, "epoch": 3.6072144288577155, "step": 2700}, {"loss": 1.5084, "grad_norm": 0.5681133270263672, "learning_rate": 0.0002, "epoch": 3.620574482297929, "step": 2710}, {"loss": 1.471, "grad_norm": 0.6074244976043701, "learning_rate": 0.0002, "epoch": 3.6339345357381427, "step": 2720}, {"loss": 1.5243, "grad_norm": 0.5900560617446899, "learning_rate": 0.0002, "epoch": 3.647294589178357, "step": 2730}, {"loss": 1.5074, "grad_norm": 0.5817505717277527, "learning_rate": 0.0002, "epoch": 3.6606546426185704, "step": 2740}, {"loss": 1.5117, "grad_norm": 0.6095547676086426, "learning_rate": 0.0002, "epoch": 3.6740146960587845, "step": 2750}, {"loss": 1.5117, "grad_norm": 0.612790584564209, "learning_rate": 0.0002, "epoch": 3.687374749498998, "step": 2760}, {"loss": 1.4976, "grad_norm": 0.6574140787124634, "learning_rate": 0.0002, "epoch": 3.7007348029392118, "step": 2770}, {"loss": 1.5306, "grad_norm": 0.5643761157989502, "learning_rate": 0.0002, "epoch": 3.7140948563794254, "step": 2780}, {"loss": 1.5751, "grad_norm": 0.5652621388435364, "learning_rate": 0.0002, "epoch": 3.727454909819639, "step": 2790}, {"loss": 1.5262, "grad_norm": 0.5604206323623657, "learning_rate": 0.0002, "epoch": 3.740814963259853, "step": 2800}, {"loss": 1.5013, "grad_norm": 3.911022663116455, "learning_rate": 0.0002, "epoch": 3.7541750167000667, "step": 2810}, {"loss": 1.5793, "grad_norm": 0.6148333549499512, "learning_rate": 0.0002, "epoch": 3.7675350701402808, "step": 2820}, {"loss": 1.5122, "grad_norm": 0.5605677962303162, "learning_rate": 0.0002, "epoch": 3.7808951235804944, "step": 2830}, {"loss": 1.5659, "grad_norm": 0.6101965308189392, "learning_rate": 0.0002, "epoch": 3.794255177020708, "step": 2840}, {"loss": 1.5618, "grad_norm": 0.5387342572212219, "learning_rate": 0.0002, "epoch": 3.8076152304609217, "step": 2850}, {"loss": 1.5193, "grad_norm": 0.5733087062835693, "learning_rate": 0.0002, "epoch": 3.8209752839011357, "step": 2860}, {"loss": 1.5545, "grad_norm": 0.6538485884666443, "learning_rate": 0.0002, "epoch": 3.8343353373413493, "step": 2870}, {"loss": 1.523, "grad_norm": 0.6247632503509521, "learning_rate": 0.0002, "epoch": 3.847695390781563, "step": 2880}, {"loss": 1.5591, "grad_norm": 0.5745735764503479, "learning_rate": 0.0002, "epoch": 3.861055444221777, "step": 2890}, {"loss": 1.5706, "grad_norm": 0.5942763686180115, "learning_rate": 0.0002, "epoch": 3.8744154976619907, "step": 2900}, {"loss": 1.564, "grad_norm": 0.7086281776428223, "learning_rate": 0.0002, "epoch": 3.8877755511022043, "step": 2910}, {"loss": 1.5526, "grad_norm": 0.8825129866600037, "learning_rate": 0.0002, "epoch": 3.901135604542418, "step": 2920}, {"loss": 1.4519, "grad_norm": 0.6260842680931091, "learning_rate": 0.0002, "epoch": 3.914495657982632, "step": 2930}, {"loss": 1.5433, "grad_norm": 0.6015968322753906, "learning_rate": 0.0002, "epoch": 3.9278557114228456, "step": 2940}, {"loss": 1.4931, "grad_norm": 0.7042809128761292, "learning_rate": 0.0002, "epoch": 3.9412157648630597, "step": 2950}, {"loss": 1.5596, "grad_norm": 0.5860083699226379, "learning_rate": 0.0002, "epoch": 3.9545758183032733, "step": 2960}, {"loss": 1.565, "grad_norm": 0.5939757823944092, "learning_rate": 0.0002, "epoch": 3.967935871743487, "step": 2970}, {"loss": 1.408, "grad_norm": 0.5523964166641235, "learning_rate": 0.0002, "epoch": 3.9812959251837006, "step": 2980}, {"loss": 1.5629, "grad_norm": 0.6380264759063721, "learning_rate": 0.0002, "epoch": 3.9946559786239146, "step": 2990}, {"eval_loss": 1.8875294923782349, "eval_runtime": 38.5837, "eval_samples_per_second": 13.348, "eval_steps_per_second": 1.685, "epoch": 4.0, "step": 2994}, {"loss": 1.4002, "grad_norm": 0.5478564500808716, "learning_rate": 0.0002, "epoch": 4.008016032064128, "step": 3000}, {"loss": 1.436, "grad_norm": 0.9384379982948303, "learning_rate": 0.0002, "epoch": 4.021376085504342, "step": 3010}, {"loss": 1.4127, "grad_norm": 0.7819344401359558, "learning_rate": 0.0002, "epoch": 4.034736138944556, "step": 3020}, {"loss": 1.326, "grad_norm": 0.7737417817115784, "learning_rate": 0.0002, "epoch": 4.04809619238477, "step": 3030}, {"loss": 1.3203, "grad_norm": 0.8893805742263794, "learning_rate": 0.0002, "epoch": 4.061456245824983, "step": 3040}, {"loss": 1.3913, "grad_norm": 0.7759843468666077, "learning_rate": 0.0002, "epoch": 4.074816299265197, "step": 3050}, {"loss": 1.2941, "grad_norm": 0.642654538154602, "learning_rate": 0.0002, "epoch": 4.0881763527054105, "step": 3060}, {"loss": 1.3204, "grad_norm": 0.8515549302101135, "learning_rate": 0.0002, "epoch": 4.101536406145625, "step": 3070}, {"loss": 1.3683, "grad_norm": 0.7033658623695374, "learning_rate": 0.0002, "epoch": 4.114896459585839, "step": 3080}, {"loss": 1.4159, "grad_norm": 0.7063882946968079, "learning_rate": 0.0002, "epoch": 4.128256513026052, "step": 3090}, {"loss": 1.384, "grad_norm": 0.6946853995323181, "learning_rate": 0.0002, "epoch": 4.141616566466266, "step": 3100}, {"loss": 1.3689, "grad_norm": 0.7286741137504578, "learning_rate": 0.0002, "epoch": 4.1549766199064795, "step": 3110}, {"loss": 1.3061, "grad_norm": 0.7894193530082703, "learning_rate": 0.0002, "epoch": 4.168336673346693, "step": 3120}, {"loss": 1.3346, "grad_norm": 0.7005895376205444, "learning_rate": 0.0002, "epoch": 4.181696726786907, "step": 3130}, {"loss": 1.3834, "grad_norm": 0.799567461013794, "learning_rate": 0.0002, "epoch": 4.195056780227121, "step": 3140}, {"loss": 1.3813, "grad_norm": 0.7010157108306885, "learning_rate": 0.0002, "epoch": 4.208416833667335, "step": 3150}, {"loss": 1.3637, "grad_norm": 0.7489650249481201, "learning_rate": 0.0002, "epoch": 4.2217768871075485, "step": 3160}, {"loss": 1.3546, "grad_norm": 0.7908048629760742, "learning_rate": 0.0002, "epoch": 4.235136940547762, "step": 3170}, {"loss": 1.3073, "grad_norm": 0.7002180814743042, "learning_rate": 0.0002, "epoch": 4.248496993987976, "step": 3180}, {"loss": 1.4525, "grad_norm": 0.8339495062828064, "learning_rate": 0.0002, "epoch": 4.261857047428189, "step": 3190}, {"loss": 1.3471, "grad_norm": 0.7884618043899536, "learning_rate": 0.0002, "epoch": 4.275217100868403, "step": 3200}, {"loss": 1.4261, "grad_norm": 0.7964122295379639, "learning_rate": 0.0002, "epoch": 4.2885771543086175, "step": 3210}, {"loss": 1.3506, "grad_norm": 0.838646650314331, "learning_rate": 0.0002, "epoch": 4.301937207748831, "step": 3220}, {"loss": 1.3738, "grad_norm": 0.8063107132911682, "learning_rate": 0.0002, "epoch": 4.315297261189045, "step": 3230}, {"loss": 1.3769, "grad_norm": 0.8147385120391846, "learning_rate": 0.0002, "epoch": 4.328657314629258, "step": 3240}, {"loss": 1.4118, "grad_norm": 0.7636798620223999, "learning_rate": 0.0002, "epoch": 4.342017368069472, "step": 3250}, {"loss": 1.3698, "grad_norm": 0.7530609965324402, "learning_rate": 0.0002, "epoch": 4.355377421509686, "step": 3260}, {"loss": 1.3507, "grad_norm": 0.8853573799133301, "learning_rate": 0.0002, "epoch": 4.3687374749499, "step": 3270}, {"loss": 1.3614, "grad_norm": 0.7180975675582886, "learning_rate": 0.0002, "epoch": 4.382097528390114, "step": 3280}, {"loss": 1.4119, "grad_norm": 0.837150514125824, "learning_rate": 0.0002, "epoch": 4.395457581830327, "step": 3290}, {"loss": 1.461, "grad_norm": 0.8370638489723206, "learning_rate": 0.0002, "epoch": 4.408817635270541, "step": 3300}, {"loss": 1.4478, "grad_norm": 0.7738229036331177, "learning_rate": 0.0002, "epoch": 4.422177688710755, "step": 3310}, {"loss": 1.4195, "grad_norm": 0.7665290832519531, "learning_rate": 0.0002, "epoch": 4.435537742150968, "step": 3320}, {"loss": 1.3308, "grad_norm": 0.7547745704650879, "learning_rate": 0.0002, "epoch": 4.448897795591183, "step": 3330}, {"loss": 1.4165, "grad_norm": 0.7421861290931702, "learning_rate": 0.0002, "epoch": 4.462257849031396, "step": 3340}, {"loss": 1.4244, "grad_norm": 0.8042104244232178, "learning_rate": 0.0002, "epoch": 4.47561790247161, "step": 3350}, {"loss": 1.365, "grad_norm": 0.8111839890480042, "learning_rate": 0.0002, "epoch": 4.488977955911824, "step": 3360}, {"loss": 1.3537, "grad_norm": 0.7998340129852295, "learning_rate": 0.0002, "epoch": 4.502338009352037, "step": 3370}, {"loss": 1.3812, "grad_norm": 0.7668877243995667, "learning_rate": 0.0002, "epoch": 4.515698062792251, "step": 3380}, {"loss": 1.3972, "grad_norm": 0.7986718416213989, "learning_rate": 0.0002, "epoch": 4.529058116232465, "step": 3390}, {"loss": 1.3582, "grad_norm": 0.6806602478027344, "learning_rate": 0.0002, "epoch": 4.542418169672679, "step": 3400}, {"loss": 1.3942, "grad_norm": 0.8788819909095764, "learning_rate": 0.0002, "epoch": 4.555778223112893, "step": 3410}, {"loss": 1.3379, "grad_norm": 0.7499664425849915, "learning_rate": 0.0002, "epoch": 4.569138276553106, "step": 3420}, {"loss": 1.3823, "grad_norm": 0.7967109084129333, "learning_rate": 0.0002, "epoch": 4.58249832999332, "step": 3430}, {"loss": 1.3531, "grad_norm": 0.759639322757721, "learning_rate": 0.0002, "epoch": 4.595858383433534, "step": 3440}, {"loss": 1.3517, "grad_norm": 0.8327916264533997, "learning_rate": 0.0002, "epoch": 4.609218436873747, "step": 3450}, {"loss": 1.4619, "grad_norm": 0.7400892376899719, "learning_rate": 0.0002, "epoch": 4.622578490313961, "step": 3460}, {"loss": 1.3374, "grad_norm": 0.8116602301597595, "learning_rate": 0.0002, "epoch": 4.635938543754175, "step": 3470}, {"loss": 1.4445, "grad_norm": 0.7604362368583679, "learning_rate": 0.0002, "epoch": 4.649298597194389, "step": 3480}, {"loss": 1.3724, "grad_norm": 0.7397996783256531, "learning_rate": 0.0002, "epoch": 4.662658650634603, "step": 3490}, {"loss": 1.4048, "grad_norm": 0.869293749332428, "learning_rate": 0.0002, "epoch": 4.676018704074816, "step": 3500}, {"loss": 1.3873, "grad_norm": 0.6854358315467834, "learning_rate": 0.0002, "epoch": 4.68937875751503, "step": 3510}, {"loss": 1.3413, "grad_norm": 0.8326661586761475, "learning_rate": 0.0002, "epoch": 4.7027388109552435, "step": 3520}, {"loss": 1.3666, "grad_norm": 0.6887506246566772, "learning_rate": 0.0002, "epoch": 4.716098864395457, "step": 3530}, {"loss": 1.4508, "grad_norm": 3.837689161300659, "learning_rate": 0.0002, "epoch": 4.729458917835672, "step": 3540}, {"loss": 1.3775, "grad_norm": 0.6874563694000244, "learning_rate": 0.0002, "epoch": 4.742818971275885, "step": 3550}, {"loss": 1.3643, "grad_norm": 0.8340407609939575, "learning_rate": 0.0002, "epoch": 4.756179024716099, "step": 3560}, {"loss": 1.3556, "grad_norm": 0.7286418676376343, "learning_rate": 0.0002, "epoch": 4.7695390781563125, "step": 3570}, {"loss": 1.4338, "grad_norm": 0.7239373326301575, "learning_rate": 0.0002, "epoch": 4.782899131596526, "step": 3580}, {"loss": 1.4697, "grad_norm": 0.831310510635376, "learning_rate": 0.0002, "epoch": 4.796259185036741, "step": 3590}, {"loss": 1.4146, "grad_norm": 0.767715573310852, "learning_rate": 0.0002, "epoch": 4.809619238476954, "step": 3600}, {"loss": 1.4199, "grad_norm": 0.9013199210166931, "learning_rate": 0.0002, "epoch": 4.822979291917168, "step": 3610}, {"loss": 1.4513, "grad_norm": 0.7543512582778931, "learning_rate": 0.0002, "epoch": 4.8363393453573815, "step": 3620}, {"loss": 1.4218, "grad_norm": 0.7626057267189026, "learning_rate": 0.0002, "epoch": 4.849699398797595, "step": 3630}, {"loss": 1.4102, "grad_norm": 0.847079336643219, "learning_rate": 0.0002, "epoch": 4.863059452237809, "step": 3640}, {"loss": 1.5014, "grad_norm": 0.8273295760154724, "learning_rate": 0.0002, "epoch": 4.876419505678022, "step": 3650}, {"loss": 1.3806, "grad_norm": 0.7675244808197021, "learning_rate": 0.0002, "epoch": 4.889779559118237, "step": 3660}, {"loss": 1.4894, "grad_norm": 0.9560356736183167, "learning_rate": 0.0002, "epoch": 4.9031396125584505, "step": 3670}, {"loss": 1.4044, "grad_norm": 0.7682451605796814, "learning_rate": 0.0002, "epoch": 4.916499665998664, "step": 3680}, {"loss": 1.342, "grad_norm": 0.8113830089569092, "learning_rate": 0.0002, "epoch": 4.929859719438878, "step": 3690}, {"loss": 1.3559, "grad_norm": 0.7642542719841003, "learning_rate": 0.0002, "epoch": 4.943219772879091, "step": 3700}, {"loss": 1.403, "grad_norm": 0.823863685131073, "learning_rate": 0.0002, "epoch": 4.956579826319305, "step": 3710}, {"loss": 1.464, "grad_norm": 0.8287797570228577, "learning_rate": 0.0002, "epoch": 4.969939879759519, "step": 3720}, {"loss": 1.4139, "grad_norm": 0.778170108795166, "learning_rate": 0.0002, "epoch": 4.983299933199733, "step": 3730}, {"loss": 1.4218, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 4.996659986639947, "step": 3740}, {"eval_loss": 1.9638569355010986, "eval_runtime": 38.5725, "eval_samples_per_second": 13.351, "eval_steps_per_second": 1.685, "epoch": 4.999331997327989, "step": 3742}, {"loss": 1.225, "grad_norm": 0.8864085078239441, "learning_rate": 0.0002, "epoch": 5.01002004008016, "step": 3750}, {"loss": 1.2056, "grad_norm": 0.9191637635231018, "learning_rate": 0.0002, "epoch": 5.023380093520374, "step": 3760}, {"loss": 1.2453, "grad_norm": 0.749519407749176, "learning_rate": 0.0002, "epoch": 5.036740146960588, "step": 3770}, {"loss": 1.1959, "grad_norm": 0.7916892170906067, "learning_rate": 0.0002, "epoch": 5.050100200400801, "step": 3780}, {"loss": 1.2279, "grad_norm": 1.0318909883499146, "learning_rate": 0.0002, "epoch": 5.063460253841015, "step": 3790}, {"loss": 1.2585, "grad_norm": 1.028586745262146, "learning_rate": 0.0002, "epoch": 5.0768203072812295, "step": 3800}, {"loss": 1.1769, "grad_norm": 1.0568538904190063, "learning_rate": 0.0002, "epoch": 5.090180360721443, "step": 3810}, {"loss": 1.263, "grad_norm": 0.9780595302581787, "learning_rate": 0.0002, "epoch": 5.103540414161657, "step": 3820}, {"loss": 1.2019, "grad_norm": 1.10311758518219, "learning_rate": 0.0002, "epoch": 5.11690046760187, "step": 3830}, {"loss": 1.2133, "grad_norm": 0.9497154355049133, "learning_rate": 0.0002, "epoch": 5.130260521042084, "step": 3840}, {"loss": 1.1718, "grad_norm": 0.948279857635498, "learning_rate": 0.0002, "epoch": 5.143620574482298, "step": 3850}, {"loss": 1.2108, "grad_norm": 0.9497880339622498, "learning_rate": 0.0002, "epoch": 5.156980627922512, "step": 3860}, {"loss": 1.1876, "grad_norm": 1.3213258981704712, "learning_rate": 0.0002, "epoch": 5.170340681362726, "step": 3870}, {"loss": 1.2327, "grad_norm": 0.9835752248764038, "learning_rate": 0.0002, "epoch": 5.183700734802939, "step": 3880}, {"loss": 1.2256, "grad_norm": 0.8426132202148438, "learning_rate": 0.0002, "epoch": 5.197060788243153, "step": 3890}, {"loss": 1.2066, "grad_norm": 1.0343470573425293, "learning_rate": 0.0002, "epoch": 5.210420841683367, "step": 3900}, {"loss": 1.2596, "grad_norm": 1.0771924257278442, "learning_rate": 0.0002, "epoch": 5.22378089512358, "step": 3910}, {"loss": 1.2371, "grad_norm": 0.8542634844779968, "learning_rate": 0.0002, "epoch": 5.237140948563794, "step": 3920}, {"loss": 1.2264, "grad_norm": 1.1021966934204102, "learning_rate": 0.0002, "epoch": 5.250501002004008, "step": 3930}, {"loss": 1.2097, "grad_norm": 1.170011281967163, "learning_rate": 0.0002, "epoch": 5.263861055444222, "step": 3940}, {"loss": 1.2101, "grad_norm": 0.9787653684616089, "learning_rate": 0.0002, "epoch": 5.277221108884436, "step": 3950}, {"loss": 1.24, "grad_norm": 0.914513885974884, "learning_rate": 0.0002, "epoch": 5.290581162324649, "step": 3960}, {"loss": 1.1641, "grad_norm": 1.0831562280654907, "learning_rate": 0.0002, "epoch": 5.303941215764863, "step": 3970}, {"loss": 1.2609, "grad_norm": 0.9810112714767456, "learning_rate": 0.0002, "epoch": 5.3173012692050765, "step": 3980}, {"loss": 1.1825, "grad_norm": 0.9624066948890686, "learning_rate": 0.0002, "epoch": 5.330661322645291, "step": 3990}, {"loss": 1.273, "grad_norm": 1.2296923398971558, "learning_rate": 0.0002, "epoch": 5.344021376085505, "step": 4000}, {"loss": 1.2452, "grad_norm": 1.011299967765808, "learning_rate": 0.0002, "epoch": 5.357381429525718, "step": 4010}, {"loss": 1.2539, "grad_norm": 0.9144132733345032, "learning_rate": 0.0002, "epoch": 5.370741482965932, "step": 4020}, {"loss": 1.2914, "grad_norm": 1.0573601722717285, "learning_rate": 0.0002, "epoch": 5.3841015364061455, "step": 4030}, {"loss": 1.2295, "grad_norm": 1.1667137145996094, "learning_rate": 0.0002, "epoch": 5.397461589846359, "step": 4040}, {"loss": 1.2541, "grad_norm": 1.072070598602295, "learning_rate": 0.0002, "epoch": 5.410821643286573, "step": 4050}, {"loss": 1.2448, "grad_norm": 1.1005792617797852, "learning_rate": 0.0002, "epoch": 5.424181696726787, "step": 4060}, {"loss": 1.2604, "grad_norm": 1.033581018447876, "learning_rate": 0.0002, "epoch": 5.437541750167001, "step": 4070}, {"loss": 1.2552, "grad_norm": 0.9537439942359924, "learning_rate": 0.0002, "epoch": 5.4509018036072145, "step": 4080}, {"loss": 1.2985, "grad_norm": 1.0502177476882935, "learning_rate": 0.0002, "epoch": 5.464261857047428, "step": 4090}, {"loss": 1.2424, "grad_norm": 0.9098296761512756, "learning_rate": 0.0002, "epoch": 5.477621910487642, "step": 4100}, {"loss": 1.2262, "grad_norm": 0.9551953077316284, "learning_rate": 0.0002, "epoch": 5.490981963927855, "step": 4110}, {"loss": 1.2848, "grad_norm": 0.9169427156448364, "learning_rate": 0.0002, "epoch": 5.504342017368069, "step": 4120}, {"loss": 1.2572, "grad_norm": 0.9430235624313354, "learning_rate": 0.0002, "epoch": 5.517702070808284, "step": 4130}, {"loss": 1.2618, "grad_norm": 0.817259669303894, "learning_rate": 0.0002, "epoch": 5.531062124248497, "step": 4140}, {"loss": 1.3012, "grad_norm": 1.124152660369873, "learning_rate": 0.0002, "epoch": 5.544422177688711, "step": 4150}, {"loss": 1.2508, "grad_norm": 0.9250756502151489, "learning_rate": 0.0002, "epoch": 5.557782231128924, "step": 4160}, {"loss": 1.2492, "grad_norm": 0.9582970142364502, "learning_rate": 0.0002, "epoch": 5.571142284569138, "step": 4170}, {"loss": 1.2804, "grad_norm": 1.0078704357147217, "learning_rate": 0.0002, "epoch": 5.584502338009352, "step": 4180}, {"loss": 1.1961, "grad_norm": 0.9585610032081604, "learning_rate": 0.0002, "epoch": 5.597862391449565, "step": 4190}, {"loss": 1.2522, "grad_norm": 1.0150971412658691, "learning_rate": 0.0002, "epoch": 5.61122244488978, "step": 4200}, {"loss": 1.2275, "grad_norm": 0.9943351149559021, "learning_rate": 0.0002, "epoch": 5.6245824983299935, "step": 4210}, {"loss": 1.2928, "grad_norm": 0.8880936503410339, "learning_rate": 0.0002, "epoch": 5.637942551770207, "step": 4220}, {"loss": 1.2323, "grad_norm": 0.9873887896537781, "learning_rate": 0.0002, "epoch": 5.651302605210421, "step": 4230}, {"loss": 1.3391, "grad_norm": 0.9185152649879456, "learning_rate": 0.0002, "epoch": 5.664662658650634, "step": 4240}, {"loss": 1.2511, "grad_norm": 1.0706779956817627, "learning_rate": 0.0002, "epoch": 5.678022712090849, "step": 4250}, {"loss": 1.2737, "grad_norm": 0.9660224914550781, "learning_rate": 0.0002, "epoch": 5.6913827655310625, "step": 4260}, {"loss": 1.2815, "grad_norm": 0.8685019612312317, "learning_rate": 0.0002, "epoch": 5.704742818971276, "step": 4270}, {"loss": 1.1559, "grad_norm": 1.0390565395355225, "learning_rate": 0.0002, "epoch": 5.71810287241149, "step": 4280}, {"loss": 1.3134, "grad_norm": 0.9290478825569153, "learning_rate": 0.0002, "epoch": 5.731462925851703, "step": 4290}, {"loss": 1.2426, "grad_norm": 1.0361281633377075, "learning_rate": 0.0002, "epoch": 5.744822979291917, "step": 4300}, {"loss": 1.2688, "grad_norm": 0.8804615139961243, "learning_rate": 0.0002, "epoch": 5.758183032732131, "step": 4310}, {"loss": 1.2479, "grad_norm": 1.0051425695419312, "learning_rate": 0.0002, "epoch": 5.771543086172345, "step": 4320}, {"loss": 1.1946, "grad_norm": 1.0051119327545166, "learning_rate": 0.0002, "epoch": 5.784903139612559, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.9961661100387573, "learning_rate": 0.0002, "epoch": 5.798263193052772, "step": 4340}, {"loss": 1.2179, "grad_norm": 1.0229419469833374, "learning_rate": 0.0002, "epoch": 5.811623246492986, "step": 4350}, {"loss": 1.2984, "grad_norm": 1.1129552125930786, "learning_rate": 0.0002, "epoch": 5.8249832999332, "step": 4360}, {"loss": 1.2692, "grad_norm": 1.18964421749115, "learning_rate": 0.0002, "epoch": 5.838343353373413, "step": 4370}, {"loss": 1.1996, "grad_norm": 0.9490230083465576, "learning_rate": 0.0002, "epoch": 5.851703406813627, "step": 4380}, {"loss": 1.3177, "grad_norm": 0.8734540343284607, "learning_rate": 0.0002, "epoch": 5.865063460253841, "step": 4390}, {"loss": 1.3131, "grad_norm": 1.0017802715301514, "learning_rate": 0.0002, "epoch": 5.878423513694055, "step": 4400}, {"loss": 1.2649, "grad_norm": 0.953556478023529, "learning_rate": 0.0002, "epoch": 5.891783567134269, "step": 4410}, {"loss": 1.2684, "grad_norm": 0.8915258646011353, "learning_rate": 0.0002, "epoch": 5.905143620574482, "step": 4420}, {"loss": 1.2843, "grad_norm": 0.9715141654014587, "learning_rate": 0.0002, "epoch": 5.918503674014696, "step": 4430}, {"loss": 1.2769, "grad_norm": 0.9432152509689331, "learning_rate": 0.0002, "epoch": 5.9318637274549095, "step": 4440}, {"loss": 1.233, "grad_norm": 0.9473979473114014, "learning_rate": 0.0002, "epoch": 5.945223780895123, "step": 4450}, {"loss": 1.3209, "grad_norm": 1.104871392250061, "learning_rate": 0.0002, "epoch": 5.958583834335338, "step": 4460}, {"loss": 1.3427, "grad_norm": 1.0308905839920044, "learning_rate": 0.0002, "epoch": 5.971943887775551, "step": 4470}, {"loss": 1.1808, "grad_norm": 0.8895487189292908, "learning_rate": 0.0002, "epoch": 5.985303941215765, "step": 4480}, {"loss": 1.2634, "grad_norm": 1.0148485898971558, "learning_rate": 0.0002, "epoch": 5.9986639946559785, "step": 4490}]} +{"epoch": 6.999331997327989, "step": 5239, "epoch_duration": 812.9917500019073, "total_accumulated_duration": 5777.37841963768, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.8201380968093872, "eval_runtime": 38.6124, "eval_samples_per_second": 13.338, "eval_steps_per_second": 1.683, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7786, "grad_norm": 0.32302048802375793, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7297, "grad_norm": 0.37585633993148804, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.7008, "grad_norm": 0.33826273679733276, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.809, "grad_norm": 0.44682955741882324, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7092, "grad_norm": 0.422188401222229, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7765, "grad_norm": 0.3809906244277954, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3454349637031555, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7257, "grad_norm": 0.3767355978488922, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7224, "grad_norm": 0.3361407518386841, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7509, "grad_norm": 0.3654632568359375, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7151, "grad_norm": 0.3822861313819885, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7121, "grad_norm": 0.3853831887245178, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35521796345710754, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7735, "grad_norm": 0.4107200503349304, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7484, "grad_norm": 0.33219534158706665, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7071, "grad_norm": 0.3559704124927521, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7535, "grad_norm": 0.3700537383556366, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7513, "grad_norm": 0.3771909475326538, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7566, "grad_norm": 0.3136613965034485, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.6783, "grad_norm": 0.3952099084854126, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7691, "grad_norm": 0.36534377932548523, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7127, "grad_norm": 0.3803492486476898, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3992428183555603, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7343, "grad_norm": 0.3627142906188965, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7598, "grad_norm": 0.4248180091381073, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6896, "grad_norm": 0.4060308039188385, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.7457, "grad_norm": 0.3788969814777374, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7111, "grad_norm": 0.4174270033836365, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.7975, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.724, "grad_norm": 0.3454059362411499, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8299, "grad_norm": 0.45807570219039917, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7425, "grad_norm": 0.39338022470474243, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7457, "grad_norm": 0.3870709240436554, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6565, "grad_norm": 0.40996190905570984, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7324, "grad_norm": 0.38762837648391724, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7362, "grad_norm": 0.36756977438926697, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7451, "grad_norm": 0.4087235927581787, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7114, "grad_norm": 0.3357745110988617, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6877, "grad_norm": 0.37486532330513, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7252, "grad_norm": 0.3387809991836548, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7169, "grad_norm": 0.37462118268013, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6988, "grad_norm": 0.38575324416160583, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7438, "grad_norm": 0.3515765964984894, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7524, "grad_norm": 0.39308643341064453, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6422, "grad_norm": 0.3308864235877991, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7566, "grad_norm": 0.3397478461265564, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7871, "grad_norm": 0.3911525309085846, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7443, "grad_norm": 0.3771969974040985, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7631, "grad_norm": 0.35346856713294983, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.41736963391304016, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7582, "grad_norm": 0.3375225067138672, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6916, "grad_norm": 0.3779928982257843, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.728, "grad_norm": 0.35388994216918945, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7461, "grad_norm": 0.33884134888648987, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7083, "grad_norm": 0.35439756512641907, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7389, "grad_norm": 0.3766156733036041, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7847, "grad_norm": 0.36148911714553833, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.39687496423721313, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7541, "grad_norm": 0.35639452934265137, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7254, "grad_norm": 0.38781628012657166, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7867, "grad_norm": 0.42784637212753296, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7336, "grad_norm": 0.40258511900901794, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7771, "grad_norm": 0.36674195528030396, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7425, "grad_norm": 0.4064558446407318, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7425, "grad_norm": 0.3669849932193756, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7924, "grad_norm": 0.37569567561149597, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7885, "grad_norm": 0.37307995557785034, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7548, "grad_norm": 0.3772695064544678, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7682, "grad_norm": 0.36993589997291565, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7249, "grad_norm": 0.3490557372570038, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7979, "grad_norm": 0.3716149628162384, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6664, "grad_norm": 0.39236098527908325, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6852, "grad_norm": 0.37258651852607727, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7427, "grad_norm": 0.36183077096939087, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7055, "grad_norm": 0.3956947326660156, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}, {"eval_loss": 1.8132041692733765, "eval_runtime": 38.6287, "eval_samples_per_second": 13.332, "eval_steps_per_second": 1.683, "epoch": 2.0, "step": 1497}, {"loss": 1.6791, "grad_norm": 0.34480565786361694, "learning_rate": 0.0002, "epoch": 2.004008016032064, "step": 1500}, {"loss": 1.6367, "grad_norm": 0.3418028652667999, "learning_rate": 0.0002, "epoch": 2.017368069472278, "step": 1510}, {"loss": 1.5827, "grad_norm": 0.4514467716217041, "learning_rate": 0.0002, "epoch": 2.0307281229124916, "step": 1520}, {"loss": 1.6365, "grad_norm": 0.4197506606578827, "learning_rate": 0.0002, "epoch": 2.0440881763527052, "step": 1530}, {"loss": 1.6221, "grad_norm": 0.4134170711040497, "learning_rate": 0.0002, "epoch": 2.0574482297929193, "step": 1540}, {"loss": 1.6876, "grad_norm": 0.43709826469421387, "learning_rate": 0.0002, "epoch": 2.070808283233133, "step": 1550}, {"loss": 1.5779, "grad_norm": 0.4703378677368164, "learning_rate": 0.0002, "epoch": 2.0841683366733466, "step": 1560}, {"loss": 1.599, "grad_norm": 0.4538188576698303, "learning_rate": 0.0002, "epoch": 2.0975283901135606, "step": 1570}, {"loss": 1.6464, "grad_norm": 0.4649668037891388, "learning_rate": 0.0002, "epoch": 2.1108884435537743, "step": 1580}, {"loss": 1.6348, "grad_norm": 0.42669883370399475, "learning_rate": 0.0002, "epoch": 2.124248496993988, "step": 1590}, {"loss": 1.5838, "grad_norm": 0.43162038922309875, "learning_rate": 0.0002, "epoch": 2.1376085504342015, "step": 1600}, {"loss": 1.6673, "grad_norm": 0.4294586479663849, "learning_rate": 0.0002, "epoch": 2.1509686038744156, "step": 1610}, {"loss": 1.6024, "grad_norm": 0.4669102132320404, "learning_rate": 0.0002, "epoch": 2.164328657314629, "step": 1620}, {"loss": 1.659, "grad_norm": 0.4188412129878998, "learning_rate": 0.0002, "epoch": 2.177688710754843, "step": 1630}, {"loss": 1.625, "grad_norm": 0.4662680923938751, "learning_rate": 0.0002, "epoch": 2.191048764195057, "step": 1640}, {"loss": 1.6699, "grad_norm": 0.4020286500453949, "learning_rate": 0.0002, "epoch": 2.2044088176352705, "step": 1650}, {"loss": 1.6284, "grad_norm": 0.41919606924057007, "learning_rate": 0.0002, "epoch": 2.217768871075484, "step": 1660}, {"loss": 1.6776, "grad_norm": 0.4644531309604645, "learning_rate": 0.0002, "epoch": 2.231128924515698, "step": 1670}, {"loss": 1.6711, "grad_norm": 0.4526427984237671, "learning_rate": 0.0002, "epoch": 2.244488977955912, "step": 1680}, {"loss": 1.6058, "grad_norm": 0.45953166484832764, "learning_rate": 0.0002, "epoch": 2.2578490313961255, "step": 1690}, {"loss": 1.5979, "grad_norm": 0.4701860249042511, "learning_rate": 0.0002, "epoch": 2.2712090848363395, "step": 1700}, {"loss": 1.6183, "grad_norm": 0.4749310612678528, "learning_rate": 0.0002, "epoch": 2.284569138276553, "step": 1710}, {"loss": 1.6703, "grad_norm": 0.45026102662086487, "learning_rate": 0.0002, "epoch": 2.297929191716767, "step": 1720}, {"loss": 1.6386, "grad_norm": 0.4755004048347473, "learning_rate": 0.0002, "epoch": 2.3112892451569804, "step": 1730}, {"loss": 1.6365, "grad_norm": 0.4505726993083954, "learning_rate": 0.0002, "epoch": 2.3246492985971945, "step": 1740}, {"loss": 1.589, "grad_norm": 0.44464054703712463, "learning_rate": 0.0002, "epoch": 2.338009352037408, "step": 1750}, {"loss": 1.6139, "grad_norm": 0.4449476897716522, "learning_rate": 0.0002, "epoch": 2.3513694054776217, "step": 1760}, {"loss": 1.7195, "grad_norm": 0.4216482937335968, "learning_rate": 0.0002, "epoch": 2.364729458917836, "step": 1770}, {"loss": 1.7075, "grad_norm": 0.4379308521747589, "learning_rate": 0.0002, "epoch": 2.3780895123580494, "step": 1780}, {"loss": 1.7024, "grad_norm": 0.41670042276382446, "learning_rate": 0.0002, "epoch": 2.391449565798263, "step": 1790}, {"loss": 1.5989, "grad_norm": 0.48089510202407837, "learning_rate": 0.0002, "epoch": 2.404809619238477, "step": 1800}, {"loss": 1.6313, "grad_norm": 0.4389738142490387, "learning_rate": 0.0002, "epoch": 2.4181696726786908, "step": 1810}, {"loss": 1.5841, "grad_norm": 0.45293036103248596, "learning_rate": 0.0002, "epoch": 2.4315297261189044, "step": 1820}, {"loss": 1.6887, "grad_norm": 0.5211683511734009, "learning_rate": 0.0002, "epoch": 2.4448897795591185, "step": 1830}, {"loss": 1.6599, "grad_norm": 0.4631884694099426, "learning_rate": 0.0002, "epoch": 2.458249832999332, "step": 1840}, {"loss": 1.6537, "grad_norm": 0.4276818335056305, "learning_rate": 0.0002, "epoch": 2.4716098864395457, "step": 1850}, {"loss": 1.6836, "grad_norm": 0.477524071931839, "learning_rate": 0.0002, "epoch": 2.4849699398797593, "step": 1860}, {"loss": 1.66, "grad_norm": 0.44860973954200745, "learning_rate": 0.0002, "epoch": 2.4983299933199734, "step": 1870}, {"loss": 1.6308, "grad_norm": 0.46413546800613403, "learning_rate": 0.0002, "epoch": 2.511690046760187, "step": 1880}, {"loss": 1.6225, "grad_norm": 0.42487645149230957, "learning_rate": 0.0002, "epoch": 2.5250501002004007, "step": 1890}, {"loss": 1.6268, "grad_norm": 0.4778307378292084, "learning_rate": 0.0002, "epoch": 2.5384101536406147, "step": 1900}, {"loss": 1.6143, "grad_norm": 0.45307061076164246, "learning_rate": 0.0002, "epoch": 2.5517702070808284, "step": 1910}, {"loss": 1.7279, "grad_norm": 0.47886642813682556, "learning_rate": 0.0002, "epoch": 2.565130260521042, "step": 1920}, {"loss": 1.5931, "grad_norm": 0.4839435815811157, "learning_rate": 0.0002, "epoch": 2.5784903139612556, "step": 1930}, {"loss": 1.6089, "grad_norm": 0.4388359785079956, "learning_rate": 0.0002, "epoch": 2.5918503674014697, "step": 1940}, {"loss": 1.6828, "grad_norm": 0.47859734296798706, "learning_rate": 0.0002, "epoch": 2.6052104208416833, "step": 1950}, {"loss": 1.6014, "grad_norm": 0.5526517033576965, "learning_rate": 0.0002, "epoch": 2.6185704742818974, "step": 1960}, {"loss": 1.6889, "grad_norm": 0.5449170470237732, "learning_rate": 0.0002, "epoch": 2.631930527722111, "step": 1970}, {"loss": 1.6481, "grad_norm": 0.48521968722343445, "learning_rate": 0.0002, "epoch": 2.6452905811623246, "step": 1980}, {"loss": 1.6741, "grad_norm": 0.4733737111091614, "learning_rate": 0.0002, "epoch": 2.6586506346025383, "step": 1990}, {"loss": 1.662, "grad_norm": 0.507118284702301, "learning_rate": 0.0002, "epoch": 2.6720106880427523, "step": 2000}, {"loss": 1.6419, "grad_norm": 0.4508971571922302, "learning_rate": 0.0002, "epoch": 2.685370741482966, "step": 2010}, {"loss": 1.7052, "grad_norm": 0.4657728672027588, "learning_rate": 0.0002, "epoch": 2.6987307949231796, "step": 2020}, {"loss": 1.6261, "grad_norm": 0.48647549748420715, "learning_rate": 0.0002, "epoch": 2.7120908483633936, "step": 2030}, {"loss": 1.5638, "grad_norm": 0.49525555968284607, "learning_rate": 0.0002, "epoch": 2.7254509018036073, "step": 2040}, {"loss": 1.658, "grad_norm": 0.4712379276752472, "learning_rate": 0.0002, "epoch": 2.738810955243821, "step": 2050}, {"loss": 1.6464, "grad_norm": 0.4846591055393219, "learning_rate": 0.0002, "epoch": 2.7521710086840345, "step": 2060}, {"loss": 1.5641, "grad_norm": 0.4823240041732788, "learning_rate": 0.0002, "epoch": 2.7655310621242486, "step": 2070}, {"loss": 1.6701, "grad_norm": 0.4546685516834259, "learning_rate": 0.0002, "epoch": 2.778891115564462, "step": 2080}, {"loss": 1.7015, "grad_norm": 0.45542681217193604, "learning_rate": 0.0002, "epoch": 2.7922511690046763, "step": 2090}, {"loss": 1.6398, "grad_norm": 0.42137566208839417, "learning_rate": 0.0002, "epoch": 2.80561122244489, "step": 2100}, {"loss": 1.6526, "grad_norm": 0.6143282055854797, "learning_rate": 0.0002, "epoch": 2.8189712758851035, "step": 2110}, {"loss": 1.6955, "grad_norm": 0.4828081727027893, "learning_rate": 0.0002, "epoch": 2.832331329325317, "step": 2120}, {"loss": 1.744, "grad_norm": 0.4319005608558655, "learning_rate": 0.0002, "epoch": 2.845691382765531, "step": 2130}, {"loss": 1.6717, "grad_norm": 0.4297086298465729, "learning_rate": 0.0002, "epoch": 2.859051436205745, "step": 2140}, {"loss": 1.5968, "grad_norm": 0.5011981129646301, "learning_rate": 0.0002, "epoch": 2.8724114896459585, "step": 2150}, {"loss": 1.7181, "grad_norm": 0.4401548504829407, "learning_rate": 0.0002, "epoch": 2.8857715430861726, "step": 2160}, {"loss": 1.5722, "grad_norm": 0.48090746998786926, "learning_rate": 0.0002, "epoch": 2.899131596526386, "step": 2170}, {"loss": 1.6596, "grad_norm": 0.4740385413169861, "learning_rate": 0.0002, "epoch": 2.9124916499666, "step": 2180}, {"loss": 1.6501, "grad_norm": 0.5337260365486145, "learning_rate": 0.0002, "epoch": 2.9258517034068134, "step": 2190}, {"loss": 1.6802, "grad_norm": 0.4420052766799927, "learning_rate": 0.0002, "epoch": 2.9392117568470275, "step": 2200}, {"loss": 1.5474, "grad_norm": 0.477512389421463, "learning_rate": 0.0002, "epoch": 2.952571810287241, "step": 2210}, {"loss": 1.6544, "grad_norm": 0.5344052910804749, "learning_rate": 0.0002, "epoch": 2.9659318637274548, "step": 2220}, {"loss": 1.6866, "grad_norm": 0.4483940303325653, "learning_rate": 0.0002, "epoch": 2.979291917167669, "step": 2230}, {"loss": 1.6477, "grad_norm": 0.4366597831249237, "learning_rate": 0.0002, "epoch": 2.9926519706078825, "step": 2240}, {"eval_loss": 1.834012746810913, "eval_runtime": 38.5659, "eval_samples_per_second": 13.354, "eval_steps_per_second": 1.685, "epoch": 2.9993319973279893, "step": 2245}, {"loss": 1.5582, "grad_norm": 0.428824245929718, "learning_rate": 0.0002, "epoch": 3.006012024048096, "step": 2250}, {"loss": 1.499, "grad_norm": 0.4870174825191498, "learning_rate": 0.0002, "epoch": 3.01937207748831, "step": 2260}, {"loss": 1.4872, "grad_norm": 0.4684266149997711, "learning_rate": 0.0002, "epoch": 3.032732130928524, "step": 2270}, {"loss": 1.5284, "grad_norm": 0.581604540348053, "learning_rate": 0.0002, "epoch": 3.0460921843687374, "step": 2280}, {"loss": 1.4549, "grad_norm": 0.5561677813529968, "learning_rate": 0.0002, "epoch": 3.059452237808951, "step": 2290}, {"loss": 1.4903, "grad_norm": 0.5750220417976379, "learning_rate": 0.0002, "epoch": 3.072812291249165, "step": 2300}, {"loss": 1.5903, "grad_norm": 0.5704626441001892, "learning_rate": 0.0002, "epoch": 3.0861723446893787, "step": 2310}, {"loss": 1.4292, "grad_norm": 0.6242083311080933, "learning_rate": 0.0002, "epoch": 3.0995323981295924, "step": 2320}, {"loss": 1.5092, "grad_norm": 0.5174121260643005, "learning_rate": 0.0002, "epoch": 3.1128924515698064, "step": 2330}, {"loss": 1.5106, "grad_norm": 0.5697633028030396, "learning_rate": 0.0002, "epoch": 3.12625250501002, "step": 2340}, {"loss": 1.5156, "grad_norm": 0.5969541072845459, "learning_rate": 0.0002, "epoch": 3.1396125584502337, "step": 2350}, {"loss": 1.52, "grad_norm": 0.6244304180145264, "learning_rate": 0.0002, "epoch": 3.1529726118904478, "step": 2360}, {"loss": 1.5244, "grad_norm": 0.5561705827713013, "learning_rate": 0.0002, "epoch": 3.1663326653306614, "step": 2370}, {"loss": 1.6169, "grad_norm": 0.5401188135147095, "learning_rate": 0.0002, "epoch": 3.179692718770875, "step": 2380}, {"loss": 1.5387, "grad_norm": 0.6450421810150146, "learning_rate": 0.0002, "epoch": 3.1930527722110886, "step": 2390}, {"loss": 1.4839, "grad_norm": 0.5741903185844421, "learning_rate": 0.0002, "epoch": 3.2064128256513027, "step": 2400}, {"loss": 1.5584, "grad_norm": 0.6337407231330872, "learning_rate": 0.0002, "epoch": 3.2197728790915163, "step": 2410}, {"loss": 1.5025, "grad_norm": 0.6493517160415649, "learning_rate": 0.0002, "epoch": 3.23313293253173, "step": 2420}, {"loss": 1.5168, "grad_norm": 0.6230176091194153, "learning_rate": 0.0002, "epoch": 3.246492985971944, "step": 2430}, {"loss": 1.5408, "grad_norm": 0.680704653263092, "learning_rate": 0.0002, "epoch": 3.2598530394121576, "step": 2440}, {"loss": 1.6005, "grad_norm": 0.5279417037963867, "learning_rate": 0.0002, "epoch": 3.2732130928523713, "step": 2450}, {"loss": 1.5231, "grad_norm": 0.5601515173912048, "learning_rate": 0.0002, "epoch": 3.2865731462925853, "step": 2460}, {"loss": 1.4949, "grad_norm": 0.5591090321540833, "learning_rate": 0.0002, "epoch": 3.299933199732799, "step": 2470}, {"loss": 1.5181, "grad_norm": 0.6596529483795166, "learning_rate": 0.0002, "epoch": 3.3132932531730126, "step": 2480}, {"loss": 1.5259, "grad_norm": 0.6115918755531311, "learning_rate": 0.0002, "epoch": 3.3266533066132267, "step": 2490}, {"loss": 1.5344, "grad_norm": 0.6443548202514648, "learning_rate": 0.0002, "epoch": 3.3400133600534403, "step": 2500}, {"loss": 1.5037, "grad_norm": 0.5504242181777954, "learning_rate": 0.0002, "epoch": 3.353373413493654, "step": 2510}, {"loss": 1.5049, "grad_norm": 0.6104483604431152, "learning_rate": 0.0002, "epoch": 3.3667334669338675, "step": 2520}, {"loss": 1.587, "grad_norm": 0.8387531638145447, "learning_rate": 0.0002, "epoch": 3.3800935203740816, "step": 2530}, {"loss": 1.5227, "grad_norm": 0.6346094012260437, "learning_rate": 0.0002, "epoch": 3.3934535738142952, "step": 2540}, {"loss": 1.4855, "grad_norm": 0.6261265873908997, "learning_rate": 0.0002, "epoch": 3.406813627254509, "step": 2550}, {"loss": 1.5233, "grad_norm": 0.5960372090339661, "learning_rate": 0.0002, "epoch": 3.420173680694723, "step": 2560}, {"loss": 1.5153, "grad_norm": 0.5291280746459961, "learning_rate": 0.0002, "epoch": 3.4335337341349366, "step": 2570}, {"loss": 1.5152, "grad_norm": 0.6133161783218384, "learning_rate": 0.0002, "epoch": 3.44689378757515, "step": 2580}, {"loss": 1.5533, "grad_norm": 0.623573362827301, "learning_rate": 0.0002, "epoch": 3.460253841015364, "step": 2590}, {"loss": 1.4935, "grad_norm": 0.5959834456443787, "learning_rate": 0.0002, "epoch": 3.473613894455578, "step": 2600}, {"loss": 1.5792, "grad_norm": 0.583332359790802, "learning_rate": 0.0002, "epoch": 3.4869739478957915, "step": 2610}, {"loss": 1.5229, "grad_norm": 0.6003559231758118, "learning_rate": 0.0002, "epoch": 3.5003340013360056, "step": 2620}, {"loss": 1.4901, "grad_norm": 0.5832992196083069, "learning_rate": 0.0002, "epoch": 3.513694054776219, "step": 2630}, {"loss": 1.5005, "grad_norm": 0.5942609906196594, "learning_rate": 0.0002, "epoch": 3.527054108216433, "step": 2640}, {"loss": 1.5213, "grad_norm": 0.6087163686752319, "learning_rate": 0.0002, "epoch": 3.5404141616566465, "step": 2650}, {"loss": 1.5826, "grad_norm": 0.631948709487915, "learning_rate": 0.0002, "epoch": 3.5537742150968605, "step": 2660}, {"loss": 1.5844, "grad_norm": 0.6450803279876709, "learning_rate": 0.0002, "epoch": 3.567134268537074, "step": 2670}, {"loss": 1.4981, "grad_norm": 0.6507797837257385, "learning_rate": 0.0002, "epoch": 3.580494321977288, "step": 2680}, {"loss": 1.5826, "grad_norm": 0.5778017044067383, "learning_rate": 0.0002, "epoch": 3.593854375417502, "step": 2690}, {"loss": 1.4688, "grad_norm": 0.6214032173156738, "learning_rate": 0.0002, "epoch": 3.6072144288577155, "step": 2700}, {"loss": 1.5084, "grad_norm": 0.5681133270263672, "learning_rate": 0.0002, "epoch": 3.620574482297929, "step": 2710}, {"loss": 1.471, "grad_norm": 0.6074244976043701, "learning_rate": 0.0002, "epoch": 3.6339345357381427, "step": 2720}, {"loss": 1.5243, "grad_norm": 0.5900560617446899, "learning_rate": 0.0002, "epoch": 3.647294589178357, "step": 2730}, {"loss": 1.5074, "grad_norm": 0.5817505717277527, "learning_rate": 0.0002, "epoch": 3.6606546426185704, "step": 2740}, {"loss": 1.5117, "grad_norm": 0.6095547676086426, "learning_rate": 0.0002, "epoch": 3.6740146960587845, "step": 2750}, {"loss": 1.5117, "grad_norm": 0.612790584564209, "learning_rate": 0.0002, "epoch": 3.687374749498998, "step": 2760}, {"loss": 1.4976, "grad_norm": 0.6574140787124634, "learning_rate": 0.0002, "epoch": 3.7007348029392118, "step": 2770}, {"loss": 1.5306, "grad_norm": 0.5643761157989502, "learning_rate": 0.0002, "epoch": 3.7140948563794254, "step": 2780}, {"loss": 1.5751, "grad_norm": 0.5652621388435364, "learning_rate": 0.0002, "epoch": 3.727454909819639, "step": 2790}, {"loss": 1.5262, "grad_norm": 0.5604206323623657, "learning_rate": 0.0002, "epoch": 3.740814963259853, "step": 2800}, {"loss": 1.5013, "grad_norm": 3.911022663116455, "learning_rate": 0.0002, "epoch": 3.7541750167000667, "step": 2810}, {"loss": 1.5793, "grad_norm": 0.6148333549499512, "learning_rate": 0.0002, "epoch": 3.7675350701402808, "step": 2820}, {"loss": 1.5122, "grad_norm": 0.5605677962303162, "learning_rate": 0.0002, "epoch": 3.7808951235804944, "step": 2830}, {"loss": 1.5659, "grad_norm": 0.6101965308189392, "learning_rate": 0.0002, "epoch": 3.794255177020708, "step": 2840}, {"loss": 1.5618, "grad_norm": 0.5387342572212219, "learning_rate": 0.0002, "epoch": 3.8076152304609217, "step": 2850}, {"loss": 1.5193, "grad_norm": 0.5733087062835693, "learning_rate": 0.0002, "epoch": 3.8209752839011357, "step": 2860}, {"loss": 1.5545, "grad_norm": 0.6538485884666443, "learning_rate": 0.0002, "epoch": 3.8343353373413493, "step": 2870}, {"loss": 1.523, "grad_norm": 0.6247632503509521, "learning_rate": 0.0002, "epoch": 3.847695390781563, "step": 2880}, {"loss": 1.5591, "grad_norm": 0.5745735764503479, "learning_rate": 0.0002, "epoch": 3.861055444221777, "step": 2890}, {"loss": 1.5706, "grad_norm": 0.5942763686180115, "learning_rate": 0.0002, "epoch": 3.8744154976619907, "step": 2900}, {"loss": 1.564, "grad_norm": 0.7086281776428223, "learning_rate": 0.0002, "epoch": 3.8877755511022043, "step": 2910}, {"loss": 1.5526, "grad_norm": 0.8825129866600037, "learning_rate": 0.0002, "epoch": 3.901135604542418, "step": 2920}, {"loss": 1.4519, "grad_norm": 0.6260842680931091, "learning_rate": 0.0002, "epoch": 3.914495657982632, "step": 2930}, {"loss": 1.5433, "grad_norm": 0.6015968322753906, "learning_rate": 0.0002, "epoch": 3.9278557114228456, "step": 2940}, {"loss": 1.4931, "grad_norm": 0.7042809128761292, "learning_rate": 0.0002, "epoch": 3.9412157648630597, "step": 2950}, {"loss": 1.5596, "grad_norm": 0.5860083699226379, "learning_rate": 0.0002, "epoch": 3.9545758183032733, "step": 2960}, {"loss": 1.565, "grad_norm": 0.5939757823944092, "learning_rate": 0.0002, "epoch": 3.967935871743487, "step": 2970}, {"loss": 1.408, "grad_norm": 0.5523964166641235, "learning_rate": 0.0002, "epoch": 3.9812959251837006, "step": 2980}, {"loss": 1.5629, "grad_norm": 0.6380264759063721, "learning_rate": 0.0002, "epoch": 3.9946559786239146, "step": 2990}, {"eval_loss": 1.8875294923782349, "eval_runtime": 38.5837, "eval_samples_per_second": 13.348, "eval_steps_per_second": 1.685, "epoch": 4.0, "step": 2994}, {"loss": 1.4002, "grad_norm": 0.5478564500808716, "learning_rate": 0.0002, "epoch": 4.008016032064128, "step": 3000}, {"loss": 1.436, "grad_norm": 0.9384379982948303, "learning_rate": 0.0002, "epoch": 4.021376085504342, "step": 3010}, {"loss": 1.4127, "grad_norm": 0.7819344401359558, "learning_rate": 0.0002, "epoch": 4.034736138944556, "step": 3020}, {"loss": 1.326, "grad_norm": 0.7737417817115784, "learning_rate": 0.0002, "epoch": 4.04809619238477, "step": 3030}, {"loss": 1.3203, "grad_norm": 0.8893805742263794, "learning_rate": 0.0002, "epoch": 4.061456245824983, "step": 3040}, {"loss": 1.3913, "grad_norm": 0.7759843468666077, "learning_rate": 0.0002, "epoch": 4.074816299265197, "step": 3050}, {"loss": 1.2941, "grad_norm": 0.642654538154602, "learning_rate": 0.0002, "epoch": 4.0881763527054105, "step": 3060}, {"loss": 1.3204, "grad_norm": 0.8515549302101135, "learning_rate": 0.0002, "epoch": 4.101536406145625, "step": 3070}, {"loss": 1.3683, "grad_norm": 0.7033658623695374, "learning_rate": 0.0002, "epoch": 4.114896459585839, "step": 3080}, {"loss": 1.4159, "grad_norm": 0.7063882946968079, "learning_rate": 0.0002, "epoch": 4.128256513026052, "step": 3090}, {"loss": 1.384, "grad_norm": 0.6946853995323181, "learning_rate": 0.0002, "epoch": 4.141616566466266, "step": 3100}, {"loss": 1.3689, "grad_norm": 0.7286741137504578, "learning_rate": 0.0002, "epoch": 4.1549766199064795, "step": 3110}, {"loss": 1.3061, "grad_norm": 0.7894193530082703, "learning_rate": 0.0002, "epoch": 4.168336673346693, "step": 3120}, {"loss": 1.3346, "grad_norm": 0.7005895376205444, "learning_rate": 0.0002, "epoch": 4.181696726786907, "step": 3130}, {"loss": 1.3834, "grad_norm": 0.799567461013794, "learning_rate": 0.0002, "epoch": 4.195056780227121, "step": 3140}, {"loss": 1.3813, "grad_norm": 0.7010157108306885, "learning_rate": 0.0002, "epoch": 4.208416833667335, "step": 3150}, {"loss": 1.3637, "grad_norm": 0.7489650249481201, "learning_rate": 0.0002, "epoch": 4.2217768871075485, "step": 3160}, {"loss": 1.3546, "grad_norm": 0.7908048629760742, "learning_rate": 0.0002, "epoch": 4.235136940547762, "step": 3170}, {"loss": 1.3073, "grad_norm": 0.7002180814743042, "learning_rate": 0.0002, "epoch": 4.248496993987976, "step": 3180}, {"loss": 1.4525, "grad_norm": 0.8339495062828064, "learning_rate": 0.0002, "epoch": 4.261857047428189, "step": 3190}, {"loss": 1.3471, "grad_norm": 0.7884618043899536, "learning_rate": 0.0002, "epoch": 4.275217100868403, "step": 3200}, {"loss": 1.4261, "grad_norm": 0.7964122295379639, "learning_rate": 0.0002, "epoch": 4.2885771543086175, "step": 3210}, {"loss": 1.3506, "grad_norm": 0.838646650314331, "learning_rate": 0.0002, "epoch": 4.301937207748831, "step": 3220}, {"loss": 1.3738, "grad_norm": 0.8063107132911682, "learning_rate": 0.0002, "epoch": 4.315297261189045, "step": 3230}, {"loss": 1.3769, "grad_norm": 0.8147385120391846, "learning_rate": 0.0002, "epoch": 4.328657314629258, "step": 3240}, {"loss": 1.4118, "grad_norm": 0.7636798620223999, "learning_rate": 0.0002, "epoch": 4.342017368069472, "step": 3250}, {"loss": 1.3698, "grad_norm": 0.7530609965324402, "learning_rate": 0.0002, "epoch": 4.355377421509686, "step": 3260}, {"loss": 1.3507, "grad_norm": 0.8853573799133301, "learning_rate": 0.0002, "epoch": 4.3687374749499, "step": 3270}, {"loss": 1.3614, "grad_norm": 0.7180975675582886, "learning_rate": 0.0002, "epoch": 4.382097528390114, "step": 3280}, {"loss": 1.4119, "grad_norm": 0.837150514125824, "learning_rate": 0.0002, "epoch": 4.395457581830327, "step": 3290}, {"loss": 1.461, "grad_norm": 0.8370638489723206, "learning_rate": 0.0002, "epoch": 4.408817635270541, "step": 3300}, {"loss": 1.4478, "grad_norm": 0.7738229036331177, "learning_rate": 0.0002, "epoch": 4.422177688710755, "step": 3310}, {"loss": 1.4195, "grad_norm": 0.7665290832519531, "learning_rate": 0.0002, "epoch": 4.435537742150968, "step": 3320}, {"loss": 1.3308, "grad_norm": 0.7547745704650879, "learning_rate": 0.0002, "epoch": 4.448897795591183, "step": 3330}, {"loss": 1.4165, "grad_norm": 0.7421861290931702, "learning_rate": 0.0002, "epoch": 4.462257849031396, "step": 3340}, {"loss": 1.4244, "grad_norm": 0.8042104244232178, "learning_rate": 0.0002, "epoch": 4.47561790247161, "step": 3350}, {"loss": 1.365, "grad_norm": 0.8111839890480042, "learning_rate": 0.0002, "epoch": 4.488977955911824, "step": 3360}, {"loss": 1.3537, "grad_norm": 0.7998340129852295, "learning_rate": 0.0002, "epoch": 4.502338009352037, "step": 3370}, {"loss": 1.3812, "grad_norm": 0.7668877243995667, "learning_rate": 0.0002, "epoch": 4.515698062792251, "step": 3380}, {"loss": 1.3972, "grad_norm": 0.7986718416213989, "learning_rate": 0.0002, "epoch": 4.529058116232465, "step": 3390}, {"loss": 1.3582, "grad_norm": 0.6806602478027344, "learning_rate": 0.0002, "epoch": 4.542418169672679, "step": 3400}, {"loss": 1.3942, "grad_norm": 0.8788819909095764, "learning_rate": 0.0002, "epoch": 4.555778223112893, "step": 3410}, {"loss": 1.3379, "grad_norm": 0.7499664425849915, "learning_rate": 0.0002, "epoch": 4.569138276553106, "step": 3420}, {"loss": 1.3823, "grad_norm": 0.7967109084129333, "learning_rate": 0.0002, "epoch": 4.58249832999332, "step": 3430}, {"loss": 1.3531, "grad_norm": 0.759639322757721, "learning_rate": 0.0002, "epoch": 4.595858383433534, "step": 3440}, {"loss": 1.3517, "grad_norm": 0.8327916264533997, "learning_rate": 0.0002, "epoch": 4.609218436873747, "step": 3450}, {"loss": 1.4619, "grad_norm": 0.7400892376899719, "learning_rate": 0.0002, "epoch": 4.622578490313961, "step": 3460}, {"loss": 1.3374, "grad_norm": 0.8116602301597595, "learning_rate": 0.0002, "epoch": 4.635938543754175, "step": 3470}, {"loss": 1.4445, "grad_norm": 0.7604362368583679, "learning_rate": 0.0002, "epoch": 4.649298597194389, "step": 3480}, {"loss": 1.3724, "grad_norm": 0.7397996783256531, "learning_rate": 0.0002, "epoch": 4.662658650634603, "step": 3490}, {"loss": 1.4048, "grad_norm": 0.869293749332428, "learning_rate": 0.0002, "epoch": 4.676018704074816, "step": 3500}, {"loss": 1.3873, "grad_norm": 0.6854358315467834, "learning_rate": 0.0002, "epoch": 4.68937875751503, "step": 3510}, {"loss": 1.3413, "grad_norm": 0.8326661586761475, "learning_rate": 0.0002, "epoch": 4.7027388109552435, "step": 3520}, {"loss": 1.3666, "grad_norm": 0.6887506246566772, "learning_rate": 0.0002, "epoch": 4.716098864395457, "step": 3530}, {"loss": 1.4508, "grad_norm": 3.837689161300659, "learning_rate": 0.0002, "epoch": 4.729458917835672, "step": 3540}, {"loss": 1.3775, "grad_norm": 0.6874563694000244, "learning_rate": 0.0002, "epoch": 4.742818971275885, "step": 3550}, {"loss": 1.3643, "grad_norm": 0.8340407609939575, "learning_rate": 0.0002, "epoch": 4.756179024716099, "step": 3560}, {"loss": 1.3556, "grad_norm": 0.7286418676376343, "learning_rate": 0.0002, "epoch": 4.7695390781563125, "step": 3570}, {"loss": 1.4338, "grad_norm": 0.7239373326301575, "learning_rate": 0.0002, "epoch": 4.782899131596526, "step": 3580}, {"loss": 1.4697, "grad_norm": 0.831310510635376, "learning_rate": 0.0002, "epoch": 4.796259185036741, "step": 3590}, {"loss": 1.4146, "grad_norm": 0.767715573310852, "learning_rate": 0.0002, "epoch": 4.809619238476954, "step": 3600}, {"loss": 1.4199, "grad_norm": 0.9013199210166931, "learning_rate": 0.0002, "epoch": 4.822979291917168, "step": 3610}, {"loss": 1.4513, "grad_norm": 0.7543512582778931, "learning_rate": 0.0002, "epoch": 4.8363393453573815, "step": 3620}, {"loss": 1.4218, "grad_norm": 0.7626057267189026, "learning_rate": 0.0002, "epoch": 4.849699398797595, "step": 3630}, {"loss": 1.4102, "grad_norm": 0.847079336643219, "learning_rate": 0.0002, "epoch": 4.863059452237809, "step": 3640}, {"loss": 1.5014, "grad_norm": 0.8273295760154724, "learning_rate": 0.0002, "epoch": 4.876419505678022, "step": 3650}, {"loss": 1.3806, "grad_norm": 0.7675244808197021, "learning_rate": 0.0002, "epoch": 4.889779559118237, "step": 3660}, {"loss": 1.4894, "grad_norm": 0.9560356736183167, "learning_rate": 0.0002, "epoch": 4.9031396125584505, "step": 3670}, {"loss": 1.4044, "grad_norm": 0.7682451605796814, "learning_rate": 0.0002, "epoch": 4.916499665998664, "step": 3680}, {"loss": 1.342, "grad_norm": 0.8113830089569092, "learning_rate": 0.0002, "epoch": 4.929859719438878, "step": 3690}, {"loss": 1.3559, "grad_norm": 0.7642542719841003, "learning_rate": 0.0002, "epoch": 4.943219772879091, "step": 3700}, {"loss": 1.403, "grad_norm": 0.823863685131073, "learning_rate": 0.0002, "epoch": 4.956579826319305, "step": 3710}, {"loss": 1.464, "grad_norm": 0.8287797570228577, "learning_rate": 0.0002, "epoch": 4.969939879759519, "step": 3720}, {"loss": 1.4139, "grad_norm": 0.778170108795166, "learning_rate": 0.0002, "epoch": 4.983299933199733, "step": 3730}, {"loss": 1.4218, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 4.996659986639947, "step": 3740}, {"eval_loss": 1.9638569355010986, "eval_runtime": 38.5725, "eval_samples_per_second": 13.351, "eval_steps_per_second": 1.685, "epoch": 4.999331997327989, "step": 3742}, {"loss": 1.225, "grad_norm": 0.8864085078239441, "learning_rate": 0.0002, "epoch": 5.01002004008016, "step": 3750}, {"loss": 1.2056, "grad_norm": 0.9191637635231018, "learning_rate": 0.0002, "epoch": 5.023380093520374, "step": 3760}, {"loss": 1.2453, "grad_norm": 0.749519407749176, "learning_rate": 0.0002, "epoch": 5.036740146960588, "step": 3770}, {"loss": 1.1959, "grad_norm": 0.7916892170906067, "learning_rate": 0.0002, "epoch": 5.050100200400801, "step": 3780}, {"loss": 1.2279, "grad_norm": 1.0318909883499146, "learning_rate": 0.0002, "epoch": 5.063460253841015, "step": 3790}, {"loss": 1.2585, "grad_norm": 1.028586745262146, "learning_rate": 0.0002, "epoch": 5.0768203072812295, "step": 3800}, {"loss": 1.1769, "grad_norm": 1.0568538904190063, "learning_rate": 0.0002, "epoch": 5.090180360721443, "step": 3810}, {"loss": 1.263, "grad_norm": 0.9780595302581787, "learning_rate": 0.0002, "epoch": 5.103540414161657, "step": 3820}, {"loss": 1.2019, "grad_norm": 1.10311758518219, "learning_rate": 0.0002, "epoch": 5.11690046760187, "step": 3830}, {"loss": 1.2133, "grad_norm": 0.9497154355049133, "learning_rate": 0.0002, "epoch": 5.130260521042084, "step": 3840}, {"loss": 1.1718, "grad_norm": 0.948279857635498, "learning_rate": 0.0002, "epoch": 5.143620574482298, "step": 3850}, {"loss": 1.2108, "grad_norm": 0.9497880339622498, "learning_rate": 0.0002, "epoch": 5.156980627922512, "step": 3860}, {"loss": 1.1876, "grad_norm": 1.3213258981704712, "learning_rate": 0.0002, "epoch": 5.170340681362726, "step": 3870}, {"loss": 1.2327, "grad_norm": 0.9835752248764038, "learning_rate": 0.0002, "epoch": 5.183700734802939, "step": 3880}, {"loss": 1.2256, "grad_norm": 0.8426132202148438, "learning_rate": 0.0002, "epoch": 5.197060788243153, "step": 3890}, {"loss": 1.2066, "grad_norm": 1.0343470573425293, "learning_rate": 0.0002, "epoch": 5.210420841683367, "step": 3900}, {"loss": 1.2596, "grad_norm": 1.0771924257278442, "learning_rate": 0.0002, "epoch": 5.22378089512358, "step": 3910}, {"loss": 1.2371, "grad_norm": 0.8542634844779968, "learning_rate": 0.0002, "epoch": 5.237140948563794, "step": 3920}, {"loss": 1.2264, "grad_norm": 1.1021966934204102, "learning_rate": 0.0002, "epoch": 5.250501002004008, "step": 3930}, {"loss": 1.2097, "grad_norm": 1.170011281967163, "learning_rate": 0.0002, "epoch": 5.263861055444222, "step": 3940}, {"loss": 1.2101, "grad_norm": 0.9787653684616089, "learning_rate": 0.0002, "epoch": 5.277221108884436, "step": 3950}, {"loss": 1.24, "grad_norm": 0.914513885974884, "learning_rate": 0.0002, "epoch": 5.290581162324649, "step": 3960}, {"loss": 1.1641, "grad_norm": 1.0831562280654907, "learning_rate": 0.0002, "epoch": 5.303941215764863, "step": 3970}, {"loss": 1.2609, "grad_norm": 0.9810112714767456, "learning_rate": 0.0002, "epoch": 5.3173012692050765, "step": 3980}, {"loss": 1.1825, "grad_norm": 0.9624066948890686, "learning_rate": 0.0002, "epoch": 5.330661322645291, "step": 3990}, {"loss": 1.273, "grad_norm": 1.2296923398971558, "learning_rate": 0.0002, "epoch": 5.344021376085505, "step": 4000}, {"loss": 1.2452, "grad_norm": 1.011299967765808, "learning_rate": 0.0002, "epoch": 5.357381429525718, "step": 4010}, {"loss": 1.2539, "grad_norm": 0.9144132733345032, "learning_rate": 0.0002, "epoch": 5.370741482965932, "step": 4020}, {"loss": 1.2914, "grad_norm": 1.0573601722717285, "learning_rate": 0.0002, "epoch": 5.3841015364061455, "step": 4030}, {"loss": 1.2295, "grad_norm": 1.1667137145996094, "learning_rate": 0.0002, "epoch": 5.397461589846359, "step": 4040}, {"loss": 1.2541, "grad_norm": 1.072070598602295, "learning_rate": 0.0002, "epoch": 5.410821643286573, "step": 4050}, {"loss": 1.2448, "grad_norm": 1.1005792617797852, "learning_rate": 0.0002, "epoch": 5.424181696726787, "step": 4060}, {"loss": 1.2604, "grad_norm": 1.033581018447876, "learning_rate": 0.0002, "epoch": 5.437541750167001, "step": 4070}, {"loss": 1.2552, "grad_norm": 0.9537439942359924, "learning_rate": 0.0002, "epoch": 5.4509018036072145, "step": 4080}, {"loss": 1.2985, "grad_norm": 1.0502177476882935, "learning_rate": 0.0002, "epoch": 5.464261857047428, "step": 4090}, {"loss": 1.2424, "grad_norm": 0.9098296761512756, "learning_rate": 0.0002, "epoch": 5.477621910487642, "step": 4100}, {"loss": 1.2262, "grad_norm": 0.9551953077316284, "learning_rate": 0.0002, "epoch": 5.490981963927855, "step": 4110}, {"loss": 1.2848, "grad_norm": 0.9169427156448364, "learning_rate": 0.0002, "epoch": 5.504342017368069, "step": 4120}, {"loss": 1.2572, "grad_norm": 0.9430235624313354, "learning_rate": 0.0002, "epoch": 5.517702070808284, "step": 4130}, {"loss": 1.2618, "grad_norm": 0.817259669303894, "learning_rate": 0.0002, "epoch": 5.531062124248497, "step": 4140}, {"loss": 1.3012, "grad_norm": 1.124152660369873, "learning_rate": 0.0002, "epoch": 5.544422177688711, "step": 4150}, {"loss": 1.2508, "grad_norm": 0.9250756502151489, "learning_rate": 0.0002, "epoch": 5.557782231128924, "step": 4160}, {"loss": 1.2492, "grad_norm": 0.9582970142364502, "learning_rate": 0.0002, "epoch": 5.571142284569138, "step": 4170}, {"loss": 1.2804, "grad_norm": 1.0078704357147217, "learning_rate": 0.0002, "epoch": 5.584502338009352, "step": 4180}, {"loss": 1.1961, "grad_norm": 0.9585610032081604, "learning_rate": 0.0002, "epoch": 5.597862391449565, "step": 4190}, {"loss": 1.2522, "grad_norm": 1.0150971412658691, "learning_rate": 0.0002, "epoch": 5.61122244488978, "step": 4200}, {"loss": 1.2275, "grad_norm": 0.9943351149559021, "learning_rate": 0.0002, "epoch": 5.6245824983299935, "step": 4210}, {"loss": 1.2928, "grad_norm": 0.8880936503410339, "learning_rate": 0.0002, "epoch": 5.637942551770207, "step": 4220}, {"loss": 1.2323, "grad_norm": 0.9873887896537781, "learning_rate": 0.0002, "epoch": 5.651302605210421, "step": 4230}, {"loss": 1.3391, "grad_norm": 0.9185152649879456, "learning_rate": 0.0002, "epoch": 5.664662658650634, "step": 4240}, {"loss": 1.2511, "grad_norm": 1.0706779956817627, "learning_rate": 0.0002, "epoch": 5.678022712090849, "step": 4250}, {"loss": 1.2737, "grad_norm": 0.9660224914550781, "learning_rate": 0.0002, "epoch": 5.6913827655310625, "step": 4260}, {"loss": 1.2815, "grad_norm": 0.8685019612312317, "learning_rate": 0.0002, "epoch": 5.704742818971276, "step": 4270}, {"loss": 1.1559, "grad_norm": 1.0390565395355225, "learning_rate": 0.0002, "epoch": 5.71810287241149, "step": 4280}, {"loss": 1.3134, "grad_norm": 0.9290478825569153, "learning_rate": 0.0002, "epoch": 5.731462925851703, "step": 4290}, {"loss": 1.2426, "grad_norm": 1.0361281633377075, "learning_rate": 0.0002, "epoch": 5.744822979291917, "step": 4300}, {"loss": 1.2688, "grad_norm": 0.8804615139961243, "learning_rate": 0.0002, "epoch": 5.758183032732131, "step": 4310}, {"loss": 1.2479, "grad_norm": 1.0051425695419312, "learning_rate": 0.0002, "epoch": 5.771543086172345, "step": 4320}, {"loss": 1.1946, "grad_norm": 1.0051119327545166, "learning_rate": 0.0002, "epoch": 5.784903139612559, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.9961661100387573, "learning_rate": 0.0002, "epoch": 5.798263193052772, "step": 4340}, {"loss": 1.2179, "grad_norm": 1.0229419469833374, "learning_rate": 0.0002, "epoch": 5.811623246492986, "step": 4350}, {"loss": 1.2984, "grad_norm": 1.1129552125930786, "learning_rate": 0.0002, "epoch": 5.8249832999332, "step": 4360}, {"loss": 1.2692, "grad_norm": 1.18964421749115, "learning_rate": 0.0002, "epoch": 5.838343353373413, "step": 4370}, {"loss": 1.1996, "grad_norm": 0.9490230083465576, "learning_rate": 0.0002, "epoch": 5.851703406813627, "step": 4380}, {"loss": 1.3177, "grad_norm": 0.8734540343284607, "learning_rate": 0.0002, "epoch": 5.865063460253841, "step": 4390}, {"loss": 1.3131, "grad_norm": 1.0017802715301514, "learning_rate": 0.0002, "epoch": 5.878423513694055, "step": 4400}, {"loss": 1.2649, "grad_norm": 0.953556478023529, "learning_rate": 0.0002, "epoch": 5.891783567134269, "step": 4410}, {"loss": 1.2684, "grad_norm": 0.8915258646011353, "learning_rate": 0.0002, "epoch": 5.905143620574482, "step": 4420}, {"loss": 1.2843, "grad_norm": 0.9715141654014587, "learning_rate": 0.0002, "epoch": 5.918503674014696, "step": 4430}, {"loss": 1.2769, "grad_norm": 0.9432152509689331, "learning_rate": 0.0002, "epoch": 5.9318637274549095, "step": 4440}, {"loss": 1.233, "grad_norm": 0.9473979473114014, "learning_rate": 0.0002, "epoch": 5.945223780895123, "step": 4450}, {"loss": 1.3209, "grad_norm": 1.104871392250061, "learning_rate": 0.0002, "epoch": 5.958583834335338, "step": 4460}, {"loss": 1.3427, "grad_norm": 1.0308905839920044, "learning_rate": 0.0002, "epoch": 5.971943887775551, "step": 4470}, {"loss": 1.1808, "grad_norm": 0.8895487189292908, "learning_rate": 0.0002, "epoch": 5.985303941215765, "step": 4480}, {"loss": 1.2634, "grad_norm": 1.0148485898971558, "learning_rate": 0.0002, "epoch": 5.9986639946559785, "step": 4490}, {"eval_loss": 2.0830726623535156, "eval_runtime": 38.5442, "eval_samples_per_second": 13.361, "eval_steps_per_second": 1.686, "epoch": 6.0, "step": 4491}, {"loss": 1.1106, "grad_norm": 1.1640599966049194, "learning_rate": 0.0002, "epoch": 6.012024048096192, "step": 4500}, {"loss": 1.0436, "grad_norm": 1.213204264640808, "learning_rate": 0.0002, "epoch": 6.025384101536406, "step": 4510}, {"loss": 1.0606, "grad_norm": 1.1694388389587402, "learning_rate": 0.0002, "epoch": 6.03874415497662, "step": 4520}, {"loss": 1.0274, "grad_norm": 1.1044062376022339, "learning_rate": 0.0002, "epoch": 6.052104208416834, "step": 4530}, {"loss": 1.0552, "grad_norm": 1.0701100826263428, "learning_rate": 0.0002, "epoch": 6.065464261857048, "step": 4540}, {"loss": 1.0018, "grad_norm": 1.360065221786499, "learning_rate": 0.0002, "epoch": 6.078824315297261, "step": 4550}, {"loss": 1.0189, "grad_norm": 1.0648503303527832, "learning_rate": 0.0002, "epoch": 6.092184368737475, "step": 4560}, {"loss": 1.008, "grad_norm": 1.066245198249817, "learning_rate": 0.0002, "epoch": 6.1055444221776884, "step": 4570}, {"loss": 1.099, "grad_norm": 1.1483700275421143, "learning_rate": 0.0002, "epoch": 6.118904475617902, "step": 4580}, {"loss": 1.1043, "grad_norm": 1.334275722503662, "learning_rate": 0.0002, "epoch": 6.132264529058117, "step": 4590}, {"loss": 1.0783, "grad_norm": 1.2141029834747314, "learning_rate": 0.0002, "epoch": 6.14562458249833, "step": 4600}, {"loss": 1.0891, "grad_norm": 1.2284387350082397, "learning_rate": 0.0002, "epoch": 6.158984635938544, "step": 4610}, {"loss": 1.122, "grad_norm": 1.2326734066009521, "learning_rate": 0.0002, "epoch": 6.1723446893787575, "step": 4620}, {"loss": 1.1069, "grad_norm": 1.245004653930664, "learning_rate": 0.0002, "epoch": 6.185704742818971, "step": 4630}, {"loss": 1.0821, "grad_norm": 0.9685266017913818, "learning_rate": 0.0002, "epoch": 6.199064796259185, "step": 4640}, {"loss": 1.0659, "grad_norm": 1.141634464263916, "learning_rate": 0.0002, "epoch": 6.212424849699399, "step": 4650}, {"loss": 1.0971, "grad_norm": 1.4279003143310547, "learning_rate": 0.0002, "epoch": 6.225784903139613, "step": 4660}, {"loss": 1.093, "grad_norm": 1.186668872833252, "learning_rate": 0.0002, "epoch": 6.2391449565798265, "step": 4670}, {"loss": 1.0522, "grad_norm": 1.2656606435775757, "learning_rate": 0.0002, "epoch": 6.25250501002004, "step": 4680}, {"loss": 1.1138, "grad_norm": 1.1122987270355225, "learning_rate": 0.0002, "epoch": 6.265865063460254, "step": 4690}, {"loss": 1.0906, "grad_norm": 1.190050482749939, "learning_rate": 0.0002, "epoch": 6.279225116900467, "step": 4700}, {"loss": 1.1095, "grad_norm": 1.3683340549468994, "learning_rate": 0.0002, "epoch": 6.292585170340681, "step": 4710}, {"loss": 1.0663, "grad_norm": 1.1787203550338745, "learning_rate": 0.0002, "epoch": 6.3059452237808955, "step": 4720}, {"loss": 1.0856, "grad_norm": 1.3502576351165771, "learning_rate": 0.0002, "epoch": 6.319305277221109, "step": 4730}, {"loss": 1.0999, "grad_norm": 1.1958597898483276, "learning_rate": 0.0002, "epoch": 6.332665330661323, "step": 4740}, {"loss": 1.021, "grad_norm": 1.0918327569961548, "learning_rate": 0.0002, "epoch": 6.346025384101536, "step": 4750}, {"loss": 1.0484, "grad_norm": 1.2624558210372925, "learning_rate": 0.0002, "epoch": 6.35938543754175, "step": 4760}, {"loss": 1.0785, "grad_norm": 1.1390577554702759, "learning_rate": 0.0002, "epoch": 6.372745490981964, "step": 4770}, {"loss": 1.0979, "grad_norm": 1.041666865348816, "learning_rate": 0.0002, "epoch": 6.386105544422177, "step": 4780}, {"loss": 1.1026, "grad_norm": 1.4209141731262207, "learning_rate": 0.0002, "epoch": 6.399465597862392, "step": 4790}, {"loss": 1.119, "grad_norm": 1.1001079082489014, "learning_rate": 0.0002, "epoch": 6.412825651302605, "step": 4800}, {"loss": 1.1082, "grad_norm": 1.3324936628341675, "learning_rate": 0.0002, "epoch": 6.426185704742819, "step": 4810}, {"loss": 1.0785, "grad_norm": 1.1270194053649902, "learning_rate": 0.0002, "epoch": 6.439545758183033, "step": 4820}, {"loss": 1.1338, "grad_norm": 1.1961387395858765, "learning_rate": 0.0002, "epoch": 6.452905811623246, "step": 4830}, {"loss": 1.0967, "grad_norm": 1.255366563796997, "learning_rate": 0.0002, "epoch": 6.46626586506346, "step": 4840}, {"loss": 1.1226, "grad_norm": 1.343855381011963, "learning_rate": 0.0002, "epoch": 6.479625918503674, "step": 4850}, {"loss": 1.1118, "grad_norm": 1.3216257095336914, "learning_rate": 0.0002, "epoch": 6.492985971943888, "step": 4860}, {"loss": 1.1664, "grad_norm": 1.5244755744934082, "learning_rate": 0.0002, "epoch": 6.506346025384102, "step": 4870}, {"loss": 1.0403, "grad_norm": 1.1585701704025269, "learning_rate": 0.0002, "epoch": 6.519706078824315, "step": 4880}, {"loss": 1.1344, "grad_norm": 1.0301100015640259, "learning_rate": 0.0002, "epoch": 6.533066132264529, "step": 4890}, {"loss": 1.1304, "grad_norm": 1.5772714614868164, "learning_rate": 0.0002, "epoch": 6.5464261857047426, "step": 4900}, {"loss": 1.0953, "grad_norm": 1.2015259265899658, "learning_rate": 0.0002, "epoch": 6.559786239144957, "step": 4910}, {"loss": 1.1283, "grad_norm": 1.4365423917770386, "learning_rate": 0.0002, "epoch": 6.573146292585171, "step": 4920}, {"loss": 1.0717, "grad_norm": 1.2534470558166504, "learning_rate": 0.0002, "epoch": 6.586506346025384, "step": 4930}, {"loss": 1.1099, "grad_norm": 1.216138482093811, "learning_rate": 0.0002, "epoch": 6.599866399465598, "step": 4940}, {"loss": 1.1744, "grad_norm": 1.144316554069519, "learning_rate": 0.0002, "epoch": 6.613226452905812, "step": 4950}, {"loss": 1.0548, "grad_norm": 1.1127740144729614, "learning_rate": 0.0002, "epoch": 6.626586506346025, "step": 4960}, {"loss": 1.0686, "grad_norm": 1.1925606727600098, "learning_rate": 0.0002, "epoch": 6.639946559786239, "step": 4970}, {"loss": 1.0684, "grad_norm": 1.2500451803207397, "learning_rate": 0.0002, "epoch": 6.653306613226453, "step": 4980}, {"loss": 1.1642, "grad_norm": 1.16154944896698, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 4990}, {"loss": 1.1003, "grad_norm": 1.1921433210372925, "learning_rate": 0.0002, "epoch": 6.680026720106881, "step": 5000}, {"loss": 1.1278, "grad_norm": 1.1561170816421509, "learning_rate": 0.0002, "epoch": 6.693386773547094, "step": 5010}, {"loss": 1.0766, "grad_norm": 1.2988990545272827, "learning_rate": 0.0002, "epoch": 6.706746826987308, "step": 5020}, {"loss": 1.1131, "grad_norm": 0.9620341062545776, "learning_rate": 0.0002, "epoch": 6.7201068804275215, "step": 5030}, {"loss": 1.109, "grad_norm": 1.084228515625, "learning_rate": 0.0002, "epoch": 6.733466933867735, "step": 5040}, {"loss": 1.1474, "grad_norm": 1.1119431257247925, "learning_rate": 0.0002, "epoch": 6.74682698730795, "step": 5050}, {"loss": 1.179, "grad_norm": 1.1365628242492676, "learning_rate": 0.0002, "epoch": 6.760187040748163, "step": 5060}, {"loss": 1.0988, "grad_norm": 1.0989075899124146, "learning_rate": 0.0002, "epoch": 6.773547094188377, "step": 5070}, {"loss": 1.127, "grad_norm": 1.040647268295288, "learning_rate": 0.0002, "epoch": 6.7869071476285905, "step": 5080}, {"loss": 1.0793, "grad_norm": 1.1083087921142578, "learning_rate": 0.0002, "epoch": 6.800267201068804, "step": 5090}, {"loss": 1.1081, "grad_norm": 1.3434782028198242, "learning_rate": 0.0002, "epoch": 6.813627254509018, "step": 5100}, {"loss": 1.1243, "grad_norm": 1.2493442296981812, "learning_rate": 0.0002, "epoch": 6.826987307949231, "step": 5110}, {"loss": 1.0633, "grad_norm": 1.0672307014465332, "learning_rate": 0.0002, "epoch": 6.840347361389446, "step": 5120}, {"loss": 1.1344, "grad_norm": 1.068350911140442, "learning_rate": 0.0002, "epoch": 6.8537074148296595, "step": 5130}, {"loss": 1.1942, "grad_norm": 1.2880923748016357, "learning_rate": 0.0002, "epoch": 6.867067468269873, "step": 5140}, {"loss": 1.1445, "grad_norm": 1.0895041227340698, "learning_rate": 0.0002, "epoch": 6.880427521710087, "step": 5150}, {"loss": 1.1535, "grad_norm": 1.2383300065994263, "learning_rate": 0.0002, "epoch": 6.8937875751503, "step": 5160}, {"loss": 1.1653, "grad_norm": 1.5274227857589722, "learning_rate": 0.0002, "epoch": 6.907147628590514, "step": 5170}, {"loss": 1.1112, "grad_norm": 1.1453371047973633, "learning_rate": 0.0002, "epoch": 6.920507682030728, "step": 5180}, {"loss": 1.1808, "grad_norm": 1.171336054801941, "learning_rate": 0.0002, "epoch": 6.933867735470942, "step": 5190}, {"loss": 1.1142, "grad_norm": 1.1946955919265747, "learning_rate": 0.0002, "epoch": 6.947227788911156, "step": 5200}, {"loss": 1.1386, "grad_norm": 1.2290117740631104, "learning_rate": 0.0002, "epoch": 6.960587842351369, "step": 5210}, {"loss": 1.1573, "grad_norm": 1.3134533166885376, "learning_rate": 0.0002, "epoch": 6.973947895791583, "step": 5220}, {"loss": 1.1687, "grad_norm": 1.1500377655029297, "learning_rate": 0.0002, "epoch": 6.987307949231797, "step": 5230}]} +{"epoch": 7.994655978623914, "step": 5984, "epoch_duration": 819.5618627071381, "total_accumulated_duration": 6596.940282344818, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-0/checkpoint-1497", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6569, "grad_norm": 0.5097216367721558, "learning_rate": 0.0002, "epoch": 0.013360053440213761, "step": 10}, {"loss": 2.2557, "grad_norm": 0.5924790501594543, "learning_rate": 0.0002, "epoch": 0.026720106880427523, "step": 20}, {"loss": 2.0626, "grad_norm": 0.5158102512359619, "learning_rate": 0.0002, "epoch": 0.04008016032064128, "step": 30}, {"loss": 1.9452, "grad_norm": 0.5033753514289856, "learning_rate": 0.0002, "epoch": 0.053440213760855046, "step": 40}, {"loss": 1.9128, "grad_norm": 0.5390949845314026, "learning_rate": 0.0002, "epoch": 0.06680026720106881, "step": 50}, {"loss": 1.937, "grad_norm": 0.6376217007637024, "learning_rate": 0.0002, "epoch": 0.08016032064128256, "step": 60}, {"loss": 1.929, "grad_norm": 0.4202035069465637, "learning_rate": 0.0002, "epoch": 0.09352037408149633, "step": 70}, {"loss": 1.811, "grad_norm": 0.4269474744796753, "learning_rate": 0.0002, "epoch": 0.10688042752171009, "step": 80}, {"loss": 1.8303, "grad_norm": 0.4306574761867523, "learning_rate": 0.0002, "epoch": 0.12024048096192384, "step": 90}, {"loss": 1.8469, "grad_norm": 0.5297011137008667, "learning_rate": 0.0002, "epoch": 0.13360053440213762, "step": 100}, {"loss": 1.864, "grad_norm": 1.2313778400421143, "learning_rate": 0.0002, "epoch": 0.14696058784235136, "step": 110}, {"loss": 1.8531, "grad_norm": 0.5351294279098511, "learning_rate": 0.0002, "epoch": 0.16032064128256512, "step": 120}, {"loss": 1.9232, "grad_norm": 0.4848092496395111, "learning_rate": 0.0002, "epoch": 0.1736806947227789, "step": 130}, {"loss": 1.8633, "grad_norm": 0.4339500665664673, "learning_rate": 0.0002, "epoch": 0.18704074816299265, "step": 140}, {"loss": 1.816, "grad_norm": 0.46877285838127136, "learning_rate": 0.0002, "epoch": 0.20040080160320642, "step": 150}, {"loss": 1.8033, "grad_norm": 0.5600412487983704, "learning_rate": 0.0002, "epoch": 0.21376085504342018, "step": 160}, {"loss": 1.8162, "grad_norm": 0.3733620345592499, "learning_rate": 0.0002, "epoch": 0.22712090848363392, "step": 170}, {"loss": 1.8564, "grad_norm": 0.5116042494773865, "learning_rate": 0.0002, "epoch": 0.24048096192384769, "step": 180}, {"loss": 1.915, "grad_norm": 0.4071602523326874, "learning_rate": 0.0002, "epoch": 0.25384101536406145, "step": 190}, {"loss": 1.7984, "grad_norm": 0.44189608097076416, "learning_rate": 0.0002, "epoch": 0.26720106880427524, "step": 200}, {"loss": 1.8728, "grad_norm": 0.398699015378952, "learning_rate": 0.0002, "epoch": 0.280561122244489, "step": 210}, {"loss": 1.8205, "grad_norm": 0.3585626482963562, "learning_rate": 0.0002, "epoch": 0.2939211756847027, "step": 220}, {"loss": 1.8861, "grad_norm": 0.3811776041984558, "learning_rate": 0.0002, "epoch": 0.3072812291249165, "step": 230}, {"loss": 1.8365, "grad_norm": 0.37261509895324707, "learning_rate": 0.0002, "epoch": 0.32064128256513025, "step": 240}, {"loss": 1.9186, "grad_norm": 0.39762404561042786, "learning_rate": 0.0002, "epoch": 0.33400133600534404, "step": 250}, {"loss": 1.7965, "grad_norm": 0.3509528934955597, "learning_rate": 0.0002, "epoch": 0.3473613894455578, "step": 260}, {"loss": 1.7802, "grad_norm": 0.3169104754924774, "learning_rate": 0.0002, "epoch": 0.36072144288577157, "step": 270}, {"loss": 1.8038, "grad_norm": 0.33714795112609863, "learning_rate": 0.0002, "epoch": 0.3740814963259853, "step": 280}, {"loss": 1.787, "grad_norm": 1.2936875820159912, "learning_rate": 0.0002, "epoch": 0.38744154976619904, "step": 290}, {"loss": 1.7974, "grad_norm": 0.3459427058696747, "learning_rate": 0.0002, "epoch": 0.40080160320641284, "step": 300}, {"loss": 1.8879, "grad_norm": 0.3380655348300934, "learning_rate": 0.0002, "epoch": 0.4141616566466266, "step": 310}, {"loss": 1.9196, "grad_norm": 0.3890381455421448, "learning_rate": 0.0002, "epoch": 0.42752171008684037, "step": 320}, {"loss": 1.8034, "grad_norm": 0.432327002286911, "learning_rate": 0.0002, "epoch": 0.4408817635270541, "step": 330}, {"loss": 1.8443, "grad_norm": 0.3736560046672821, "learning_rate": 0.0002, "epoch": 0.45424181696726784, "step": 340}, {"loss": 1.8506, "grad_norm": 0.3700982630252838, "learning_rate": 0.0002, "epoch": 0.46760187040748163, "step": 350}, {"loss": 1.7978, "grad_norm": 0.4533902406692505, "learning_rate": 0.0002, "epoch": 0.48096192384769537, "step": 360}, {"loss": 1.7427, "grad_norm": 0.35999053716659546, "learning_rate": 0.0002, "epoch": 0.49432197728790916, "step": 370}, {"loss": 1.7995, "grad_norm": 0.3490903675556183, "learning_rate": 0.0002, "epoch": 0.5076820307281229, "step": 380}, {"loss": 1.8709, "grad_norm": 0.34704291820526123, "learning_rate": 0.0002, "epoch": 0.5210420841683366, "step": 390}, {"loss": 1.7948, "grad_norm": 0.343565434217453, "learning_rate": 0.0002, "epoch": 0.5344021376085505, "step": 400}, {"loss": 1.8564, "grad_norm": 0.3573552966117859, "learning_rate": 0.0002, "epoch": 0.5477621910487642, "step": 410}, {"loss": 1.8477, "grad_norm": 0.32980719208717346, "learning_rate": 0.0002, "epoch": 0.561122244488978, "step": 420}, {"loss": 1.9233, "grad_norm": 0.356952428817749, "learning_rate": 0.0002, "epoch": 0.5744822979291917, "step": 430}, {"loss": 1.7433, "grad_norm": 0.3170869052410126, "learning_rate": 0.0002, "epoch": 0.5878423513694054, "step": 440}, {"loss": 1.7607, "grad_norm": 0.35233718156814575, "learning_rate": 0.0002, "epoch": 0.6012024048096193, "step": 450}, {"loss": 1.8111, "grad_norm": 0.3480125367641449, "learning_rate": 0.0002, "epoch": 0.614562458249833, "step": 460}, {"loss": 1.8386, "grad_norm": 0.4762810468673706, "learning_rate": 0.0002, "epoch": 0.6279225116900468, "step": 470}, {"loss": 1.805, "grad_norm": 0.3907663822174072, "learning_rate": 0.0002, "epoch": 0.6412825651302605, "step": 480}, {"loss": 1.8113, "grad_norm": 0.36315613985061646, "learning_rate": 0.0002, "epoch": 0.6546426185704742, "step": 490}, {"loss": 1.7805, "grad_norm": 0.377796471118927, "learning_rate": 0.0002, "epoch": 0.6680026720106881, "step": 500}, {"loss": 1.7457, "grad_norm": 0.34284207224845886, "learning_rate": 0.0002, "epoch": 0.6813627254509018, "step": 510}, {"loss": 1.8013, "grad_norm": 0.35563018918037415, "learning_rate": 0.0002, "epoch": 0.6947227788911156, "step": 520}, {"loss": 1.8414, "grad_norm": 0.37575867772102356, "learning_rate": 0.0002, "epoch": 0.7080828323313293, "step": 530}, {"loss": 1.7993, "grad_norm": 0.35719701647758484, "learning_rate": 0.0002, "epoch": 0.7214428857715431, "step": 540}, {"loss": 1.7574, "grad_norm": 0.385813444852829, "learning_rate": 0.0002, "epoch": 0.7348029392117569, "step": 550}, {"loss": 1.7985, "grad_norm": 0.44509607553482056, "learning_rate": 0.0002, "epoch": 0.7481629926519706, "step": 560}, {"loss": 1.7459, "grad_norm": 0.36108464002609253, "learning_rate": 0.0002, "epoch": 0.7615230460921844, "step": 570}, {"loss": 1.8207, "grad_norm": 0.3530745804309845, "learning_rate": 0.0002, "epoch": 0.7748830995323981, "step": 580}, {"loss": 1.7479, "grad_norm": 0.34888574481010437, "learning_rate": 0.0002, "epoch": 0.7882431529726119, "step": 590}, {"loss": 1.8656, "grad_norm": 0.387346476316452, "learning_rate": 0.0002, "epoch": 0.8016032064128257, "step": 600}, {"loss": 1.8071, "grad_norm": 0.3641138970851898, "learning_rate": 0.0002, "epoch": 0.8149632598530394, "step": 610}, {"loss": 1.7531, "grad_norm": 0.33729103207588196, "learning_rate": 0.0002, "epoch": 0.8283233132932531, "step": 620}, {"loss": 1.8613, "grad_norm": 0.3652004599571228, "learning_rate": 0.0002, "epoch": 0.8416833667334669, "step": 630}, {"loss": 1.9184, "grad_norm": 0.3986643850803375, "learning_rate": 0.0002, "epoch": 0.8550434201736807, "step": 640}, {"loss": 1.8198, "grad_norm": 0.3458964228630066, "learning_rate": 0.0002, "epoch": 0.8684034736138945, "step": 650}, {"loss": 1.803, "grad_norm": 0.3559381365776062, "learning_rate": 0.0002, "epoch": 0.8817635270541082, "step": 660}, {"loss": 1.7641, "grad_norm": 0.3612841069698334, "learning_rate": 0.0002, "epoch": 0.895123580494322, "step": 670}, {"loss": 1.7888, "grad_norm": 0.34771719574928284, "learning_rate": 0.0002, "epoch": 0.9084836339345357, "step": 680}, {"loss": 1.8204, "grad_norm": 0.3371497094631195, "learning_rate": 0.0002, "epoch": 0.9218436873747495, "step": 690}, {"loss": 1.7848, "grad_norm": 0.5596055388450623, "learning_rate": 0.0002, "epoch": 0.9352037408149633, "step": 700}, {"loss": 1.7794, "grad_norm": 0.311880499124527, "learning_rate": 0.0002, "epoch": 0.948563794255177, "step": 710}, {"loss": 1.8464, "grad_norm": 0.3462068736553192, "learning_rate": 0.0002, "epoch": 0.9619238476953907, "step": 720}, {"loss": 1.8197, "grad_norm": 0.29982393980026245, "learning_rate": 0.0002, "epoch": 0.9752839011356046, "step": 730}, {"loss": 1.8503, "grad_norm": 0.34606459736824036, "learning_rate": 0.0002, "epoch": 0.9886439545758183, "step": 740}, {"eval_loss": 1.8201380968093872, "eval_runtime": 38.6124, "eval_samples_per_second": 13.338, "eval_steps_per_second": 1.683, "epoch": 0.9993319973279893, "step": 748}, {"loss": 1.7786, "grad_norm": 0.32302048802375793, "learning_rate": 0.0002, "epoch": 1.002004008016032, "step": 750}, {"loss": 1.7297, "grad_norm": 0.37585633993148804, "learning_rate": 0.0002, "epoch": 1.0153640614562458, "step": 760}, {"loss": 1.7008, "grad_norm": 0.33826273679733276, "learning_rate": 0.0002, "epoch": 1.0287241148964597, "step": 770}, {"loss": 1.809, "grad_norm": 0.44682955741882324, "learning_rate": 0.0002, "epoch": 1.0420841683366733, "step": 780}, {"loss": 1.7092, "grad_norm": 0.422188401222229, "learning_rate": 0.0002, "epoch": 1.0554442217768871, "step": 790}, {"loss": 1.7765, "grad_norm": 0.3809906244277954, "learning_rate": 0.0002, "epoch": 1.0688042752171008, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3454349637031555, "learning_rate": 0.0002, "epoch": 1.0821643286573146, "step": 810}, {"loss": 1.7257, "grad_norm": 0.3767355978488922, "learning_rate": 0.0002, "epoch": 1.0955243820975284, "step": 820}, {"loss": 1.7224, "grad_norm": 0.3361407518386841, "learning_rate": 0.0002, "epoch": 1.108884435537742, "step": 830}, {"loss": 1.7509, "grad_norm": 0.3654632568359375, "learning_rate": 0.0002, "epoch": 1.122244488977956, "step": 840}, {"loss": 1.7151, "grad_norm": 0.3822861313819885, "learning_rate": 0.0002, "epoch": 1.1356045424181698, "step": 850}, {"loss": 1.7121, "grad_norm": 0.3853831887245178, "learning_rate": 0.0002, "epoch": 1.1489645958583834, "step": 860}, {"loss": 1.7685, "grad_norm": 0.35521796345710754, "learning_rate": 0.0002, "epoch": 1.1623246492985972, "step": 870}, {"loss": 1.7735, "grad_norm": 0.4107200503349304, "learning_rate": 0.0002, "epoch": 1.1756847027388109, "step": 880}, {"loss": 1.7484, "grad_norm": 0.33219534158706665, "learning_rate": 0.0002, "epoch": 1.1890447561790247, "step": 890}, {"loss": 1.7071, "grad_norm": 0.3559704124927521, "learning_rate": 0.0002, "epoch": 1.2024048096192386, "step": 900}, {"loss": 1.7535, "grad_norm": 0.3700537383556366, "learning_rate": 0.0002, "epoch": 1.2157648630594522, "step": 910}, {"loss": 1.7513, "grad_norm": 0.3771909475326538, "learning_rate": 0.0002, "epoch": 1.229124916499666, "step": 920}, {"loss": 1.7566, "grad_norm": 0.3136613965034485, "learning_rate": 0.0002, "epoch": 1.2424849699398797, "step": 930}, {"loss": 1.6783, "grad_norm": 0.3952099084854126, "learning_rate": 0.0002, "epoch": 1.2558450233800935, "step": 940}, {"loss": 1.7691, "grad_norm": 0.36534377932548523, "learning_rate": 0.0002, "epoch": 1.2692050768203074, "step": 950}, {"loss": 1.7127, "grad_norm": 0.3803492486476898, "learning_rate": 0.0002, "epoch": 1.282565130260521, "step": 960}, {"loss": 1.7896, "grad_norm": 0.3992428183555603, "learning_rate": 0.0002, "epoch": 1.2959251837007348, "step": 970}, {"loss": 1.7343, "grad_norm": 0.3627142906188965, "learning_rate": 0.0002, "epoch": 1.3092852371409487, "step": 980}, {"loss": 1.7598, "grad_norm": 0.4248180091381073, "learning_rate": 0.0002, "epoch": 1.3226452905811623, "step": 990}, {"loss": 1.6896, "grad_norm": 0.4060308039188385, "learning_rate": 0.0002, "epoch": 1.3360053440213762, "step": 1000}, {"loss": 1.7457, "grad_norm": 0.3788969814777374, "learning_rate": 0.0002, "epoch": 1.3493653974615898, "step": 1010}, {"loss": 1.7111, "grad_norm": 0.4174270033836365, "learning_rate": 0.0002, "epoch": 1.3627254509018036, "step": 1020}, {"loss": 1.7975, "grad_norm": 0.35500675439834595, "learning_rate": 0.0002, "epoch": 1.3760855043420173, "step": 1030}, {"loss": 1.724, "grad_norm": 0.3454059362411499, "learning_rate": 0.0002, "epoch": 1.389445557782231, "step": 1040}, {"loss": 1.8299, "grad_norm": 0.45807570219039917, "learning_rate": 0.0002, "epoch": 1.402805611222445, "step": 1050}, {"loss": 1.7425, "grad_norm": 0.39338022470474243, "learning_rate": 0.0002, "epoch": 1.4161656646626586, "step": 1060}, {"loss": 1.7457, "grad_norm": 0.3870709240436554, "learning_rate": 0.0002, "epoch": 1.4295257181028724, "step": 1070}, {"loss": 1.6565, "grad_norm": 0.40996190905570984, "learning_rate": 0.0002, "epoch": 1.4428857715430863, "step": 1080}, {"loss": 1.7324, "grad_norm": 0.38762837648391724, "learning_rate": 0.0002, "epoch": 1.4562458249833, "step": 1090}, {"loss": 1.7362, "grad_norm": 0.36756977438926697, "learning_rate": 0.0002, "epoch": 1.4696058784235138, "step": 1100}, {"loss": 1.7451, "grad_norm": 0.4087235927581787, "learning_rate": 0.0002, "epoch": 1.4829659318637274, "step": 1110}, {"loss": 1.7114, "grad_norm": 0.3357745110988617, "learning_rate": 0.0002, "epoch": 1.4963259853039412, "step": 1120}, {"loss": 1.6877, "grad_norm": 0.37486532330513, "learning_rate": 0.0002, "epoch": 1.5096860387441549, "step": 1130}, {"loss": 1.7252, "grad_norm": 0.3387809991836548, "learning_rate": 0.0002, "epoch": 1.5230460921843687, "step": 1140}, {"loss": 1.7169, "grad_norm": 0.37462118268013, "learning_rate": 0.0002, "epoch": 1.5364061456245826, "step": 1150}, {"loss": 1.6988, "grad_norm": 0.38575324416160583, "learning_rate": 0.0002, "epoch": 1.5497661990647962, "step": 1160}, {"loss": 1.7438, "grad_norm": 0.3515765964984894, "learning_rate": 0.0002, "epoch": 1.56312625250501, "step": 1170}, {"loss": 1.7524, "grad_norm": 0.39308643341064453, "learning_rate": 0.0002, "epoch": 1.5764863059452239, "step": 1180}, {"loss": 1.6422, "grad_norm": 0.3308864235877991, "learning_rate": 0.0002, "epoch": 1.5898463593854375, "step": 1190}, {"loss": 1.7566, "grad_norm": 0.3397478461265564, "learning_rate": 0.0002, "epoch": 1.6032064128256514, "step": 1200}, {"loss": 1.7871, "grad_norm": 0.3911525309085846, "learning_rate": 0.0002, "epoch": 1.6165664662658652, "step": 1210}, {"loss": 1.7443, "grad_norm": 0.3771969974040985, "learning_rate": 0.0002, "epoch": 1.6299265197060788, "step": 1220}, {"loss": 1.7631, "grad_norm": 0.35346856713294983, "learning_rate": 0.0002, "epoch": 1.6432865731462925, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.41736963391304016, "learning_rate": 0.0002, "epoch": 1.6566466265865063, "step": 1240}, {"loss": 1.7582, "grad_norm": 0.3375225067138672, "learning_rate": 0.0002, "epoch": 1.6700066800267201, "step": 1250}, {"loss": 1.6916, "grad_norm": 0.3779928982257843, "learning_rate": 0.0002, "epoch": 1.6833667334669338, "step": 1260}, {"loss": 1.728, "grad_norm": 0.35388994216918945, "learning_rate": 0.0002, "epoch": 1.6967267869071476, "step": 1270}, {"loss": 1.7461, "grad_norm": 0.33884134888648987, "learning_rate": 0.0002, "epoch": 1.7100868403473615, "step": 1280}, {"loss": 1.7083, "grad_norm": 0.35439756512641907, "learning_rate": 0.0002, "epoch": 1.723446893787575, "step": 1290}, {"loss": 1.7389, "grad_norm": 0.3766156733036041, "learning_rate": 0.0002, "epoch": 1.736806947227789, "step": 1300}, {"loss": 1.7847, "grad_norm": 0.36148911714553833, "learning_rate": 0.0002, "epoch": 1.7501670006680028, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.39687496423721313, "learning_rate": 0.0002, "epoch": 1.7635270541082164, "step": 1320}, {"loss": 1.7541, "grad_norm": 0.35639452934265137, "learning_rate": 0.0002, "epoch": 1.77688710754843, "step": 1330}, {"loss": 1.7254, "grad_norm": 0.38781628012657166, "learning_rate": 0.0002, "epoch": 1.7902471609886441, "step": 1340}, {"loss": 1.7867, "grad_norm": 0.42784637212753296, "learning_rate": 0.0002, "epoch": 1.8036072144288577, "step": 1350}, {"loss": 1.7336, "grad_norm": 0.40258511900901794, "learning_rate": 0.0002, "epoch": 1.8169672678690714, "step": 1360}, {"loss": 1.7771, "grad_norm": 0.36674195528030396, "learning_rate": 0.0002, "epoch": 1.8303273213092852, "step": 1370}, {"loss": 1.7425, "grad_norm": 0.4064558446407318, "learning_rate": 0.0002, "epoch": 1.843687374749499, "step": 1380}, {"loss": 1.7425, "grad_norm": 0.3669849932193756, "learning_rate": 0.0002, "epoch": 1.8570474281897127, "step": 1390}, {"loss": 1.7924, "grad_norm": 0.37569567561149597, "learning_rate": 0.0002, "epoch": 1.8704074816299265, "step": 1400}, {"loss": 1.7885, "grad_norm": 0.37307995557785034, "learning_rate": 0.0002, "epoch": 1.8837675350701404, "step": 1410}, {"loss": 1.7548, "grad_norm": 0.3772695064544678, "learning_rate": 0.0002, "epoch": 1.897127588510354, "step": 1420}, {"loss": 1.7682, "grad_norm": 0.36993589997291565, "learning_rate": 0.0002, "epoch": 1.9104876419505676, "step": 1430}, {"loss": 1.7249, "grad_norm": 0.3490557372570038, "learning_rate": 0.0002, "epoch": 1.9238476953907817, "step": 1440}, {"loss": 1.7979, "grad_norm": 0.3716149628162384, "learning_rate": 0.0002, "epoch": 1.9372077488309953, "step": 1450}, {"loss": 1.6664, "grad_norm": 0.39236098527908325, "learning_rate": 0.0002, "epoch": 1.950567802271209, "step": 1460}, {"loss": 1.6852, "grad_norm": 0.37258651852607727, "learning_rate": 0.0002, "epoch": 1.9639278557114228, "step": 1470}, {"loss": 1.7427, "grad_norm": 0.36183077096939087, "learning_rate": 0.0002, "epoch": 1.9772879091516367, "step": 1480}, {"loss": 1.7055, "grad_norm": 0.3956947326660156, "learning_rate": 0.0002, "epoch": 1.9906479625918503, "step": 1490}, {"eval_loss": 1.8132041692733765, "eval_runtime": 38.6287, "eval_samples_per_second": 13.332, "eval_steps_per_second": 1.683, "epoch": 2.0, "step": 1497}, {"loss": 1.6791, "grad_norm": 0.34480565786361694, "learning_rate": 0.0002, "epoch": 2.004008016032064, "step": 1500}, {"loss": 1.6367, "grad_norm": 0.3418028652667999, "learning_rate": 0.0002, "epoch": 2.017368069472278, "step": 1510}, {"loss": 1.5827, "grad_norm": 0.4514467716217041, "learning_rate": 0.0002, "epoch": 2.0307281229124916, "step": 1520}, {"loss": 1.6365, "grad_norm": 0.4197506606578827, "learning_rate": 0.0002, "epoch": 2.0440881763527052, "step": 1530}, {"loss": 1.6221, "grad_norm": 0.4134170711040497, "learning_rate": 0.0002, "epoch": 2.0574482297929193, "step": 1540}, {"loss": 1.6876, "grad_norm": 0.43709826469421387, "learning_rate": 0.0002, "epoch": 2.070808283233133, "step": 1550}, {"loss": 1.5779, "grad_norm": 0.4703378677368164, "learning_rate": 0.0002, "epoch": 2.0841683366733466, "step": 1560}, {"loss": 1.599, "grad_norm": 0.4538188576698303, "learning_rate": 0.0002, "epoch": 2.0975283901135606, "step": 1570}, {"loss": 1.6464, "grad_norm": 0.4649668037891388, "learning_rate": 0.0002, "epoch": 2.1108884435537743, "step": 1580}, {"loss": 1.6348, "grad_norm": 0.42669883370399475, "learning_rate": 0.0002, "epoch": 2.124248496993988, "step": 1590}, {"loss": 1.5838, "grad_norm": 0.43162038922309875, "learning_rate": 0.0002, "epoch": 2.1376085504342015, "step": 1600}, {"loss": 1.6673, "grad_norm": 0.4294586479663849, "learning_rate": 0.0002, "epoch": 2.1509686038744156, "step": 1610}, {"loss": 1.6024, "grad_norm": 0.4669102132320404, "learning_rate": 0.0002, "epoch": 2.164328657314629, "step": 1620}, {"loss": 1.659, "grad_norm": 0.4188412129878998, "learning_rate": 0.0002, "epoch": 2.177688710754843, "step": 1630}, {"loss": 1.625, "grad_norm": 0.4662680923938751, "learning_rate": 0.0002, "epoch": 2.191048764195057, "step": 1640}, {"loss": 1.6699, "grad_norm": 0.4020286500453949, "learning_rate": 0.0002, "epoch": 2.2044088176352705, "step": 1650}, {"loss": 1.6284, "grad_norm": 0.41919606924057007, "learning_rate": 0.0002, "epoch": 2.217768871075484, "step": 1660}, {"loss": 1.6776, "grad_norm": 0.4644531309604645, "learning_rate": 0.0002, "epoch": 2.231128924515698, "step": 1670}, {"loss": 1.6711, "grad_norm": 0.4526427984237671, "learning_rate": 0.0002, "epoch": 2.244488977955912, "step": 1680}, {"loss": 1.6058, "grad_norm": 0.45953166484832764, "learning_rate": 0.0002, "epoch": 2.2578490313961255, "step": 1690}, {"loss": 1.5979, "grad_norm": 0.4701860249042511, "learning_rate": 0.0002, "epoch": 2.2712090848363395, "step": 1700}, {"loss": 1.6183, "grad_norm": 0.4749310612678528, "learning_rate": 0.0002, "epoch": 2.284569138276553, "step": 1710}, {"loss": 1.6703, "grad_norm": 0.45026102662086487, "learning_rate": 0.0002, "epoch": 2.297929191716767, "step": 1720}, {"loss": 1.6386, "grad_norm": 0.4755004048347473, "learning_rate": 0.0002, "epoch": 2.3112892451569804, "step": 1730}, {"loss": 1.6365, "grad_norm": 0.4505726993083954, "learning_rate": 0.0002, "epoch": 2.3246492985971945, "step": 1740}, {"loss": 1.589, "grad_norm": 0.44464054703712463, "learning_rate": 0.0002, "epoch": 2.338009352037408, "step": 1750}, {"loss": 1.6139, "grad_norm": 0.4449476897716522, "learning_rate": 0.0002, "epoch": 2.3513694054776217, "step": 1760}, {"loss": 1.7195, "grad_norm": 0.4216482937335968, "learning_rate": 0.0002, "epoch": 2.364729458917836, "step": 1770}, {"loss": 1.7075, "grad_norm": 0.4379308521747589, "learning_rate": 0.0002, "epoch": 2.3780895123580494, "step": 1780}, {"loss": 1.7024, "grad_norm": 0.41670042276382446, "learning_rate": 0.0002, "epoch": 2.391449565798263, "step": 1790}, {"loss": 1.5989, "grad_norm": 0.48089510202407837, "learning_rate": 0.0002, "epoch": 2.404809619238477, "step": 1800}, {"loss": 1.6313, "grad_norm": 0.4389738142490387, "learning_rate": 0.0002, "epoch": 2.4181696726786908, "step": 1810}, {"loss": 1.5841, "grad_norm": 0.45293036103248596, "learning_rate": 0.0002, "epoch": 2.4315297261189044, "step": 1820}, {"loss": 1.6887, "grad_norm": 0.5211683511734009, "learning_rate": 0.0002, "epoch": 2.4448897795591185, "step": 1830}, {"loss": 1.6599, "grad_norm": 0.4631884694099426, "learning_rate": 0.0002, "epoch": 2.458249832999332, "step": 1840}, {"loss": 1.6537, "grad_norm": 0.4276818335056305, "learning_rate": 0.0002, "epoch": 2.4716098864395457, "step": 1850}, {"loss": 1.6836, "grad_norm": 0.477524071931839, "learning_rate": 0.0002, "epoch": 2.4849699398797593, "step": 1860}, {"loss": 1.66, "grad_norm": 0.44860973954200745, "learning_rate": 0.0002, "epoch": 2.4983299933199734, "step": 1870}, {"loss": 1.6308, "grad_norm": 0.46413546800613403, "learning_rate": 0.0002, "epoch": 2.511690046760187, "step": 1880}, {"loss": 1.6225, "grad_norm": 0.42487645149230957, "learning_rate": 0.0002, "epoch": 2.5250501002004007, "step": 1890}, {"loss": 1.6268, "grad_norm": 0.4778307378292084, "learning_rate": 0.0002, "epoch": 2.5384101536406147, "step": 1900}, {"loss": 1.6143, "grad_norm": 0.45307061076164246, "learning_rate": 0.0002, "epoch": 2.5517702070808284, "step": 1910}, {"loss": 1.7279, "grad_norm": 0.47886642813682556, "learning_rate": 0.0002, "epoch": 2.565130260521042, "step": 1920}, {"loss": 1.5931, "grad_norm": 0.4839435815811157, "learning_rate": 0.0002, "epoch": 2.5784903139612556, "step": 1930}, {"loss": 1.6089, "grad_norm": 0.4388359785079956, "learning_rate": 0.0002, "epoch": 2.5918503674014697, "step": 1940}, {"loss": 1.6828, "grad_norm": 0.47859734296798706, "learning_rate": 0.0002, "epoch": 2.6052104208416833, "step": 1950}, {"loss": 1.6014, "grad_norm": 0.5526517033576965, "learning_rate": 0.0002, "epoch": 2.6185704742818974, "step": 1960}, {"loss": 1.6889, "grad_norm": 0.5449170470237732, "learning_rate": 0.0002, "epoch": 2.631930527722111, "step": 1970}, {"loss": 1.6481, "grad_norm": 0.48521968722343445, "learning_rate": 0.0002, "epoch": 2.6452905811623246, "step": 1980}, {"loss": 1.6741, "grad_norm": 0.4733737111091614, "learning_rate": 0.0002, "epoch": 2.6586506346025383, "step": 1990}, {"loss": 1.662, "grad_norm": 0.507118284702301, "learning_rate": 0.0002, "epoch": 2.6720106880427523, "step": 2000}, {"loss": 1.6419, "grad_norm": 0.4508971571922302, "learning_rate": 0.0002, "epoch": 2.685370741482966, "step": 2010}, {"loss": 1.7052, "grad_norm": 0.4657728672027588, "learning_rate": 0.0002, "epoch": 2.6987307949231796, "step": 2020}, {"loss": 1.6261, "grad_norm": 0.48647549748420715, "learning_rate": 0.0002, "epoch": 2.7120908483633936, "step": 2030}, {"loss": 1.5638, "grad_norm": 0.49525555968284607, "learning_rate": 0.0002, "epoch": 2.7254509018036073, "step": 2040}, {"loss": 1.658, "grad_norm": 0.4712379276752472, "learning_rate": 0.0002, "epoch": 2.738810955243821, "step": 2050}, {"loss": 1.6464, "grad_norm": 0.4846591055393219, "learning_rate": 0.0002, "epoch": 2.7521710086840345, "step": 2060}, {"loss": 1.5641, "grad_norm": 0.4823240041732788, "learning_rate": 0.0002, "epoch": 2.7655310621242486, "step": 2070}, {"loss": 1.6701, "grad_norm": 0.4546685516834259, "learning_rate": 0.0002, "epoch": 2.778891115564462, "step": 2080}, {"loss": 1.7015, "grad_norm": 0.45542681217193604, "learning_rate": 0.0002, "epoch": 2.7922511690046763, "step": 2090}, {"loss": 1.6398, "grad_norm": 0.42137566208839417, "learning_rate": 0.0002, "epoch": 2.80561122244489, "step": 2100}, {"loss": 1.6526, "grad_norm": 0.6143282055854797, "learning_rate": 0.0002, "epoch": 2.8189712758851035, "step": 2110}, {"loss": 1.6955, "grad_norm": 0.4828081727027893, "learning_rate": 0.0002, "epoch": 2.832331329325317, "step": 2120}, {"loss": 1.744, "grad_norm": 0.4319005608558655, "learning_rate": 0.0002, "epoch": 2.845691382765531, "step": 2130}, {"loss": 1.6717, "grad_norm": 0.4297086298465729, "learning_rate": 0.0002, "epoch": 2.859051436205745, "step": 2140}, {"loss": 1.5968, "grad_norm": 0.5011981129646301, "learning_rate": 0.0002, "epoch": 2.8724114896459585, "step": 2150}, {"loss": 1.7181, "grad_norm": 0.4401548504829407, "learning_rate": 0.0002, "epoch": 2.8857715430861726, "step": 2160}, {"loss": 1.5722, "grad_norm": 0.48090746998786926, "learning_rate": 0.0002, "epoch": 2.899131596526386, "step": 2170}, {"loss": 1.6596, "grad_norm": 0.4740385413169861, "learning_rate": 0.0002, "epoch": 2.9124916499666, "step": 2180}, {"loss": 1.6501, "grad_norm": 0.5337260365486145, "learning_rate": 0.0002, "epoch": 2.9258517034068134, "step": 2190}, {"loss": 1.6802, "grad_norm": 0.4420052766799927, "learning_rate": 0.0002, "epoch": 2.9392117568470275, "step": 2200}, {"loss": 1.5474, "grad_norm": 0.477512389421463, "learning_rate": 0.0002, "epoch": 2.952571810287241, "step": 2210}, {"loss": 1.6544, "grad_norm": 0.5344052910804749, "learning_rate": 0.0002, "epoch": 2.9659318637274548, "step": 2220}, {"loss": 1.6866, "grad_norm": 0.4483940303325653, "learning_rate": 0.0002, "epoch": 2.979291917167669, "step": 2230}, {"loss": 1.6477, "grad_norm": 0.4366597831249237, "learning_rate": 0.0002, "epoch": 2.9926519706078825, "step": 2240}, {"eval_loss": 1.834012746810913, "eval_runtime": 38.5659, "eval_samples_per_second": 13.354, "eval_steps_per_second": 1.685, "epoch": 2.9993319973279893, "step": 2245}, {"loss": 1.5582, "grad_norm": 0.428824245929718, "learning_rate": 0.0002, "epoch": 3.006012024048096, "step": 2250}, {"loss": 1.499, "grad_norm": 0.4870174825191498, "learning_rate": 0.0002, "epoch": 3.01937207748831, "step": 2260}, {"loss": 1.4872, "grad_norm": 0.4684266149997711, "learning_rate": 0.0002, "epoch": 3.032732130928524, "step": 2270}, {"loss": 1.5284, "grad_norm": 0.581604540348053, "learning_rate": 0.0002, "epoch": 3.0460921843687374, "step": 2280}, {"loss": 1.4549, "grad_norm": 0.5561677813529968, "learning_rate": 0.0002, "epoch": 3.059452237808951, "step": 2290}, {"loss": 1.4903, "grad_norm": 0.5750220417976379, "learning_rate": 0.0002, "epoch": 3.072812291249165, "step": 2300}, {"loss": 1.5903, "grad_norm": 0.5704626441001892, "learning_rate": 0.0002, "epoch": 3.0861723446893787, "step": 2310}, {"loss": 1.4292, "grad_norm": 0.6242083311080933, "learning_rate": 0.0002, "epoch": 3.0995323981295924, "step": 2320}, {"loss": 1.5092, "grad_norm": 0.5174121260643005, "learning_rate": 0.0002, "epoch": 3.1128924515698064, "step": 2330}, {"loss": 1.5106, "grad_norm": 0.5697633028030396, "learning_rate": 0.0002, "epoch": 3.12625250501002, "step": 2340}, {"loss": 1.5156, "grad_norm": 0.5969541072845459, "learning_rate": 0.0002, "epoch": 3.1396125584502337, "step": 2350}, {"loss": 1.52, "grad_norm": 0.6244304180145264, "learning_rate": 0.0002, "epoch": 3.1529726118904478, "step": 2360}, {"loss": 1.5244, "grad_norm": 0.5561705827713013, "learning_rate": 0.0002, "epoch": 3.1663326653306614, "step": 2370}, {"loss": 1.6169, "grad_norm": 0.5401188135147095, "learning_rate": 0.0002, "epoch": 3.179692718770875, "step": 2380}, {"loss": 1.5387, "grad_norm": 0.6450421810150146, "learning_rate": 0.0002, "epoch": 3.1930527722110886, "step": 2390}, {"loss": 1.4839, "grad_norm": 0.5741903185844421, "learning_rate": 0.0002, "epoch": 3.2064128256513027, "step": 2400}, {"loss": 1.5584, "grad_norm": 0.6337407231330872, "learning_rate": 0.0002, "epoch": 3.2197728790915163, "step": 2410}, {"loss": 1.5025, "grad_norm": 0.6493517160415649, "learning_rate": 0.0002, "epoch": 3.23313293253173, "step": 2420}, {"loss": 1.5168, "grad_norm": 0.6230176091194153, "learning_rate": 0.0002, "epoch": 3.246492985971944, "step": 2430}, {"loss": 1.5408, "grad_norm": 0.680704653263092, "learning_rate": 0.0002, "epoch": 3.2598530394121576, "step": 2440}, {"loss": 1.6005, "grad_norm": 0.5279417037963867, "learning_rate": 0.0002, "epoch": 3.2732130928523713, "step": 2450}, {"loss": 1.5231, "grad_norm": 0.5601515173912048, "learning_rate": 0.0002, "epoch": 3.2865731462925853, "step": 2460}, {"loss": 1.4949, "grad_norm": 0.5591090321540833, "learning_rate": 0.0002, "epoch": 3.299933199732799, "step": 2470}, {"loss": 1.5181, "grad_norm": 0.6596529483795166, "learning_rate": 0.0002, "epoch": 3.3132932531730126, "step": 2480}, {"loss": 1.5259, "grad_norm": 0.6115918755531311, "learning_rate": 0.0002, "epoch": 3.3266533066132267, "step": 2490}, {"loss": 1.5344, "grad_norm": 0.6443548202514648, "learning_rate": 0.0002, "epoch": 3.3400133600534403, "step": 2500}, {"loss": 1.5037, "grad_norm": 0.5504242181777954, "learning_rate": 0.0002, "epoch": 3.353373413493654, "step": 2510}, {"loss": 1.5049, "grad_norm": 0.6104483604431152, "learning_rate": 0.0002, "epoch": 3.3667334669338675, "step": 2520}, {"loss": 1.587, "grad_norm": 0.8387531638145447, "learning_rate": 0.0002, "epoch": 3.3800935203740816, "step": 2530}, {"loss": 1.5227, "grad_norm": 0.6346094012260437, "learning_rate": 0.0002, "epoch": 3.3934535738142952, "step": 2540}, {"loss": 1.4855, "grad_norm": 0.6261265873908997, "learning_rate": 0.0002, "epoch": 3.406813627254509, "step": 2550}, {"loss": 1.5233, "grad_norm": 0.5960372090339661, "learning_rate": 0.0002, "epoch": 3.420173680694723, "step": 2560}, {"loss": 1.5153, "grad_norm": 0.5291280746459961, "learning_rate": 0.0002, "epoch": 3.4335337341349366, "step": 2570}, {"loss": 1.5152, "grad_norm": 0.6133161783218384, "learning_rate": 0.0002, "epoch": 3.44689378757515, "step": 2580}, {"loss": 1.5533, "grad_norm": 0.623573362827301, "learning_rate": 0.0002, "epoch": 3.460253841015364, "step": 2590}, {"loss": 1.4935, "grad_norm": 0.5959834456443787, "learning_rate": 0.0002, "epoch": 3.473613894455578, "step": 2600}, {"loss": 1.5792, "grad_norm": 0.583332359790802, "learning_rate": 0.0002, "epoch": 3.4869739478957915, "step": 2610}, {"loss": 1.5229, "grad_norm": 0.6003559231758118, "learning_rate": 0.0002, "epoch": 3.5003340013360056, "step": 2620}, {"loss": 1.4901, "grad_norm": 0.5832992196083069, "learning_rate": 0.0002, "epoch": 3.513694054776219, "step": 2630}, {"loss": 1.5005, "grad_norm": 0.5942609906196594, "learning_rate": 0.0002, "epoch": 3.527054108216433, "step": 2640}, {"loss": 1.5213, "grad_norm": 0.6087163686752319, "learning_rate": 0.0002, "epoch": 3.5404141616566465, "step": 2650}, {"loss": 1.5826, "grad_norm": 0.631948709487915, "learning_rate": 0.0002, "epoch": 3.5537742150968605, "step": 2660}, {"loss": 1.5844, "grad_norm": 0.6450803279876709, "learning_rate": 0.0002, "epoch": 3.567134268537074, "step": 2670}, {"loss": 1.4981, "grad_norm": 0.6507797837257385, "learning_rate": 0.0002, "epoch": 3.580494321977288, "step": 2680}, {"loss": 1.5826, "grad_norm": 0.5778017044067383, "learning_rate": 0.0002, "epoch": 3.593854375417502, "step": 2690}, {"loss": 1.4688, "grad_norm": 0.6214032173156738, "learning_rate": 0.0002, "epoch": 3.6072144288577155, "step": 2700}, {"loss": 1.5084, "grad_norm": 0.5681133270263672, "learning_rate": 0.0002, "epoch": 3.620574482297929, "step": 2710}, {"loss": 1.471, "grad_norm": 0.6074244976043701, "learning_rate": 0.0002, "epoch": 3.6339345357381427, "step": 2720}, {"loss": 1.5243, "grad_norm": 0.5900560617446899, "learning_rate": 0.0002, "epoch": 3.647294589178357, "step": 2730}, {"loss": 1.5074, "grad_norm": 0.5817505717277527, "learning_rate": 0.0002, "epoch": 3.6606546426185704, "step": 2740}, {"loss": 1.5117, "grad_norm": 0.6095547676086426, "learning_rate": 0.0002, "epoch": 3.6740146960587845, "step": 2750}, {"loss": 1.5117, "grad_norm": 0.612790584564209, "learning_rate": 0.0002, "epoch": 3.687374749498998, "step": 2760}, {"loss": 1.4976, "grad_norm": 0.6574140787124634, "learning_rate": 0.0002, "epoch": 3.7007348029392118, "step": 2770}, {"loss": 1.5306, "grad_norm": 0.5643761157989502, "learning_rate": 0.0002, "epoch": 3.7140948563794254, "step": 2780}, {"loss": 1.5751, "grad_norm": 0.5652621388435364, "learning_rate": 0.0002, "epoch": 3.727454909819639, "step": 2790}, {"loss": 1.5262, "grad_norm": 0.5604206323623657, "learning_rate": 0.0002, "epoch": 3.740814963259853, "step": 2800}, {"loss": 1.5013, "grad_norm": 3.911022663116455, "learning_rate": 0.0002, "epoch": 3.7541750167000667, "step": 2810}, {"loss": 1.5793, "grad_norm": 0.6148333549499512, "learning_rate": 0.0002, "epoch": 3.7675350701402808, "step": 2820}, {"loss": 1.5122, "grad_norm": 0.5605677962303162, "learning_rate": 0.0002, "epoch": 3.7808951235804944, "step": 2830}, {"loss": 1.5659, "grad_norm": 0.6101965308189392, "learning_rate": 0.0002, "epoch": 3.794255177020708, "step": 2840}, {"loss": 1.5618, "grad_norm": 0.5387342572212219, "learning_rate": 0.0002, "epoch": 3.8076152304609217, "step": 2850}, {"loss": 1.5193, "grad_norm": 0.5733087062835693, "learning_rate": 0.0002, "epoch": 3.8209752839011357, "step": 2860}, {"loss": 1.5545, "grad_norm": 0.6538485884666443, "learning_rate": 0.0002, "epoch": 3.8343353373413493, "step": 2870}, {"loss": 1.523, "grad_norm": 0.6247632503509521, "learning_rate": 0.0002, "epoch": 3.847695390781563, "step": 2880}, {"loss": 1.5591, "grad_norm": 0.5745735764503479, "learning_rate": 0.0002, "epoch": 3.861055444221777, "step": 2890}, {"loss": 1.5706, "grad_norm": 0.5942763686180115, "learning_rate": 0.0002, "epoch": 3.8744154976619907, "step": 2900}, {"loss": 1.564, "grad_norm": 0.7086281776428223, "learning_rate": 0.0002, "epoch": 3.8877755511022043, "step": 2910}, {"loss": 1.5526, "grad_norm": 0.8825129866600037, "learning_rate": 0.0002, "epoch": 3.901135604542418, "step": 2920}, {"loss": 1.4519, "grad_norm": 0.6260842680931091, "learning_rate": 0.0002, "epoch": 3.914495657982632, "step": 2930}, {"loss": 1.5433, "grad_norm": 0.6015968322753906, "learning_rate": 0.0002, "epoch": 3.9278557114228456, "step": 2940}, {"loss": 1.4931, "grad_norm": 0.7042809128761292, "learning_rate": 0.0002, "epoch": 3.9412157648630597, "step": 2950}, {"loss": 1.5596, "grad_norm": 0.5860083699226379, "learning_rate": 0.0002, "epoch": 3.9545758183032733, "step": 2960}, {"loss": 1.565, "grad_norm": 0.5939757823944092, "learning_rate": 0.0002, "epoch": 3.967935871743487, "step": 2970}, {"loss": 1.408, "grad_norm": 0.5523964166641235, "learning_rate": 0.0002, "epoch": 3.9812959251837006, "step": 2980}, {"loss": 1.5629, "grad_norm": 0.6380264759063721, "learning_rate": 0.0002, "epoch": 3.9946559786239146, "step": 2990}, {"eval_loss": 1.8875294923782349, "eval_runtime": 38.5837, "eval_samples_per_second": 13.348, "eval_steps_per_second": 1.685, "epoch": 4.0, "step": 2994}, {"loss": 1.4002, "grad_norm": 0.5478564500808716, "learning_rate": 0.0002, "epoch": 4.008016032064128, "step": 3000}, {"loss": 1.436, "grad_norm": 0.9384379982948303, "learning_rate": 0.0002, "epoch": 4.021376085504342, "step": 3010}, {"loss": 1.4127, "grad_norm": 0.7819344401359558, "learning_rate": 0.0002, "epoch": 4.034736138944556, "step": 3020}, {"loss": 1.326, "grad_norm": 0.7737417817115784, "learning_rate": 0.0002, "epoch": 4.04809619238477, "step": 3030}, {"loss": 1.3203, "grad_norm": 0.8893805742263794, "learning_rate": 0.0002, "epoch": 4.061456245824983, "step": 3040}, {"loss": 1.3913, "grad_norm": 0.7759843468666077, "learning_rate": 0.0002, "epoch": 4.074816299265197, "step": 3050}, {"loss": 1.2941, "grad_norm": 0.642654538154602, "learning_rate": 0.0002, "epoch": 4.0881763527054105, "step": 3060}, {"loss": 1.3204, "grad_norm": 0.8515549302101135, "learning_rate": 0.0002, "epoch": 4.101536406145625, "step": 3070}, {"loss": 1.3683, "grad_norm": 0.7033658623695374, "learning_rate": 0.0002, "epoch": 4.114896459585839, "step": 3080}, {"loss": 1.4159, "grad_norm": 0.7063882946968079, "learning_rate": 0.0002, "epoch": 4.128256513026052, "step": 3090}, {"loss": 1.384, "grad_norm": 0.6946853995323181, "learning_rate": 0.0002, "epoch": 4.141616566466266, "step": 3100}, {"loss": 1.3689, "grad_norm": 0.7286741137504578, "learning_rate": 0.0002, "epoch": 4.1549766199064795, "step": 3110}, {"loss": 1.3061, "grad_norm": 0.7894193530082703, "learning_rate": 0.0002, "epoch": 4.168336673346693, "step": 3120}, {"loss": 1.3346, "grad_norm": 0.7005895376205444, "learning_rate": 0.0002, "epoch": 4.181696726786907, "step": 3130}, {"loss": 1.3834, "grad_norm": 0.799567461013794, "learning_rate": 0.0002, "epoch": 4.195056780227121, "step": 3140}, {"loss": 1.3813, "grad_norm": 0.7010157108306885, "learning_rate": 0.0002, "epoch": 4.208416833667335, "step": 3150}, {"loss": 1.3637, "grad_norm": 0.7489650249481201, "learning_rate": 0.0002, "epoch": 4.2217768871075485, "step": 3160}, {"loss": 1.3546, "grad_norm": 0.7908048629760742, "learning_rate": 0.0002, "epoch": 4.235136940547762, "step": 3170}, {"loss": 1.3073, "grad_norm": 0.7002180814743042, "learning_rate": 0.0002, "epoch": 4.248496993987976, "step": 3180}, {"loss": 1.4525, "grad_norm": 0.8339495062828064, "learning_rate": 0.0002, "epoch": 4.261857047428189, "step": 3190}, {"loss": 1.3471, "grad_norm": 0.7884618043899536, "learning_rate": 0.0002, "epoch": 4.275217100868403, "step": 3200}, {"loss": 1.4261, "grad_norm": 0.7964122295379639, "learning_rate": 0.0002, "epoch": 4.2885771543086175, "step": 3210}, {"loss": 1.3506, "grad_norm": 0.838646650314331, "learning_rate": 0.0002, "epoch": 4.301937207748831, "step": 3220}, {"loss": 1.3738, "grad_norm": 0.8063107132911682, "learning_rate": 0.0002, "epoch": 4.315297261189045, "step": 3230}, {"loss": 1.3769, "grad_norm": 0.8147385120391846, "learning_rate": 0.0002, "epoch": 4.328657314629258, "step": 3240}, {"loss": 1.4118, "grad_norm": 0.7636798620223999, "learning_rate": 0.0002, "epoch": 4.342017368069472, "step": 3250}, {"loss": 1.3698, "grad_norm": 0.7530609965324402, "learning_rate": 0.0002, "epoch": 4.355377421509686, "step": 3260}, {"loss": 1.3507, "grad_norm": 0.8853573799133301, "learning_rate": 0.0002, "epoch": 4.3687374749499, "step": 3270}, {"loss": 1.3614, "grad_norm": 0.7180975675582886, "learning_rate": 0.0002, "epoch": 4.382097528390114, "step": 3280}, {"loss": 1.4119, "grad_norm": 0.837150514125824, "learning_rate": 0.0002, "epoch": 4.395457581830327, "step": 3290}, {"loss": 1.461, "grad_norm": 0.8370638489723206, "learning_rate": 0.0002, "epoch": 4.408817635270541, "step": 3300}, {"loss": 1.4478, "grad_norm": 0.7738229036331177, "learning_rate": 0.0002, "epoch": 4.422177688710755, "step": 3310}, {"loss": 1.4195, "grad_norm": 0.7665290832519531, "learning_rate": 0.0002, "epoch": 4.435537742150968, "step": 3320}, {"loss": 1.3308, "grad_norm": 0.7547745704650879, "learning_rate": 0.0002, "epoch": 4.448897795591183, "step": 3330}, {"loss": 1.4165, "grad_norm": 0.7421861290931702, "learning_rate": 0.0002, "epoch": 4.462257849031396, "step": 3340}, {"loss": 1.4244, "grad_norm": 0.8042104244232178, "learning_rate": 0.0002, "epoch": 4.47561790247161, "step": 3350}, {"loss": 1.365, "grad_norm": 0.8111839890480042, "learning_rate": 0.0002, "epoch": 4.488977955911824, "step": 3360}, {"loss": 1.3537, "grad_norm": 0.7998340129852295, "learning_rate": 0.0002, "epoch": 4.502338009352037, "step": 3370}, {"loss": 1.3812, "grad_norm": 0.7668877243995667, "learning_rate": 0.0002, "epoch": 4.515698062792251, "step": 3380}, {"loss": 1.3972, "grad_norm": 0.7986718416213989, "learning_rate": 0.0002, "epoch": 4.529058116232465, "step": 3390}, {"loss": 1.3582, "grad_norm": 0.6806602478027344, "learning_rate": 0.0002, "epoch": 4.542418169672679, "step": 3400}, {"loss": 1.3942, "grad_norm": 0.8788819909095764, "learning_rate": 0.0002, "epoch": 4.555778223112893, "step": 3410}, {"loss": 1.3379, "grad_norm": 0.7499664425849915, "learning_rate": 0.0002, "epoch": 4.569138276553106, "step": 3420}, {"loss": 1.3823, "grad_norm": 0.7967109084129333, "learning_rate": 0.0002, "epoch": 4.58249832999332, "step": 3430}, {"loss": 1.3531, "grad_norm": 0.759639322757721, "learning_rate": 0.0002, "epoch": 4.595858383433534, "step": 3440}, {"loss": 1.3517, "grad_norm": 0.8327916264533997, "learning_rate": 0.0002, "epoch": 4.609218436873747, "step": 3450}, {"loss": 1.4619, "grad_norm": 0.7400892376899719, "learning_rate": 0.0002, "epoch": 4.622578490313961, "step": 3460}, {"loss": 1.3374, "grad_norm": 0.8116602301597595, "learning_rate": 0.0002, "epoch": 4.635938543754175, "step": 3470}, {"loss": 1.4445, "grad_norm": 0.7604362368583679, "learning_rate": 0.0002, "epoch": 4.649298597194389, "step": 3480}, {"loss": 1.3724, "grad_norm": 0.7397996783256531, "learning_rate": 0.0002, "epoch": 4.662658650634603, "step": 3490}, {"loss": 1.4048, "grad_norm": 0.869293749332428, "learning_rate": 0.0002, "epoch": 4.676018704074816, "step": 3500}, {"loss": 1.3873, "grad_norm": 0.6854358315467834, "learning_rate": 0.0002, "epoch": 4.68937875751503, "step": 3510}, {"loss": 1.3413, "grad_norm": 0.8326661586761475, "learning_rate": 0.0002, "epoch": 4.7027388109552435, "step": 3520}, {"loss": 1.3666, "grad_norm": 0.6887506246566772, "learning_rate": 0.0002, "epoch": 4.716098864395457, "step": 3530}, {"loss": 1.4508, "grad_norm": 3.837689161300659, "learning_rate": 0.0002, "epoch": 4.729458917835672, "step": 3540}, {"loss": 1.3775, "grad_norm": 0.6874563694000244, "learning_rate": 0.0002, "epoch": 4.742818971275885, "step": 3550}, {"loss": 1.3643, "grad_norm": 0.8340407609939575, "learning_rate": 0.0002, "epoch": 4.756179024716099, "step": 3560}, {"loss": 1.3556, "grad_norm": 0.7286418676376343, "learning_rate": 0.0002, "epoch": 4.7695390781563125, "step": 3570}, {"loss": 1.4338, "grad_norm": 0.7239373326301575, "learning_rate": 0.0002, "epoch": 4.782899131596526, "step": 3580}, {"loss": 1.4697, "grad_norm": 0.831310510635376, "learning_rate": 0.0002, "epoch": 4.796259185036741, "step": 3590}, {"loss": 1.4146, "grad_norm": 0.767715573310852, "learning_rate": 0.0002, "epoch": 4.809619238476954, "step": 3600}, {"loss": 1.4199, "grad_norm": 0.9013199210166931, "learning_rate": 0.0002, "epoch": 4.822979291917168, "step": 3610}, {"loss": 1.4513, "grad_norm": 0.7543512582778931, "learning_rate": 0.0002, "epoch": 4.8363393453573815, "step": 3620}, {"loss": 1.4218, "grad_norm": 0.7626057267189026, "learning_rate": 0.0002, "epoch": 4.849699398797595, "step": 3630}, {"loss": 1.4102, "grad_norm": 0.847079336643219, "learning_rate": 0.0002, "epoch": 4.863059452237809, "step": 3640}, {"loss": 1.5014, "grad_norm": 0.8273295760154724, "learning_rate": 0.0002, "epoch": 4.876419505678022, "step": 3650}, {"loss": 1.3806, "grad_norm": 0.7675244808197021, "learning_rate": 0.0002, "epoch": 4.889779559118237, "step": 3660}, {"loss": 1.4894, "grad_norm": 0.9560356736183167, "learning_rate": 0.0002, "epoch": 4.9031396125584505, "step": 3670}, {"loss": 1.4044, "grad_norm": 0.7682451605796814, "learning_rate": 0.0002, "epoch": 4.916499665998664, "step": 3680}, {"loss": 1.342, "grad_norm": 0.8113830089569092, "learning_rate": 0.0002, "epoch": 4.929859719438878, "step": 3690}, {"loss": 1.3559, "grad_norm": 0.7642542719841003, "learning_rate": 0.0002, "epoch": 4.943219772879091, "step": 3700}, {"loss": 1.403, "grad_norm": 0.823863685131073, "learning_rate": 0.0002, "epoch": 4.956579826319305, "step": 3710}, {"loss": 1.464, "grad_norm": 0.8287797570228577, "learning_rate": 0.0002, "epoch": 4.969939879759519, "step": 3720}, {"loss": 1.4139, "grad_norm": 0.778170108795166, "learning_rate": 0.0002, "epoch": 4.983299933199733, "step": 3730}, {"loss": 1.4218, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 4.996659986639947, "step": 3740}, {"eval_loss": 1.9638569355010986, "eval_runtime": 38.5725, "eval_samples_per_second": 13.351, "eval_steps_per_second": 1.685, "epoch": 4.999331997327989, "step": 3742}, {"loss": 1.225, "grad_norm": 0.8864085078239441, "learning_rate": 0.0002, "epoch": 5.01002004008016, "step": 3750}, {"loss": 1.2056, "grad_norm": 0.9191637635231018, "learning_rate": 0.0002, "epoch": 5.023380093520374, "step": 3760}, {"loss": 1.2453, "grad_norm": 0.749519407749176, "learning_rate": 0.0002, "epoch": 5.036740146960588, "step": 3770}, {"loss": 1.1959, "grad_norm": 0.7916892170906067, "learning_rate": 0.0002, "epoch": 5.050100200400801, "step": 3780}, {"loss": 1.2279, "grad_norm": 1.0318909883499146, "learning_rate": 0.0002, "epoch": 5.063460253841015, "step": 3790}, {"loss": 1.2585, "grad_norm": 1.028586745262146, "learning_rate": 0.0002, "epoch": 5.0768203072812295, "step": 3800}, {"loss": 1.1769, "grad_norm": 1.0568538904190063, "learning_rate": 0.0002, "epoch": 5.090180360721443, "step": 3810}, {"loss": 1.263, "grad_norm": 0.9780595302581787, "learning_rate": 0.0002, "epoch": 5.103540414161657, "step": 3820}, {"loss": 1.2019, "grad_norm": 1.10311758518219, "learning_rate": 0.0002, "epoch": 5.11690046760187, "step": 3830}, {"loss": 1.2133, "grad_norm": 0.9497154355049133, "learning_rate": 0.0002, "epoch": 5.130260521042084, "step": 3840}, {"loss": 1.1718, "grad_norm": 0.948279857635498, "learning_rate": 0.0002, "epoch": 5.143620574482298, "step": 3850}, {"loss": 1.2108, "grad_norm": 0.9497880339622498, "learning_rate": 0.0002, "epoch": 5.156980627922512, "step": 3860}, {"loss": 1.1876, "grad_norm": 1.3213258981704712, "learning_rate": 0.0002, "epoch": 5.170340681362726, "step": 3870}, {"loss": 1.2327, "grad_norm": 0.9835752248764038, "learning_rate": 0.0002, "epoch": 5.183700734802939, "step": 3880}, {"loss": 1.2256, "grad_norm": 0.8426132202148438, "learning_rate": 0.0002, "epoch": 5.197060788243153, "step": 3890}, {"loss": 1.2066, "grad_norm": 1.0343470573425293, "learning_rate": 0.0002, "epoch": 5.210420841683367, "step": 3900}, {"loss": 1.2596, "grad_norm": 1.0771924257278442, "learning_rate": 0.0002, "epoch": 5.22378089512358, "step": 3910}, {"loss": 1.2371, "grad_norm": 0.8542634844779968, "learning_rate": 0.0002, "epoch": 5.237140948563794, "step": 3920}, {"loss": 1.2264, "grad_norm": 1.1021966934204102, "learning_rate": 0.0002, "epoch": 5.250501002004008, "step": 3930}, {"loss": 1.2097, "grad_norm": 1.170011281967163, "learning_rate": 0.0002, "epoch": 5.263861055444222, "step": 3940}, {"loss": 1.2101, "grad_norm": 0.9787653684616089, "learning_rate": 0.0002, "epoch": 5.277221108884436, "step": 3950}, {"loss": 1.24, "grad_norm": 0.914513885974884, "learning_rate": 0.0002, "epoch": 5.290581162324649, "step": 3960}, {"loss": 1.1641, "grad_norm": 1.0831562280654907, "learning_rate": 0.0002, "epoch": 5.303941215764863, "step": 3970}, {"loss": 1.2609, "grad_norm": 0.9810112714767456, "learning_rate": 0.0002, "epoch": 5.3173012692050765, "step": 3980}, {"loss": 1.1825, "grad_norm": 0.9624066948890686, "learning_rate": 0.0002, "epoch": 5.330661322645291, "step": 3990}, {"loss": 1.273, "grad_norm": 1.2296923398971558, "learning_rate": 0.0002, "epoch": 5.344021376085505, "step": 4000}, {"loss": 1.2452, "grad_norm": 1.011299967765808, "learning_rate": 0.0002, "epoch": 5.357381429525718, "step": 4010}, {"loss": 1.2539, "grad_norm": 0.9144132733345032, "learning_rate": 0.0002, "epoch": 5.370741482965932, "step": 4020}, {"loss": 1.2914, "grad_norm": 1.0573601722717285, "learning_rate": 0.0002, "epoch": 5.3841015364061455, "step": 4030}, {"loss": 1.2295, "grad_norm": 1.1667137145996094, "learning_rate": 0.0002, "epoch": 5.397461589846359, "step": 4040}, {"loss": 1.2541, "grad_norm": 1.072070598602295, "learning_rate": 0.0002, "epoch": 5.410821643286573, "step": 4050}, {"loss": 1.2448, "grad_norm": 1.1005792617797852, "learning_rate": 0.0002, "epoch": 5.424181696726787, "step": 4060}, {"loss": 1.2604, "grad_norm": 1.033581018447876, "learning_rate": 0.0002, "epoch": 5.437541750167001, "step": 4070}, {"loss": 1.2552, "grad_norm": 0.9537439942359924, "learning_rate": 0.0002, "epoch": 5.4509018036072145, "step": 4080}, {"loss": 1.2985, "grad_norm": 1.0502177476882935, "learning_rate": 0.0002, "epoch": 5.464261857047428, "step": 4090}, {"loss": 1.2424, "grad_norm": 0.9098296761512756, "learning_rate": 0.0002, "epoch": 5.477621910487642, "step": 4100}, {"loss": 1.2262, "grad_norm": 0.9551953077316284, "learning_rate": 0.0002, "epoch": 5.490981963927855, "step": 4110}, {"loss": 1.2848, "grad_norm": 0.9169427156448364, "learning_rate": 0.0002, "epoch": 5.504342017368069, "step": 4120}, {"loss": 1.2572, "grad_norm": 0.9430235624313354, "learning_rate": 0.0002, "epoch": 5.517702070808284, "step": 4130}, {"loss": 1.2618, "grad_norm": 0.817259669303894, "learning_rate": 0.0002, "epoch": 5.531062124248497, "step": 4140}, {"loss": 1.3012, "grad_norm": 1.124152660369873, "learning_rate": 0.0002, "epoch": 5.544422177688711, "step": 4150}, {"loss": 1.2508, "grad_norm": 0.9250756502151489, "learning_rate": 0.0002, "epoch": 5.557782231128924, "step": 4160}, {"loss": 1.2492, "grad_norm": 0.9582970142364502, "learning_rate": 0.0002, "epoch": 5.571142284569138, "step": 4170}, {"loss": 1.2804, "grad_norm": 1.0078704357147217, "learning_rate": 0.0002, "epoch": 5.584502338009352, "step": 4180}, {"loss": 1.1961, "grad_norm": 0.9585610032081604, "learning_rate": 0.0002, "epoch": 5.597862391449565, "step": 4190}, {"loss": 1.2522, "grad_norm": 1.0150971412658691, "learning_rate": 0.0002, "epoch": 5.61122244488978, "step": 4200}, {"loss": 1.2275, "grad_norm": 0.9943351149559021, "learning_rate": 0.0002, "epoch": 5.6245824983299935, "step": 4210}, {"loss": 1.2928, "grad_norm": 0.8880936503410339, "learning_rate": 0.0002, "epoch": 5.637942551770207, "step": 4220}, {"loss": 1.2323, "grad_norm": 0.9873887896537781, "learning_rate": 0.0002, "epoch": 5.651302605210421, "step": 4230}, {"loss": 1.3391, "grad_norm": 0.9185152649879456, "learning_rate": 0.0002, "epoch": 5.664662658650634, "step": 4240}, {"loss": 1.2511, "grad_norm": 1.0706779956817627, "learning_rate": 0.0002, "epoch": 5.678022712090849, "step": 4250}, {"loss": 1.2737, "grad_norm": 0.9660224914550781, "learning_rate": 0.0002, "epoch": 5.6913827655310625, "step": 4260}, {"loss": 1.2815, "grad_norm": 0.8685019612312317, "learning_rate": 0.0002, "epoch": 5.704742818971276, "step": 4270}, {"loss": 1.1559, "grad_norm": 1.0390565395355225, "learning_rate": 0.0002, "epoch": 5.71810287241149, "step": 4280}, {"loss": 1.3134, "grad_norm": 0.9290478825569153, "learning_rate": 0.0002, "epoch": 5.731462925851703, "step": 4290}, {"loss": 1.2426, "grad_norm": 1.0361281633377075, "learning_rate": 0.0002, "epoch": 5.744822979291917, "step": 4300}, {"loss": 1.2688, "grad_norm": 0.8804615139961243, "learning_rate": 0.0002, "epoch": 5.758183032732131, "step": 4310}, {"loss": 1.2479, "grad_norm": 1.0051425695419312, "learning_rate": 0.0002, "epoch": 5.771543086172345, "step": 4320}, {"loss": 1.1946, "grad_norm": 1.0051119327545166, "learning_rate": 0.0002, "epoch": 5.784903139612559, "step": 4330}, {"loss": 1.2571, "grad_norm": 0.9961661100387573, "learning_rate": 0.0002, "epoch": 5.798263193052772, "step": 4340}, {"loss": 1.2179, "grad_norm": 1.0229419469833374, "learning_rate": 0.0002, "epoch": 5.811623246492986, "step": 4350}, {"loss": 1.2984, "grad_norm": 1.1129552125930786, "learning_rate": 0.0002, "epoch": 5.8249832999332, "step": 4360}, {"loss": 1.2692, "grad_norm": 1.18964421749115, "learning_rate": 0.0002, "epoch": 5.838343353373413, "step": 4370}, {"loss": 1.1996, "grad_norm": 0.9490230083465576, "learning_rate": 0.0002, "epoch": 5.851703406813627, "step": 4380}, {"loss": 1.3177, "grad_norm": 0.8734540343284607, "learning_rate": 0.0002, "epoch": 5.865063460253841, "step": 4390}, {"loss": 1.3131, "grad_norm": 1.0017802715301514, "learning_rate": 0.0002, "epoch": 5.878423513694055, "step": 4400}, {"loss": 1.2649, "grad_norm": 0.953556478023529, "learning_rate": 0.0002, "epoch": 5.891783567134269, "step": 4410}, {"loss": 1.2684, "grad_norm": 0.8915258646011353, "learning_rate": 0.0002, "epoch": 5.905143620574482, "step": 4420}, {"loss": 1.2843, "grad_norm": 0.9715141654014587, "learning_rate": 0.0002, "epoch": 5.918503674014696, "step": 4430}, {"loss": 1.2769, "grad_norm": 0.9432152509689331, "learning_rate": 0.0002, "epoch": 5.9318637274549095, "step": 4440}, {"loss": 1.233, "grad_norm": 0.9473979473114014, "learning_rate": 0.0002, "epoch": 5.945223780895123, "step": 4450}, {"loss": 1.3209, "grad_norm": 1.104871392250061, "learning_rate": 0.0002, "epoch": 5.958583834335338, "step": 4460}, {"loss": 1.3427, "grad_norm": 1.0308905839920044, "learning_rate": 0.0002, "epoch": 5.971943887775551, "step": 4470}, {"loss": 1.1808, "grad_norm": 0.8895487189292908, "learning_rate": 0.0002, "epoch": 5.985303941215765, "step": 4480}, {"loss": 1.2634, "grad_norm": 1.0148485898971558, "learning_rate": 0.0002, "epoch": 5.9986639946559785, "step": 4490}, {"eval_loss": 2.0830726623535156, "eval_runtime": 38.5442, "eval_samples_per_second": 13.361, "eval_steps_per_second": 1.686, "epoch": 6.0, "step": 4491}, {"loss": 1.1106, "grad_norm": 1.1640599966049194, "learning_rate": 0.0002, "epoch": 6.012024048096192, "step": 4500}, {"loss": 1.0436, "grad_norm": 1.213204264640808, "learning_rate": 0.0002, "epoch": 6.025384101536406, "step": 4510}, {"loss": 1.0606, "grad_norm": 1.1694388389587402, "learning_rate": 0.0002, "epoch": 6.03874415497662, "step": 4520}, {"loss": 1.0274, "grad_norm": 1.1044062376022339, "learning_rate": 0.0002, "epoch": 6.052104208416834, "step": 4530}, {"loss": 1.0552, "grad_norm": 1.0701100826263428, "learning_rate": 0.0002, "epoch": 6.065464261857048, "step": 4540}, {"loss": 1.0018, "grad_norm": 1.360065221786499, "learning_rate": 0.0002, "epoch": 6.078824315297261, "step": 4550}, {"loss": 1.0189, "grad_norm": 1.0648503303527832, "learning_rate": 0.0002, "epoch": 6.092184368737475, "step": 4560}, {"loss": 1.008, "grad_norm": 1.066245198249817, "learning_rate": 0.0002, "epoch": 6.1055444221776884, "step": 4570}, {"loss": 1.099, "grad_norm": 1.1483700275421143, "learning_rate": 0.0002, "epoch": 6.118904475617902, "step": 4580}, {"loss": 1.1043, "grad_norm": 1.334275722503662, "learning_rate": 0.0002, "epoch": 6.132264529058117, "step": 4590}, {"loss": 1.0783, "grad_norm": 1.2141029834747314, "learning_rate": 0.0002, "epoch": 6.14562458249833, "step": 4600}, {"loss": 1.0891, "grad_norm": 1.2284387350082397, "learning_rate": 0.0002, "epoch": 6.158984635938544, "step": 4610}, {"loss": 1.122, "grad_norm": 1.2326734066009521, "learning_rate": 0.0002, "epoch": 6.1723446893787575, "step": 4620}, {"loss": 1.1069, "grad_norm": 1.245004653930664, "learning_rate": 0.0002, "epoch": 6.185704742818971, "step": 4630}, {"loss": 1.0821, "grad_norm": 0.9685266017913818, "learning_rate": 0.0002, "epoch": 6.199064796259185, "step": 4640}, {"loss": 1.0659, "grad_norm": 1.141634464263916, "learning_rate": 0.0002, "epoch": 6.212424849699399, "step": 4650}, {"loss": 1.0971, "grad_norm": 1.4279003143310547, "learning_rate": 0.0002, "epoch": 6.225784903139613, "step": 4660}, {"loss": 1.093, "grad_norm": 1.186668872833252, "learning_rate": 0.0002, "epoch": 6.2391449565798265, "step": 4670}, {"loss": 1.0522, "grad_norm": 1.2656606435775757, "learning_rate": 0.0002, "epoch": 6.25250501002004, "step": 4680}, {"loss": 1.1138, "grad_norm": 1.1122987270355225, "learning_rate": 0.0002, "epoch": 6.265865063460254, "step": 4690}, {"loss": 1.0906, "grad_norm": 1.190050482749939, "learning_rate": 0.0002, "epoch": 6.279225116900467, "step": 4700}, {"loss": 1.1095, "grad_norm": 1.3683340549468994, "learning_rate": 0.0002, "epoch": 6.292585170340681, "step": 4710}, {"loss": 1.0663, "grad_norm": 1.1787203550338745, "learning_rate": 0.0002, "epoch": 6.3059452237808955, "step": 4720}, {"loss": 1.0856, "grad_norm": 1.3502576351165771, "learning_rate": 0.0002, "epoch": 6.319305277221109, "step": 4730}, {"loss": 1.0999, "grad_norm": 1.1958597898483276, "learning_rate": 0.0002, "epoch": 6.332665330661323, "step": 4740}, {"loss": 1.021, "grad_norm": 1.0918327569961548, "learning_rate": 0.0002, "epoch": 6.346025384101536, "step": 4750}, {"loss": 1.0484, "grad_norm": 1.2624558210372925, "learning_rate": 0.0002, "epoch": 6.35938543754175, "step": 4760}, {"loss": 1.0785, "grad_norm": 1.1390577554702759, "learning_rate": 0.0002, "epoch": 6.372745490981964, "step": 4770}, {"loss": 1.0979, "grad_norm": 1.041666865348816, "learning_rate": 0.0002, "epoch": 6.386105544422177, "step": 4780}, {"loss": 1.1026, "grad_norm": 1.4209141731262207, "learning_rate": 0.0002, "epoch": 6.399465597862392, "step": 4790}, {"loss": 1.119, "grad_norm": 1.1001079082489014, "learning_rate": 0.0002, "epoch": 6.412825651302605, "step": 4800}, {"loss": 1.1082, "grad_norm": 1.3324936628341675, "learning_rate": 0.0002, "epoch": 6.426185704742819, "step": 4810}, {"loss": 1.0785, "grad_norm": 1.1270194053649902, "learning_rate": 0.0002, "epoch": 6.439545758183033, "step": 4820}, {"loss": 1.1338, "grad_norm": 1.1961387395858765, "learning_rate": 0.0002, "epoch": 6.452905811623246, "step": 4830}, {"loss": 1.0967, "grad_norm": 1.255366563796997, "learning_rate": 0.0002, "epoch": 6.46626586506346, "step": 4840}, {"loss": 1.1226, "grad_norm": 1.343855381011963, "learning_rate": 0.0002, "epoch": 6.479625918503674, "step": 4850}, {"loss": 1.1118, "grad_norm": 1.3216257095336914, "learning_rate": 0.0002, "epoch": 6.492985971943888, "step": 4860}, {"loss": 1.1664, "grad_norm": 1.5244755744934082, "learning_rate": 0.0002, "epoch": 6.506346025384102, "step": 4870}, {"loss": 1.0403, "grad_norm": 1.1585701704025269, "learning_rate": 0.0002, "epoch": 6.519706078824315, "step": 4880}, {"loss": 1.1344, "grad_norm": 1.0301100015640259, "learning_rate": 0.0002, "epoch": 6.533066132264529, "step": 4890}, {"loss": 1.1304, "grad_norm": 1.5772714614868164, "learning_rate": 0.0002, "epoch": 6.5464261857047426, "step": 4900}, {"loss": 1.0953, "grad_norm": 1.2015259265899658, "learning_rate": 0.0002, "epoch": 6.559786239144957, "step": 4910}, {"loss": 1.1283, "grad_norm": 1.4365423917770386, "learning_rate": 0.0002, "epoch": 6.573146292585171, "step": 4920}, {"loss": 1.0717, "grad_norm": 1.2534470558166504, "learning_rate": 0.0002, "epoch": 6.586506346025384, "step": 4930}, {"loss": 1.1099, "grad_norm": 1.216138482093811, "learning_rate": 0.0002, "epoch": 6.599866399465598, "step": 4940}, {"loss": 1.1744, "grad_norm": 1.144316554069519, "learning_rate": 0.0002, "epoch": 6.613226452905812, "step": 4950}, {"loss": 1.0548, "grad_norm": 1.1127740144729614, "learning_rate": 0.0002, "epoch": 6.626586506346025, "step": 4960}, {"loss": 1.0686, "grad_norm": 1.1925606727600098, "learning_rate": 0.0002, "epoch": 6.639946559786239, "step": 4970}, {"loss": 1.0684, "grad_norm": 1.2500451803207397, "learning_rate": 0.0002, "epoch": 6.653306613226453, "step": 4980}, {"loss": 1.1642, "grad_norm": 1.16154944896698, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 4990}, {"loss": 1.1003, "grad_norm": 1.1921433210372925, "learning_rate": 0.0002, "epoch": 6.680026720106881, "step": 5000}, {"loss": 1.1278, "grad_norm": 1.1561170816421509, "learning_rate": 0.0002, "epoch": 6.693386773547094, "step": 5010}, {"loss": 1.0766, "grad_norm": 1.2988990545272827, "learning_rate": 0.0002, "epoch": 6.706746826987308, "step": 5020}, {"loss": 1.1131, "grad_norm": 0.9620341062545776, "learning_rate": 0.0002, "epoch": 6.7201068804275215, "step": 5030}, {"loss": 1.109, "grad_norm": 1.084228515625, "learning_rate": 0.0002, "epoch": 6.733466933867735, "step": 5040}, {"loss": 1.1474, "grad_norm": 1.1119431257247925, "learning_rate": 0.0002, "epoch": 6.74682698730795, "step": 5050}, {"loss": 1.179, "grad_norm": 1.1365628242492676, "learning_rate": 0.0002, "epoch": 6.760187040748163, "step": 5060}, {"loss": 1.0988, "grad_norm": 1.0989075899124146, "learning_rate": 0.0002, "epoch": 6.773547094188377, "step": 5070}, {"loss": 1.127, "grad_norm": 1.040647268295288, "learning_rate": 0.0002, "epoch": 6.7869071476285905, "step": 5080}, {"loss": 1.0793, "grad_norm": 1.1083087921142578, "learning_rate": 0.0002, "epoch": 6.800267201068804, "step": 5090}, {"loss": 1.1081, "grad_norm": 1.3434782028198242, "learning_rate": 0.0002, "epoch": 6.813627254509018, "step": 5100}, {"loss": 1.1243, "grad_norm": 1.2493442296981812, "learning_rate": 0.0002, "epoch": 6.826987307949231, "step": 5110}, {"loss": 1.0633, "grad_norm": 1.0672307014465332, "learning_rate": 0.0002, "epoch": 6.840347361389446, "step": 5120}, {"loss": 1.1344, "grad_norm": 1.068350911140442, "learning_rate": 0.0002, "epoch": 6.8537074148296595, "step": 5130}, {"loss": 1.1942, "grad_norm": 1.2880923748016357, "learning_rate": 0.0002, "epoch": 6.867067468269873, "step": 5140}, {"loss": 1.1445, "grad_norm": 1.0895041227340698, "learning_rate": 0.0002, "epoch": 6.880427521710087, "step": 5150}, {"loss": 1.1535, "grad_norm": 1.2383300065994263, "learning_rate": 0.0002, "epoch": 6.8937875751503, "step": 5160}, {"loss": 1.1653, "grad_norm": 1.5274227857589722, "learning_rate": 0.0002, "epoch": 6.907147628590514, "step": 5170}, {"loss": 1.1112, "grad_norm": 1.1453371047973633, "learning_rate": 0.0002, "epoch": 6.920507682030728, "step": 5180}, {"loss": 1.1808, "grad_norm": 1.171336054801941, "learning_rate": 0.0002, "epoch": 6.933867735470942, "step": 5190}, {"loss": 1.1142, "grad_norm": 1.1946955919265747, "learning_rate": 0.0002, "epoch": 6.947227788911156, "step": 5200}, {"loss": 1.1386, "grad_norm": 1.2290117740631104, "learning_rate": 0.0002, "epoch": 6.960587842351369, "step": 5210}, {"loss": 1.1573, "grad_norm": 1.3134533166885376, "learning_rate": 0.0002, "epoch": 6.973947895791583, "step": 5220}, {"loss": 1.1687, "grad_norm": 1.1500377655029297, "learning_rate": 0.0002, "epoch": 6.987307949231797, "step": 5230}, {"eval_loss": 2.2211341857910156, "eval_runtime": 38.5729, "eval_samples_per_second": 13.351, "eval_steps_per_second": 1.685, "epoch": 6.999331997327989, "step": 5239}, {"loss": 1.1235, "grad_norm": 1.1143344640731812, "learning_rate": 0.0002, "epoch": 7.00066800267201, "step": 5240}, {"loss": 0.9203, "grad_norm": 1.5164896249771118, "learning_rate": 0.0002, "epoch": 7.014028056112225, "step": 5250}, {"loss": 0.9605, "grad_norm": 1.3737165927886963, "learning_rate": 0.0002, "epoch": 7.027388109552438, "step": 5260}, {"loss": 0.8599, "grad_norm": 1.2159202098846436, "learning_rate": 0.0002, "epoch": 7.040748162992652, "step": 5270}, {"loss": 0.9469, "grad_norm": 1.4183212518692017, "learning_rate": 0.0002, "epoch": 7.054108216432866, "step": 5280}, {"loss": 0.9188, "grad_norm": 1.4752920866012573, "learning_rate": 0.0002, "epoch": 7.067468269873079, "step": 5290}, {"loss": 0.873, "grad_norm": 1.398065447807312, "learning_rate": 0.0002, "epoch": 7.080828323313293, "step": 5300}, {"loss": 0.9434, "grad_norm": 1.4385913610458374, "learning_rate": 0.0002, "epoch": 7.094188376753507, "step": 5310}, {"loss": 0.8527, "grad_norm": 1.3779526948928833, "learning_rate": 0.0002, "epoch": 7.107548430193721, "step": 5320}, {"loss": 0.9139, "grad_norm": 1.5290347337722778, "learning_rate": 0.0002, "epoch": 7.120908483633935, "step": 5330}, {"loss": 0.8769, "grad_norm": 1.2389367818832397, "learning_rate": 0.0002, "epoch": 7.134268537074148, "step": 5340}, {"loss": 0.9396, "grad_norm": 1.4514659643173218, "learning_rate": 0.0002, "epoch": 7.147628590514362, "step": 5350}, {"loss": 0.9303, "grad_norm": 1.3247307538986206, "learning_rate": 0.0002, "epoch": 7.160988643954576, "step": 5360}, {"loss": 0.9218, "grad_norm": 1.1711286306381226, "learning_rate": 0.0002, "epoch": 7.174348697394789, "step": 5370}, {"loss": 0.944, "grad_norm": 1.4408347606658936, "learning_rate": 0.0002, "epoch": 7.187708750835004, "step": 5380}, {"loss": 0.9509, "grad_norm": 1.4405876398086548, "learning_rate": 0.0002, "epoch": 7.201068804275217, "step": 5390}, {"loss": 0.9428, "grad_norm": 1.233242154121399, "learning_rate": 0.0002, "epoch": 7.214428857715431, "step": 5400}, {"loss": 0.9401, "grad_norm": 1.734960675239563, "learning_rate": 0.0002, "epoch": 7.227788911155645, "step": 5410}, {"loss": 0.9232, "grad_norm": 1.5165163278579712, "learning_rate": 0.0002, "epoch": 7.241148964595858, "step": 5420}, {"loss": 0.949, "grad_norm": 1.4353035688400269, "learning_rate": 0.0002, "epoch": 7.254509018036072, "step": 5430}, {"loss": 0.9029, "grad_norm": 1.4540636539459229, "learning_rate": 0.0002, "epoch": 7.2678690714762855, "step": 5440}, {"loss": 0.9117, "grad_norm": 1.2676037549972534, "learning_rate": 0.0002, "epoch": 7.2812291249165, "step": 5450}, {"loss": 1.0148, "grad_norm": 1.2626118659973145, "learning_rate": 0.0002, "epoch": 7.294589178356714, "step": 5460}, {"loss": 0.9373, "grad_norm": 1.4866795539855957, "learning_rate": 0.0002, "epoch": 7.307949231796927, "step": 5470}, {"loss": 0.9474, "grad_norm": 1.2464289665222168, "learning_rate": 0.0002, "epoch": 7.321309285237141, "step": 5480}, {"loss": 0.896, "grad_norm": 1.2815988063812256, "learning_rate": 0.0002, "epoch": 7.3346693386773545, "step": 5490}, {"loss": 0.9733, "grad_norm": 1.282402753829956, "learning_rate": 0.0002, "epoch": 7.348029392117568, "step": 5500}, {"loss": 0.9427, "grad_norm": 1.5422425270080566, "learning_rate": 0.0002, "epoch": 7.361389445557783, "step": 5510}, {"loss": 1.0067, "grad_norm": 1.4137073755264282, "learning_rate": 0.0002, "epoch": 7.374749498997996, "step": 5520}, {"loss": 0.8669, "grad_norm": 1.4875508546829224, "learning_rate": 0.0002, "epoch": 7.38810955243821, "step": 5530}, {"loss": 0.9576, "grad_norm": 1.292340874671936, "learning_rate": 0.0002, "epoch": 7.4014696058784235, "step": 5540}, {"loss": 0.9031, "grad_norm": 1.5553388595581055, "learning_rate": 0.0002, "epoch": 7.414829659318637, "step": 5550}, {"loss": 1.005, "grad_norm": 1.394142746925354, "learning_rate": 0.0002, "epoch": 7.428189712758851, "step": 5560}, {"loss": 0.9493, "grad_norm": 1.3249385356903076, "learning_rate": 0.0002, "epoch": 7.441549766199064, "step": 5570}, {"loss": 0.9806, "grad_norm": 1.3204814195632935, "learning_rate": 0.0002, "epoch": 7.454909819639279, "step": 5580}, {"loss": 0.9182, "grad_norm": 1.4062745571136475, "learning_rate": 0.0002, "epoch": 7.4682698730794925, "step": 5590}, {"loss": 0.9429, "grad_norm": 1.2828562259674072, "learning_rate": 0.0002, "epoch": 7.481629926519706, "step": 5600}, {"loss": 0.9498, "grad_norm": 1.440412998199463, "learning_rate": 0.0002, "epoch": 7.49498997995992, "step": 5610}, {"loss": 1.0005, "grad_norm": 1.4771733283996582, "learning_rate": 0.0002, "epoch": 7.508350033400133, "step": 5620}, {"loss": 0.9472, "grad_norm": 1.329460620880127, "learning_rate": 0.0002, "epoch": 7.521710086840347, "step": 5630}, {"loss": 0.9635, "grad_norm": 1.2443828582763672, "learning_rate": 0.0002, "epoch": 7.5350701402805615, "step": 5640}, {"loss": 0.9785, "grad_norm": 1.3739941120147705, "learning_rate": 0.0002, "epoch": 7.548430193720775, "step": 5650}, {"loss": 0.9538, "grad_norm": 1.5168178081512451, "learning_rate": 0.0002, "epoch": 7.561790247160989, "step": 5660}, {"loss": 0.9381, "grad_norm": 1.3648325204849243, "learning_rate": 0.0002, "epoch": 7.575150300601202, "step": 5670}, {"loss": 0.9696, "grad_norm": 1.308164119720459, "learning_rate": 0.0002, "epoch": 7.588510354041416, "step": 5680}, {"loss": 0.9889, "grad_norm": 1.3583498001098633, "learning_rate": 0.0002, "epoch": 7.60187040748163, "step": 5690}, {"loss": 0.9566, "grad_norm": 1.4746732711791992, "learning_rate": 0.0002, "epoch": 7.615230460921843, "step": 5700}, {"loss": 0.9462, "grad_norm": 1.3042285442352295, "learning_rate": 0.0002, "epoch": 7.628590514362058, "step": 5710}, {"loss": 1.0146, "grad_norm": 1.5272791385650635, "learning_rate": 0.0002, "epoch": 7.641950567802271, "step": 5720}, {"loss": 0.9795, "grad_norm": 1.1505831480026245, "learning_rate": 0.0002, "epoch": 7.655310621242485, "step": 5730}, {"loss": 0.9758, "grad_norm": 1.3690030574798584, "learning_rate": 0.0002, "epoch": 7.668670674682699, "step": 5740}, {"loss": 0.9565, "grad_norm": 1.3092460632324219, "learning_rate": 0.0002, "epoch": 7.682030728122912, "step": 5750}, {"loss": 1.0147, "grad_norm": 1.7011737823486328, "learning_rate": 0.0002, "epoch": 7.695390781563126, "step": 5760}, {"loss": 1.0261, "grad_norm": 1.4010082483291626, "learning_rate": 0.0002, "epoch": 7.70875083500334, "step": 5770}, {"loss": 1.0288, "grad_norm": 1.63649582862854, "learning_rate": 0.0002, "epoch": 7.722110888443554, "step": 5780}, {"loss": 0.9496, "grad_norm": 1.5091519355773926, "learning_rate": 0.0002, "epoch": 7.735470941883768, "step": 5790}, {"loss": 0.9627, "grad_norm": 1.345441460609436, "learning_rate": 0.0002, "epoch": 7.748830995323981, "step": 5800}, {"loss": 0.9518, "grad_norm": 1.461037278175354, "learning_rate": 0.0002, "epoch": 7.762191048764195, "step": 5810}, {"loss": 0.9544, "grad_norm": 1.3914507627487183, "learning_rate": 0.0002, "epoch": 7.775551102204409, "step": 5820}, {"loss": 0.9946, "grad_norm": 1.293625831604004, "learning_rate": 0.0002, "epoch": 7.788911155644622, "step": 5830}, {"loss": 0.9732, "grad_norm": 1.5641531944274902, "learning_rate": 0.0002, "epoch": 7.802271209084836, "step": 5840}, {"loss": 0.9952, "grad_norm": 1.2400811910629272, "learning_rate": 0.0002, "epoch": 7.81563126252505, "step": 5850}, {"loss": 0.9841, "grad_norm": 1.335532546043396, "learning_rate": 0.0002, "epoch": 7.828991315965264, "step": 5860}, {"loss": 0.9609, "grad_norm": 1.4629961252212524, "learning_rate": 0.0002, "epoch": 7.842351369405478, "step": 5870}, {"loss": 1.0813, "grad_norm": 1.3005847930908203, "learning_rate": 0.0002, "epoch": 7.855711422845691, "step": 5880}, {"loss": 1.0466, "grad_norm": 1.6970791816711426, "learning_rate": 0.0002, "epoch": 7.869071476285905, "step": 5890}, {"loss": 0.9399, "grad_norm": 1.6887991428375244, "learning_rate": 0.0002, "epoch": 7.882431529726119, "step": 5900}, {"loss": 1.0364, "grad_norm": 1.4156445264816284, "learning_rate": 0.0002, "epoch": 7.895791583166333, "step": 5910}, {"loss": 0.9856, "grad_norm": 1.2695387601852417, "learning_rate": 0.0002, "epoch": 7.909151636606547, "step": 5920}, {"loss": 0.9902, "grad_norm": 1.4491169452667236, "learning_rate": 0.0002, "epoch": 7.92251169004676, "step": 5930}, {"loss": 1.047, "grad_norm": 1.4262619018554688, "learning_rate": 0.0002, "epoch": 7.935871743486974, "step": 5940}, {"loss": 1.0889, "grad_norm": 1.5128049850463867, "learning_rate": 0.0002, "epoch": 7.9492317969271875, "step": 5950}, {"loss": 0.9721, "grad_norm": 1.3630818128585815, "learning_rate": 0.0002, "epoch": 7.962591850367401, "step": 5960}, {"loss": 1.0154, "grad_norm": 1.410461187362671, "learning_rate": 0.0002, "epoch": 7.975951903807616, "step": 5970}, {"loss": 1.0192, "grad_norm": 1.4158549308776855, "learning_rate": 0.0002, "epoch": 7.989311957247829, "step": 5980}]}