diff --git a/.gitattributes b/.gitattributes index 700cf25fb8571363d7f9ac2f4af5e9de3a5deece..e83ad3820935b85cfcffa07caff9878724eb17ab 100644 --- a/.gitattributes +++ b/.gitattributes @@ -415,3 +415,12 @@ Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq- Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-1029-sd-42/checkpoint-731/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-1029-sd-42/checkpoint-832/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-1029-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..56e5920d80526c364d2343fccebf2d2c51f835ba --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29626cfdf5fb74581691c2f8a4959a8bfacf991c59dce48b2a56c151db65b924 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..56e5920d80526c364d2343fccebf2d2c51f835ba --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29626cfdf5fb74581691c2f8a4959a8bfacf991c59dce48b2a56c151db65b924 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb818173cf135c3e668e7516a6fe114b5b7471f1 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08fa544b6efd9931c1508e7dd19e79d63a011f1892a4a75470d8ad0d6f14126a +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..39bb72984a6ad4164dd3526fe83785c4a6a5494e --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd56b345acd204cffc51ffbb4bb8808fb29c4b818fc35e6e1b2c7fc45e0218a0 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4c4fa34d66b60508f0ec3a56ffe95306a5a4281 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7b30b8f3b26b2ef091d45bfcbfd165d1a71788a954adc2f9846020c65a033a2 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0da0541d5ff4bf87646b50eee5d002fea4c99a19 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/trainer_state.json @@ -0,0 +1,1183 @@ +{ + "best_metric": 1.238026738166809, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1622, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + }, + { + "epoch": 1.0110974106041923, + "grad_norm": 0.26760655641555786, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 820 + }, + { + "epoch": 1.0234278668310728, + "grad_norm": 0.4640820026397705, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 830 + }, + { + "epoch": 1.0357583230579532, + "grad_norm": 0.2699166238307953, + "learning_rate": 0.0002, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.3441709578037262, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 850 + }, + { + "epoch": 1.060419235511714, + "grad_norm": 0.299934983253479, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 860 + }, + { + "epoch": 1.0727496917385944, + "grad_norm": 0.2980666160583496, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 870 + }, + { + "epoch": 1.0850801479654748, + "grad_norm": 0.3131714463233948, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 880 + }, + { + "epoch": 1.097410604192355, + "grad_norm": 0.29881617426872253, + "learning_rate": 0.0002, + "loss": 0.9288, + "step": 890 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.29870888590812683, + "learning_rate": 0.0002, + "loss": 0.998, + "step": 900 + }, + { + "epoch": 1.122071516646116, + "grad_norm": 0.5735140442848206, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 910 + }, + { + "epoch": 1.1344019728729964, + "grad_norm": 0.33159002661705017, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 920 + }, + { + "epoch": 1.1467324290998766, + "grad_norm": 1.235399842262268, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 930 + }, + { + "epoch": 1.159062885326757, + "grad_norm": 0.27469736337661743, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 940 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.29130664467811584, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 950 + }, + { + "epoch": 1.183723797780518, + "grad_norm": 0.3730354607105255, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 960 + }, + { + "epoch": 1.1960542540073984, + "grad_norm": 0.5973590612411499, + "learning_rate": 0.0002, + "loss": 0.9988, + "step": 970 + }, + { + "epoch": 1.2083847102342786, + "grad_norm": 0.39631304144859314, + "learning_rate": 0.0002, + "loss": 0.9525, + "step": 980 + }, + { + "epoch": 1.220715166461159, + "grad_norm": 0.849051296710968, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 990 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.4390525817871094, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1000 + }, + { + "epoch": 1.2453760789149197, + "grad_norm": 0.30423852801322937, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 1010 + }, + { + "epoch": 1.2577065351418002, + "grad_norm": 0.34736061096191406, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 1020 + }, + { + "epoch": 1.2700369913686806, + "grad_norm": 0.3421604037284851, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1030 + }, + { + "epoch": 1.282367447595561, + "grad_norm": 0.544170081615448, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1040 + }, + { + "epoch": 1.2946979038224415, + "grad_norm": 0.5128790736198425, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 1050 + }, + { + "epoch": 1.3070283600493218, + "grad_norm": 0.443344384431839, + "learning_rate": 0.0002, + "loss": 0.9214, + "step": 1060 + }, + { + "epoch": 1.3193588162762022, + "grad_norm": 0.6380868554115295, + "learning_rate": 0.0002, + "loss": 0.9367, + "step": 1070 + }, + { + "epoch": 1.3316892725030827, + "grad_norm": 0.4638073146343231, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 1080 + }, + { + "epoch": 1.344019728729963, + "grad_norm": 0.32406893372535706, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 1090 + }, + { + "epoch": 1.3563501849568433, + "grad_norm": 0.3955065608024597, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1100 + }, + { + "epoch": 1.3686806411837238, + "grad_norm": 0.3489246666431427, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 1110 + }, + { + "epoch": 1.3810110974106042, + "grad_norm": 0.48451653122901917, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 1120 + }, + { + "epoch": 1.3933415536374847, + "grad_norm": 0.3652360439300537, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 1130 + }, + { + "epoch": 1.405672009864365, + "grad_norm": 1.3097436428070068, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 1140 + }, + { + "epoch": 1.4180024660912454, + "grad_norm": 0.3647715449333191, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1150 + }, + { + "epoch": 1.4303329223181258, + "grad_norm": 0.37248560786247253, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 1160 + }, + { + "epoch": 1.442663378545006, + "grad_norm": 0.4639643430709839, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1170 + }, + { + "epoch": 1.4549938347718865, + "grad_norm": 0.5455219745635986, + "learning_rate": 0.0002, + "loss": 0.9511, + "step": 1180 + }, + { + "epoch": 1.467324290998767, + "grad_norm": 0.38862571120262146, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 1190 + }, + { + "epoch": 1.4796547472256474, + "grad_norm": 0.37586215138435364, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 1200 + }, + { + "epoch": 1.4919852034525278, + "grad_norm": 0.46244436502456665, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1210 + }, + { + "epoch": 1.504315659679408, + "grad_norm": 0.3570359945297241, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 1220 + }, + { + "epoch": 1.5166461159062885, + "grad_norm": 0.28393083810806274, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 1230 + }, + { + "epoch": 1.528976572133169, + "grad_norm": 0.5672869682312012, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 1240 + }, + { + "epoch": 1.5413070283600492, + "grad_norm": 0.41605108976364136, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 1250 + }, + { + "epoch": 1.5536374845869299, + "grad_norm": 0.40657493472099304, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1260 + }, + { + "epoch": 1.56596794081381, + "grad_norm": 0.43672341108322144, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 1.5782983970406905, + "grad_norm": 0.3065410554409027, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 1280 + }, + { + "epoch": 1.590628853267571, + "grad_norm": 0.37826645374298096, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1290 + }, + { + "epoch": 1.6029593094944512, + "grad_norm": 0.42307335138320923, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 1300 + }, + { + "epoch": 1.6152897657213316, + "grad_norm": 0.3648843467235565, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1310 + }, + { + "epoch": 1.627620221948212, + "grad_norm": 0.8921076059341431, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 1320 + }, + { + "epoch": 1.6399506781750923, + "grad_norm": 0.37522226572036743, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 1330 + }, + { + "epoch": 1.652281134401973, + "grad_norm": 0.7489957809448242, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 1340 + }, + { + "epoch": 1.6646115906288532, + "grad_norm": 0.31733131408691406, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 1350 + }, + { + "epoch": 1.6769420468557337, + "grad_norm": 0.3249478340148926, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1360 + }, + { + "epoch": 1.6892725030826141, + "grad_norm": 0.3178001344203949, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1370 + }, + { + "epoch": 1.7016029593094943, + "grad_norm": 0.5674093961715698, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 1380 + }, + { + "epoch": 1.7139334155363748, + "grad_norm": 0.35272449254989624, + "learning_rate": 0.0002, + "loss": 0.8972, + "step": 1390 + }, + { + "epoch": 1.7262638717632552, + "grad_norm": 0.5778217911720276, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 1.7385943279901355, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 1410 + }, + { + "epoch": 1.7509247842170161, + "grad_norm": 0.31735464930534363, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 1420 + }, + { + "epoch": 1.7632552404438964, + "grad_norm": 1.0612670183181763, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 1430 + }, + { + "epoch": 1.7755856966707768, + "grad_norm": 0.5442509651184082, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1440 + }, + { + "epoch": 1.7879161528976573, + "grad_norm": 0.7471332550048828, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1450 + }, + { + "epoch": 1.8002466091245375, + "grad_norm": 0.4323609173297882, + "learning_rate": 0.0002, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 1.8125770653514182, + "grad_norm": 0.47796759009361267, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1470 + }, + { + "epoch": 1.8249075215782984, + "grad_norm": 0.3348400592803955, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 1480 + }, + { + "epoch": 1.8372379778051788, + "grad_norm": 0.3354550898075104, + "learning_rate": 0.0002, + "loss": 0.9793, + "step": 1490 + }, + { + "epoch": 1.8495684340320593, + "grad_norm": 0.5988477468490601, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 1500 + }, + { + "epoch": 1.8618988902589395, + "grad_norm": 0.5222318172454834, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 1510 + }, + { + "epoch": 1.87422934648582, + "grad_norm": 0.5246642827987671, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 1520 + }, + { + "epoch": 1.8865598027127004, + "grad_norm": 0.3164594769477844, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1530 + }, + { + "epoch": 1.8988902589395806, + "grad_norm": 0.3496174216270447, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 1540 + }, + { + "epoch": 1.9112207151664613, + "grad_norm": 0.8863359689712524, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1550 + }, + { + "epoch": 1.9235511713933415, + "grad_norm": 0.3587026298046112, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1560 + }, + { + "epoch": 1.935881627620222, + "grad_norm": 0.6052881479263306, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1570 + }, + { + "epoch": 1.9482120838471024, + "grad_norm": 0.567269504070282, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1580 + }, + { + "epoch": 1.9605425400739827, + "grad_norm": 0.45184487104415894, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 1590 + }, + { + "epoch": 1.972872996300863, + "grad_norm": 0.5028569102287292, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 1600 + }, + { + "epoch": 1.9852034525277436, + "grad_norm": 0.4677547216415405, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 1610 + }, + { + "epoch": 1.9975339087546238, + "grad_norm": 0.35106056928634644, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 1.238026738166809, + "eval_runtime": 95.4287, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.576, + "step": 1622 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.331199540887552e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..82f2dc4b3b1ab8fb339e854a201871ef3e9e02f2 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98412178b503491fd12c8c313e08b1435a95e5943c2c6b15cbb06e57d9f25072 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c69255b3aad0a45315b4c32cb44a5f3b10ec2c2c --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c7487133ca2b3ee4617865295c95819332404e64aaf4584cd360c373857d38 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b026e1358e6c3f0e63f43563997a3675389511b0 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf62e9954e54ce4054ffe246197c1e5cf5ca07594c06a3d167d20fca185f2ac1 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1195b691dbde4fc073b7fe5c6b663876891add01 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bd9f5f69932f2af95a428f4d1e97bce98fdafea6df5245c7b87d7819fd124b6 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..86e51f628a0582d9d73e4b71162b9d76a39b3a3a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/trainer_state.json @@ -0,0 +1,1758 @@ +{ + "best_metric": 1.238026738166809, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 2433, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + }, + { + "epoch": 1.0110974106041923, + "grad_norm": 0.26760655641555786, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 820 + }, + { + "epoch": 1.0234278668310728, + "grad_norm": 0.4640820026397705, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 830 + }, + { + "epoch": 1.0357583230579532, + "grad_norm": 0.2699166238307953, + "learning_rate": 0.0002, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.3441709578037262, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 850 + }, + { + "epoch": 1.060419235511714, + "grad_norm": 0.299934983253479, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 860 + }, + { + "epoch": 1.0727496917385944, + "grad_norm": 0.2980666160583496, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 870 + }, + { + "epoch": 1.0850801479654748, + "grad_norm": 0.3131714463233948, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 880 + }, + { + "epoch": 1.097410604192355, + "grad_norm": 0.29881617426872253, + "learning_rate": 0.0002, + "loss": 0.9288, + "step": 890 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.29870888590812683, + "learning_rate": 0.0002, + "loss": 0.998, + "step": 900 + }, + { + "epoch": 1.122071516646116, + "grad_norm": 0.5735140442848206, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 910 + }, + { + "epoch": 1.1344019728729964, + "grad_norm": 0.33159002661705017, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 920 + }, + { + "epoch": 1.1467324290998766, + "grad_norm": 1.235399842262268, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 930 + }, + { + "epoch": 1.159062885326757, + "grad_norm": 0.27469736337661743, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 940 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.29130664467811584, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 950 + }, + { + "epoch": 1.183723797780518, + "grad_norm": 0.3730354607105255, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 960 + }, + { + "epoch": 1.1960542540073984, + "grad_norm": 0.5973590612411499, + "learning_rate": 0.0002, + "loss": 0.9988, + "step": 970 + }, + { + "epoch": 1.2083847102342786, + "grad_norm": 0.39631304144859314, + "learning_rate": 0.0002, + "loss": 0.9525, + "step": 980 + }, + { + "epoch": 1.220715166461159, + "grad_norm": 0.849051296710968, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 990 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.4390525817871094, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1000 + }, + { + "epoch": 1.2453760789149197, + "grad_norm": 0.30423852801322937, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 1010 + }, + { + "epoch": 1.2577065351418002, + "grad_norm": 0.34736061096191406, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 1020 + }, + { + "epoch": 1.2700369913686806, + "grad_norm": 0.3421604037284851, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1030 + }, + { + "epoch": 1.282367447595561, + "grad_norm": 0.544170081615448, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1040 + }, + { + "epoch": 1.2946979038224415, + "grad_norm": 0.5128790736198425, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 1050 + }, + { + "epoch": 1.3070283600493218, + "grad_norm": 0.443344384431839, + "learning_rate": 0.0002, + "loss": 0.9214, + "step": 1060 + }, + { + "epoch": 1.3193588162762022, + "grad_norm": 0.6380868554115295, + "learning_rate": 0.0002, + "loss": 0.9367, + "step": 1070 + }, + { + "epoch": 1.3316892725030827, + "grad_norm": 0.4638073146343231, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 1080 + }, + { + "epoch": 1.344019728729963, + "grad_norm": 0.32406893372535706, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 1090 + }, + { + "epoch": 1.3563501849568433, + "grad_norm": 0.3955065608024597, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1100 + }, + { + "epoch": 1.3686806411837238, + "grad_norm": 0.3489246666431427, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 1110 + }, + { + "epoch": 1.3810110974106042, + "grad_norm": 0.48451653122901917, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 1120 + }, + { + "epoch": 1.3933415536374847, + "grad_norm": 0.3652360439300537, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 1130 + }, + { + "epoch": 1.405672009864365, + "grad_norm": 1.3097436428070068, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 1140 + }, + { + "epoch": 1.4180024660912454, + "grad_norm": 0.3647715449333191, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1150 + }, + { + "epoch": 1.4303329223181258, + "grad_norm": 0.37248560786247253, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 1160 + }, + { + "epoch": 1.442663378545006, + "grad_norm": 0.4639643430709839, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1170 + }, + { + "epoch": 1.4549938347718865, + "grad_norm": 0.5455219745635986, + "learning_rate": 0.0002, + "loss": 0.9511, + "step": 1180 + }, + { + "epoch": 1.467324290998767, + "grad_norm": 0.38862571120262146, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 1190 + }, + { + "epoch": 1.4796547472256474, + "grad_norm": 0.37586215138435364, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 1200 + }, + { + "epoch": 1.4919852034525278, + "grad_norm": 0.46244436502456665, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1210 + }, + { + "epoch": 1.504315659679408, + "grad_norm": 0.3570359945297241, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 1220 + }, + { + "epoch": 1.5166461159062885, + "grad_norm": 0.28393083810806274, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 1230 + }, + { + "epoch": 1.528976572133169, + "grad_norm": 0.5672869682312012, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 1240 + }, + { + "epoch": 1.5413070283600492, + "grad_norm": 0.41605108976364136, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 1250 + }, + { + "epoch": 1.5536374845869299, + "grad_norm": 0.40657493472099304, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1260 + }, + { + "epoch": 1.56596794081381, + "grad_norm": 0.43672341108322144, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 1.5782983970406905, + "grad_norm": 0.3065410554409027, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 1280 + }, + { + "epoch": 1.590628853267571, + "grad_norm": 0.37826645374298096, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1290 + }, + { + "epoch": 1.6029593094944512, + "grad_norm": 0.42307335138320923, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 1300 + }, + { + "epoch": 1.6152897657213316, + "grad_norm": 0.3648843467235565, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1310 + }, + { + "epoch": 1.627620221948212, + "grad_norm": 0.8921076059341431, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 1320 + }, + { + "epoch": 1.6399506781750923, + "grad_norm": 0.37522226572036743, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 1330 + }, + { + "epoch": 1.652281134401973, + "grad_norm": 0.7489957809448242, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 1340 + }, + { + "epoch": 1.6646115906288532, + "grad_norm": 0.31733131408691406, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 1350 + }, + { + "epoch": 1.6769420468557337, + "grad_norm": 0.3249478340148926, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1360 + }, + { + "epoch": 1.6892725030826141, + "grad_norm": 0.3178001344203949, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1370 + }, + { + "epoch": 1.7016029593094943, + "grad_norm": 0.5674093961715698, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 1380 + }, + { + "epoch": 1.7139334155363748, + "grad_norm": 0.35272449254989624, + "learning_rate": 0.0002, + "loss": 0.8972, + "step": 1390 + }, + { + "epoch": 1.7262638717632552, + "grad_norm": 0.5778217911720276, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 1.7385943279901355, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 1410 + }, + { + "epoch": 1.7509247842170161, + "grad_norm": 0.31735464930534363, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 1420 + }, + { + "epoch": 1.7632552404438964, + "grad_norm": 1.0612670183181763, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 1430 + }, + { + "epoch": 1.7755856966707768, + "grad_norm": 0.5442509651184082, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1440 + }, + { + "epoch": 1.7879161528976573, + "grad_norm": 0.7471332550048828, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1450 + }, + { + "epoch": 1.8002466091245375, + "grad_norm": 0.4323609173297882, + "learning_rate": 0.0002, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 1.8125770653514182, + "grad_norm": 0.47796759009361267, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1470 + }, + { + "epoch": 1.8249075215782984, + "grad_norm": 0.3348400592803955, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 1480 + }, + { + "epoch": 1.8372379778051788, + "grad_norm": 0.3354550898075104, + "learning_rate": 0.0002, + "loss": 0.9793, + "step": 1490 + }, + { + "epoch": 1.8495684340320593, + "grad_norm": 0.5988477468490601, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 1500 + }, + { + "epoch": 1.8618988902589395, + "grad_norm": 0.5222318172454834, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 1510 + }, + { + "epoch": 1.87422934648582, + "grad_norm": 0.5246642827987671, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 1520 + }, + { + "epoch": 1.8865598027127004, + "grad_norm": 0.3164594769477844, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1530 + }, + { + "epoch": 1.8988902589395806, + "grad_norm": 0.3496174216270447, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 1540 + }, + { + "epoch": 1.9112207151664613, + "grad_norm": 0.8863359689712524, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1550 + }, + { + "epoch": 1.9235511713933415, + "grad_norm": 0.3587026298046112, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1560 + }, + { + "epoch": 1.935881627620222, + "grad_norm": 0.6052881479263306, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1570 + }, + { + "epoch": 1.9482120838471024, + "grad_norm": 0.567269504070282, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1580 + }, + { + "epoch": 1.9605425400739827, + "grad_norm": 0.45184487104415894, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 1590 + }, + { + "epoch": 1.972872996300863, + "grad_norm": 0.5028569102287292, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 1600 + }, + { + "epoch": 1.9852034525277436, + "grad_norm": 0.4677547216415405, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 1610 + }, + { + "epoch": 1.9975339087546238, + "grad_norm": 0.35106056928634644, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 1.238026738166809, + "eval_runtime": 95.4287, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.576, + "step": 1622 + }, + { + "epoch": 2.0098643649815044, + "grad_norm": 0.444060355424881, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 1630 + }, + { + "epoch": 2.0221948212083847, + "grad_norm": 0.627570390701294, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 1640 + }, + { + "epoch": 2.034525277435265, + "grad_norm": 0.38737839460372925, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 1650 + }, + { + "epoch": 2.0468557336621456, + "grad_norm": 0.4300459623336792, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 1660 + }, + { + "epoch": 2.059186189889026, + "grad_norm": 0.43037715554237366, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 1670 + }, + { + "epoch": 2.0715166461159065, + "grad_norm": 0.40772515535354614, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 1680 + }, + { + "epoch": 2.0838471023427867, + "grad_norm": 0.5295451879501343, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1690 + }, + { + "epoch": 2.096177558569667, + "grad_norm": 0.7452750205993652, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 1700 + }, + { + "epoch": 2.1085080147965476, + "grad_norm": 0.809183657169342, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 1710 + }, + { + "epoch": 2.120838471023428, + "grad_norm": 0.4597688913345337, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 1720 + }, + { + "epoch": 2.133168927250308, + "grad_norm": 0.806919276714325, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 1730 + }, + { + "epoch": 2.1454993834771887, + "grad_norm": 0.3755643665790558, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 1740 + }, + { + "epoch": 2.157829839704069, + "grad_norm": 0.5882734060287476, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1750 + }, + { + "epoch": 2.1701602959309496, + "grad_norm": 0.692960798740387, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 1760 + }, + { + "epoch": 2.18249075215783, + "grad_norm": 0.4737096428871155, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 1770 + }, + { + "epoch": 2.19482120838471, + "grad_norm": 0.6637021899223328, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 1780 + }, + { + "epoch": 2.2071516646115907, + "grad_norm": 0.9109764099121094, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 1790 + }, + { + "epoch": 2.219482120838471, + "grad_norm": 0.4137539267539978, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 1800 + }, + { + "epoch": 2.2318125770653516, + "grad_norm": 0.44995415210723877, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 1810 + }, + { + "epoch": 2.244143033292232, + "grad_norm": 0.5985036492347717, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 1820 + }, + { + "epoch": 2.256473489519112, + "grad_norm": 0.7549490332603455, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 1830 + }, + { + "epoch": 2.2688039457459928, + "grad_norm": 0.4490937888622284, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 1840 + }, + { + "epoch": 2.281134401972873, + "grad_norm": 0.38859808444976807, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 1850 + }, + { + "epoch": 2.293464858199753, + "grad_norm": 1.0704916715621948, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 1860 + }, + { + "epoch": 2.305795314426634, + "grad_norm": 0.4647100865840912, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 1870 + }, + { + "epoch": 2.318125770653514, + "grad_norm": 0.6181163787841797, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 1880 + }, + { + "epoch": 2.3304562268803943, + "grad_norm": 0.9241904020309448, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 1890 + }, + { + "epoch": 2.342786683107275, + "grad_norm": 0.39101317524909973, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 1900 + }, + { + "epoch": 2.3551171393341552, + "grad_norm": 0.49442458152770996, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 1910 + }, + { + "epoch": 2.367447595561036, + "grad_norm": 0.4864824414253235, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 1920 + }, + { + "epoch": 2.379778051787916, + "grad_norm": 0.5427613854408264, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 1930 + }, + { + "epoch": 2.392108508014797, + "grad_norm": 0.7164974808692932, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1940 + }, + { + "epoch": 2.404438964241677, + "grad_norm": 0.562979519367218, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 1950 + }, + { + "epoch": 2.4167694204685573, + "grad_norm": 0.5631861090660095, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 1960 + }, + { + "epoch": 2.429099876695438, + "grad_norm": 0.4895121157169342, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 1970 + }, + { + "epoch": 2.441430332922318, + "grad_norm": 0.45674824714660645, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1980 + }, + { + "epoch": 2.4537607891491984, + "grad_norm": 1.1424206495285034, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 1990 + }, + { + "epoch": 2.466091245376079, + "grad_norm": 0.6314579844474792, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 2000 + }, + { + "epoch": 2.4784217016029593, + "grad_norm": 0.5481605529785156, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 2010 + }, + { + "epoch": 2.4907521578298395, + "grad_norm": 0.4671579599380493, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 2020 + }, + { + "epoch": 2.50308261405672, + "grad_norm": 0.7621194124221802, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 2030 + }, + { + "epoch": 2.5154130702836004, + "grad_norm": 0.38983288407325745, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 2.5277435265104806, + "grad_norm": 0.6341150999069214, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2050 + }, + { + "epoch": 2.5400739827373613, + "grad_norm": 0.7151971459388733, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 2060 + }, + { + "epoch": 2.5524044389642415, + "grad_norm": 0.9665895104408264, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 2070 + }, + { + "epoch": 2.564734895191122, + "grad_norm": 0.9572727680206299, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 2080 + }, + { + "epoch": 2.5770653514180024, + "grad_norm": 1.1970765590667725, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 2090 + }, + { + "epoch": 2.589395807644883, + "grad_norm": 0.5505942702293396, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 2100 + }, + { + "epoch": 2.6017262638717633, + "grad_norm": 0.5903949737548828, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 2110 + }, + { + "epoch": 2.6140567200986435, + "grad_norm": 0.45640307664871216, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 2120 + }, + { + "epoch": 2.626387176325524, + "grad_norm": 0.8763944506645203, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 2130 + }, + { + "epoch": 2.6387176325524044, + "grad_norm": 0.4472963213920593, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 2140 + }, + { + "epoch": 2.6510480887792847, + "grad_norm": 0.5335086584091187, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 2150 + }, + { + "epoch": 2.6633785450061653, + "grad_norm": 0.805263340473175, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 2160 + }, + { + "epoch": 2.6757090012330456, + "grad_norm": 0.6332727670669556, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 2170 + }, + { + "epoch": 2.688039457459926, + "grad_norm": 0.8667435646057129, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 2180 + }, + { + "epoch": 2.7003699136868065, + "grad_norm": 0.5638955235481262, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2190 + }, + { + "epoch": 2.7127003699136867, + "grad_norm": 0.4176250696182251, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 2200 + }, + { + "epoch": 2.7250308261405674, + "grad_norm": 0.6013461351394653, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 2210 + }, + { + "epoch": 2.7373612823674476, + "grad_norm": 0.553961992263794, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 2220 + }, + { + "epoch": 2.7496917385943282, + "grad_norm": 0.4710180461406708, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 2230 + }, + { + "epoch": 2.7620221948212085, + "grad_norm": 0.8141706585884094, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 2240 + }, + { + "epoch": 2.7743526510480887, + "grad_norm": 0.7449556589126587, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 2250 + }, + { + "epoch": 2.7866831072749694, + "grad_norm": 0.5366780757904053, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 2260 + }, + { + "epoch": 2.7990135635018496, + "grad_norm": 0.5316720604896545, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 2270 + }, + { + "epoch": 2.81134401972873, + "grad_norm": 0.4598459005355835, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 2280 + }, + { + "epoch": 2.8236744759556105, + "grad_norm": 0.6852091550827026, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 2290 + }, + { + "epoch": 2.8360049321824907, + "grad_norm": 0.8040902018547058, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 2300 + }, + { + "epoch": 2.848335388409371, + "grad_norm": 0.46976321935653687, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 2310 + }, + { + "epoch": 2.8606658446362516, + "grad_norm": 0.5214090347290039, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 2320 + }, + { + "epoch": 2.872996300863132, + "grad_norm": 0.5323054790496826, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 2330 + }, + { + "epoch": 2.885326757090012, + "grad_norm": 0.6842264533042908, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2340 + }, + { + "epoch": 2.8976572133168927, + "grad_norm": 0.9157055616378784, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2350 + }, + { + "epoch": 2.909987669543773, + "grad_norm": 0.5253258347511292, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 2360 + }, + { + "epoch": 2.9223181257706536, + "grad_norm": 0.4937705099582672, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 2370 + }, + { + "epoch": 2.934648581997534, + "grad_norm": 0.48762989044189453, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 2380 + }, + { + "epoch": 2.9469790382244145, + "grad_norm": 0.544335126876831, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 2390 + }, + { + "epoch": 2.9593094944512948, + "grad_norm": 0.4847845435142517, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 2400 + }, + { + "epoch": 2.971639950678175, + "grad_norm": 0.4787445366382599, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 2410 + }, + { + "epoch": 2.9839704069050557, + "grad_norm": 1.022318959236145, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 2420 + }, + { + "epoch": 2.996300863131936, + "grad_norm": 0.4987848103046417, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 2430 + }, + { + "epoch": 3.0, + "eval_loss": 1.2936296463012695, + "eval_runtime": 94.7897, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 0.58, + "step": 2433 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2496799311331328e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-2433/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3396826926c797985a101e13a42e897a206698bb --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc38cb62d14fbd06e9bdb04a1e84d0455b3987e1fa9b83f322fc2df44acc8b7 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..99a8264523d4d282cf0eb84f66c9bbde70b38f3f --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b55169fe31e37a2e2dde32ccf2a2fdcd1cc7b30c0a06fd77dd13ffc4fa4fb97 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cc69645c774895d9d3cf59924729f721ada04388 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7b5fa4db2ec9f1dd83c22e31ef62ef4da0a2145da06b57bfd033599abd353ad +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa72a11a725fca0e62be390038a08eb9c1db7907 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18d807bbd0c95f2d5fe6d3a34fd392cb32d62db708c74f56525909e7e3cc107e +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..de29095b7c91f8b12544a22f6bc6e04d22574b06 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/trainer_state.json @@ -0,0 +1,2333 @@ +{ + "best_metric": 1.238026738166809, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 3244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + }, + { + "epoch": 1.0110974106041923, + "grad_norm": 0.26760655641555786, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 820 + }, + { + "epoch": 1.0234278668310728, + "grad_norm": 0.4640820026397705, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 830 + }, + { + "epoch": 1.0357583230579532, + "grad_norm": 0.2699166238307953, + "learning_rate": 0.0002, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.3441709578037262, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 850 + }, + { + "epoch": 1.060419235511714, + "grad_norm": 0.299934983253479, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 860 + }, + { + "epoch": 1.0727496917385944, + "grad_norm": 0.2980666160583496, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 870 + }, + { + "epoch": 1.0850801479654748, + "grad_norm": 0.3131714463233948, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 880 + }, + { + "epoch": 1.097410604192355, + "grad_norm": 0.29881617426872253, + "learning_rate": 0.0002, + "loss": 0.9288, + "step": 890 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.29870888590812683, + "learning_rate": 0.0002, + "loss": 0.998, + "step": 900 + }, + { + "epoch": 1.122071516646116, + "grad_norm": 0.5735140442848206, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 910 + }, + { + "epoch": 1.1344019728729964, + "grad_norm": 0.33159002661705017, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 920 + }, + { + "epoch": 1.1467324290998766, + "grad_norm": 1.235399842262268, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 930 + }, + { + "epoch": 1.159062885326757, + "grad_norm": 0.27469736337661743, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 940 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.29130664467811584, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 950 + }, + { + "epoch": 1.183723797780518, + "grad_norm": 0.3730354607105255, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 960 + }, + { + "epoch": 1.1960542540073984, + "grad_norm": 0.5973590612411499, + "learning_rate": 0.0002, + "loss": 0.9988, + "step": 970 + }, + { + "epoch": 1.2083847102342786, + "grad_norm": 0.39631304144859314, + "learning_rate": 0.0002, + "loss": 0.9525, + "step": 980 + }, + { + "epoch": 1.220715166461159, + "grad_norm": 0.849051296710968, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 990 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.4390525817871094, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1000 + }, + { + "epoch": 1.2453760789149197, + "grad_norm": 0.30423852801322937, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 1010 + }, + { + "epoch": 1.2577065351418002, + "grad_norm": 0.34736061096191406, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 1020 + }, + { + "epoch": 1.2700369913686806, + "grad_norm": 0.3421604037284851, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1030 + }, + { + "epoch": 1.282367447595561, + "grad_norm": 0.544170081615448, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1040 + }, + { + "epoch": 1.2946979038224415, + "grad_norm": 0.5128790736198425, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 1050 + }, + { + "epoch": 1.3070283600493218, + "grad_norm": 0.443344384431839, + "learning_rate": 0.0002, + "loss": 0.9214, + "step": 1060 + }, + { + "epoch": 1.3193588162762022, + "grad_norm": 0.6380868554115295, + "learning_rate": 0.0002, + "loss": 0.9367, + "step": 1070 + }, + { + "epoch": 1.3316892725030827, + "grad_norm": 0.4638073146343231, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 1080 + }, + { + "epoch": 1.344019728729963, + "grad_norm": 0.32406893372535706, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 1090 + }, + { + "epoch": 1.3563501849568433, + "grad_norm": 0.3955065608024597, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1100 + }, + { + "epoch": 1.3686806411837238, + "grad_norm": 0.3489246666431427, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 1110 + }, + { + "epoch": 1.3810110974106042, + "grad_norm": 0.48451653122901917, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 1120 + }, + { + "epoch": 1.3933415536374847, + "grad_norm": 0.3652360439300537, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 1130 + }, + { + "epoch": 1.405672009864365, + "grad_norm": 1.3097436428070068, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 1140 + }, + { + "epoch": 1.4180024660912454, + "grad_norm": 0.3647715449333191, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1150 + }, + { + "epoch": 1.4303329223181258, + "grad_norm": 0.37248560786247253, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 1160 + }, + { + "epoch": 1.442663378545006, + "grad_norm": 0.4639643430709839, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1170 + }, + { + "epoch": 1.4549938347718865, + "grad_norm": 0.5455219745635986, + "learning_rate": 0.0002, + "loss": 0.9511, + "step": 1180 + }, + { + "epoch": 1.467324290998767, + "grad_norm": 0.38862571120262146, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 1190 + }, + { + "epoch": 1.4796547472256474, + "grad_norm": 0.37586215138435364, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 1200 + }, + { + "epoch": 1.4919852034525278, + "grad_norm": 0.46244436502456665, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1210 + }, + { + "epoch": 1.504315659679408, + "grad_norm": 0.3570359945297241, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 1220 + }, + { + "epoch": 1.5166461159062885, + "grad_norm": 0.28393083810806274, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 1230 + }, + { + "epoch": 1.528976572133169, + "grad_norm": 0.5672869682312012, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 1240 + }, + { + "epoch": 1.5413070283600492, + "grad_norm": 0.41605108976364136, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 1250 + }, + { + "epoch": 1.5536374845869299, + "grad_norm": 0.40657493472099304, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1260 + }, + { + "epoch": 1.56596794081381, + "grad_norm": 0.43672341108322144, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 1.5782983970406905, + "grad_norm": 0.3065410554409027, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 1280 + }, + { + "epoch": 1.590628853267571, + "grad_norm": 0.37826645374298096, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1290 + }, + { + "epoch": 1.6029593094944512, + "grad_norm": 0.42307335138320923, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 1300 + }, + { + "epoch": 1.6152897657213316, + "grad_norm": 0.3648843467235565, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1310 + }, + { + "epoch": 1.627620221948212, + "grad_norm": 0.8921076059341431, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 1320 + }, + { + "epoch": 1.6399506781750923, + "grad_norm": 0.37522226572036743, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 1330 + }, + { + "epoch": 1.652281134401973, + "grad_norm": 0.7489957809448242, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 1340 + }, + { + "epoch": 1.6646115906288532, + "grad_norm": 0.31733131408691406, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 1350 + }, + { + "epoch": 1.6769420468557337, + "grad_norm": 0.3249478340148926, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1360 + }, + { + "epoch": 1.6892725030826141, + "grad_norm": 0.3178001344203949, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1370 + }, + { + "epoch": 1.7016029593094943, + "grad_norm": 0.5674093961715698, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 1380 + }, + { + "epoch": 1.7139334155363748, + "grad_norm": 0.35272449254989624, + "learning_rate": 0.0002, + "loss": 0.8972, + "step": 1390 + }, + { + "epoch": 1.7262638717632552, + "grad_norm": 0.5778217911720276, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 1.7385943279901355, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 1410 + }, + { + "epoch": 1.7509247842170161, + "grad_norm": 0.31735464930534363, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 1420 + }, + { + "epoch": 1.7632552404438964, + "grad_norm": 1.0612670183181763, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 1430 + }, + { + "epoch": 1.7755856966707768, + "grad_norm": 0.5442509651184082, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1440 + }, + { + "epoch": 1.7879161528976573, + "grad_norm": 0.7471332550048828, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1450 + }, + { + "epoch": 1.8002466091245375, + "grad_norm": 0.4323609173297882, + "learning_rate": 0.0002, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 1.8125770653514182, + "grad_norm": 0.47796759009361267, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1470 + }, + { + "epoch": 1.8249075215782984, + "grad_norm": 0.3348400592803955, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 1480 + }, + { + "epoch": 1.8372379778051788, + "grad_norm": 0.3354550898075104, + "learning_rate": 0.0002, + "loss": 0.9793, + "step": 1490 + }, + { + "epoch": 1.8495684340320593, + "grad_norm": 0.5988477468490601, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 1500 + }, + { + "epoch": 1.8618988902589395, + "grad_norm": 0.5222318172454834, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 1510 + }, + { + "epoch": 1.87422934648582, + "grad_norm": 0.5246642827987671, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 1520 + }, + { + "epoch": 1.8865598027127004, + "grad_norm": 0.3164594769477844, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1530 + }, + { + "epoch": 1.8988902589395806, + "grad_norm": 0.3496174216270447, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 1540 + }, + { + "epoch": 1.9112207151664613, + "grad_norm": 0.8863359689712524, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1550 + }, + { + "epoch": 1.9235511713933415, + "grad_norm": 0.3587026298046112, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1560 + }, + { + "epoch": 1.935881627620222, + "grad_norm": 0.6052881479263306, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1570 + }, + { + "epoch": 1.9482120838471024, + "grad_norm": 0.567269504070282, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1580 + }, + { + "epoch": 1.9605425400739827, + "grad_norm": 0.45184487104415894, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 1590 + }, + { + "epoch": 1.972872996300863, + "grad_norm": 0.5028569102287292, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 1600 + }, + { + "epoch": 1.9852034525277436, + "grad_norm": 0.4677547216415405, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 1610 + }, + { + "epoch": 1.9975339087546238, + "grad_norm": 0.35106056928634644, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 1.238026738166809, + "eval_runtime": 95.4287, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.576, + "step": 1622 + }, + { + "epoch": 2.0098643649815044, + "grad_norm": 0.444060355424881, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 1630 + }, + { + "epoch": 2.0221948212083847, + "grad_norm": 0.627570390701294, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 1640 + }, + { + "epoch": 2.034525277435265, + "grad_norm": 0.38737839460372925, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 1650 + }, + { + "epoch": 2.0468557336621456, + "grad_norm": 0.4300459623336792, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 1660 + }, + { + "epoch": 2.059186189889026, + "grad_norm": 0.43037715554237366, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 1670 + }, + { + "epoch": 2.0715166461159065, + "grad_norm": 0.40772515535354614, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 1680 + }, + { + "epoch": 2.0838471023427867, + "grad_norm": 0.5295451879501343, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1690 + }, + { + "epoch": 2.096177558569667, + "grad_norm": 0.7452750205993652, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 1700 + }, + { + "epoch": 2.1085080147965476, + "grad_norm": 0.809183657169342, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 1710 + }, + { + "epoch": 2.120838471023428, + "grad_norm": 0.4597688913345337, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 1720 + }, + { + "epoch": 2.133168927250308, + "grad_norm": 0.806919276714325, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 1730 + }, + { + "epoch": 2.1454993834771887, + "grad_norm": 0.3755643665790558, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 1740 + }, + { + "epoch": 2.157829839704069, + "grad_norm": 0.5882734060287476, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1750 + }, + { + "epoch": 2.1701602959309496, + "grad_norm": 0.692960798740387, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 1760 + }, + { + "epoch": 2.18249075215783, + "grad_norm": 0.4737096428871155, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 1770 + }, + { + "epoch": 2.19482120838471, + "grad_norm": 0.6637021899223328, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 1780 + }, + { + "epoch": 2.2071516646115907, + "grad_norm": 0.9109764099121094, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 1790 + }, + { + "epoch": 2.219482120838471, + "grad_norm": 0.4137539267539978, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 1800 + }, + { + "epoch": 2.2318125770653516, + "grad_norm": 0.44995415210723877, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 1810 + }, + { + "epoch": 2.244143033292232, + "grad_norm": 0.5985036492347717, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 1820 + }, + { + "epoch": 2.256473489519112, + "grad_norm": 0.7549490332603455, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 1830 + }, + { + "epoch": 2.2688039457459928, + "grad_norm": 0.4490937888622284, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 1840 + }, + { + "epoch": 2.281134401972873, + "grad_norm": 0.38859808444976807, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 1850 + }, + { + "epoch": 2.293464858199753, + "grad_norm": 1.0704916715621948, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 1860 + }, + { + "epoch": 2.305795314426634, + "grad_norm": 0.4647100865840912, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 1870 + }, + { + "epoch": 2.318125770653514, + "grad_norm": 0.6181163787841797, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 1880 + }, + { + "epoch": 2.3304562268803943, + "grad_norm": 0.9241904020309448, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 1890 + }, + { + "epoch": 2.342786683107275, + "grad_norm": 0.39101317524909973, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 1900 + }, + { + "epoch": 2.3551171393341552, + "grad_norm": 0.49442458152770996, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 1910 + }, + { + "epoch": 2.367447595561036, + "grad_norm": 0.4864824414253235, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 1920 + }, + { + "epoch": 2.379778051787916, + "grad_norm": 0.5427613854408264, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 1930 + }, + { + "epoch": 2.392108508014797, + "grad_norm": 0.7164974808692932, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1940 + }, + { + "epoch": 2.404438964241677, + "grad_norm": 0.562979519367218, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 1950 + }, + { + "epoch": 2.4167694204685573, + "grad_norm": 0.5631861090660095, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 1960 + }, + { + "epoch": 2.429099876695438, + "grad_norm": 0.4895121157169342, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 1970 + }, + { + "epoch": 2.441430332922318, + "grad_norm": 0.45674824714660645, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1980 + }, + { + "epoch": 2.4537607891491984, + "grad_norm": 1.1424206495285034, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 1990 + }, + { + "epoch": 2.466091245376079, + "grad_norm": 0.6314579844474792, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 2000 + }, + { + "epoch": 2.4784217016029593, + "grad_norm": 0.5481605529785156, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 2010 + }, + { + "epoch": 2.4907521578298395, + "grad_norm": 0.4671579599380493, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 2020 + }, + { + "epoch": 2.50308261405672, + "grad_norm": 0.7621194124221802, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 2030 + }, + { + "epoch": 2.5154130702836004, + "grad_norm": 0.38983288407325745, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 2.5277435265104806, + "grad_norm": 0.6341150999069214, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2050 + }, + { + "epoch": 2.5400739827373613, + "grad_norm": 0.7151971459388733, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 2060 + }, + { + "epoch": 2.5524044389642415, + "grad_norm": 0.9665895104408264, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 2070 + }, + { + "epoch": 2.564734895191122, + "grad_norm": 0.9572727680206299, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 2080 + }, + { + "epoch": 2.5770653514180024, + "grad_norm": 1.1970765590667725, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 2090 + }, + { + "epoch": 2.589395807644883, + "grad_norm": 0.5505942702293396, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 2100 + }, + { + "epoch": 2.6017262638717633, + "grad_norm": 0.5903949737548828, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 2110 + }, + { + "epoch": 2.6140567200986435, + "grad_norm": 0.45640307664871216, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 2120 + }, + { + "epoch": 2.626387176325524, + "grad_norm": 0.8763944506645203, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 2130 + }, + { + "epoch": 2.6387176325524044, + "grad_norm": 0.4472963213920593, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 2140 + }, + { + "epoch": 2.6510480887792847, + "grad_norm": 0.5335086584091187, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 2150 + }, + { + "epoch": 2.6633785450061653, + "grad_norm": 0.805263340473175, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 2160 + }, + { + "epoch": 2.6757090012330456, + "grad_norm": 0.6332727670669556, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 2170 + }, + { + "epoch": 2.688039457459926, + "grad_norm": 0.8667435646057129, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 2180 + }, + { + "epoch": 2.7003699136868065, + "grad_norm": 0.5638955235481262, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2190 + }, + { + "epoch": 2.7127003699136867, + "grad_norm": 0.4176250696182251, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 2200 + }, + { + "epoch": 2.7250308261405674, + "grad_norm": 0.6013461351394653, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 2210 + }, + { + "epoch": 2.7373612823674476, + "grad_norm": 0.553961992263794, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 2220 + }, + { + "epoch": 2.7496917385943282, + "grad_norm": 0.4710180461406708, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 2230 + }, + { + "epoch": 2.7620221948212085, + "grad_norm": 0.8141706585884094, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 2240 + }, + { + "epoch": 2.7743526510480887, + "grad_norm": 0.7449556589126587, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 2250 + }, + { + "epoch": 2.7866831072749694, + "grad_norm": 0.5366780757904053, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 2260 + }, + { + "epoch": 2.7990135635018496, + "grad_norm": 0.5316720604896545, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 2270 + }, + { + "epoch": 2.81134401972873, + "grad_norm": 0.4598459005355835, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 2280 + }, + { + "epoch": 2.8236744759556105, + "grad_norm": 0.6852091550827026, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 2290 + }, + { + "epoch": 2.8360049321824907, + "grad_norm": 0.8040902018547058, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 2300 + }, + { + "epoch": 2.848335388409371, + "grad_norm": 0.46976321935653687, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 2310 + }, + { + "epoch": 2.8606658446362516, + "grad_norm": 0.5214090347290039, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 2320 + }, + { + "epoch": 2.872996300863132, + "grad_norm": 0.5323054790496826, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 2330 + }, + { + "epoch": 2.885326757090012, + "grad_norm": 0.6842264533042908, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2340 + }, + { + "epoch": 2.8976572133168927, + "grad_norm": 0.9157055616378784, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2350 + }, + { + "epoch": 2.909987669543773, + "grad_norm": 0.5253258347511292, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 2360 + }, + { + "epoch": 2.9223181257706536, + "grad_norm": 0.4937705099582672, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 2370 + }, + { + "epoch": 2.934648581997534, + "grad_norm": 0.48762989044189453, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 2380 + }, + { + "epoch": 2.9469790382244145, + "grad_norm": 0.544335126876831, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 2390 + }, + { + "epoch": 2.9593094944512948, + "grad_norm": 0.4847845435142517, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 2400 + }, + { + "epoch": 2.971639950678175, + "grad_norm": 0.4787445366382599, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 2410 + }, + { + "epoch": 2.9839704069050557, + "grad_norm": 1.022318959236145, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 2420 + }, + { + "epoch": 2.996300863131936, + "grad_norm": 0.4987848103046417, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 2430 + }, + { + "epoch": 3.0, + "eval_loss": 1.2936296463012695, + "eval_runtime": 94.7897, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 0.58, + "step": 2433 + }, + { + "epoch": 3.008631319358816, + "grad_norm": 0.5562372803688049, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 2440 + }, + { + "epoch": 3.020961775585697, + "grad_norm": 1.133402705192566, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 2450 + }, + { + "epoch": 3.033292231812577, + "grad_norm": 0.6480470299720764, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 2460 + }, + { + "epoch": 3.0456226880394572, + "grad_norm": 0.8989138007164001, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 2470 + }, + { + "epoch": 3.057953144266338, + "grad_norm": 0.8257461786270142, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2480 + }, + { + "epoch": 3.070283600493218, + "grad_norm": 0.6813381910324097, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 2490 + }, + { + "epoch": 3.082614056720099, + "grad_norm": 0.6989586353302002, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 2500 + }, + { + "epoch": 3.094944512946979, + "grad_norm": 0.7992092967033386, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 2510 + }, + { + "epoch": 3.1072749691738593, + "grad_norm": 0.698077917098999, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 2520 + }, + { + "epoch": 3.11960542540074, + "grad_norm": 0.5699033141136169, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 2530 + }, + { + "epoch": 3.13193588162762, + "grad_norm": 0.6142355799674988, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 2540 + }, + { + "epoch": 3.144266337854501, + "grad_norm": 0.7089933753013611, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 2550 + }, + { + "epoch": 3.156596794081381, + "grad_norm": 1.0107015371322632, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 2560 + }, + { + "epoch": 3.1689272503082613, + "grad_norm": 0.568138837814331, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 2570 + }, + { + "epoch": 3.181257706535142, + "grad_norm": 0.9960416555404663, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 2580 + }, + { + "epoch": 3.193588162762022, + "grad_norm": 0.6277595162391663, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 2590 + }, + { + "epoch": 3.2059186189889024, + "grad_norm": 0.681083619594574, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 2600 + }, + { + "epoch": 3.218249075215783, + "grad_norm": 0.5816057324409485, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 2610 + }, + { + "epoch": 3.2305795314426633, + "grad_norm": 0.7197734117507935, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 2620 + }, + { + "epoch": 3.242909987669544, + "grad_norm": 0.6524068117141724, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 2630 + }, + { + "epoch": 3.255240443896424, + "grad_norm": 1.273668646812439, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 2640 + }, + { + "epoch": 3.2675709001233044, + "grad_norm": 0.6950451731681824, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 2650 + }, + { + "epoch": 3.279901356350185, + "grad_norm": 0.8029071688652039, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 2660 + }, + { + "epoch": 3.2922318125770653, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 2670 + }, + { + "epoch": 3.304562268803946, + "grad_norm": 0.8342001438140869, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 2680 + }, + { + "epoch": 3.316892725030826, + "grad_norm": 0.5629868507385254, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 2690 + }, + { + "epoch": 3.3292231812577064, + "grad_norm": 0.753999650478363, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 2700 + }, + { + "epoch": 3.341553637484587, + "grad_norm": 1.0271371603012085, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 2710 + }, + { + "epoch": 3.3538840937114673, + "grad_norm": 0.9608535170555115, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 2720 + }, + { + "epoch": 3.3662145499383476, + "grad_norm": 0.7796488404273987, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 2730 + }, + { + "epoch": 3.3785450061652282, + "grad_norm": 0.5666437149047852, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 2740 + }, + { + "epoch": 3.3908754623921085, + "grad_norm": 0.5462956428527832, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 2750 + }, + { + "epoch": 3.4032059186189887, + "grad_norm": 1.289099097251892, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 2760 + }, + { + "epoch": 3.4155363748458694, + "grad_norm": 0.825566828250885, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 2770 + }, + { + "epoch": 3.4278668310727496, + "grad_norm": 0.8366670608520508, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 2780 + }, + { + "epoch": 3.4401972872996303, + "grad_norm": 1.0931549072265625, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 2790 + }, + { + "epoch": 3.4525277435265105, + "grad_norm": 0.9228858351707458, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 2800 + }, + { + "epoch": 3.4648581997533907, + "grad_norm": 1.3182806968688965, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 2810 + }, + { + "epoch": 3.4771886559802714, + "grad_norm": 0.8366976380348206, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 2820 + }, + { + "epoch": 3.4895191122071516, + "grad_norm": 0.8067695498466492, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 2830 + }, + { + "epoch": 3.5018495684340323, + "grad_norm": 1.1163437366485596, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 2840 + }, + { + "epoch": 3.5141800246609125, + "grad_norm": 1.7196556329727173, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 2850 + }, + { + "epoch": 3.5265104808877927, + "grad_norm": 1.1267012357711792, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 2860 + }, + { + "epoch": 3.5388409371146734, + "grad_norm": 0.7220137119293213, + "learning_rate": 0.0002, + "loss": 0.447, + "step": 2870 + }, + { + "epoch": 3.5511713933415536, + "grad_norm": 0.914114773273468, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 2880 + }, + { + "epoch": 3.563501849568434, + "grad_norm": 0.6193503141403198, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 2890 + }, + { + "epoch": 3.5758323057953145, + "grad_norm": 0.6060135960578918, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 2900 + }, + { + "epoch": 3.5881627620221948, + "grad_norm": 1.0177327394485474, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 2910 + }, + { + "epoch": 3.600493218249075, + "grad_norm": 0.5994468331336975, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 2920 + }, + { + "epoch": 3.6128236744759556, + "grad_norm": 0.7450457215309143, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 2930 + }, + { + "epoch": 3.625154130702836, + "grad_norm": 0.5825870037078857, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 2940 + }, + { + "epoch": 3.6374845869297165, + "grad_norm": 0.6289743781089783, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 2950 + }, + { + "epoch": 3.6498150431565968, + "grad_norm": 0.7801929116249084, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 2960 + }, + { + "epoch": 3.6621454993834774, + "grad_norm": 1.1206634044647217, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 2970 + }, + { + "epoch": 3.6744759556103577, + "grad_norm": 0.6738817691802979, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2980 + }, + { + "epoch": 3.686806411837238, + "grad_norm": 1.1917344331741333, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 2990 + }, + { + "epoch": 3.6991368680641186, + "grad_norm": 1.3738657236099243, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 3000 + }, + { + "epoch": 3.711467324290999, + "grad_norm": 0.6642793416976929, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 3010 + }, + { + "epoch": 3.723797780517879, + "grad_norm": 0.9030995965003967, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 3020 + }, + { + "epoch": 3.7361282367447597, + "grad_norm": 1.0203914642333984, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 3030 + }, + { + "epoch": 3.74845869297164, + "grad_norm": 0.648394763469696, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 3040 + }, + { + "epoch": 3.76078914919852, + "grad_norm": 0.6304570436477661, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 3050 + }, + { + "epoch": 3.773119605425401, + "grad_norm": 0.8286601901054382, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 3060 + }, + { + "epoch": 3.785450061652281, + "grad_norm": 0.906444251537323, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 3070 + }, + { + "epoch": 3.7977805178791613, + "grad_norm": 1.4212149381637573, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 3080 + }, + { + "epoch": 3.810110974106042, + "grad_norm": 0.7574319839477539, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3090 + }, + { + "epoch": 3.822441430332922, + "grad_norm": 0.6534451246261597, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 3100 + }, + { + "epoch": 3.834771886559803, + "grad_norm": 0.7525447010993958, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 3110 + }, + { + "epoch": 3.847102342786683, + "grad_norm": 0.6513990759849548, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 3120 + }, + { + "epoch": 3.8594327990135637, + "grad_norm": 0.7782694697380066, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 3130 + }, + { + "epoch": 3.871763255240444, + "grad_norm": 0.7998530268669128, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 3140 + }, + { + "epoch": 3.884093711467324, + "grad_norm": 0.8045353293418884, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 3150 + }, + { + "epoch": 3.896424167694205, + "grad_norm": 0.8242645263671875, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 3160 + }, + { + "epoch": 3.908754623921085, + "grad_norm": 0.8302360773086548, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 3170 + }, + { + "epoch": 3.9210850801479653, + "grad_norm": 0.8653109073638916, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 3180 + }, + { + "epoch": 3.933415536374846, + "grad_norm": 0.6461338996887207, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 3190 + }, + { + "epoch": 3.945745992601726, + "grad_norm": 0.8267415165901184, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 3200 + }, + { + "epoch": 3.9580764488286064, + "grad_norm": 1.1963194608688354, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 3210 + }, + { + "epoch": 3.970406905055487, + "grad_norm": 0.7101966142654419, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 3220 + }, + { + "epoch": 3.9827373612823673, + "grad_norm": 0.5931660532951355, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 3230 + }, + { + "epoch": 3.995067817509248, + "grad_norm": 0.7465988993644714, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 3240 + }, + { + "epoch": 4.0, + "eval_loss": 1.4066498279571533, + "eval_runtime": 95.7145, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 0.575, + "step": 3244 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6662399081775104e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-3244/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9d32ce67bf19f9ae0f1f748ab6885e8fd375ae59 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9786380ee461ab167b48ed45d7cab6fe7094739f47f6a289d7af4b8dbce5f32 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2418a195bae7a0e5c9e7414002a84e91e188bdf --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b540c0bafda6b1b36ff3606a04dbc7c435a3c90649787c75efe0834bc6bfa2 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d180450c89011f65f5e86d532745005d7b2a98c --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e8376d55b7ad0f26d2ad80687f603a0e7227268397353a249ca7fb285073fa +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..444d4820854e1010c6e443f3b82a9df3f3968356 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edf44369560a6c20c2f557bcf58956ff63bc4bddae5bb9da652c28530e4c47b5 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7815ab576e5e2478c6394f873e8ce71cdbdeb276 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/trainer_state.json @@ -0,0 +1,2908 @@ +{ + "best_metric": 1.238026738166809, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 4055, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + }, + { + "epoch": 1.0110974106041923, + "grad_norm": 0.26760655641555786, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 820 + }, + { + "epoch": 1.0234278668310728, + "grad_norm": 0.4640820026397705, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 830 + }, + { + "epoch": 1.0357583230579532, + "grad_norm": 0.2699166238307953, + "learning_rate": 0.0002, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.3441709578037262, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 850 + }, + { + "epoch": 1.060419235511714, + "grad_norm": 0.299934983253479, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 860 + }, + { + "epoch": 1.0727496917385944, + "grad_norm": 0.2980666160583496, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 870 + }, + { + "epoch": 1.0850801479654748, + "grad_norm": 0.3131714463233948, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 880 + }, + { + "epoch": 1.097410604192355, + "grad_norm": 0.29881617426872253, + "learning_rate": 0.0002, + "loss": 0.9288, + "step": 890 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.29870888590812683, + "learning_rate": 0.0002, + "loss": 0.998, + "step": 900 + }, + { + "epoch": 1.122071516646116, + "grad_norm": 0.5735140442848206, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 910 + }, + { + "epoch": 1.1344019728729964, + "grad_norm": 0.33159002661705017, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 920 + }, + { + "epoch": 1.1467324290998766, + "grad_norm": 1.235399842262268, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 930 + }, + { + "epoch": 1.159062885326757, + "grad_norm": 0.27469736337661743, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 940 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.29130664467811584, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 950 + }, + { + "epoch": 1.183723797780518, + "grad_norm": 0.3730354607105255, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 960 + }, + { + "epoch": 1.1960542540073984, + "grad_norm": 0.5973590612411499, + "learning_rate": 0.0002, + "loss": 0.9988, + "step": 970 + }, + { + "epoch": 1.2083847102342786, + "grad_norm": 0.39631304144859314, + "learning_rate": 0.0002, + "loss": 0.9525, + "step": 980 + }, + { + "epoch": 1.220715166461159, + "grad_norm": 0.849051296710968, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 990 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.4390525817871094, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1000 + }, + { + "epoch": 1.2453760789149197, + "grad_norm": 0.30423852801322937, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 1010 + }, + { + "epoch": 1.2577065351418002, + "grad_norm": 0.34736061096191406, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 1020 + }, + { + "epoch": 1.2700369913686806, + "grad_norm": 0.3421604037284851, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1030 + }, + { + "epoch": 1.282367447595561, + "grad_norm": 0.544170081615448, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1040 + }, + { + "epoch": 1.2946979038224415, + "grad_norm": 0.5128790736198425, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 1050 + }, + { + "epoch": 1.3070283600493218, + "grad_norm": 0.443344384431839, + "learning_rate": 0.0002, + "loss": 0.9214, + "step": 1060 + }, + { + "epoch": 1.3193588162762022, + "grad_norm": 0.6380868554115295, + "learning_rate": 0.0002, + "loss": 0.9367, + "step": 1070 + }, + { + "epoch": 1.3316892725030827, + "grad_norm": 0.4638073146343231, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 1080 + }, + { + "epoch": 1.344019728729963, + "grad_norm": 0.32406893372535706, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 1090 + }, + { + "epoch": 1.3563501849568433, + "grad_norm": 0.3955065608024597, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1100 + }, + { + "epoch": 1.3686806411837238, + "grad_norm": 0.3489246666431427, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 1110 + }, + { + "epoch": 1.3810110974106042, + "grad_norm": 0.48451653122901917, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 1120 + }, + { + "epoch": 1.3933415536374847, + "grad_norm": 0.3652360439300537, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 1130 + }, + { + "epoch": 1.405672009864365, + "grad_norm": 1.3097436428070068, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 1140 + }, + { + "epoch": 1.4180024660912454, + "grad_norm": 0.3647715449333191, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1150 + }, + { + "epoch": 1.4303329223181258, + "grad_norm": 0.37248560786247253, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 1160 + }, + { + "epoch": 1.442663378545006, + "grad_norm": 0.4639643430709839, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1170 + }, + { + "epoch": 1.4549938347718865, + "grad_norm": 0.5455219745635986, + "learning_rate": 0.0002, + "loss": 0.9511, + "step": 1180 + }, + { + "epoch": 1.467324290998767, + "grad_norm": 0.38862571120262146, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 1190 + }, + { + "epoch": 1.4796547472256474, + "grad_norm": 0.37586215138435364, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 1200 + }, + { + "epoch": 1.4919852034525278, + "grad_norm": 0.46244436502456665, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1210 + }, + { + "epoch": 1.504315659679408, + "grad_norm": 0.3570359945297241, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 1220 + }, + { + "epoch": 1.5166461159062885, + "grad_norm": 0.28393083810806274, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 1230 + }, + { + "epoch": 1.528976572133169, + "grad_norm": 0.5672869682312012, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 1240 + }, + { + "epoch": 1.5413070283600492, + "grad_norm": 0.41605108976364136, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 1250 + }, + { + "epoch": 1.5536374845869299, + "grad_norm": 0.40657493472099304, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1260 + }, + { + "epoch": 1.56596794081381, + "grad_norm": 0.43672341108322144, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 1.5782983970406905, + "grad_norm": 0.3065410554409027, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 1280 + }, + { + "epoch": 1.590628853267571, + "grad_norm": 0.37826645374298096, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1290 + }, + { + "epoch": 1.6029593094944512, + "grad_norm": 0.42307335138320923, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 1300 + }, + { + "epoch": 1.6152897657213316, + "grad_norm": 0.3648843467235565, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1310 + }, + { + "epoch": 1.627620221948212, + "grad_norm": 0.8921076059341431, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 1320 + }, + { + "epoch": 1.6399506781750923, + "grad_norm": 0.37522226572036743, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 1330 + }, + { + "epoch": 1.652281134401973, + "grad_norm": 0.7489957809448242, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 1340 + }, + { + "epoch": 1.6646115906288532, + "grad_norm": 0.31733131408691406, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 1350 + }, + { + "epoch": 1.6769420468557337, + "grad_norm": 0.3249478340148926, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1360 + }, + { + "epoch": 1.6892725030826141, + "grad_norm": 0.3178001344203949, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1370 + }, + { + "epoch": 1.7016029593094943, + "grad_norm": 0.5674093961715698, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 1380 + }, + { + "epoch": 1.7139334155363748, + "grad_norm": 0.35272449254989624, + "learning_rate": 0.0002, + "loss": 0.8972, + "step": 1390 + }, + { + "epoch": 1.7262638717632552, + "grad_norm": 0.5778217911720276, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 1.7385943279901355, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 1410 + }, + { + "epoch": 1.7509247842170161, + "grad_norm": 0.31735464930534363, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 1420 + }, + { + "epoch": 1.7632552404438964, + "grad_norm": 1.0612670183181763, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 1430 + }, + { + "epoch": 1.7755856966707768, + "grad_norm": 0.5442509651184082, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1440 + }, + { + "epoch": 1.7879161528976573, + "grad_norm": 0.7471332550048828, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1450 + }, + { + "epoch": 1.8002466091245375, + "grad_norm": 0.4323609173297882, + "learning_rate": 0.0002, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 1.8125770653514182, + "grad_norm": 0.47796759009361267, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1470 + }, + { + "epoch": 1.8249075215782984, + "grad_norm": 0.3348400592803955, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 1480 + }, + { + "epoch": 1.8372379778051788, + "grad_norm": 0.3354550898075104, + "learning_rate": 0.0002, + "loss": 0.9793, + "step": 1490 + }, + { + "epoch": 1.8495684340320593, + "grad_norm": 0.5988477468490601, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 1500 + }, + { + "epoch": 1.8618988902589395, + "grad_norm": 0.5222318172454834, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 1510 + }, + { + "epoch": 1.87422934648582, + "grad_norm": 0.5246642827987671, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 1520 + }, + { + "epoch": 1.8865598027127004, + "grad_norm": 0.3164594769477844, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1530 + }, + { + "epoch": 1.8988902589395806, + "grad_norm": 0.3496174216270447, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 1540 + }, + { + "epoch": 1.9112207151664613, + "grad_norm": 0.8863359689712524, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1550 + }, + { + "epoch": 1.9235511713933415, + "grad_norm": 0.3587026298046112, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1560 + }, + { + "epoch": 1.935881627620222, + "grad_norm": 0.6052881479263306, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1570 + }, + { + "epoch": 1.9482120838471024, + "grad_norm": 0.567269504070282, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1580 + }, + { + "epoch": 1.9605425400739827, + "grad_norm": 0.45184487104415894, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 1590 + }, + { + "epoch": 1.972872996300863, + "grad_norm": 0.5028569102287292, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 1600 + }, + { + "epoch": 1.9852034525277436, + "grad_norm": 0.4677547216415405, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 1610 + }, + { + "epoch": 1.9975339087546238, + "grad_norm": 0.35106056928634644, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 1.238026738166809, + "eval_runtime": 95.4287, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.576, + "step": 1622 + }, + { + "epoch": 2.0098643649815044, + "grad_norm": 0.444060355424881, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 1630 + }, + { + "epoch": 2.0221948212083847, + "grad_norm": 0.627570390701294, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 1640 + }, + { + "epoch": 2.034525277435265, + "grad_norm": 0.38737839460372925, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 1650 + }, + { + "epoch": 2.0468557336621456, + "grad_norm": 0.4300459623336792, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 1660 + }, + { + "epoch": 2.059186189889026, + "grad_norm": 0.43037715554237366, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 1670 + }, + { + "epoch": 2.0715166461159065, + "grad_norm": 0.40772515535354614, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 1680 + }, + { + "epoch": 2.0838471023427867, + "grad_norm": 0.5295451879501343, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1690 + }, + { + "epoch": 2.096177558569667, + "grad_norm": 0.7452750205993652, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 1700 + }, + { + "epoch": 2.1085080147965476, + "grad_norm": 0.809183657169342, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 1710 + }, + { + "epoch": 2.120838471023428, + "grad_norm": 0.4597688913345337, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 1720 + }, + { + "epoch": 2.133168927250308, + "grad_norm": 0.806919276714325, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 1730 + }, + { + "epoch": 2.1454993834771887, + "grad_norm": 0.3755643665790558, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 1740 + }, + { + "epoch": 2.157829839704069, + "grad_norm": 0.5882734060287476, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1750 + }, + { + "epoch": 2.1701602959309496, + "grad_norm": 0.692960798740387, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 1760 + }, + { + "epoch": 2.18249075215783, + "grad_norm": 0.4737096428871155, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 1770 + }, + { + "epoch": 2.19482120838471, + "grad_norm": 0.6637021899223328, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 1780 + }, + { + "epoch": 2.2071516646115907, + "grad_norm": 0.9109764099121094, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 1790 + }, + { + "epoch": 2.219482120838471, + "grad_norm": 0.4137539267539978, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 1800 + }, + { + "epoch": 2.2318125770653516, + "grad_norm": 0.44995415210723877, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 1810 + }, + { + "epoch": 2.244143033292232, + "grad_norm": 0.5985036492347717, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 1820 + }, + { + "epoch": 2.256473489519112, + "grad_norm": 0.7549490332603455, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 1830 + }, + { + "epoch": 2.2688039457459928, + "grad_norm": 0.4490937888622284, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 1840 + }, + { + "epoch": 2.281134401972873, + "grad_norm": 0.38859808444976807, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 1850 + }, + { + "epoch": 2.293464858199753, + "grad_norm": 1.0704916715621948, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 1860 + }, + { + "epoch": 2.305795314426634, + "grad_norm": 0.4647100865840912, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 1870 + }, + { + "epoch": 2.318125770653514, + "grad_norm": 0.6181163787841797, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 1880 + }, + { + "epoch": 2.3304562268803943, + "grad_norm": 0.9241904020309448, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 1890 + }, + { + "epoch": 2.342786683107275, + "grad_norm": 0.39101317524909973, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 1900 + }, + { + "epoch": 2.3551171393341552, + "grad_norm": 0.49442458152770996, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 1910 + }, + { + "epoch": 2.367447595561036, + "grad_norm": 0.4864824414253235, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 1920 + }, + { + "epoch": 2.379778051787916, + "grad_norm": 0.5427613854408264, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 1930 + }, + { + "epoch": 2.392108508014797, + "grad_norm": 0.7164974808692932, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1940 + }, + { + "epoch": 2.404438964241677, + "grad_norm": 0.562979519367218, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 1950 + }, + { + "epoch": 2.4167694204685573, + "grad_norm": 0.5631861090660095, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 1960 + }, + { + "epoch": 2.429099876695438, + "grad_norm": 0.4895121157169342, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 1970 + }, + { + "epoch": 2.441430332922318, + "grad_norm": 0.45674824714660645, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1980 + }, + { + "epoch": 2.4537607891491984, + "grad_norm": 1.1424206495285034, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 1990 + }, + { + "epoch": 2.466091245376079, + "grad_norm": 0.6314579844474792, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 2000 + }, + { + "epoch": 2.4784217016029593, + "grad_norm": 0.5481605529785156, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 2010 + }, + { + "epoch": 2.4907521578298395, + "grad_norm": 0.4671579599380493, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 2020 + }, + { + "epoch": 2.50308261405672, + "grad_norm": 0.7621194124221802, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 2030 + }, + { + "epoch": 2.5154130702836004, + "grad_norm": 0.38983288407325745, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 2.5277435265104806, + "grad_norm": 0.6341150999069214, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2050 + }, + { + "epoch": 2.5400739827373613, + "grad_norm": 0.7151971459388733, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 2060 + }, + { + "epoch": 2.5524044389642415, + "grad_norm": 0.9665895104408264, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 2070 + }, + { + "epoch": 2.564734895191122, + "grad_norm": 0.9572727680206299, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 2080 + }, + { + "epoch": 2.5770653514180024, + "grad_norm": 1.1970765590667725, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 2090 + }, + { + "epoch": 2.589395807644883, + "grad_norm": 0.5505942702293396, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 2100 + }, + { + "epoch": 2.6017262638717633, + "grad_norm": 0.5903949737548828, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 2110 + }, + { + "epoch": 2.6140567200986435, + "grad_norm": 0.45640307664871216, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 2120 + }, + { + "epoch": 2.626387176325524, + "grad_norm": 0.8763944506645203, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 2130 + }, + { + "epoch": 2.6387176325524044, + "grad_norm": 0.4472963213920593, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 2140 + }, + { + "epoch": 2.6510480887792847, + "grad_norm": 0.5335086584091187, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 2150 + }, + { + "epoch": 2.6633785450061653, + "grad_norm": 0.805263340473175, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 2160 + }, + { + "epoch": 2.6757090012330456, + "grad_norm": 0.6332727670669556, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 2170 + }, + { + "epoch": 2.688039457459926, + "grad_norm": 0.8667435646057129, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 2180 + }, + { + "epoch": 2.7003699136868065, + "grad_norm": 0.5638955235481262, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2190 + }, + { + "epoch": 2.7127003699136867, + "grad_norm": 0.4176250696182251, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 2200 + }, + { + "epoch": 2.7250308261405674, + "grad_norm": 0.6013461351394653, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 2210 + }, + { + "epoch": 2.7373612823674476, + "grad_norm": 0.553961992263794, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 2220 + }, + { + "epoch": 2.7496917385943282, + "grad_norm": 0.4710180461406708, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 2230 + }, + { + "epoch": 2.7620221948212085, + "grad_norm": 0.8141706585884094, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 2240 + }, + { + "epoch": 2.7743526510480887, + "grad_norm": 0.7449556589126587, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 2250 + }, + { + "epoch": 2.7866831072749694, + "grad_norm": 0.5366780757904053, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 2260 + }, + { + "epoch": 2.7990135635018496, + "grad_norm": 0.5316720604896545, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 2270 + }, + { + "epoch": 2.81134401972873, + "grad_norm": 0.4598459005355835, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 2280 + }, + { + "epoch": 2.8236744759556105, + "grad_norm": 0.6852091550827026, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 2290 + }, + { + "epoch": 2.8360049321824907, + "grad_norm": 0.8040902018547058, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 2300 + }, + { + "epoch": 2.848335388409371, + "grad_norm": 0.46976321935653687, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 2310 + }, + { + "epoch": 2.8606658446362516, + "grad_norm": 0.5214090347290039, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 2320 + }, + { + "epoch": 2.872996300863132, + "grad_norm": 0.5323054790496826, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 2330 + }, + { + "epoch": 2.885326757090012, + "grad_norm": 0.6842264533042908, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2340 + }, + { + "epoch": 2.8976572133168927, + "grad_norm": 0.9157055616378784, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2350 + }, + { + "epoch": 2.909987669543773, + "grad_norm": 0.5253258347511292, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 2360 + }, + { + "epoch": 2.9223181257706536, + "grad_norm": 0.4937705099582672, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 2370 + }, + { + "epoch": 2.934648581997534, + "grad_norm": 0.48762989044189453, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 2380 + }, + { + "epoch": 2.9469790382244145, + "grad_norm": 0.544335126876831, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 2390 + }, + { + "epoch": 2.9593094944512948, + "grad_norm": 0.4847845435142517, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 2400 + }, + { + "epoch": 2.971639950678175, + "grad_norm": 0.4787445366382599, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 2410 + }, + { + "epoch": 2.9839704069050557, + "grad_norm": 1.022318959236145, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 2420 + }, + { + "epoch": 2.996300863131936, + "grad_norm": 0.4987848103046417, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 2430 + }, + { + "epoch": 3.0, + "eval_loss": 1.2936296463012695, + "eval_runtime": 94.7897, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 0.58, + "step": 2433 + }, + { + "epoch": 3.008631319358816, + "grad_norm": 0.5562372803688049, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 2440 + }, + { + "epoch": 3.020961775585697, + "grad_norm": 1.133402705192566, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 2450 + }, + { + "epoch": 3.033292231812577, + "grad_norm": 0.6480470299720764, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 2460 + }, + { + "epoch": 3.0456226880394572, + "grad_norm": 0.8989138007164001, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 2470 + }, + { + "epoch": 3.057953144266338, + "grad_norm": 0.8257461786270142, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2480 + }, + { + "epoch": 3.070283600493218, + "grad_norm": 0.6813381910324097, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 2490 + }, + { + "epoch": 3.082614056720099, + "grad_norm": 0.6989586353302002, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 2500 + }, + { + "epoch": 3.094944512946979, + "grad_norm": 0.7992092967033386, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 2510 + }, + { + "epoch": 3.1072749691738593, + "grad_norm": 0.698077917098999, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 2520 + }, + { + "epoch": 3.11960542540074, + "grad_norm": 0.5699033141136169, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 2530 + }, + { + "epoch": 3.13193588162762, + "grad_norm": 0.6142355799674988, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 2540 + }, + { + "epoch": 3.144266337854501, + "grad_norm": 0.7089933753013611, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 2550 + }, + { + "epoch": 3.156596794081381, + "grad_norm": 1.0107015371322632, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 2560 + }, + { + "epoch": 3.1689272503082613, + "grad_norm": 0.568138837814331, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 2570 + }, + { + "epoch": 3.181257706535142, + "grad_norm": 0.9960416555404663, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 2580 + }, + { + "epoch": 3.193588162762022, + "grad_norm": 0.6277595162391663, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 2590 + }, + { + "epoch": 3.2059186189889024, + "grad_norm": 0.681083619594574, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 2600 + }, + { + "epoch": 3.218249075215783, + "grad_norm": 0.5816057324409485, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 2610 + }, + { + "epoch": 3.2305795314426633, + "grad_norm": 0.7197734117507935, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 2620 + }, + { + "epoch": 3.242909987669544, + "grad_norm": 0.6524068117141724, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 2630 + }, + { + "epoch": 3.255240443896424, + "grad_norm": 1.273668646812439, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 2640 + }, + { + "epoch": 3.2675709001233044, + "grad_norm": 0.6950451731681824, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 2650 + }, + { + "epoch": 3.279901356350185, + "grad_norm": 0.8029071688652039, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 2660 + }, + { + "epoch": 3.2922318125770653, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 2670 + }, + { + "epoch": 3.304562268803946, + "grad_norm": 0.8342001438140869, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 2680 + }, + { + "epoch": 3.316892725030826, + "grad_norm": 0.5629868507385254, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 2690 + }, + { + "epoch": 3.3292231812577064, + "grad_norm": 0.753999650478363, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 2700 + }, + { + "epoch": 3.341553637484587, + "grad_norm": 1.0271371603012085, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 2710 + }, + { + "epoch": 3.3538840937114673, + "grad_norm": 0.9608535170555115, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 2720 + }, + { + "epoch": 3.3662145499383476, + "grad_norm": 0.7796488404273987, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 2730 + }, + { + "epoch": 3.3785450061652282, + "grad_norm": 0.5666437149047852, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 2740 + }, + { + "epoch": 3.3908754623921085, + "grad_norm": 0.5462956428527832, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 2750 + }, + { + "epoch": 3.4032059186189887, + "grad_norm": 1.289099097251892, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 2760 + }, + { + "epoch": 3.4155363748458694, + "grad_norm": 0.825566828250885, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 2770 + }, + { + "epoch": 3.4278668310727496, + "grad_norm": 0.8366670608520508, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 2780 + }, + { + "epoch": 3.4401972872996303, + "grad_norm": 1.0931549072265625, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 2790 + }, + { + "epoch": 3.4525277435265105, + "grad_norm": 0.9228858351707458, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 2800 + }, + { + "epoch": 3.4648581997533907, + "grad_norm": 1.3182806968688965, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 2810 + }, + { + "epoch": 3.4771886559802714, + "grad_norm": 0.8366976380348206, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 2820 + }, + { + "epoch": 3.4895191122071516, + "grad_norm": 0.8067695498466492, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 2830 + }, + { + "epoch": 3.5018495684340323, + "grad_norm": 1.1163437366485596, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 2840 + }, + { + "epoch": 3.5141800246609125, + "grad_norm": 1.7196556329727173, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 2850 + }, + { + "epoch": 3.5265104808877927, + "grad_norm": 1.1267012357711792, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 2860 + }, + { + "epoch": 3.5388409371146734, + "grad_norm": 0.7220137119293213, + "learning_rate": 0.0002, + "loss": 0.447, + "step": 2870 + }, + { + "epoch": 3.5511713933415536, + "grad_norm": 0.914114773273468, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 2880 + }, + { + "epoch": 3.563501849568434, + "grad_norm": 0.6193503141403198, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 2890 + }, + { + "epoch": 3.5758323057953145, + "grad_norm": 0.6060135960578918, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 2900 + }, + { + "epoch": 3.5881627620221948, + "grad_norm": 1.0177327394485474, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 2910 + }, + { + "epoch": 3.600493218249075, + "grad_norm": 0.5994468331336975, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 2920 + }, + { + "epoch": 3.6128236744759556, + "grad_norm": 0.7450457215309143, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 2930 + }, + { + "epoch": 3.625154130702836, + "grad_norm": 0.5825870037078857, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 2940 + }, + { + "epoch": 3.6374845869297165, + "grad_norm": 0.6289743781089783, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 2950 + }, + { + "epoch": 3.6498150431565968, + "grad_norm": 0.7801929116249084, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 2960 + }, + { + "epoch": 3.6621454993834774, + "grad_norm": 1.1206634044647217, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 2970 + }, + { + "epoch": 3.6744759556103577, + "grad_norm": 0.6738817691802979, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2980 + }, + { + "epoch": 3.686806411837238, + "grad_norm": 1.1917344331741333, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 2990 + }, + { + "epoch": 3.6991368680641186, + "grad_norm": 1.3738657236099243, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 3000 + }, + { + "epoch": 3.711467324290999, + "grad_norm": 0.6642793416976929, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 3010 + }, + { + "epoch": 3.723797780517879, + "grad_norm": 0.9030995965003967, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 3020 + }, + { + "epoch": 3.7361282367447597, + "grad_norm": 1.0203914642333984, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 3030 + }, + { + "epoch": 3.74845869297164, + "grad_norm": 0.648394763469696, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 3040 + }, + { + "epoch": 3.76078914919852, + "grad_norm": 0.6304570436477661, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 3050 + }, + { + "epoch": 3.773119605425401, + "grad_norm": 0.8286601901054382, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 3060 + }, + { + "epoch": 3.785450061652281, + "grad_norm": 0.906444251537323, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 3070 + }, + { + "epoch": 3.7977805178791613, + "grad_norm": 1.4212149381637573, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 3080 + }, + { + "epoch": 3.810110974106042, + "grad_norm": 0.7574319839477539, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3090 + }, + { + "epoch": 3.822441430332922, + "grad_norm": 0.6534451246261597, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 3100 + }, + { + "epoch": 3.834771886559803, + "grad_norm": 0.7525447010993958, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 3110 + }, + { + "epoch": 3.847102342786683, + "grad_norm": 0.6513990759849548, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 3120 + }, + { + "epoch": 3.8594327990135637, + "grad_norm": 0.7782694697380066, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 3130 + }, + { + "epoch": 3.871763255240444, + "grad_norm": 0.7998530268669128, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 3140 + }, + { + "epoch": 3.884093711467324, + "grad_norm": 0.8045353293418884, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 3150 + }, + { + "epoch": 3.896424167694205, + "grad_norm": 0.8242645263671875, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 3160 + }, + { + "epoch": 3.908754623921085, + "grad_norm": 0.8302360773086548, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 3170 + }, + { + "epoch": 3.9210850801479653, + "grad_norm": 0.8653109073638916, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 3180 + }, + { + "epoch": 3.933415536374846, + "grad_norm": 0.6461338996887207, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 3190 + }, + { + "epoch": 3.945745992601726, + "grad_norm": 0.8267415165901184, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 3200 + }, + { + "epoch": 3.9580764488286064, + "grad_norm": 1.1963194608688354, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 3210 + }, + { + "epoch": 3.970406905055487, + "grad_norm": 0.7101966142654419, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 3220 + }, + { + "epoch": 3.9827373612823673, + "grad_norm": 0.5931660532951355, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 3230 + }, + { + "epoch": 3.995067817509248, + "grad_norm": 0.7465988993644714, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 3240 + }, + { + "epoch": 4.0, + "eval_loss": 1.4066498279571533, + "eval_runtime": 95.7145, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 0.575, + "step": 3244 + }, + { + "epoch": 4.007398273736128, + "grad_norm": 0.9478800296783447, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 3250 + }, + { + "epoch": 4.019728729963009, + "grad_norm": 1.207059621810913, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 3260 + }, + { + "epoch": 4.032059186189889, + "grad_norm": 0.8984074592590332, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 3270 + }, + { + "epoch": 4.044389642416769, + "grad_norm": 0.8104140758514404, + "learning_rate": 0.0002, + "loss": 0.3798, + "step": 3280 + }, + { + "epoch": 4.05672009864365, + "grad_norm": 1.0875468254089355, + "learning_rate": 0.0002, + "loss": 0.3657, + "step": 3290 + }, + { + "epoch": 4.06905055487053, + "grad_norm": 0.8520309329032898, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 3300 + }, + { + "epoch": 4.0813810110974105, + "grad_norm": 1.076735496520996, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 3310 + }, + { + "epoch": 4.093711467324291, + "grad_norm": 0.7789369821548462, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 3320 + }, + { + "epoch": 4.106041923551172, + "grad_norm": 0.916862964630127, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 3330 + }, + { + "epoch": 4.118372379778052, + "grad_norm": 1.1251654624938965, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3340 + }, + { + "epoch": 4.130702836004932, + "grad_norm": 0.9373420476913452, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 3350 + }, + { + "epoch": 4.143033292231813, + "grad_norm": 1.03253972530365, + "learning_rate": 0.0002, + "loss": 0.384, + "step": 3360 + }, + { + "epoch": 4.155363748458693, + "grad_norm": 0.947023332118988, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 3370 + }, + { + "epoch": 4.167694204685573, + "grad_norm": 0.8709157109260559, + "learning_rate": 0.0002, + "loss": 0.4018, + "step": 3380 + }, + { + "epoch": 4.180024660912454, + "grad_norm": 0.930983304977417, + "learning_rate": 0.0002, + "loss": 0.3754, + "step": 3390 + }, + { + "epoch": 4.192355117139334, + "grad_norm": 1.092809796333313, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 3400 + }, + { + "epoch": 4.2046855733662145, + "grad_norm": 0.8454303741455078, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 3410 + }, + { + "epoch": 4.217016029593095, + "grad_norm": 0.957210123538971, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3420 + }, + { + "epoch": 4.229346485819975, + "grad_norm": 0.854333758354187, + "learning_rate": 0.0002, + "loss": 0.3743, + "step": 3430 + }, + { + "epoch": 4.241676942046856, + "grad_norm": 1.0457639694213867, + "learning_rate": 0.0002, + "loss": 0.4041, + "step": 3440 + }, + { + "epoch": 4.254007398273736, + "grad_norm": 0.8972977995872498, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3450 + }, + { + "epoch": 4.266337854500616, + "grad_norm": 1.0438238382339478, + "learning_rate": 0.0002, + "loss": 0.4445, + "step": 3460 + }, + { + "epoch": 4.278668310727497, + "grad_norm": 0.7000405192375183, + "learning_rate": 0.0002, + "loss": 0.4078, + "step": 3470 + }, + { + "epoch": 4.290998766954377, + "grad_norm": 1.0451240539550781, + "learning_rate": 0.0002, + "loss": 0.3718, + "step": 3480 + }, + { + "epoch": 4.303329223181258, + "grad_norm": 1.3339767456054688, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 3490 + }, + { + "epoch": 4.315659679408138, + "grad_norm": 0.7503946423530579, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 3500 + }, + { + "epoch": 4.3279901356350186, + "grad_norm": 0.8443584442138672, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 3510 + }, + { + "epoch": 4.340320591861899, + "grad_norm": 1.1681201457977295, + "learning_rate": 0.0002, + "loss": 0.3793, + "step": 3520 + }, + { + "epoch": 4.352651048088779, + "grad_norm": 1.078883171081543, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 3530 + }, + { + "epoch": 4.36498150431566, + "grad_norm": 0.6894834041595459, + "learning_rate": 0.0002, + "loss": 0.4216, + "step": 3540 + }, + { + "epoch": 4.37731196054254, + "grad_norm": 0.7059480547904968, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 3550 + }, + { + "epoch": 4.38964241676942, + "grad_norm": 1.1807256937026978, + "learning_rate": 0.0002, + "loss": 0.3821, + "step": 3560 + }, + { + "epoch": 4.401972872996301, + "grad_norm": 0.8341359496116638, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 3570 + }, + { + "epoch": 4.4143033292231815, + "grad_norm": 1.0273033380508423, + "learning_rate": 0.0002, + "loss": 0.4123, + "step": 3580 + }, + { + "epoch": 4.426633785450061, + "grad_norm": 0.6916454434394836, + "learning_rate": 0.0002, + "loss": 0.5018, + "step": 3590 + }, + { + "epoch": 4.438964241676942, + "grad_norm": 0.8210113644599915, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 3600 + }, + { + "epoch": 4.451294697903823, + "grad_norm": 1.0309500694274902, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 3610 + }, + { + "epoch": 4.463625154130703, + "grad_norm": 0.8847399353981018, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3620 + }, + { + "epoch": 4.475955610357583, + "grad_norm": 1.668636679649353, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3630 + }, + { + "epoch": 4.488286066584464, + "grad_norm": 1.3087958097457886, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 3640 + }, + { + "epoch": 4.500616522811344, + "grad_norm": 0.837852418422699, + "learning_rate": 0.0002, + "loss": 0.4294, + "step": 3650 + }, + { + "epoch": 4.512946979038224, + "grad_norm": 9.7662353515625, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 3660 + }, + { + "epoch": 4.525277435265105, + "grad_norm": 1.125719428062439, + "learning_rate": 0.0002, + "loss": 0.4033, + "step": 3670 + }, + { + "epoch": 4.5376078914919855, + "grad_norm": 0.7755377292633057, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 3680 + }, + { + "epoch": 4.549938347718865, + "grad_norm": 0.7185089588165283, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 3690 + }, + { + "epoch": 4.562268803945746, + "grad_norm": 1.182063102722168, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 3700 + }, + { + "epoch": 4.574599260172627, + "grad_norm": 1.001197338104248, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3710 + }, + { + "epoch": 4.586929716399506, + "grad_norm": 0.9705429077148438, + "learning_rate": 0.0002, + "loss": 0.4493, + "step": 3720 + }, + { + "epoch": 4.599260172626387, + "grad_norm": 0.7136746048927307, + "learning_rate": 0.0002, + "loss": 0.42, + "step": 3730 + }, + { + "epoch": 4.611590628853268, + "grad_norm": 1.0004864931106567, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 3740 + }, + { + "epoch": 4.623921085080148, + "grad_norm": 1.3193715810775757, + "learning_rate": 0.0002, + "loss": 0.4418, + "step": 3750 + }, + { + "epoch": 4.636251541307028, + "grad_norm": 0.6945042014122009, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3760 + }, + { + "epoch": 4.648581997533909, + "grad_norm": 0.8903936743736267, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 3770 + }, + { + "epoch": 4.660912453760789, + "grad_norm": 0.7960889339447021, + "learning_rate": 0.0002, + "loss": 0.3582, + "step": 3780 + }, + { + "epoch": 4.673242909987669, + "grad_norm": 1.0439172983169556, + "learning_rate": 0.0002, + "loss": 0.3864, + "step": 3790 + }, + { + "epoch": 4.68557336621455, + "grad_norm": 1.4546219110488892, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 3800 + }, + { + "epoch": 4.697903822441431, + "grad_norm": 0.8194343447685242, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 3810 + }, + { + "epoch": 4.7102342786683105, + "grad_norm": 1.0727602243423462, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 3820 + }, + { + "epoch": 4.722564734895191, + "grad_norm": 0.7785195708274841, + "learning_rate": 0.0002, + "loss": 0.4021, + "step": 3830 + }, + { + "epoch": 4.734895191122072, + "grad_norm": 0.846783459186554, + "learning_rate": 0.0002, + "loss": 0.4252, + "step": 3840 + }, + { + "epoch": 4.747225647348952, + "grad_norm": 1.0481648445129395, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 3850 + }, + { + "epoch": 4.759556103575832, + "grad_norm": 0.7324008941650391, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3860 + }, + { + "epoch": 4.771886559802713, + "grad_norm": 1.06382417678833, + "learning_rate": 0.0002, + "loss": 0.3831, + "step": 3870 + }, + { + "epoch": 4.784217016029594, + "grad_norm": 0.9851241111755371, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3880 + }, + { + "epoch": 4.796547472256473, + "grad_norm": 0.8215277791023254, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 3890 + }, + { + "epoch": 4.808877928483354, + "grad_norm": 0.9901723861694336, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 3900 + }, + { + "epoch": 4.821208384710234, + "grad_norm": 0.9149112701416016, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 3910 + }, + { + "epoch": 4.8335388409371145, + "grad_norm": 0.9772973656654358, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3920 + }, + { + "epoch": 4.845869297163995, + "grad_norm": 0.8889636397361755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 3930 + }, + { + "epoch": 4.858199753390876, + "grad_norm": 1.3032807111740112, + "learning_rate": 0.0002, + "loss": 0.421, + "step": 3940 + }, + { + "epoch": 4.870530209617756, + "grad_norm": 0.8575899600982666, + "learning_rate": 0.0002, + "loss": 0.434, + "step": 3950 + }, + { + "epoch": 4.882860665844636, + "grad_norm": 1.04326331615448, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3960 + }, + { + "epoch": 4.895191122071517, + "grad_norm": 1.041210651397705, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 3970 + }, + { + "epoch": 4.907521578298397, + "grad_norm": 0.9113056063652039, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 3980 + }, + { + "epoch": 4.919852034525277, + "grad_norm": 1.019347906112671, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 3990 + }, + { + "epoch": 4.932182490752158, + "grad_norm": 0.7709218859672546, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 4000 + }, + { + "epoch": 4.944512946979038, + "grad_norm": 0.8891775608062744, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 4010 + }, + { + "epoch": 4.9568434032059185, + "grad_norm": 1.0396920442581177, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 4020 + }, + { + "epoch": 4.969173859432799, + "grad_norm": 0.9239833354949951, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 4030 + }, + { + "epoch": 4.981504315659679, + "grad_norm": 1.801400065422058, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 4040 + }, + { + "epoch": 4.99383477188656, + "grad_norm": 0.6194164752960205, + "learning_rate": 0.0002, + "loss": 0.4481, + "step": 4050 + }, + { + "epoch": 5.0, + "eval_loss": 1.544758915901184, + "eval_runtime": 96.2573, + "eval_samples_per_second": 4.53, + "eval_steps_per_second": 0.571, + "step": 4055 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.082799885221888e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4055/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1a0267c8163302f579a50e891f2ce9e79ff79446 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1abb743a3afc40061af249d32efbaa49e8d42853ef57a9ec9768d8a351ca7832 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e2a88aa78451f454727122fd93f979b8f1888f9 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:354d119b98533a9c8d5ab57f273c8ecc866ad44dea1afbd90e7f91cff4f44fac +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1618f54ce3d4debb08fc1e47dbb0d0dd3635c7a2 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5c570874288ce5fe516c21989469aa2e53d25b4d879077e9e7762412bf3f1bd +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..365254da1f0cca2244b245ed9d7bf33826738212 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad6e8aba39276d3c67a0c338dee56450410d9e0eddf4f04cbe6bde7c669d697 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9f637cd8759c966b4773976802310e3e4aae1965 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/trainer_state.json @@ -0,0 +1,3483 @@ +{ + "best_metric": 1.238026738166809, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 4866, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + }, + { + "epoch": 1.0110974106041923, + "grad_norm": 0.26760655641555786, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 820 + }, + { + "epoch": 1.0234278668310728, + "grad_norm": 0.4640820026397705, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 830 + }, + { + "epoch": 1.0357583230579532, + "grad_norm": 0.2699166238307953, + "learning_rate": 0.0002, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.3441709578037262, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 850 + }, + { + "epoch": 1.060419235511714, + "grad_norm": 0.299934983253479, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 860 + }, + { + "epoch": 1.0727496917385944, + "grad_norm": 0.2980666160583496, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 870 + }, + { + "epoch": 1.0850801479654748, + "grad_norm": 0.3131714463233948, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 880 + }, + { + "epoch": 1.097410604192355, + "grad_norm": 0.29881617426872253, + "learning_rate": 0.0002, + "loss": 0.9288, + "step": 890 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.29870888590812683, + "learning_rate": 0.0002, + "loss": 0.998, + "step": 900 + }, + { + "epoch": 1.122071516646116, + "grad_norm": 0.5735140442848206, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 910 + }, + { + "epoch": 1.1344019728729964, + "grad_norm": 0.33159002661705017, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 920 + }, + { + "epoch": 1.1467324290998766, + "grad_norm": 1.235399842262268, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 930 + }, + { + "epoch": 1.159062885326757, + "grad_norm": 0.27469736337661743, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 940 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.29130664467811584, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 950 + }, + { + "epoch": 1.183723797780518, + "grad_norm": 0.3730354607105255, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 960 + }, + { + "epoch": 1.1960542540073984, + "grad_norm": 0.5973590612411499, + "learning_rate": 0.0002, + "loss": 0.9988, + "step": 970 + }, + { + "epoch": 1.2083847102342786, + "grad_norm": 0.39631304144859314, + "learning_rate": 0.0002, + "loss": 0.9525, + "step": 980 + }, + { + "epoch": 1.220715166461159, + "grad_norm": 0.849051296710968, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 990 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.4390525817871094, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1000 + }, + { + "epoch": 1.2453760789149197, + "grad_norm": 0.30423852801322937, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 1010 + }, + { + "epoch": 1.2577065351418002, + "grad_norm": 0.34736061096191406, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 1020 + }, + { + "epoch": 1.2700369913686806, + "grad_norm": 0.3421604037284851, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1030 + }, + { + "epoch": 1.282367447595561, + "grad_norm": 0.544170081615448, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1040 + }, + { + "epoch": 1.2946979038224415, + "grad_norm": 0.5128790736198425, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 1050 + }, + { + "epoch": 1.3070283600493218, + "grad_norm": 0.443344384431839, + "learning_rate": 0.0002, + "loss": 0.9214, + "step": 1060 + }, + { + "epoch": 1.3193588162762022, + "grad_norm": 0.6380868554115295, + "learning_rate": 0.0002, + "loss": 0.9367, + "step": 1070 + }, + { + "epoch": 1.3316892725030827, + "grad_norm": 0.4638073146343231, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 1080 + }, + { + "epoch": 1.344019728729963, + "grad_norm": 0.32406893372535706, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 1090 + }, + { + "epoch": 1.3563501849568433, + "grad_norm": 0.3955065608024597, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1100 + }, + { + "epoch": 1.3686806411837238, + "grad_norm": 0.3489246666431427, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 1110 + }, + { + "epoch": 1.3810110974106042, + "grad_norm": 0.48451653122901917, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 1120 + }, + { + "epoch": 1.3933415536374847, + "grad_norm": 0.3652360439300537, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 1130 + }, + { + "epoch": 1.405672009864365, + "grad_norm": 1.3097436428070068, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 1140 + }, + { + "epoch": 1.4180024660912454, + "grad_norm": 0.3647715449333191, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1150 + }, + { + "epoch": 1.4303329223181258, + "grad_norm": 0.37248560786247253, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 1160 + }, + { + "epoch": 1.442663378545006, + "grad_norm": 0.4639643430709839, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1170 + }, + { + "epoch": 1.4549938347718865, + "grad_norm": 0.5455219745635986, + "learning_rate": 0.0002, + "loss": 0.9511, + "step": 1180 + }, + { + "epoch": 1.467324290998767, + "grad_norm": 0.38862571120262146, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 1190 + }, + { + "epoch": 1.4796547472256474, + "grad_norm": 0.37586215138435364, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 1200 + }, + { + "epoch": 1.4919852034525278, + "grad_norm": 0.46244436502456665, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1210 + }, + { + "epoch": 1.504315659679408, + "grad_norm": 0.3570359945297241, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 1220 + }, + { + "epoch": 1.5166461159062885, + "grad_norm": 0.28393083810806274, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 1230 + }, + { + "epoch": 1.528976572133169, + "grad_norm": 0.5672869682312012, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 1240 + }, + { + "epoch": 1.5413070283600492, + "grad_norm": 0.41605108976364136, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 1250 + }, + { + "epoch": 1.5536374845869299, + "grad_norm": 0.40657493472099304, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1260 + }, + { + "epoch": 1.56596794081381, + "grad_norm": 0.43672341108322144, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 1.5782983970406905, + "grad_norm": 0.3065410554409027, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 1280 + }, + { + "epoch": 1.590628853267571, + "grad_norm": 0.37826645374298096, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1290 + }, + { + "epoch": 1.6029593094944512, + "grad_norm": 0.42307335138320923, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 1300 + }, + { + "epoch": 1.6152897657213316, + "grad_norm": 0.3648843467235565, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1310 + }, + { + "epoch": 1.627620221948212, + "grad_norm": 0.8921076059341431, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 1320 + }, + { + "epoch": 1.6399506781750923, + "grad_norm": 0.37522226572036743, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 1330 + }, + { + "epoch": 1.652281134401973, + "grad_norm": 0.7489957809448242, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 1340 + }, + { + "epoch": 1.6646115906288532, + "grad_norm": 0.31733131408691406, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 1350 + }, + { + "epoch": 1.6769420468557337, + "grad_norm": 0.3249478340148926, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1360 + }, + { + "epoch": 1.6892725030826141, + "grad_norm": 0.3178001344203949, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1370 + }, + { + "epoch": 1.7016029593094943, + "grad_norm": 0.5674093961715698, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 1380 + }, + { + "epoch": 1.7139334155363748, + "grad_norm": 0.35272449254989624, + "learning_rate": 0.0002, + "loss": 0.8972, + "step": 1390 + }, + { + "epoch": 1.7262638717632552, + "grad_norm": 0.5778217911720276, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 1.7385943279901355, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 1410 + }, + { + "epoch": 1.7509247842170161, + "grad_norm": 0.31735464930534363, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 1420 + }, + { + "epoch": 1.7632552404438964, + "grad_norm": 1.0612670183181763, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 1430 + }, + { + "epoch": 1.7755856966707768, + "grad_norm": 0.5442509651184082, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1440 + }, + { + "epoch": 1.7879161528976573, + "grad_norm": 0.7471332550048828, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1450 + }, + { + "epoch": 1.8002466091245375, + "grad_norm": 0.4323609173297882, + "learning_rate": 0.0002, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 1.8125770653514182, + "grad_norm": 0.47796759009361267, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1470 + }, + { + "epoch": 1.8249075215782984, + "grad_norm": 0.3348400592803955, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 1480 + }, + { + "epoch": 1.8372379778051788, + "grad_norm": 0.3354550898075104, + "learning_rate": 0.0002, + "loss": 0.9793, + "step": 1490 + }, + { + "epoch": 1.8495684340320593, + "grad_norm": 0.5988477468490601, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 1500 + }, + { + "epoch": 1.8618988902589395, + "grad_norm": 0.5222318172454834, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 1510 + }, + { + "epoch": 1.87422934648582, + "grad_norm": 0.5246642827987671, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 1520 + }, + { + "epoch": 1.8865598027127004, + "grad_norm": 0.3164594769477844, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1530 + }, + { + "epoch": 1.8988902589395806, + "grad_norm": 0.3496174216270447, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 1540 + }, + { + "epoch": 1.9112207151664613, + "grad_norm": 0.8863359689712524, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1550 + }, + { + "epoch": 1.9235511713933415, + "grad_norm": 0.3587026298046112, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1560 + }, + { + "epoch": 1.935881627620222, + "grad_norm": 0.6052881479263306, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1570 + }, + { + "epoch": 1.9482120838471024, + "grad_norm": 0.567269504070282, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1580 + }, + { + "epoch": 1.9605425400739827, + "grad_norm": 0.45184487104415894, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 1590 + }, + { + "epoch": 1.972872996300863, + "grad_norm": 0.5028569102287292, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 1600 + }, + { + "epoch": 1.9852034525277436, + "grad_norm": 0.4677547216415405, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 1610 + }, + { + "epoch": 1.9975339087546238, + "grad_norm": 0.35106056928634644, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 1.238026738166809, + "eval_runtime": 95.4287, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.576, + "step": 1622 + }, + { + "epoch": 2.0098643649815044, + "grad_norm": 0.444060355424881, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 1630 + }, + { + "epoch": 2.0221948212083847, + "grad_norm": 0.627570390701294, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 1640 + }, + { + "epoch": 2.034525277435265, + "grad_norm": 0.38737839460372925, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 1650 + }, + { + "epoch": 2.0468557336621456, + "grad_norm": 0.4300459623336792, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 1660 + }, + { + "epoch": 2.059186189889026, + "grad_norm": 0.43037715554237366, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 1670 + }, + { + "epoch": 2.0715166461159065, + "grad_norm": 0.40772515535354614, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 1680 + }, + { + "epoch": 2.0838471023427867, + "grad_norm": 0.5295451879501343, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1690 + }, + { + "epoch": 2.096177558569667, + "grad_norm": 0.7452750205993652, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 1700 + }, + { + "epoch": 2.1085080147965476, + "grad_norm": 0.809183657169342, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 1710 + }, + { + "epoch": 2.120838471023428, + "grad_norm": 0.4597688913345337, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 1720 + }, + { + "epoch": 2.133168927250308, + "grad_norm": 0.806919276714325, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 1730 + }, + { + "epoch": 2.1454993834771887, + "grad_norm": 0.3755643665790558, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 1740 + }, + { + "epoch": 2.157829839704069, + "grad_norm": 0.5882734060287476, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1750 + }, + { + "epoch": 2.1701602959309496, + "grad_norm": 0.692960798740387, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 1760 + }, + { + "epoch": 2.18249075215783, + "grad_norm": 0.4737096428871155, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 1770 + }, + { + "epoch": 2.19482120838471, + "grad_norm": 0.6637021899223328, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 1780 + }, + { + "epoch": 2.2071516646115907, + "grad_norm": 0.9109764099121094, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 1790 + }, + { + "epoch": 2.219482120838471, + "grad_norm": 0.4137539267539978, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 1800 + }, + { + "epoch": 2.2318125770653516, + "grad_norm": 0.44995415210723877, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 1810 + }, + { + "epoch": 2.244143033292232, + "grad_norm": 0.5985036492347717, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 1820 + }, + { + "epoch": 2.256473489519112, + "grad_norm": 0.7549490332603455, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 1830 + }, + { + "epoch": 2.2688039457459928, + "grad_norm": 0.4490937888622284, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 1840 + }, + { + "epoch": 2.281134401972873, + "grad_norm": 0.38859808444976807, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 1850 + }, + { + "epoch": 2.293464858199753, + "grad_norm": 1.0704916715621948, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 1860 + }, + { + "epoch": 2.305795314426634, + "grad_norm": 0.4647100865840912, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 1870 + }, + { + "epoch": 2.318125770653514, + "grad_norm": 0.6181163787841797, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 1880 + }, + { + "epoch": 2.3304562268803943, + "grad_norm": 0.9241904020309448, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 1890 + }, + { + "epoch": 2.342786683107275, + "grad_norm": 0.39101317524909973, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 1900 + }, + { + "epoch": 2.3551171393341552, + "grad_norm": 0.49442458152770996, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 1910 + }, + { + "epoch": 2.367447595561036, + "grad_norm": 0.4864824414253235, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 1920 + }, + { + "epoch": 2.379778051787916, + "grad_norm": 0.5427613854408264, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 1930 + }, + { + "epoch": 2.392108508014797, + "grad_norm": 0.7164974808692932, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1940 + }, + { + "epoch": 2.404438964241677, + "grad_norm": 0.562979519367218, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 1950 + }, + { + "epoch": 2.4167694204685573, + "grad_norm": 0.5631861090660095, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 1960 + }, + { + "epoch": 2.429099876695438, + "grad_norm": 0.4895121157169342, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 1970 + }, + { + "epoch": 2.441430332922318, + "grad_norm": 0.45674824714660645, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1980 + }, + { + "epoch": 2.4537607891491984, + "grad_norm": 1.1424206495285034, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 1990 + }, + { + "epoch": 2.466091245376079, + "grad_norm": 0.6314579844474792, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 2000 + }, + { + "epoch": 2.4784217016029593, + "grad_norm": 0.5481605529785156, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 2010 + }, + { + "epoch": 2.4907521578298395, + "grad_norm": 0.4671579599380493, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 2020 + }, + { + "epoch": 2.50308261405672, + "grad_norm": 0.7621194124221802, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 2030 + }, + { + "epoch": 2.5154130702836004, + "grad_norm": 0.38983288407325745, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 2.5277435265104806, + "grad_norm": 0.6341150999069214, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2050 + }, + { + "epoch": 2.5400739827373613, + "grad_norm": 0.7151971459388733, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 2060 + }, + { + "epoch": 2.5524044389642415, + "grad_norm": 0.9665895104408264, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 2070 + }, + { + "epoch": 2.564734895191122, + "grad_norm": 0.9572727680206299, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 2080 + }, + { + "epoch": 2.5770653514180024, + "grad_norm": 1.1970765590667725, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 2090 + }, + { + "epoch": 2.589395807644883, + "grad_norm": 0.5505942702293396, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 2100 + }, + { + "epoch": 2.6017262638717633, + "grad_norm": 0.5903949737548828, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 2110 + }, + { + "epoch": 2.6140567200986435, + "grad_norm": 0.45640307664871216, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 2120 + }, + { + "epoch": 2.626387176325524, + "grad_norm": 0.8763944506645203, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 2130 + }, + { + "epoch": 2.6387176325524044, + "grad_norm": 0.4472963213920593, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 2140 + }, + { + "epoch": 2.6510480887792847, + "grad_norm": 0.5335086584091187, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 2150 + }, + { + "epoch": 2.6633785450061653, + "grad_norm": 0.805263340473175, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 2160 + }, + { + "epoch": 2.6757090012330456, + "grad_norm": 0.6332727670669556, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 2170 + }, + { + "epoch": 2.688039457459926, + "grad_norm": 0.8667435646057129, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 2180 + }, + { + "epoch": 2.7003699136868065, + "grad_norm": 0.5638955235481262, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2190 + }, + { + "epoch": 2.7127003699136867, + "grad_norm": 0.4176250696182251, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 2200 + }, + { + "epoch": 2.7250308261405674, + "grad_norm": 0.6013461351394653, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 2210 + }, + { + "epoch": 2.7373612823674476, + "grad_norm": 0.553961992263794, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 2220 + }, + { + "epoch": 2.7496917385943282, + "grad_norm": 0.4710180461406708, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 2230 + }, + { + "epoch": 2.7620221948212085, + "grad_norm": 0.8141706585884094, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 2240 + }, + { + "epoch": 2.7743526510480887, + "grad_norm": 0.7449556589126587, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 2250 + }, + { + "epoch": 2.7866831072749694, + "grad_norm": 0.5366780757904053, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 2260 + }, + { + "epoch": 2.7990135635018496, + "grad_norm": 0.5316720604896545, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 2270 + }, + { + "epoch": 2.81134401972873, + "grad_norm": 0.4598459005355835, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 2280 + }, + { + "epoch": 2.8236744759556105, + "grad_norm": 0.6852091550827026, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 2290 + }, + { + "epoch": 2.8360049321824907, + "grad_norm": 0.8040902018547058, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 2300 + }, + { + "epoch": 2.848335388409371, + "grad_norm": 0.46976321935653687, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 2310 + }, + { + "epoch": 2.8606658446362516, + "grad_norm": 0.5214090347290039, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 2320 + }, + { + "epoch": 2.872996300863132, + "grad_norm": 0.5323054790496826, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 2330 + }, + { + "epoch": 2.885326757090012, + "grad_norm": 0.6842264533042908, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2340 + }, + { + "epoch": 2.8976572133168927, + "grad_norm": 0.9157055616378784, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2350 + }, + { + "epoch": 2.909987669543773, + "grad_norm": 0.5253258347511292, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 2360 + }, + { + "epoch": 2.9223181257706536, + "grad_norm": 0.4937705099582672, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 2370 + }, + { + "epoch": 2.934648581997534, + "grad_norm": 0.48762989044189453, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 2380 + }, + { + "epoch": 2.9469790382244145, + "grad_norm": 0.544335126876831, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 2390 + }, + { + "epoch": 2.9593094944512948, + "grad_norm": 0.4847845435142517, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 2400 + }, + { + "epoch": 2.971639950678175, + "grad_norm": 0.4787445366382599, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 2410 + }, + { + "epoch": 2.9839704069050557, + "grad_norm": 1.022318959236145, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 2420 + }, + { + "epoch": 2.996300863131936, + "grad_norm": 0.4987848103046417, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 2430 + }, + { + "epoch": 3.0, + "eval_loss": 1.2936296463012695, + "eval_runtime": 94.7897, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 0.58, + "step": 2433 + }, + { + "epoch": 3.008631319358816, + "grad_norm": 0.5562372803688049, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 2440 + }, + { + "epoch": 3.020961775585697, + "grad_norm": 1.133402705192566, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 2450 + }, + { + "epoch": 3.033292231812577, + "grad_norm": 0.6480470299720764, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 2460 + }, + { + "epoch": 3.0456226880394572, + "grad_norm": 0.8989138007164001, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 2470 + }, + { + "epoch": 3.057953144266338, + "grad_norm": 0.8257461786270142, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2480 + }, + { + "epoch": 3.070283600493218, + "grad_norm": 0.6813381910324097, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 2490 + }, + { + "epoch": 3.082614056720099, + "grad_norm": 0.6989586353302002, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 2500 + }, + { + "epoch": 3.094944512946979, + "grad_norm": 0.7992092967033386, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 2510 + }, + { + "epoch": 3.1072749691738593, + "grad_norm": 0.698077917098999, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 2520 + }, + { + "epoch": 3.11960542540074, + "grad_norm": 0.5699033141136169, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 2530 + }, + { + "epoch": 3.13193588162762, + "grad_norm": 0.6142355799674988, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 2540 + }, + { + "epoch": 3.144266337854501, + "grad_norm": 0.7089933753013611, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 2550 + }, + { + "epoch": 3.156596794081381, + "grad_norm": 1.0107015371322632, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 2560 + }, + { + "epoch": 3.1689272503082613, + "grad_norm": 0.568138837814331, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 2570 + }, + { + "epoch": 3.181257706535142, + "grad_norm": 0.9960416555404663, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 2580 + }, + { + "epoch": 3.193588162762022, + "grad_norm": 0.6277595162391663, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 2590 + }, + { + "epoch": 3.2059186189889024, + "grad_norm": 0.681083619594574, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 2600 + }, + { + "epoch": 3.218249075215783, + "grad_norm": 0.5816057324409485, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 2610 + }, + { + "epoch": 3.2305795314426633, + "grad_norm": 0.7197734117507935, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 2620 + }, + { + "epoch": 3.242909987669544, + "grad_norm": 0.6524068117141724, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 2630 + }, + { + "epoch": 3.255240443896424, + "grad_norm": 1.273668646812439, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 2640 + }, + { + "epoch": 3.2675709001233044, + "grad_norm": 0.6950451731681824, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 2650 + }, + { + "epoch": 3.279901356350185, + "grad_norm": 0.8029071688652039, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 2660 + }, + { + "epoch": 3.2922318125770653, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 2670 + }, + { + "epoch": 3.304562268803946, + "grad_norm": 0.8342001438140869, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 2680 + }, + { + "epoch": 3.316892725030826, + "grad_norm": 0.5629868507385254, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 2690 + }, + { + "epoch": 3.3292231812577064, + "grad_norm": 0.753999650478363, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 2700 + }, + { + "epoch": 3.341553637484587, + "grad_norm": 1.0271371603012085, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 2710 + }, + { + "epoch": 3.3538840937114673, + "grad_norm": 0.9608535170555115, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 2720 + }, + { + "epoch": 3.3662145499383476, + "grad_norm": 0.7796488404273987, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 2730 + }, + { + "epoch": 3.3785450061652282, + "grad_norm": 0.5666437149047852, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 2740 + }, + { + "epoch": 3.3908754623921085, + "grad_norm": 0.5462956428527832, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 2750 + }, + { + "epoch": 3.4032059186189887, + "grad_norm": 1.289099097251892, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 2760 + }, + { + "epoch": 3.4155363748458694, + "grad_norm": 0.825566828250885, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 2770 + }, + { + "epoch": 3.4278668310727496, + "grad_norm": 0.8366670608520508, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 2780 + }, + { + "epoch": 3.4401972872996303, + "grad_norm": 1.0931549072265625, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 2790 + }, + { + "epoch": 3.4525277435265105, + "grad_norm": 0.9228858351707458, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 2800 + }, + { + "epoch": 3.4648581997533907, + "grad_norm": 1.3182806968688965, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 2810 + }, + { + "epoch": 3.4771886559802714, + "grad_norm": 0.8366976380348206, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 2820 + }, + { + "epoch": 3.4895191122071516, + "grad_norm": 0.8067695498466492, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 2830 + }, + { + "epoch": 3.5018495684340323, + "grad_norm": 1.1163437366485596, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 2840 + }, + { + "epoch": 3.5141800246609125, + "grad_norm": 1.7196556329727173, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 2850 + }, + { + "epoch": 3.5265104808877927, + "grad_norm": 1.1267012357711792, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 2860 + }, + { + "epoch": 3.5388409371146734, + "grad_norm": 0.7220137119293213, + "learning_rate": 0.0002, + "loss": 0.447, + "step": 2870 + }, + { + "epoch": 3.5511713933415536, + "grad_norm": 0.914114773273468, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 2880 + }, + { + "epoch": 3.563501849568434, + "grad_norm": 0.6193503141403198, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 2890 + }, + { + "epoch": 3.5758323057953145, + "grad_norm": 0.6060135960578918, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 2900 + }, + { + "epoch": 3.5881627620221948, + "grad_norm": 1.0177327394485474, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 2910 + }, + { + "epoch": 3.600493218249075, + "grad_norm": 0.5994468331336975, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 2920 + }, + { + "epoch": 3.6128236744759556, + "grad_norm": 0.7450457215309143, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 2930 + }, + { + "epoch": 3.625154130702836, + "grad_norm": 0.5825870037078857, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 2940 + }, + { + "epoch": 3.6374845869297165, + "grad_norm": 0.6289743781089783, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 2950 + }, + { + "epoch": 3.6498150431565968, + "grad_norm": 0.7801929116249084, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 2960 + }, + { + "epoch": 3.6621454993834774, + "grad_norm": 1.1206634044647217, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 2970 + }, + { + "epoch": 3.6744759556103577, + "grad_norm": 0.6738817691802979, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2980 + }, + { + "epoch": 3.686806411837238, + "grad_norm": 1.1917344331741333, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 2990 + }, + { + "epoch": 3.6991368680641186, + "grad_norm": 1.3738657236099243, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 3000 + }, + { + "epoch": 3.711467324290999, + "grad_norm": 0.6642793416976929, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 3010 + }, + { + "epoch": 3.723797780517879, + "grad_norm": 0.9030995965003967, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 3020 + }, + { + "epoch": 3.7361282367447597, + "grad_norm": 1.0203914642333984, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 3030 + }, + { + "epoch": 3.74845869297164, + "grad_norm": 0.648394763469696, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 3040 + }, + { + "epoch": 3.76078914919852, + "grad_norm": 0.6304570436477661, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 3050 + }, + { + "epoch": 3.773119605425401, + "grad_norm": 0.8286601901054382, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 3060 + }, + { + "epoch": 3.785450061652281, + "grad_norm": 0.906444251537323, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 3070 + }, + { + "epoch": 3.7977805178791613, + "grad_norm": 1.4212149381637573, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 3080 + }, + { + "epoch": 3.810110974106042, + "grad_norm": 0.7574319839477539, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3090 + }, + { + "epoch": 3.822441430332922, + "grad_norm": 0.6534451246261597, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 3100 + }, + { + "epoch": 3.834771886559803, + "grad_norm": 0.7525447010993958, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 3110 + }, + { + "epoch": 3.847102342786683, + "grad_norm": 0.6513990759849548, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 3120 + }, + { + "epoch": 3.8594327990135637, + "grad_norm": 0.7782694697380066, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 3130 + }, + { + "epoch": 3.871763255240444, + "grad_norm": 0.7998530268669128, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 3140 + }, + { + "epoch": 3.884093711467324, + "grad_norm": 0.8045353293418884, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 3150 + }, + { + "epoch": 3.896424167694205, + "grad_norm": 0.8242645263671875, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 3160 + }, + { + "epoch": 3.908754623921085, + "grad_norm": 0.8302360773086548, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 3170 + }, + { + "epoch": 3.9210850801479653, + "grad_norm": 0.8653109073638916, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 3180 + }, + { + "epoch": 3.933415536374846, + "grad_norm": 0.6461338996887207, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 3190 + }, + { + "epoch": 3.945745992601726, + "grad_norm": 0.8267415165901184, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 3200 + }, + { + "epoch": 3.9580764488286064, + "grad_norm": 1.1963194608688354, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 3210 + }, + { + "epoch": 3.970406905055487, + "grad_norm": 0.7101966142654419, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 3220 + }, + { + "epoch": 3.9827373612823673, + "grad_norm": 0.5931660532951355, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 3230 + }, + { + "epoch": 3.995067817509248, + "grad_norm": 0.7465988993644714, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 3240 + }, + { + "epoch": 4.0, + "eval_loss": 1.4066498279571533, + "eval_runtime": 95.7145, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 0.575, + "step": 3244 + }, + { + "epoch": 4.007398273736128, + "grad_norm": 0.9478800296783447, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 3250 + }, + { + "epoch": 4.019728729963009, + "grad_norm": 1.207059621810913, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 3260 + }, + { + "epoch": 4.032059186189889, + "grad_norm": 0.8984074592590332, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 3270 + }, + { + "epoch": 4.044389642416769, + "grad_norm": 0.8104140758514404, + "learning_rate": 0.0002, + "loss": 0.3798, + "step": 3280 + }, + { + "epoch": 4.05672009864365, + "grad_norm": 1.0875468254089355, + "learning_rate": 0.0002, + "loss": 0.3657, + "step": 3290 + }, + { + "epoch": 4.06905055487053, + "grad_norm": 0.8520309329032898, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 3300 + }, + { + "epoch": 4.0813810110974105, + "grad_norm": 1.076735496520996, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 3310 + }, + { + "epoch": 4.093711467324291, + "grad_norm": 0.7789369821548462, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 3320 + }, + { + "epoch": 4.106041923551172, + "grad_norm": 0.916862964630127, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 3330 + }, + { + "epoch": 4.118372379778052, + "grad_norm": 1.1251654624938965, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3340 + }, + { + "epoch": 4.130702836004932, + "grad_norm": 0.9373420476913452, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 3350 + }, + { + "epoch": 4.143033292231813, + "grad_norm": 1.03253972530365, + "learning_rate": 0.0002, + "loss": 0.384, + "step": 3360 + }, + { + "epoch": 4.155363748458693, + "grad_norm": 0.947023332118988, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 3370 + }, + { + "epoch": 4.167694204685573, + "grad_norm": 0.8709157109260559, + "learning_rate": 0.0002, + "loss": 0.4018, + "step": 3380 + }, + { + "epoch": 4.180024660912454, + "grad_norm": 0.930983304977417, + "learning_rate": 0.0002, + "loss": 0.3754, + "step": 3390 + }, + { + "epoch": 4.192355117139334, + "grad_norm": 1.092809796333313, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 3400 + }, + { + "epoch": 4.2046855733662145, + "grad_norm": 0.8454303741455078, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 3410 + }, + { + "epoch": 4.217016029593095, + "grad_norm": 0.957210123538971, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3420 + }, + { + "epoch": 4.229346485819975, + "grad_norm": 0.854333758354187, + "learning_rate": 0.0002, + "loss": 0.3743, + "step": 3430 + }, + { + "epoch": 4.241676942046856, + "grad_norm": 1.0457639694213867, + "learning_rate": 0.0002, + "loss": 0.4041, + "step": 3440 + }, + { + "epoch": 4.254007398273736, + "grad_norm": 0.8972977995872498, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3450 + }, + { + "epoch": 4.266337854500616, + "grad_norm": 1.0438238382339478, + "learning_rate": 0.0002, + "loss": 0.4445, + "step": 3460 + }, + { + "epoch": 4.278668310727497, + "grad_norm": 0.7000405192375183, + "learning_rate": 0.0002, + "loss": 0.4078, + "step": 3470 + }, + { + "epoch": 4.290998766954377, + "grad_norm": 1.0451240539550781, + "learning_rate": 0.0002, + "loss": 0.3718, + "step": 3480 + }, + { + "epoch": 4.303329223181258, + "grad_norm": 1.3339767456054688, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 3490 + }, + { + "epoch": 4.315659679408138, + "grad_norm": 0.7503946423530579, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 3500 + }, + { + "epoch": 4.3279901356350186, + "grad_norm": 0.8443584442138672, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 3510 + }, + { + "epoch": 4.340320591861899, + "grad_norm": 1.1681201457977295, + "learning_rate": 0.0002, + "loss": 0.3793, + "step": 3520 + }, + { + "epoch": 4.352651048088779, + "grad_norm": 1.078883171081543, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 3530 + }, + { + "epoch": 4.36498150431566, + "grad_norm": 0.6894834041595459, + "learning_rate": 0.0002, + "loss": 0.4216, + "step": 3540 + }, + { + "epoch": 4.37731196054254, + "grad_norm": 0.7059480547904968, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 3550 + }, + { + "epoch": 4.38964241676942, + "grad_norm": 1.1807256937026978, + "learning_rate": 0.0002, + "loss": 0.3821, + "step": 3560 + }, + { + "epoch": 4.401972872996301, + "grad_norm": 0.8341359496116638, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 3570 + }, + { + "epoch": 4.4143033292231815, + "grad_norm": 1.0273033380508423, + "learning_rate": 0.0002, + "loss": 0.4123, + "step": 3580 + }, + { + "epoch": 4.426633785450061, + "grad_norm": 0.6916454434394836, + "learning_rate": 0.0002, + "loss": 0.5018, + "step": 3590 + }, + { + "epoch": 4.438964241676942, + "grad_norm": 0.8210113644599915, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 3600 + }, + { + "epoch": 4.451294697903823, + "grad_norm": 1.0309500694274902, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 3610 + }, + { + "epoch": 4.463625154130703, + "grad_norm": 0.8847399353981018, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3620 + }, + { + "epoch": 4.475955610357583, + "grad_norm": 1.668636679649353, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3630 + }, + { + "epoch": 4.488286066584464, + "grad_norm": 1.3087958097457886, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 3640 + }, + { + "epoch": 4.500616522811344, + "grad_norm": 0.837852418422699, + "learning_rate": 0.0002, + "loss": 0.4294, + "step": 3650 + }, + { + "epoch": 4.512946979038224, + "grad_norm": 9.7662353515625, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 3660 + }, + { + "epoch": 4.525277435265105, + "grad_norm": 1.125719428062439, + "learning_rate": 0.0002, + "loss": 0.4033, + "step": 3670 + }, + { + "epoch": 4.5376078914919855, + "grad_norm": 0.7755377292633057, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 3680 + }, + { + "epoch": 4.549938347718865, + "grad_norm": 0.7185089588165283, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 3690 + }, + { + "epoch": 4.562268803945746, + "grad_norm": 1.182063102722168, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 3700 + }, + { + "epoch": 4.574599260172627, + "grad_norm": 1.001197338104248, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3710 + }, + { + "epoch": 4.586929716399506, + "grad_norm": 0.9705429077148438, + "learning_rate": 0.0002, + "loss": 0.4493, + "step": 3720 + }, + { + "epoch": 4.599260172626387, + "grad_norm": 0.7136746048927307, + "learning_rate": 0.0002, + "loss": 0.42, + "step": 3730 + }, + { + "epoch": 4.611590628853268, + "grad_norm": 1.0004864931106567, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 3740 + }, + { + "epoch": 4.623921085080148, + "grad_norm": 1.3193715810775757, + "learning_rate": 0.0002, + "loss": 0.4418, + "step": 3750 + }, + { + "epoch": 4.636251541307028, + "grad_norm": 0.6945042014122009, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3760 + }, + { + "epoch": 4.648581997533909, + "grad_norm": 0.8903936743736267, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 3770 + }, + { + "epoch": 4.660912453760789, + "grad_norm": 0.7960889339447021, + "learning_rate": 0.0002, + "loss": 0.3582, + "step": 3780 + }, + { + "epoch": 4.673242909987669, + "grad_norm": 1.0439172983169556, + "learning_rate": 0.0002, + "loss": 0.3864, + "step": 3790 + }, + { + "epoch": 4.68557336621455, + "grad_norm": 1.4546219110488892, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 3800 + }, + { + "epoch": 4.697903822441431, + "grad_norm": 0.8194343447685242, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 3810 + }, + { + "epoch": 4.7102342786683105, + "grad_norm": 1.0727602243423462, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 3820 + }, + { + "epoch": 4.722564734895191, + "grad_norm": 0.7785195708274841, + "learning_rate": 0.0002, + "loss": 0.4021, + "step": 3830 + }, + { + "epoch": 4.734895191122072, + "grad_norm": 0.846783459186554, + "learning_rate": 0.0002, + "loss": 0.4252, + "step": 3840 + }, + { + "epoch": 4.747225647348952, + "grad_norm": 1.0481648445129395, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 3850 + }, + { + "epoch": 4.759556103575832, + "grad_norm": 0.7324008941650391, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3860 + }, + { + "epoch": 4.771886559802713, + "grad_norm": 1.06382417678833, + "learning_rate": 0.0002, + "loss": 0.3831, + "step": 3870 + }, + { + "epoch": 4.784217016029594, + "grad_norm": 0.9851241111755371, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3880 + }, + { + "epoch": 4.796547472256473, + "grad_norm": 0.8215277791023254, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 3890 + }, + { + "epoch": 4.808877928483354, + "grad_norm": 0.9901723861694336, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 3900 + }, + { + "epoch": 4.821208384710234, + "grad_norm": 0.9149112701416016, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 3910 + }, + { + "epoch": 4.8335388409371145, + "grad_norm": 0.9772973656654358, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3920 + }, + { + "epoch": 4.845869297163995, + "grad_norm": 0.8889636397361755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 3930 + }, + { + "epoch": 4.858199753390876, + "grad_norm": 1.3032807111740112, + "learning_rate": 0.0002, + "loss": 0.421, + "step": 3940 + }, + { + "epoch": 4.870530209617756, + "grad_norm": 0.8575899600982666, + "learning_rate": 0.0002, + "loss": 0.434, + "step": 3950 + }, + { + "epoch": 4.882860665844636, + "grad_norm": 1.04326331615448, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3960 + }, + { + "epoch": 4.895191122071517, + "grad_norm": 1.041210651397705, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 3970 + }, + { + "epoch": 4.907521578298397, + "grad_norm": 0.9113056063652039, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 3980 + }, + { + "epoch": 4.919852034525277, + "grad_norm": 1.019347906112671, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 3990 + }, + { + "epoch": 4.932182490752158, + "grad_norm": 0.7709218859672546, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 4000 + }, + { + "epoch": 4.944512946979038, + "grad_norm": 0.8891775608062744, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 4010 + }, + { + "epoch": 4.9568434032059185, + "grad_norm": 1.0396920442581177, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 4020 + }, + { + "epoch": 4.969173859432799, + "grad_norm": 0.9239833354949951, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 4030 + }, + { + "epoch": 4.981504315659679, + "grad_norm": 1.801400065422058, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 4040 + }, + { + "epoch": 4.99383477188656, + "grad_norm": 0.6194164752960205, + "learning_rate": 0.0002, + "loss": 0.4481, + "step": 4050 + }, + { + "epoch": 5.0, + "eval_loss": 1.544758915901184, + "eval_runtime": 96.2573, + "eval_samples_per_second": 4.53, + "eval_steps_per_second": 0.571, + "step": 4055 + }, + { + "epoch": 5.00616522811344, + "grad_norm": 0.9918256998062134, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 4060 + }, + { + "epoch": 5.018495684340321, + "grad_norm": 1.4851351976394653, + "learning_rate": 0.0002, + "loss": 0.2887, + "step": 4070 + }, + { + "epoch": 5.030826140567201, + "grad_norm": 0.9237686395645142, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 4080 + }, + { + "epoch": 5.0431565967940815, + "grad_norm": 1.2180852890014648, + "learning_rate": 0.0002, + "loss": 0.3072, + "step": 4090 + }, + { + "epoch": 5.055487053020962, + "grad_norm": 1.1247979402542114, + "learning_rate": 0.0002, + "loss": 0.282, + "step": 4100 + }, + { + "epoch": 5.067817509247842, + "grad_norm": 1.2969884872436523, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 4110 + }, + { + "epoch": 5.080147965474723, + "grad_norm": 1.0183063745498657, + "learning_rate": 0.0002, + "loss": 0.2858, + "step": 4120 + }, + { + "epoch": 5.092478421701603, + "grad_norm": 1.121330738067627, + "learning_rate": 0.0002, + "loss": 0.295, + "step": 4130 + }, + { + "epoch": 5.104808877928483, + "grad_norm": 1.0748186111450195, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 4140 + }, + { + "epoch": 5.117139334155364, + "grad_norm": 1.103474736213684, + "learning_rate": 0.0002, + "loss": 0.3414, + "step": 4150 + }, + { + "epoch": 5.129469790382244, + "grad_norm": 1.2251166105270386, + "learning_rate": 0.0002, + "loss": 0.305, + "step": 4160 + }, + { + "epoch": 5.141800246609124, + "grad_norm": 0.920898973941803, + "learning_rate": 0.0002, + "loss": 0.3131, + "step": 4170 + }, + { + "epoch": 5.154130702836005, + "grad_norm": 1.327542781829834, + "learning_rate": 0.0002, + "loss": 0.281, + "step": 4180 + }, + { + "epoch": 5.1664611590628855, + "grad_norm": 1.0677192211151123, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 4190 + }, + { + "epoch": 5.178791615289766, + "grad_norm": 0.897241473197937, + "learning_rate": 0.0002, + "loss": 0.2863, + "step": 4200 + }, + { + "epoch": 5.191122071516646, + "grad_norm": 0.977457582950592, + "learning_rate": 0.0002, + "loss": 0.2967, + "step": 4210 + }, + { + "epoch": 5.203452527743527, + "grad_norm": 1.4115267992019653, + "learning_rate": 0.0002, + "loss": 0.3032, + "step": 4220 + }, + { + "epoch": 5.215782983970407, + "grad_norm": 1.097743034362793, + "learning_rate": 0.0002, + "loss": 0.3279, + "step": 4230 + }, + { + "epoch": 5.228113440197287, + "grad_norm": 1.1095269918441772, + "learning_rate": 0.0002, + "loss": 0.293, + "step": 4240 + }, + { + "epoch": 5.240443896424168, + "grad_norm": 1.3785479068756104, + "learning_rate": 0.0002, + "loss": 0.3544, + "step": 4250 + }, + { + "epoch": 5.252774352651048, + "grad_norm": 1.0298776626586914, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 4260 + }, + { + "epoch": 5.265104808877928, + "grad_norm": 1.1592111587524414, + "learning_rate": 0.0002, + "loss": 0.296, + "step": 4270 + }, + { + "epoch": 5.277435265104809, + "grad_norm": 1.2355743646621704, + "learning_rate": 0.0002, + "loss": 0.2878, + "step": 4280 + }, + { + "epoch": 5.2897657213316895, + "grad_norm": 0.8543112874031067, + "learning_rate": 0.0002, + "loss": 0.3085, + "step": 4290 + }, + { + "epoch": 5.302096177558569, + "grad_norm": 1.2953215837478638, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 4300 + }, + { + "epoch": 5.31442663378545, + "grad_norm": 1.1001787185668945, + "learning_rate": 0.0002, + "loss": 0.2912, + "step": 4310 + }, + { + "epoch": 5.326757090012331, + "grad_norm": 0.7476816773414612, + "learning_rate": 0.0002, + "loss": 0.3003, + "step": 4320 + }, + { + "epoch": 5.3390875462392104, + "grad_norm": 0.8195574283599854, + "learning_rate": 0.0002, + "loss": 0.3247, + "step": 4330 + }, + { + "epoch": 5.351418002466091, + "grad_norm": 0.9490262866020203, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 4340 + }, + { + "epoch": 5.363748458692972, + "grad_norm": 1.2201412916183472, + "learning_rate": 0.0002, + "loss": 0.2846, + "step": 4350 + }, + { + "epoch": 5.376078914919852, + "grad_norm": 1.0311479568481445, + "learning_rate": 0.0002, + "loss": 0.2644, + "step": 4360 + }, + { + "epoch": 5.388409371146732, + "grad_norm": 1.2097488641738892, + "learning_rate": 0.0002, + "loss": 0.3104, + "step": 4370 + }, + { + "epoch": 5.400739827373613, + "grad_norm": 1.140942096710205, + "learning_rate": 0.0002, + "loss": 0.2977, + "step": 4380 + }, + { + "epoch": 5.413070283600494, + "grad_norm": 0.8091890811920166, + "learning_rate": 0.0002, + "loss": 0.2975, + "step": 4390 + }, + { + "epoch": 5.425400739827373, + "grad_norm": 1.4467964172363281, + "learning_rate": 0.0002, + "loss": 0.3727, + "step": 4400 + }, + { + "epoch": 5.437731196054254, + "grad_norm": 1.0836058855056763, + "learning_rate": 0.0002, + "loss": 0.2979, + "step": 4410 + }, + { + "epoch": 5.450061652281135, + "grad_norm": 1.0515433549880981, + "learning_rate": 0.0002, + "loss": 0.2601, + "step": 4420 + }, + { + "epoch": 5.4623921085080145, + "grad_norm": 0.9603073000907898, + "learning_rate": 0.0002, + "loss": 0.315, + "step": 4430 + }, + { + "epoch": 5.474722564734895, + "grad_norm": 1.234609842300415, + "learning_rate": 0.0002, + "loss": 0.3166, + "step": 4440 + }, + { + "epoch": 5.487053020961776, + "grad_norm": 0.8881428837776184, + "learning_rate": 0.0002, + "loss": 0.3142, + "step": 4450 + }, + { + "epoch": 5.499383477188656, + "grad_norm": 1.1817275285720825, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 4460 + }, + { + "epoch": 5.511713933415536, + "grad_norm": 1.213993787765503, + "learning_rate": 0.0002, + "loss": 0.2944, + "step": 4470 + }, + { + "epoch": 5.524044389642417, + "grad_norm": 1.0501725673675537, + "learning_rate": 0.0002, + "loss": 0.3136, + "step": 4480 + }, + { + "epoch": 5.536374845869297, + "grad_norm": 1.5061579942703247, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 4490 + }, + { + "epoch": 5.548705302096177, + "grad_norm": 1.1171475648880005, + "learning_rate": 0.0002, + "loss": 0.3226, + "step": 4500 + }, + { + "epoch": 5.561035758323058, + "grad_norm": 1.1147594451904297, + "learning_rate": 0.0002, + "loss": 0.3624, + "step": 4510 + }, + { + "epoch": 5.573366214549939, + "grad_norm": 1.0600544214248657, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4520 + }, + { + "epoch": 5.5856966707768185, + "grad_norm": 1.247870922088623, + "learning_rate": 0.0002, + "loss": 0.3268, + "step": 4530 + }, + { + "epoch": 5.598027127003699, + "grad_norm": 0.9425561428070068, + "learning_rate": 0.0002, + "loss": 0.3168, + "step": 4540 + }, + { + "epoch": 5.61035758323058, + "grad_norm": 1.1111550331115723, + "learning_rate": 0.0002, + "loss": 0.3119, + "step": 4550 + }, + { + "epoch": 5.62268803945746, + "grad_norm": 1.743268609046936, + "learning_rate": 0.0002, + "loss": 0.3389, + "step": 4560 + }, + { + "epoch": 5.63501849568434, + "grad_norm": 1.3522645235061646, + "learning_rate": 0.0002, + "loss": 0.31, + "step": 4570 + }, + { + "epoch": 5.647348951911221, + "grad_norm": 0.7354221343994141, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 4580 + }, + { + "epoch": 5.659679408138101, + "grad_norm": 1.050743818283081, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 4590 + }, + { + "epoch": 5.6720098643649814, + "grad_norm": 1.1302396059036255, + "learning_rate": 0.0002, + "loss": 0.3449, + "step": 4600 + }, + { + "epoch": 5.684340320591862, + "grad_norm": 0.8774183392524719, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 4610 + }, + { + "epoch": 5.696670776818742, + "grad_norm": 1.090781569480896, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 4620 + }, + { + "epoch": 5.709001233045623, + "grad_norm": 0.9177733063697815, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 4630 + }, + { + "epoch": 5.721331689272503, + "grad_norm": 0.9985341429710388, + "learning_rate": 0.0002, + "loss": 0.3439, + "step": 4640 + }, + { + "epoch": 5.733662145499384, + "grad_norm": 1.0230613946914673, + "learning_rate": 0.0002, + "loss": 0.3323, + "step": 4650 + }, + { + "epoch": 5.745992601726264, + "grad_norm": 0.944656252861023, + "learning_rate": 0.0002, + "loss": 0.3525, + "step": 4660 + }, + { + "epoch": 5.758323057953144, + "grad_norm": 0.8162471652030945, + "learning_rate": 0.0002, + "loss": 0.3191, + "step": 4670 + }, + { + "epoch": 5.770653514180025, + "grad_norm": 1.0500398874282837, + "learning_rate": 0.0002, + "loss": 0.4011, + "step": 4680 + }, + { + "epoch": 5.782983970406905, + "grad_norm": 0.9487981796264648, + "learning_rate": 0.0002, + "loss": 0.3452, + "step": 4690 + }, + { + "epoch": 5.7953144266337855, + "grad_norm": 1.1856540441513062, + "learning_rate": 0.0002, + "loss": 0.2942, + "step": 4700 + }, + { + "epoch": 5.807644882860666, + "grad_norm": 1.2583396434783936, + "learning_rate": 0.0002, + "loss": 0.3107, + "step": 4710 + }, + { + "epoch": 5.819975339087546, + "grad_norm": 1.2532602548599243, + "learning_rate": 0.0002, + "loss": 0.3223, + "step": 4720 + }, + { + "epoch": 5.832305795314427, + "grad_norm": 1.115236520767212, + "learning_rate": 0.0002, + "loss": 0.3253, + "step": 4730 + }, + { + "epoch": 5.844636251541307, + "grad_norm": 1.2245537042617798, + "learning_rate": 0.0002, + "loss": 0.3539, + "step": 4740 + }, + { + "epoch": 5.856966707768187, + "grad_norm": 1.1964094638824463, + "learning_rate": 0.0002, + "loss": 0.3171, + "step": 4750 + }, + { + "epoch": 5.869297163995068, + "grad_norm": 1.0833805799484253, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 4760 + }, + { + "epoch": 5.881627620221948, + "grad_norm": 1.0694046020507812, + "learning_rate": 0.0002, + "loss": 0.3511, + "step": 4770 + }, + { + "epoch": 5.893958076448829, + "grad_norm": 0.9947936534881592, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4780 + }, + { + "epoch": 5.906288532675709, + "grad_norm": 1.175716519355774, + "learning_rate": 0.0002, + "loss": 0.316, + "step": 4790 + }, + { + "epoch": 5.9186189889025895, + "grad_norm": 0.7717352509498596, + "learning_rate": 0.0002, + "loss": 0.3609, + "step": 4800 + }, + { + "epoch": 5.930949445129469, + "grad_norm": 1.2906442880630493, + "learning_rate": 0.0002, + "loss": 0.3058, + "step": 4810 + }, + { + "epoch": 5.94327990135635, + "grad_norm": 1.2416284084320068, + "learning_rate": 0.0002, + "loss": 0.3187, + "step": 4820 + }, + { + "epoch": 5.955610357583231, + "grad_norm": 1.3066956996917725, + "learning_rate": 0.0002, + "loss": 0.337, + "step": 4830 + }, + { + "epoch": 5.967940813810111, + "grad_norm": 1.0872026681900024, + "learning_rate": 0.0002, + "loss": 0.3167, + "step": 4840 + }, + { + "epoch": 5.980271270036991, + "grad_norm": 1.1941101551055908, + "learning_rate": 0.0002, + "loss": 0.3262, + "step": 4850 + }, + { + "epoch": 5.992601726263872, + "grad_norm": 1.1126095056533813, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 4860 + }, + { + "epoch": 6.0, + "eval_loss": 1.748323917388916, + "eval_runtime": 97.7488, + "eval_samples_per_second": 4.46, + "eval_steps_per_second": 0.563, + "step": 4866 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4993598622662656e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-4866/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..032cd672ebb1fed63311496c2060def719e1aa2f --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1cef35ec572a9817b838d442ed757139ea6c6872cb537a7c44ccd3fd1115d6 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..264f0fbd5eb0aa48762c27d57279143860172c86 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35ee121268f7b4ba2af88c2f87f9b558e7bd230c7839d55dd4e1ca774a4bbb9a +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..36bec44e886dca7704f7ac183f987f04b9fe549a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b665d283293cba13847b5d8e25bd9ac4b7f3658cd54aec8d20689ec4e87cf99b +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..92c35a87a729e89be8430721defd697b07ab7118 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08c63c3119d1e66c8e4572d55ec3986f2909422152daac522b536300baca280f +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4f9fdafa4f1fa0a6fe7c7a7d08374485a95e1c34 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/trainer_state.json @@ -0,0 +1,4058 @@ +{ + "best_metric": 1.238026738166809, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 5677, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + }, + { + "epoch": 1.0110974106041923, + "grad_norm": 0.26760655641555786, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 820 + }, + { + "epoch": 1.0234278668310728, + "grad_norm": 0.4640820026397705, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 830 + }, + { + "epoch": 1.0357583230579532, + "grad_norm": 0.2699166238307953, + "learning_rate": 0.0002, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.3441709578037262, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 850 + }, + { + "epoch": 1.060419235511714, + "grad_norm": 0.299934983253479, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 860 + }, + { + "epoch": 1.0727496917385944, + "grad_norm": 0.2980666160583496, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 870 + }, + { + "epoch": 1.0850801479654748, + "grad_norm": 0.3131714463233948, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 880 + }, + { + "epoch": 1.097410604192355, + "grad_norm": 0.29881617426872253, + "learning_rate": 0.0002, + "loss": 0.9288, + "step": 890 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.29870888590812683, + "learning_rate": 0.0002, + "loss": 0.998, + "step": 900 + }, + { + "epoch": 1.122071516646116, + "grad_norm": 0.5735140442848206, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 910 + }, + { + "epoch": 1.1344019728729964, + "grad_norm": 0.33159002661705017, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 920 + }, + { + "epoch": 1.1467324290998766, + "grad_norm": 1.235399842262268, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 930 + }, + { + "epoch": 1.159062885326757, + "grad_norm": 0.27469736337661743, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 940 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.29130664467811584, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 950 + }, + { + "epoch": 1.183723797780518, + "grad_norm": 0.3730354607105255, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 960 + }, + { + "epoch": 1.1960542540073984, + "grad_norm": 0.5973590612411499, + "learning_rate": 0.0002, + "loss": 0.9988, + "step": 970 + }, + { + "epoch": 1.2083847102342786, + "grad_norm": 0.39631304144859314, + "learning_rate": 0.0002, + "loss": 0.9525, + "step": 980 + }, + { + "epoch": 1.220715166461159, + "grad_norm": 0.849051296710968, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 990 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.4390525817871094, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1000 + }, + { + "epoch": 1.2453760789149197, + "grad_norm": 0.30423852801322937, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 1010 + }, + { + "epoch": 1.2577065351418002, + "grad_norm": 0.34736061096191406, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 1020 + }, + { + "epoch": 1.2700369913686806, + "grad_norm": 0.3421604037284851, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1030 + }, + { + "epoch": 1.282367447595561, + "grad_norm": 0.544170081615448, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1040 + }, + { + "epoch": 1.2946979038224415, + "grad_norm": 0.5128790736198425, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 1050 + }, + { + "epoch": 1.3070283600493218, + "grad_norm": 0.443344384431839, + "learning_rate": 0.0002, + "loss": 0.9214, + "step": 1060 + }, + { + "epoch": 1.3193588162762022, + "grad_norm": 0.6380868554115295, + "learning_rate": 0.0002, + "loss": 0.9367, + "step": 1070 + }, + { + "epoch": 1.3316892725030827, + "grad_norm": 0.4638073146343231, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 1080 + }, + { + "epoch": 1.344019728729963, + "grad_norm": 0.32406893372535706, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 1090 + }, + { + "epoch": 1.3563501849568433, + "grad_norm": 0.3955065608024597, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1100 + }, + { + "epoch": 1.3686806411837238, + "grad_norm": 0.3489246666431427, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 1110 + }, + { + "epoch": 1.3810110974106042, + "grad_norm": 0.48451653122901917, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 1120 + }, + { + "epoch": 1.3933415536374847, + "grad_norm": 0.3652360439300537, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 1130 + }, + { + "epoch": 1.405672009864365, + "grad_norm": 1.3097436428070068, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 1140 + }, + { + "epoch": 1.4180024660912454, + "grad_norm": 0.3647715449333191, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1150 + }, + { + "epoch": 1.4303329223181258, + "grad_norm": 0.37248560786247253, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 1160 + }, + { + "epoch": 1.442663378545006, + "grad_norm": 0.4639643430709839, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1170 + }, + { + "epoch": 1.4549938347718865, + "grad_norm": 0.5455219745635986, + "learning_rate": 0.0002, + "loss": 0.9511, + "step": 1180 + }, + { + "epoch": 1.467324290998767, + "grad_norm": 0.38862571120262146, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 1190 + }, + { + "epoch": 1.4796547472256474, + "grad_norm": 0.37586215138435364, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 1200 + }, + { + "epoch": 1.4919852034525278, + "grad_norm": 0.46244436502456665, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1210 + }, + { + "epoch": 1.504315659679408, + "grad_norm": 0.3570359945297241, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 1220 + }, + { + "epoch": 1.5166461159062885, + "grad_norm": 0.28393083810806274, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 1230 + }, + { + "epoch": 1.528976572133169, + "grad_norm": 0.5672869682312012, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 1240 + }, + { + "epoch": 1.5413070283600492, + "grad_norm": 0.41605108976364136, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 1250 + }, + { + "epoch": 1.5536374845869299, + "grad_norm": 0.40657493472099304, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1260 + }, + { + "epoch": 1.56596794081381, + "grad_norm": 0.43672341108322144, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 1.5782983970406905, + "grad_norm": 0.3065410554409027, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 1280 + }, + { + "epoch": 1.590628853267571, + "grad_norm": 0.37826645374298096, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1290 + }, + { + "epoch": 1.6029593094944512, + "grad_norm": 0.42307335138320923, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 1300 + }, + { + "epoch": 1.6152897657213316, + "grad_norm": 0.3648843467235565, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1310 + }, + { + "epoch": 1.627620221948212, + "grad_norm": 0.8921076059341431, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 1320 + }, + { + "epoch": 1.6399506781750923, + "grad_norm": 0.37522226572036743, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 1330 + }, + { + "epoch": 1.652281134401973, + "grad_norm": 0.7489957809448242, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 1340 + }, + { + "epoch": 1.6646115906288532, + "grad_norm": 0.31733131408691406, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 1350 + }, + { + "epoch": 1.6769420468557337, + "grad_norm": 0.3249478340148926, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1360 + }, + { + "epoch": 1.6892725030826141, + "grad_norm": 0.3178001344203949, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1370 + }, + { + "epoch": 1.7016029593094943, + "grad_norm": 0.5674093961715698, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 1380 + }, + { + "epoch": 1.7139334155363748, + "grad_norm": 0.35272449254989624, + "learning_rate": 0.0002, + "loss": 0.8972, + "step": 1390 + }, + { + "epoch": 1.7262638717632552, + "grad_norm": 0.5778217911720276, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 1.7385943279901355, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 1410 + }, + { + "epoch": 1.7509247842170161, + "grad_norm": 0.31735464930534363, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 1420 + }, + { + "epoch": 1.7632552404438964, + "grad_norm": 1.0612670183181763, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 1430 + }, + { + "epoch": 1.7755856966707768, + "grad_norm": 0.5442509651184082, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1440 + }, + { + "epoch": 1.7879161528976573, + "grad_norm": 0.7471332550048828, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1450 + }, + { + "epoch": 1.8002466091245375, + "grad_norm": 0.4323609173297882, + "learning_rate": 0.0002, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 1.8125770653514182, + "grad_norm": 0.47796759009361267, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1470 + }, + { + "epoch": 1.8249075215782984, + "grad_norm": 0.3348400592803955, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 1480 + }, + { + "epoch": 1.8372379778051788, + "grad_norm": 0.3354550898075104, + "learning_rate": 0.0002, + "loss": 0.9793, + "step": 1490 + }, + { + "epoch": 1.8495684340320593, + "grad_norm": 0.5988477468490601, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 1500 + }, + { + "epoch": 1.8618988902589395, + "grad_norm": 0.5222318172454834, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 1510 + }, + { + "epoch": 1.87422934648582, + "grad_norm": 0.5246642827987671, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 1520 + }, + { + "epoch": 1.8865598027127004, + "grad_norm": 0.3164594769477844, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1530 + }, + { + "epoch": 1.8988902589395806, + "grad_norm": 0.3496174216270447, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 1540 + }, + { + "epoch": 1.9112207151664613, + "grad_norm": 0.8863359689712524, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1550 + }, + { + "epoch": 1.9235511713933415, + "grad_norm": 0.3587026298046112, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1560 + }, + { + "epoch": 1.935881627620222, + "grad_norm": 0.6052881479263306, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1570 + }, + { + "epoch": 1.9482120838471024, + "grad_norm": 0.567269504070282, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1580 + }, + { + "epoch": 1.9605425400739827, + "grad_norm": 0.45184487104415894, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 1590 + }, + { + "epoch": 1.972872996300863, + "grad_norm": 0.5028569102287292, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 1600 + }, + { + "epoch": 1.9852034525277436, + "grad_norm": 0.4677547216415405, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 1610 + }, + { + "epoch": 1.9975339087546238, + "grad_norm": 0.35106056928634644, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 1.238026738166809, + "eval_runtime": 95.4287, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.576, + "step": 1622 + }, + { + "epoch": 2.0098643649815044, + "grad_norm": 0.444060355424881, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 1630 + }, + { + "epoch": 2.0221948212083847, + "grad_norm": 0.627570390701294, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 1640 + }, + { + "epoch": 2.034525277435265, + "grad_norm": 0.38737839460372925, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 1650 + }, + { + "epoch": 2.0468557336621456, + "grad_norm": 0.4300459623336792, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 1660 + }, + { + "epoch": 2.059186189889026, + "grad_norm": 0.43037715554237366, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 1670 + }, + { + "epoch": 2.0715166461159065, + "grad_norm": 0.40772515535354614, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 1680 + }, + { + "epoch": 2.0838471023427867, + "grad_norm": 0.5295451879501343, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1690 + }, + { + "epoch": 2.096177558569667, + "grad_norm": 0.7452750205993652, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 1700 + }, + { + "epoch": 2.1085080147965476, + "grad_norm": 0.809183657169342, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 1710 + }, + { + "epoch": 2.120838471023428, + "grad_norm": 0.4597688913345337, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 1720 + }, + { + "epoch": 2.133168927250308, + "grad_norm": 0.806919276714325, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 1730 + }, + { + "epoch": 2.1454993834771887, + "grad_norm": 0.3755643665790558, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 1740 + }, + { + "epoch": 2.157829839704069, + "grad_norm": 0.5882734060287476, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1750 + }, + { + "epoch": 2.1701602959309496, + "grad_norm": 0.692960798740387, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 1760 + }, + { + "epoch": 2.18249075215783, + "grad_norm": 0.4737096428871155, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 1770 + }, + { + "epoch": 2.19482120838471, + "grad_norm": 0.6637021899223328, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 1780 + }, + { + "epoch": 2.2071516646115907, + "grad_norm": 0.9109764099121094, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 1790 + }, + { + "epoch": 2.219482120838471, + "grad_norm": 0.4137539267539978, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 1800 + }, + { + "epoch": 2.2318125770653516, + "grad_norm": 0.44995415210723877, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 1810 + }, + { + "epoch": 2.244143033292232, + "grad_norm": 0.5985036492347717, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 1820 + }, + { + "epoch": 2.256473489519112, + "grad_norm": 0.7549490332603455, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 1830 + }, + { + "epoch": 2.2688039457459928, + "grad_norm": 0.4490937888622284, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 1840 + }, + { + "epoch": 2.281134401972873, + "grad_norm": 0.38859808444976807, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 1850 + }, + { + "epoch": 2.293464858199753, + "grad_norm": 1.0704916715621948, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 1860 + }, + { + "epoch": 2.305795314426634, + "grad_norm": 0.4647100865840912, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 1870 + }, + { + "epoch": 2.318125770653514, + "grad_norm": 0.6181163787841797, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 1880 + }, + { + "epoch": 2.3304562268803943, + "grad_norm": 0.9241904020309448, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 1890 + }, + { + "epoch": 2.342786683107275, + "grad_norm": 0.39101317524909973, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 1900 + }, + { + "epoch": 2.3551171393341552, + "grad_norm": 0.49442458152770996, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 1910 + }, + { + "epoch": 2.367447595561036, + "grad_norm": 0.4864824414253235, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 1920 + }, + { + "epoch": 2.379778051787916, + "grad_norm": 0.5427613854408264, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 1930 + }, + { + "epoch": 2.392108508014797, + "grad_norm": 0.7164974808692932, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1940 + }, + { + "epoch": 2.404438964241677, + "grad_norm": 0.562979519367218, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 1950 + }, + { + "epoch": 2.4167694204685573, + "grad_norm": 0.5631861090660095, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 1960 + }, + { + "epoch": 2.429099876695438, + "grad_norm": 0.4895121157169342, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 1970 + }, + { + "epoch": 2.441430332922318, + "grad_norm": 0.45674824714660645, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1980 + }, + { + "epoch": 2.4537607891491984, + "grad_norm": 1.1424206495285034, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 1990 + }, + { + "epoch": 2.466091245376079, + "grad_norm": 0.6314579844474792, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 2000 + }, + { + "epoch": 2.4784217016029593, + "grad_norm": 0.5481605529785156, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 2010 + }, + { + "epoch": 2.4907521578298395, + "grad_norm": 0.4671579599380493, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 2020 + }, + { + "epoch": 2.50308261405672, + "grad_norm": 0.7621194124221802, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 2030 + }, + { + "epoch": 2.5154130702836004, + "grad_norm": 0.38983288407325745, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 2.5277435265104806, + "grad_norm": 0.6341150999069214, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2050 + }, + { + "epoch": 2.5400739827373613, + "grad_norm": 0.7151971459388733, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 2060 + }, + { + "epoch": 2.5524044389642415, + "grad_norm": 0.9665895104408264, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 2070 + }, + { + "epoch": 2.564734895191122, + "grad_norm": 0.9572727680206299, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 2080 + }, + { + "epoch": 2.5770653514180024, + "grad_norm": 1.1970765590667725, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 2090 + }, + { + "epoch": 2.589395807644883, + "grad_norm": 0.5505942702293396, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 2100 + }, + { + "epoch": 2.6017262638717633, + "grad_norm": 0.5903949737548828, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 2110 + }, + { + "epoch": 2.6140567200986435, + "grad_norm": 0.45640307664871216, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 2120 + }, + { + "epoch": 2.626387176325524, + "grad_norm": 0.8763944506645203, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 2130 + }, + { + "epoch": 2.6387176325524044, + "grad_norm": 0.4472963213920593, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 2140 + }, + { + "epoch": 2.6510480887792847, + "grad_norm": 0.5335086584091187, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 2150 + }, + { + "epoch": 2.6633785450061653, + "grad_norm": 0.805263340473175, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 2160 + }, + { + "epoch": 2.6757090012330456, + "grad_norm": 0.6332727670669556, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 2170 + }, + { + "epoch": 2.688039457459926, + "grad_norm": 0.8667435646057129, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 2180 + }, + { + "epoch": 2.7003699136868065, + "grad_norm": 0.5638955235481262, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2190 + }, + { + "epoch": 2.7127003699136867, + "grad_norm": 0.4176250696182251, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 2200 + }, + { + "epoch": 2.7250308261405674, + "grad_norm": 0.6013461351394653, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 2210 + }, + { + "epoch": 2.7373612823674476, + "grad_norm": 0.553961992263794, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 2220 + }, + { + "epoch": 2.7496917385943282, + "grad_norm": 0.4710180461406708, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 2230 + }, + { + "epoch": 2.7620221948212085, + "grad_norm": 0.8141706585884094, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 2240 + }, + { + "epoch": 2.7743526510480887, + "grad_norm": 0.7449556589126587, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 2250 + }, + { + "epoch": 2.7866831072749694, + "grad_norm": 0.5366780757904053, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 2260 + }, + { + "epoch": 2.7990135635018496, + "grad_norm": 0.5316720604896545, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 2270 + }, + { + "epoch": 2.81134401972873, + "grad_norm": 0.4598459005355835, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 2280 + }, + { + "epoch": 2.8236744759556105, + "grad_norm": 0.6852091550827026, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 2290 + }, + { + "epoch": 2.8360049321824907, + "grad_norm": 0.8040902018547058, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 2300 + }, + { + "epoch": 2.848335388409371, + "grad_norm": 0.46976321935653687, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 2310 + }, + { + "epoch": 2.8606658446362516, + "grad_norm": 0.5214090347290039, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 2320 + }, + { + "epoch": 2.872996300863132, + "grad_norm": 0.5323054790496826, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 2330 + }, + { + "epoch": 2.885326757090012, + "grad_norm": 0.6842264533042908, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2340 + }, + { + "epoch": 2.8976572133168927, + "grad_norm": 0.9157055616378784, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2350 + }, + { + "epoch": 2.909987669543773, + "grad_norm": 0.5253258347511292, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 2360 + }, + { + "epoch": 2.9223181257706536, + "grad_norm": 0.4937705099582672, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 2370 + }, + { + "epoch": 2.934648581997534, + "grad_norm": 0.48762989044189453, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 2380 + }, + { + "epoch": 2.9469790382244145, + "grad_norm": 0.544335126876831, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 2390 + }, + { + "epoch": 2.9593094944512948, + "grad_norm": 0.4847845435142517, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 2400 + }, + { + "epoch": 2.971639950678175, + "grad_norm": 0.4787445366382599, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 2410 + }, + { + "epoch": 2.9839704069050557, + "grad_norm": 1.022318959236145, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 2420 + }, + { + "epoch": 2.996300863131936, + "grad_norm": 0.4987848103046417, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 2430 + }, + { + "epoch": 3.0, + "eval_loss": 1.2936296463012695, + "eval_runtime": 94.7897, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 0.58, + "step": 2433 + }, + { + "epoch": 3.008631319358816, + "grad_norm": 0.5562372803688049, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 2440 + }, + { + "epoch": 3.020961775585697, + "grad_norm": 1.133402705192566, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 2450 + }, + { + "epoch": 3.033292231812577, + "grad_norm": 0.6480470299720764, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 2460 + }, + { + "epoch": 3.0456226880394572, + "grad_norm": 0.8989138007164001, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 2470 + }, + { + "epoch": 3.057953144266338, + "grad_norm": 0.8257461786270142, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2480 + }, + { + "epoch": 3.070283600493218, + "grad_norm": 0.6813381910324097, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 2490 + }, + { + "epoch": 3.082614056720099, + "grad_norm": 0.6989586353302002, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 2500 + }, + { + "epoch": 3.094944512946979, + "grad_norm": 0.7992092967033386, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 2510 + }, + { + "epoch": 3.1072749691738593, + "grad_norm": 0.698077917098999, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 2520 + }, + { + "epoch": 3.11960542540074, + "grad_norm": 0.5699033141136169, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 2530 + }, + { + "epoch": 3.13193588162762, + "grad_norm": 0.6142355799674988, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 2540 + }, + { + "epoch": 3.144266337854501, + "grad_norm": 0.7089933753013611, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 2550 + }, + { + "epoch": 3.156596794081381, + "grad_norm": 1.0107015371322632, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 2560 + }, + { + "epoch": 3.1689272503082613, + "grad_norm": 0.568138837814331, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 2570 + }, + { + "epoch": 3.181257706535142, + "grad_norm": 0.9960416555404663, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 2580 + }, + { + "epoch": 3.193588162762022, + "grad_norm": 0.6277595162391663, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 2590 + }, + { + "epoch": 3.2059186189889024, + "grad_norm": 0.681083619594574, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 2600 + }, + { + "epoch": 3.218249075215783, + "grad_norm": 0.5816057324409485, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 2610 + }, + { + "epoch": 3.2305795314426633, + "grad_norm": 0.7197734117507935, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 2620 + }, + { + "epoch": 3.242909987669544, + "grad_norm": 0.6524068117141724, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 2630 + }, + { + "epoch": 3.255240443896424, + "grad_norm": 1.273668646812439, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 2640 + }, + { + "epoch": 3.2675709001233044, + "grad_norm": 0.6950451731681824, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 2650 + }, + { + "epoch": 3.279901356350185, + "grad_norm": 0.8029071688652039, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 2660 + }, + { + "epoch": 3.2922318125770653, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 2670 + }, + { + "epoch": 3.304562268803946, + "grad_norm": 0.8342001438140869, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 2680 + }, + { + "epoch": 3.316892725030826, + "grad_norm": 0.5629868507385254, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 2690 + }, + { + "epoch": 3.3292231812577064, + "grad_norm": 0.753999650478363, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 2700 + }, + { + "epoch": 3.341553637484587, + "grad_norm": 1.0271371603012085, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 2710 + }, + { + "epoch": 3.3538840937114673, + "grad_norm": 0.9608535170555115, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 2720 + }, + { + "epoch": 3.3662145499383476, + "grad_norm": 0.7796488404273987, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 2730 + }, + { + "epoch": 3.3785450061652282, + "grad_norm": 0.5666437149047852, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 2740 + }, + { + "epoch": 3.3908754623921085, + "grad_norm": 0.5462956428527832, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 2750 + }, + { + "epoch": 3.4032059186189887, + "grad_norm": 1.289099097251892, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 2760 + }, + { + "epoch": 3.4155363748458694, + "grad_norm": 0.825566828250885, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 2770 + }, + { + "epoch": 3.4278668310727496, + "grad_norm": 0.8366670608520508, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 2780 + }, + { + "epoch": 3.4401972872996303, + "grad_norm": 1.0931549072265625, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 2790 + }, + { + "epoch": 3.4525277435265105, + "grad_norm": 0.9228858351707458, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 2800 + }, + { + "epoch": 3.4648581997533907, + "grad_norm": 1.3182806968688965, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 2810 + }, + { + "epoch": 3.4771886559802714, + "grad_norm": 0.8366976380348206, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 2820 + }, + { + "epoch": 3.4895191122071516, + "grad_norm": 0.8067695498466492, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 2830 + }, + { + "epoch": 3.5018495684340323, + "grad_norm": 1.1163437366485596, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 2840 + }, + { + "epoch": 3.5141800246609125, + "grad_norm": 1.7196556329727173, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 2850 + }, + { + "epoch": 3.5265104808877927, + "grad_norm": 1.1267012357711792, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 2860 + }, + { + "epoch": 3.5388409371146734, + "grad_norm": 0.7220137119293213, + "learning_rate": 0.0002, + "loss": 0.447, + "step": 2870 + }, + { + "epoch": 3.5511713933415536, + "grad_norm": 0.914114773273468, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 2880 + }, + { + "epoch": 3.563501849568434, + "grad_norm": 0.6193503141403198, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 2890 + }, + { + "epoch": 3.5758323057953145, + "grad_norm": 0.6060135960578918, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 2900 + }, + { + "epoch": 3.5881627620221948, + "grad_norm": 1.0177327394485474, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 2910 + }, + { + "epoch": 3.600493218249075, + "grad_norm": 0.5994468331336975, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 2920 + }, + { + "epoch": 3.6128236744759556, + "grad_norm": 0.7450457215309143, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 2930 + }, + { + "epoch": 3.625154130702836, + "grad_norm": 0.5825870037078857, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 2940 + }, + { + "epoch": 3.6374845869297165, + "grad_norm": 0.6289743781089783, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 2950 + }, + { + "epoch": 3.6498150431565968, + "grad_norm": 0.7801929116249084, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 2960 + }, + { + "epoch": 3.6621454993834774, + "grad_norm": 1.1206634044647217, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 2970 + }, + { + "epoch": 3.6744759556103577, + "grad_norm": 0.6738817691802979, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2980 + }, + { + "epoch": 3.686806411837238, + "grad_norm": 1.1917344331741333, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 2990 + }, + { + "epoch": 3.6991368680641186, + "grad_norm": 1.3738657236099243, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 3000 + }, + { + "epoch": 3.711467324290999, + "grad_norm": 0.6642793416976929, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 3010 + }, + { + "epoch": 3.723797780517879, + "grad_norm": 0.9030995965003967, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 3020 + }, + { + "epoch": 3.7361282367447597, + "grad_norm": 1.0203914642333984, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 3030 + }, + { + "epoch": 3.74845869297164, + "grad_norm": 0.648394763469696, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 3040 + }, + { + "epoch": 3.76078914919852, + "grad_norm": 0.6304570436477661, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 3050 + }, + { + "epoch": 3.773119605425401, + "grad_norm": 0.8286601901054382, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 3060 + }, + { + "epoch": 3.785450061652281, + "grad_norm": 0.906444251537323, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 3070 + }, + { + "epoch": 3.7977805178791613, + "grad_norm": 1.4212149381637573, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 3080 + }, + { + "epoch": 3.810110974106042, + "grad_norm": 0.7574319839477539, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3090 + }, + { + "epoch": 3.822441430332922, + "grad_norm": 0.6534451246261597, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 3100 + }, + { + "epoch": 3.834771886559803, + "grad_norm": 0.7525447010993958, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 3110 + }, + { + "epoch": 3.847102342786683, + "grad_norm": 0.6513990759849548, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 3120 + }, + { + "epoch": 3.8594327990135637, + "grad_norm": 0.7782694697380066, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 3130 + }, + { + "epoch": 3.871763255240444, + "grad_norm": 0.7998530268669128, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 3140 + }, + { + "epoch": 3.884093711467324, + "grad_norm": 0.8045353293418884, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 3150 + }, + { + "epoch": 3.896424167694205, + "grad_norm": 0.8242645263671875, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 3160 + }, + { + "epoch": 3.908754623921085, + "grad_norm": 0.8302360773086548, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 3170 + }, + { + "epoch": 3.9210850801479653, + "grad_norm": 0.8653109073638916, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 3180 + }, + { + "epoch": 3.933415536374846, + "grad_norm": 0.6461338996887207, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 3190 + }, + { + "epoch": 3.945745992601726, + "grad_norm": 0.8267415165901184, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 3200 + }, + { + "epoch": 3.9580764488286064, + "grad_norm": 1.1963194608688354, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 3210 + }, + { + "epoch": 3.970406905055487, + "grad_norm": 0.7101966142654419, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 3220 + }, + { + "epoch": 3.9827373612823673, + "grad_norm": 0.5931660532951355, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 3230 + }, + { + "epoch": 3.995067817509248, + "grad_norm": 0.7465988993644714, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 3240 + }, + { + "epoch": 4.0, + "eval_loss": 1.4066498279571533, + "eval_runtime": 95.7145, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 0.575, + "step": 3244 + }, + { + "epoch": 4.007398273736128, + "grad_norm": 0.9478800296783447, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 3250 + }, + { + "epoch": 4.019728729963009, + "grad_norm": 1.207059621810913, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 3260 + }, + { + "epoch": 4.032059186189889, + "grad_norm": 0.8984074592590332, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 3270 + }, + { + "epoch": 4.044389642416769, + "grad_norm": 0.8104140758514404, + "learning_rate": 0.0002, + "loss": 0.3798, + "step": 3280 + }, + { + "epoch": 4.05672009864365, + "grad_norm": 1.0875468254089355, + "learning_rate": 0.0002, + "loss": 0.3657, + "step": 3290 + }, + { + "epoch": 4.06905055487053, + "grad_norm": 0.8520309329032898, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 3300 + }, + { + "epoch": 4.0813810110974105, + "grad_norm": 1.076735496520996, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 3310 + }, + { + "epoch": 4.093711467324291, + "grad_norm": 0.7789369821548462, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 3320 + }, + { + "epoch": 4.106041923551172, + "grad_norm": 0.916862964630127, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 3330 + }, + { + "epoch": 4.118372379778052, + "grad_norm": 1.1251654624938965, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3340 + }, + { + "epoch": 4.130702836004932, + "grad_norm": 0.9373420476913452, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 3350 + }, + { + "epoch": 4.143033292231813, + "grad_norm": 1.03253972530365, + "learning_rate": 0.0002, + "loss": 0.384, + "step": 3360 + }, + { + "epoch": 4.155363748458693, + "grad_norm": 0.947023332118988, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 3370 + }, + { + "epoch": 4.167694204685573, + "grad_norm": 0.8709157109260559, + "learning_rate": 0.0002, + "loss": 0.4018, + "step": 3380 + }, + { + "epoch": 4.180024660912454, + "grad_norm": 0.930983304977417, + "learning_rate": 0.0002, + "loss": 0.3754, + "step": 3390 + }, + { + "epoch": 4.192355117139334, + "grad_norm": 1.092809796333313, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 3400 + }, + { + "epoch": 4.2046855733662145, + "grad_norm": 0.8454303741455078, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 3410 + }, + { + "epoch": 4.217016029593095, + "grad_norm": 0.957210123538971, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3420 + }, + { + "epoch": 4.229346485819975, + "grad_norm": 0.854333758354187, + "learning_rate": 0.0002, + "loss": 0.3743, + "step": 3430 + }, + { + "epoch": 4.241676942046856, + "grad_norm": 1.0457639694213867, + "learning_rate": 0.0002, + "loss": 0.4041, + "step": 3440 + }, + { + "epoch": 4.254007398273736, + "grad_norm": 0.8972977995872498, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3450 + }, + { + "epoch": 4.266337854500616, + "grad_norm": 1.0438238382339478, + "learning_rate": 0.0002, + "loss": 0.4445, + "step": 3460 + }, + { + "epoch": 4.278668310727497, + "grad_norm": 0.7000405192375183, + "learning_rate": 0.0002, + "loss": 0.4078, + "step": 3470 + }, + { + "epoch": 4.290998766954377, + "grad_norm": 1.0451240539550781, + "learning_rate": 0.0002, + "loss": 0.3718, + "step": 3480 + }, + { + "epoch": 4.303329223181258, + "grad_norm": 1.3339767456054688, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 3490 + }, + { + "epoch": 4.315659679408138, + "grad_norm": 0.7503946423530579, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 3500 + }, + { + "epoch": 4.3279901356350186, + "grad_norm": 0.8443584442138672, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 3510 + }, + { + "epoch": 4.340320591861899, + "grad_norm": 1.1681201457977295, + "learning_rate": 0.0002, + "loss": 0.3793, + "step": 3520 + }, + { + "epoch": 4.352651048088779, + "grad_norm": 1.078883171081543, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 3530 + }, + { + "epoch": 4.36498150431566, + "grad_norm": 0.6894834041595459, + "learning_rate": 0.0002, + "loss": 0.4216, + "step": 3540 + }, + { + "epoch": 4.37731196054254, + "grad_norm": 0.7059480547904968, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 3550 + }, + { + "epoch": 4.38964241676942, + "grad_norm": 1.1807256937026978, + "learning_rate": 0.0002, + "loss": 0.3821, + "step": 3560 + }, + { + "epoch": 4.401972872996301, + "grad_norm": 0.8341359496116638, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 3570 + }, + { + "epoch": 4.4143033292231815, + "grad_norm": 1.0273033380508423, + "learning_rate": 0.0002, + "loss": 0.4123, + "step": 3580 + }, + { + "epoch": 4.426633785450061, + "grad_norm": 0.6916454434394836, + "learning_rate": 0.0002, + "loss": 0.5018, + "step": 3590 + }, + { + "epoch": 4.438964241676942, + "grad_norm": 0.8210113644599915, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 3600 + }, + { + "epoch": 4.451294697903823, + "grad_norm": 1.0309500694274902, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 3610 + }, + { + "epoch": 4.463625154130703, + "grad_norm": 0.8847399353981018, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3620 + }, + { + "epoch": 4.475955610357583, + "grad_norm": 1.668636679649353, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3630 + }, + { + "epoch": 4.488286066584464, + "grad_norm": 1.3087958097457886, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 3640 + }, + { + "epoch": 4.500616522811344, + "grad_norm": 0.837852418422699, + "learning_rate": 0.0002, + "loss": 0.4294, + "step": 3650 + }, + { + "epoch": 4.512946979038224, + "grad_norm": 9.7662353515625, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 3660 + }, + { + "epoch": 4.525277435265105, + "grad_norm": 1.125719428062439, + "learning_rate": 0.0002, + "loss": 0.4033, + "step": 3670 + }, + { + "epoch": 4.5376078914919855, + "grad_norm": 0.7755377292633057, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 3680 + }, + { + "epoch": 4.549938347718865, + "grad_norm": 0.7185089588165283, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 3690 + }, + { + "epoch": 4.562268803945746, + "grad_norm": 1.182063102722168, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 3700 + }, + { + "epoch": 4.574599260172627, + "grad_norm": 1.001197338104248, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3710 + }, + { + "epoch": 4.586929716399506, + "grad_norm": 0.9705429077148438, + "learning_rate": 0.0002, + "loss": 0.4493, + "step": 3720 + }, + { + "epoch": 4.599260172626387, + "grad_norm": 0.7136746048927307, + "learning_rate": 0.0002, + "loss": 0.42, + "step": 3730 + }, + { + "epoch": 4.611590628853268, + "grad_norm": 1.0004864931106567, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 3740 + }, + { + "epoch": 4.623921085080148, + "grad_norm": 1.3193715810775757, + "learning_rate": 0.0002, + "loss": 0.4418, + "step": 3750 + }, + { + "epoch": 4.636251541307028, + "grad_norm": 0.6945042014122009, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3760 + }, + { + "epoch": 4.648581997533909, + "grad_norm": 0.8903936743736267, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 3770 + }, + { + "epoch": 4.660912453760789, + "grad_norm": 0.7960889339447021, + "learning_rate": 0.0002, + "loss": 0.3582, + "step": 3780 + }, + { + "epoch": 4.673242909987669, + "grad_norm": 1.0439172983169556, + "learning_rate": 0.0002, + "loss": 0.3864, + "step": 3790 + }, + { + "epoch": 4.68557336621455, + "grad_norm": 1.4546219110488892, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 3800 + }, + { + "epoch": 4.697903822441431, + "grad_norm": 0.8194343447685242, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 3810 + }, + { + "epoch": 4.7102342786683105, + "grad_norm": 1.0727602243423462, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 3820 + }, + { + "epoch": 4.722564734895191, + "grad_norm": 0.7785195708274841, + "learning_rate": 0.0002, + "loss": 0.4021, + "step": 3830 + }, + { + "epoch": 4.734895191122072, + "grad_norm": 0.846783459186554, + "learning_rate": 0.0002, + "loss": 0.4252, + "step": 3840 + }, + { + "epoch": 4.747225647348952, + "grad_norm": 1.0481648445129395, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 3850 + }, + { + "epoch": 4.759556103575832, + "grad_norm": 0.7324008941650391, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3860 + }, + { + "epoch": 4.771886559802713, + "grad_norm": 1.06382417678833, + "learning_rate": 0.0002, + "loss": 0.3831, + "step": 3870 + }, + { + "epoch": 4.784217016029594, + "grad_norm": 0.9851241111755371, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3880 + }, + { + "epoch": 4.796547472256473, + "grad_norm": 0.8215277791023254, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 3890 + }, + { + "epoch": 4.808877928483354, + "grad_norm": 0.9901723861694336, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 3900 + }, + { + "epoch": 4.821208384710234, + "grad_norm": 0.9149112701416016, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 3910 + }, + { + "epoch": 4.8335388409371145, + "grad_norm": 0.9772973656654358, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3920 + }, + { + "epoch": 4.845869297163995, + "grad_norm": 0.8889636397361755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 3930 + }, + { + "epoch": 4.858199753390876, + "grad_norm": 1.3032807111740112, + "learning_rate": 0.0002, + "loss": 0.421, + "step": 3940 + }, + { + "epoch": 4.870530209617756, + "grad_norm": 0.8575899600982666, + "learning_rate": 0.0002, + "loss": 0.434, + "step": 3950 + }, + { + "epoch": 4.882860665844636, + "grad_norm": 1.04326331615448, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3960 + }, + { + "epoch": 4.895191122071517, + "grad_norm": 1.041210651397705, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 3970 + }, + { + "epoch": 4.907521578298397, + "grad_norm": 0.9113056063652039, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 3980 + }, + { + "epoch": 4.919852034525277, + "grad_norm": 1.019347906112671, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 3990 + }, + { + "epoch": 4.932182490752158, + "grad_norm": 0.7709218859672546, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 4000 + }, + { + "epoch": 4.944512946979038, + "grad_norm": 0.8891775608062744, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 4010 + }, + { + "epoch": 4.9568434032059185, + "grad_norm": 1.0396920442581177, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 4020 + }, + { + "epoch": 4.969173859432799, + "grad_norm": 0.9239833354949951, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 4030 + }, + { + "epoch": 4.981504315659679, + "grad_norm": 1.801400065422058, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 4040 + }, + { + "epoch": 4.99383477188656, + "grad_norm": 0.6194164752960205, + "learning_rate": 0.0002, + "loss": 0.4481, + "step": 4050 + }, + { + "epoch": 5.0, + "eval_loss": 1.544758915901184, + "eval_runtime": 96.2573, + "eval_samples_per_second": 4.53, + "eval_steps_per_second": 0.571, + "step": 4055 + }, + { + "epoch": 5.00616522811344, + "grad_norm": 0.9918256998062134, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 4060 + }, + { + "epoch": 5.018495684340321, + "grad_norm": 1.4851351976394653, + "learning_rate": 0.0002, + "loss": 0.2887, + "step": 4070 + }, + { + "epoch": 5.030826140567201, + "grad_norm": 0.9237686395645142, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 4080 + }, + { + "epoch": 5.0431565967940815, + "grad_norm": 1.2180852890014648, + "learning_rate": 0.0002, + "loss": 0.3072, + "step": 4090 + }, + { + "epoch": 5.055487053020962, + "grad_norm": 1.1247979402542114, + "learning_rate": 0.0002, + "loss": 0.282, + "step": 4100 + }, + { + "epoch": 5.067817509247842, + "grad_norm": 1.2969884872436523, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 4110 + }, + { + "epoch": 5.080147965474723, + "grad_norm": 1.0183063745498657, + "learning_rate": 0.0002, + "loss": 0.2858, + "step": 4120 + }, + { + "epoch": 5.092478421701603, + "grad_norm": 1.121330738067627, + "learning_rate": 0.0002, + "loss": 0.295, + "step": 4130 + }, + { + "epoch": 5.104808877928483, + "grad_norm": 1.0748186111450195, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 4140 + }, + { + "epoch": 5.117139334155364, + "grad_norm": 1.103474736213684, + "learning_rate": 0.0002, + "loss": 0.3414, + "step": 4150 + }, + { + "epoch": 5.129469790382244, + "grad_norm": 1.2251166105270386, + "learning_rate": 0.0002, + "loss": 0.305, + "step": 4160 + }, + { + "epoch": 5.141800246609124, + "grad_norm": 0.920898973941803, + "learning_rate": 0.0002, + "loss": 0.3131, + "step": 4170 + }, + { + "epoch": 5.154130702836005, + "grad_norm": 1.327542781829834, + "learning_rate": 0.0002, + "loss": 0.281, + "step": 4180 + }, + { + "epoch": 5.1664611590628855, + "grad_norm": 1.0677192211151123, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 4190 + }, + { + "epoch": 5.178791615289766, + "grad_norm": 0.897241473197937, + "learning_rate": 0.0002, + "loss": 0.2863, + "step": 4200 + }, + { + "epoch": 5.191122071516646, + "grad_norm": 0.977457582950592, + "learning_rate": 0.0002, + "loss": 0.2967, + "step": 4210 + }, + { + "epoch": 5.203452527743527, + "grad_norm": 1.4115267992019653, + "learning_rate": 0.0002, + "loss": 0.3032, + "step": 4220 + }, + { + "epoch": 5.215782983970407, + "grad_norm": 1.097743034362793, + "learning_rate": 0.0002, + "loss": 0.3279, + "step": 4230 + }, + { + "epoch": 5.228113440197287, + "grad_norm": 1.1095269918441772, + "learning_rate": 0.0002, + "loss": 0.293, + "step": 4240 + }, + { + "epoch": 5.240443896424168, + "grad_norm": 1.3785479068756104, + "learning_rate": 0.0002, + "loss": 0.3544, + "step": 4250 + }, + { + "epoch": 5.252774352651048, + "grad_norm": 1.0298776626586914, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 4260 + }, + { + "epoch": 5.265104808877928, + "grad_norm": 1.1592111587524414, + "learning_rate": 0.0002, + "loss": 0.296, + "step": 4270 + }, + { + "epoch": 5.277435265104809, + "grad_norm": 1.2355743646621704, + "learning_rate": 0.0002, + "loss": 0.2878, + "step": 4280 + }, + { + "epoch": 5.2897657213316895, + "grad_norm": 0.8543112874031067, + "learning_rate": 0.0002, + "loss": 0.3085, + "step": 4290 + }, + { + "epoch": 5.302096177558569, + "grad_norm": 1.2953215837478638, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 4300 + }, + { + "epoch": 5.31442663378545, + "grad_norm": 1.1001787185668945, + "learning_rate": 0.0002, + "loss": 0.2912, + "step": 4310 + }, + { + "epoch": 5.326757090012331, + "grad_norm": 0.7476816773414612, + "learning_rate": 0.0002, + "loss": 0.3003, + "step": 4320 + }, + { + "epoch": 5.3390875462392104, + "grad_norm": 0.8195574283599854, + "learning_rate": 0.0002, + "loss": 0.3247, + "step": 4330 + }, + { + "epoch": 5.351418002466091, + "grad_norm": 0.9490262866020203, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 4340 + }, + { + "epoch": 5.363748458692972, + "grad_norm": 1.2201412916183472, + "learning_rate": 0.0002, + "loss": 0.2846, + "step": 4350 + }, + { + "epoch": 5.376078914919852, + "grad_norm": 1.0311479568481445, + "learning_rate": 0.0002, + "loss": 0.2644, + "step": 4360 + }, + { + "epoch": 5.388409371146732, + "grad_norm": 1.2097488641738892, + "learning_rate": 0.0002, + "loss": 0.3104, + "step": 4370 + }, + { + "epoch": 5.400739827373613, + "grad_norm": 1.140942096710205, + "learning_rate": 0.0002, + "loss": 0.2977, + "step": 4380 + }, + { + "epoch": 5.413070283600494, + "grad_norm": 0.8091890811920166, + "learning_rate": 0.0002, + "loss": 0.2975, + "step": 4390 + }, + { + "epoch": 5.425400739827373, + "grad_norm": 1.4467964172363281, + "learning_rate": 0.0002, + "loss": 0.3727, + "step": 4400 + }, + { + "epoch": 5.437731196054254, + "grad_norm": 1.0836058855056763, + "learning_rate": 0.0002, + "loss": 0.2979, + "step": 4410 + }, + { + "epoch": 5.450061652281135, + "grad_norm": 1.0515433549880981, + "learning_rate": 0.0002, + "loss": 0.2601, + "step": 4420 + }, + { + "epoch": 5.4623921085080145, + "grad_norm": 0.9603073000907898, + "learning_rate": 0.0002, + "loss": 0.315, + "step": 4430 + }, + { + "epoch": 5.474722564734895, + "grad_norm": 1.234609842300415, + "learning_rate": 0.0002, + "loss": 0.3166, + "step": 4440 + }, + { + "epoch": 5.487053020961776, + "grad_norm": 0.8881428837776184, + "learning_rate": 0.0002, + "loss": 0.3142, + "step": 4450 + }, + { + "epoch": 5.499383477188656, + "grad_norm": 1.1817275285720825, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 4460 + }, + { + "epoch": 5.511713933415536, + "grad_norm": 1.213993787765503, + "learning_rate": 0.0002, + "loss": 0.2944, + "step": 4470 + }, + { + "epoch": 5.524044389642417, + "grad_norm": 1.0501725673675537, + "learning_rate": 0.0002, + "loss": 0.3136, + "step": 4480 + }, + { + "epoch": 5.536374845869297, + "grad_norm": 1.5061579942703247, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 4490 + }, + { + "epoch": 5.548705302096177, + "grad_norm": 1.1171475648880005, + "learning_rate": 0.0002, + "loss": 0.3226, + "step": 4500 + }, + { + "epoch": 5.561035758323058, + "grad_norm": 1.1147594451904297, + "learning_rate": 0.0002, + "loss": 0.3624, + "step": 4510 + }, + { + "epoch": 5.573366214549939, + "grad_norm": 1.0600544214248657, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4520 + }, + { + "epoch": 5.5856966707768185, + "grad_norm": 1.247870922088623, + "learning_rate": 0.0002, + "loss": 0.3268, + "step": 4530 + }, + { + "epoch": 5.598027127003699, + "grad_norm": 0.9425561428070068, + "learning_rate": 0.0002, + "loss": 0.3168, + "step": 4540 + }, + { + "epoch": 5.61035758323058, + "grad_norm": 1.1111550331115723, + "learning_rate": 0.0002, + "loss": 0.3119, + "step": 4550 + }, + { + "epoch": 5.62268803945746, + "grad_norm": 1.743268609046936, + "learning_rate": 0.0002, + "loss": 0.3389, + "step": 4560 + }, + { + "epoch": 5.63501849568434, + "grad_norm": 1.3522645235061646, + "learning_rate": 0.0002, + "loss": 0.31, + "step": 4570 + }, + { + "epoch": 5.647348951911221, + "grad_norm": 0.7354221343994141, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 4580 + }, + { + "epoch": 5.659679408138101, + "grad_norm": 1.050743818283081, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 4590 + }, + { + "epoch": 5.6720098643649814, + "grad_norm": 1.1302396059036255, + "learning_rate": 0.0002, + "loss": 0.3449, + "step": 4600 + }, + { + "epoch": 5.684340320591862, + "grad_norm": 0.8774183392524719, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 4610 + }, + { + "epoch": 5.696670776818742, + "grad_norm": 1.090781569480896, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 4620 + }, + { + "epoch": 5.709001233045623, + "grad_norm": 0.9177733063697815, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 4630 + }, + { + "epoch": 5.721331689272503, + "grad_norm": 0.9985341429710388, + "learning_rate": 0.0002, + "loss": 0.3439, + "step": 4640 + }, + { + "epoch": 5.733662145499384, + "grad_norm": 1.0230613946914673, + "learning_rate": 0.0002, + "loss": 0.3323, + "step": 4650 + }, + { + "epoch": 5.745992601726264, + "grad_norm": 0.944656252861023, + "learning_rate": 0.0002, + "loss": 0.3525, + "step": 4660 + }, + { + "epoch": 5.758323057953144, + "grad_norm": 0.8162471652030945, + "learning_rate": 0.0002, + "loss": 0.3191, + "step": 4670 + }, + { + "epoch": 5.770653514180025, + "grad_norm": 1.0500398874282837, + "learning_rate": 0.0002, + "loss": 0.4011, + "step": 4680 + }, + { + "epoch": 5.782983970406905, + "grad_norm": 0.9487981796264648, + "learning_rate": 0.0002, + "loss": 0.3452, + "step": 4690 + }, + { + "epoch": 5.7953144266337855, + "grad_norm": 1.1856540441513062, + "learning_rate": 0.0002, + "loss": 0.2942, + "step": 4700 + }, + { + "epoch": 5.807644882860666, + "grad_norm": 1.2583396434783936, + "learning_rate": 0.0002, + "loss": 0.3107, + "step": 4710 + }, + { + "epoch": 5.819975339087546, + "grad_norm": 1.2532602548599243, + "learning_rate": 0.0002, + "loss": 0.3223, + "step": 4720 + }, + { + "epoch": 5.832305795314427, + "grad_norm": 1.115236520767212, + "learning_rate": 0.0002, + "loss": 0.3253, + "step": 4730 + }, + { + "epoch": 5.844636251541307, + "grad_norm": 1.2245537042617798, + "learning_rate": 0.0002, + "loss": 0.3539, + "step": 4740 + }, + { + "epoch": 5.856966707768187, + "grad_norm": 1.1964094638824463, + "learning_rate": 0.0002, + "loss": 0.3171, + "step": 4750 + }, + { + "epoch": 5.869297163995068, + "grad_norm": 1.0833805799484253, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 4760 + }, + { + "epoch": 5.881627620221948, + "grad_norm": 1.0694046020507812, + "learning_rate": 0.0002, + "loss": 0.3511, + "step": 4770 + }, + { + "epoch": 5.893958076448829, + "grad_norm": 0.9947936534881592, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4780 + }, + { + "epoch": 5.906288532675709, + "grad_norm": 1.175716519355774, + "learning_rate": 0.0002, + "loss": 0.316, + "step": 4790 + }, + { + "epoch": 5.9186189889025895, + "grad_norm": 0.7717352509498596, + "learning_rate": 0.0002, + "loss": 0.3609, + "step": 4800 + }, + { + "epoch": 5.930949445129469, + "grad_norm": 1.2906442880630493, + "learning_rate": 0.0002, + "loss": 0.3058, + "step": 4810 + }, + { + "epoch": 5.94327990135635, + "grad_norm": 1.2416284084320068, + "learning_rate": 0.0002, + "loss": 0.3187, + "step": 4820 + }, + { + "epoch": 5.955610357583231, + "grad_norm": 1.3066956996917725, + "learning_rate": 0.0002, + "loss": 0.337, + "step": 4830 + }, + { + "epoch": 5.967940813810111, + "grad_norm": 1.0872026681900024, + "learning_rate": 0.0002, + "loss": 0.3167, + "step": 4840 + }, + { + "epoch": 5.980271270036991, + "grad_norm": 1.1941101551055908, + "learning_rate": 0.0002, + "loss": 0.3262, + "step": 4850 + }, + { + "epoch": 5.992601726263872, + "grad_norm": 1.1126095056533813, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 4860 + }, + { + "epoch": 6.0, + "eval_loss": 1.748323917388916, + "eval_runtime": 97.7488, + "eval_samples_per_second": 4.46, + "eval_steps_per_second": 0.563, + "step": 4866 + }, + { + "epoch": 6.0049321824907524, + "grad_norm": 1.3631165027618408, + "learning_rate": 0.0002, + "loss": 0.2774, + "step": 4870 + }, + { + "epoch": 6.017262638717632, + "grad_norm": 1.2631664276123047, + "learning_rate": 0.0002, + "loss": 0.2399, + "step": 4880 + }, + { + "epoch": 6.029593094944513, + "grad_norm": 0.7073080539703369, + "learning_rate": 0.0002, + "loss": 0.2177, + "step": 4890 + }, + { + "epoch": 6.041923551171394, + "grad_norm": 0.7856091856956482, + "learning_rate": 0.0002, + "loss": 0.215, + "step": 4900 + }, + { + "epoch": 6.054254007398273, + "grad_norm": 1.145540475845337, + "learning_rate": 0.0002, + "loss": 0.1999, + "step": 4910 + }, + { + "epoch": 6.066584463625154, + "grad_norm": 1.1742334365844727, + "learning_rate": 0.0002, + "loss": 0.2084, + "step": 4920 + }, + { + "epoch": 6.078914919852035, + "grad_norm": 0.8043994903564453, + "learning_rate": 0.0002, + "loss": 0.2342, + "step": 4930 + }, + { + "epoch": 6.0912453760789145, + "grad_norm": 1.1877652406692505, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 4940 + }, + { + "epoch": 6.103575832305795, + "grad_norm": 0.7624953985214233, + "learning_rate": 0.0002, + "loss": 0.1908, + "step": 4950 + }, + { + "epoch": 6.115906288532676, + "grad_norm": 1.0403119325637817, + "learning_rate": 0.0002, + "loss": 0.2254, + "step": 4960 + }, + { + "epoch": 6.1282367447595565, + "grad_norm": 1.2040252685546875, + "learning_rate": 0.0002, + "loss": 0.2274, + "step": 4970 + }, + { + "epoch": 6.140567200986436, + "grad_norm": 0.6242546439170837, + "learning_rate": 0.0002, + "loss": 0.2199, + "step": 4980 + }, + { + "epoch": 6.152897657213317, + "grad_norm": 1.1394767761230469, + "learning_rate": 0.0002, + "loss": 0.27, + "step": 4990 + }, + { + "epoch": 6.165228113440198, + "grad_norm": 1.3760257959365845, + "learning_rate": 0.0002, + "loss": 0.2377, + "step": 5000 + }, + { + "epoch": 6.177558569667077, + "grad_norm": 1.0707697868347168, + "learning_rate": 0.0002, + "loss": 0.2331, + "step": 5010 + }, + { + "epoch": 6.189889025893958, + "grad_norm": 1.288072109222412, + "learning_rate": 0.0002, + "loss": 0.2311, + "step": 5020 + }, + { + "epoch": 6.202219482120839, + "grad_norm": 1.1479463577270508, + "learning_rate": 0.0002, + "loss": 0.2276, + "step": 5030 + }, + { + "epoch": 6.2145499383477185, + "grad_norm": 0.905891478061676, + "learning_rate": 0.0002, + "loss": 0.2294, + "step": 5040 + }, + { + "epoch": 6.226880394574599, + "grad_norm": 1.0354516506195068, + "learning_rate": 0.0002, + "loss": 0.2575, + "step": 5050 + }, + { + "epoch": 6.23921085080148, + "grad_norm": 1.312671184539795, + "learning_rate": 0.0002, + "loss": 0.2259, + "step": 5060 + }, + { + "epoch": 6.25154130702836, + "grad_norm": 1.614709734916687, + "learning_rate": 0.0002, + "loss": 0.2281, + "step": 5070 + }, + { + "epoch": 6.26387176325524, + "grad_norm": 1.0864229202270508, + "learning_rate": 0.0002, + "loss": 0.2388, + "step": 5080 + }, + { + "epoch": 6.276202219482121, + "grad_norm": 1.0401391983032227, + "learning_rate": 0.0002, + "loss": 0.2014, + "step": 5090 + }, + { + "epoch": 6.288532675709002, + "grad_norm": 1.2187728881835938, + "learning_rate": 0.0002, + "loss": 0.2419, + "step": 5100 + }, + { + "epoch": 6.300863131935881, + "grad_norm": 0.9474364519119263, + "learning_rate": 0.0002, + "loss": 0.2144, + "step": 5110 + }, + { + "epoch": 6.313193588162762, + "grad_norm": 1.1228716373443604, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 5120 + }, + { + "epoch": 6.325524044389643, + "grad_norm": 0.9294499754905701, + "learning_rate": 0.0002, + "loss": 0.2556, + "step": 5130 + }, + { + "epoch": 6.337854500616523, + "grad_norm": 1.0521048307418823, + "learning_rate": 0.0002, + "loss": 0.2384, + "step": 5140 + }, + { + "epoch": 6.350184956843403, + "grad_norm": 1.2406890392303467, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 5150 + }, + { + "epoch": 6.362515413070284, + "grad_norm": 1.2972853183746338, + "learning_rate": 0.0002, + "loss": 0.2301, + "step": 5160 + }, + { + "epoch": 6.374845869297164, + "grad_norm": 0.8772842288017273, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 5170 + }, + { + "epoch": 6.387176325524044, + "grad_norm": 1.050349473953247, + "learning_rate": 0.0002, + "loss": 0.2337, + "step": 5180 + }, + { + "epoch": 6.399506781750925, + "grad_norm": 0.9432134032249451, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 5190 + }, + { + "epoch": 6.411837237977805, + "grad_norm": 1.11045241355896, + "learning_rate": 0.0002, + "loss": 0.2546, + "step": 5200 + }, + { + "epoch": 6.4241676942046855, + "grad_norm": 1.117530345916748, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 5210 + }, + { + "epoch": 6.436498150431566, + "grad_norm": 1.4194035530090332, + "learning_rate": 0.0002, + "loss": 0.25, + "step": 5220 + }, + { + "epoch": 6.448828606658447, + "grad_norm": 1.063950777053833, + "learning_rate": 0.0002, + "loss": 0.2335, + "step": 5230 + }, + { + "epoch": 6.461159062885327, + "grad_norm": 1.2946349382400513, + "learning_rate": 0.0002, + "loss": 0.2299, + "step": 5240 + }, + { + "epoch": 6.473489519112207, + "grad_norm": 1.5237880945205688, + "learning_rate": 0.0002, + "loss": 0.242, + "step": 5250 + }, + { + "epoch": 6.485819975339088, + "grad_norm": 1.1915720701217651, + "learning_rate": 0.0002, + "loss": 0.255, + "step": 5260 + }, + { + "epoch": 6.498150431565968, + "grad_norm": 1.0779626369476318, + "learning_rate": 0.0002, + "loss": 0.2357, + "step": 5270 + }, + { + "epoch": 6.510480887792848, + "grad_norm": 0.8255738019943237, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 5280 + }, + { + "epoch": 6.522811344019729, + "grad_norm": 1.275174856185913, + "learning_rate": 0.0002, + "loss": 0.267, + "step": 5290 + }, + { + "epoch": 6.535141800246609, + "grad_norm": 1.0878815650939941, + "learning_rate": 0.0002, + "loss": 0.2217, + "step": 5300 + }, + { + "epoch": 6.5474722564734895, + "grad_norm": 1.2594236135482788, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 5310 + }, + { + "epoch": 6.55980271270037, + "grad_norm": 0.9919610619544983, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 5320 + }, + { + "epoch": 6.57213316892725, + "grad_norm": 1.3703680038452148, + "learning_rate": 0.0002, + "loss": 0.2933, + "step": 5330 + }, + { + "epoch": 6.584463625154131, + "grad_norm": 1.403140902519226, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 5340 + }, + { + "epoch": 6.596794081381011, + "grad_norm": 1.3477165699005127, + "learning_rate": 0.0002, + "loss": 0.2584, + "step": 5350 + }, + { + "epoch": 6.609124537607892, + "grad_norm": 1.3145594596862793, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 5360 + }, + { + "epoch": 6.621454993834772, + "grad_norm": 0.9048973321914673, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 5370 + }, + { + "epoch": 6.633785450061652, + "grad_norm": 1.4123972654342651, + "learning_rate": 0.0002, + "loss": 0.2646, + "step": 5380 + }, + { + "epoch": 6.646115906288532, + "grad_norm": 1.3584848642349243, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 5390 + }, + { + "epoch": 6.658446362515413, + "grad_norm": 1.2085801362991333, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 5400 + }, + { + "epoch": 6.670776818742294, + "grad_norm": 1.9293283224105835, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 5410 + }, + { + "epoch": 6.683107274969174, + "grad_norm": 1.3658782243728638, + "learning_rate": 0.0002, + "loss": 0.2412, + "step": 5420 + }, + { + "epoch": 6.695437731196054, + "grad_norm": 1.2004997730255127, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 5430 + }, + { + "epoch": 6.707768187422935, + "grad_norm": 1.0671268701553345, + "learning_rate": 0.0002, + "loss": 0.2693, + "step": 5440 + }, + { + "epoch": 6.720098643649815, + "grad_norm": 0.8877466320991516, + "learning_rate": 0.0002, + "loss": 0.2216, + "step": 5450 + }, + { + "epoch": 6.732429099876695, + "grad_norm": 1.2843106985092163, + "learning_rate": 0.0002, + "loss": 0.2678, + "step": 5460 + }, + { + "epoch": 6.744759556103576, + "grad_norm": 1.0663448572158813, + "learning_rate": 0.0002, + "loss": 0.2418, + "step": 5470 + }, + { + "epoch": 6.7570900123304565, + "grad_norm": 1.3155773878097534, + "learning_rate": 0.0002, + "loss": 0.2402, + "step": 5480 + }, + { + "epoch": 6.769420468557336, + "grad_norm": 1.8862448930740356, + "learning_rate": 0.0002, + "loss": 0.2559, + "step": 5490 + }, + { + "epoch": 6.781750924784217, + "grad_norm": 1.165061116218567, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 5500 + }, + { + "epoch": 6.794081381011098, + "grad_norm": 1.0968598127365112, + "learning_rate": 0.0002, + "loss": 0.2342, + "step": 5510 + }, + { + "epoch": 6.806411837237977, + "grad_norm": 0.9448091983795166, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 5520 + }, + { + "epoch": 6.818742293464858, + "grad_norm": 1.400767207145691, + "learning_rate": 0.0002, + "loss": 0.2609, + "step": 5530 + }, + { + "epoch": 6.831072749691739, + "grad_norm": 1.1031112670898438, + "learning_rate": 0.0002, + "loss": 0.2642, + "step": 5540 + }, + { + "epoch": 6.843403205918619, + "grad_norm": 1.2436904907226562, + "learning_rate": 0.0002, + "loss": 0.2534, + "step": 5550 + }, + { + "epoch": 6.855733662145499, + "grad_norm": 1.0987974405288696, + "learning_rate": 0.0002, + "loss": 0.2601, + "step": 5560 + }, + { + "epoch": 6.86806411837238, + "grad_norm": 0.8656415939331055, + "learning_rate": 0.0002, + "loss": 0.2622, + "step": 5570 + }, + { + "epoch": 6.8803945745992605, + "grad_norm": 1.2153927087783813, + "learning_rate": 0.0002, + "loss": 0.2585, + "step": 5580 + }, + { + "epoch": 6.89272503082614, + "grad_norm": 1.111377477645874, + "learning_rate": 0.0002, + "loss": 0.2888, + "step": 5590 + }, + { + "epoch": 6.905055487053021, + "grad_norm": 1.0041896104812622, + "learning_rate": 0.0002, + "loss": 0.2569, + "step": 5600 + }, + { + "epoch": 6.917385943279902, + "grad_norm": 1.0638413429260254, + "learning_rate": 0.0002, + "loss": 0.2654, + "step": 5610 + }, + { + "epoch": 6.929716399506781, + "grad_norm": 0.9756764769554138, + "learning_rate": 0.0002, + "loss": 0.2364, + "step": 5620 + }, + { + "epoch": 6.942046855733662, + "grad_norm": 1.153550624847412, + "learning_rate": 0.0002, + "loss": 0.2756, + "step": 5630 + }, + { + "epoch": 6.954377311960543, + "grad_norm": 1.3393985033035278, + "learning_rate": 0.0002, + "loss": 0.2732, + "step": 5640 + }, + { + "epoch": 6.9667077681874225, + "grad_norm": 1.3233463764190674, + "learning_rate": 0.0002, + "loss": 0.2793, + "step": 5650 + }, + { + "epoch": 6.979038224414303, + "grad_norm": 1.1693105697631836, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 5660 + }, + { + "epoch": 6.991368680641184, + "grad_norm": 0.7186262607574463, + "learning_rate": 0.0002, + "loss": 0.278, + "step": 5670 + }, + { + "epoch": 7.0, + "eval_loss": 1.936746597290039, + "eval_runtime": 99.4259, + "eval_samples_per_second": 4.385, + "eval_steps_per_second": 0.553, + "step": 5677 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.915919839310643e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-5677/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b010916a0d0274a90be6d410a100d6c5b321b4b6 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13bdf9c4c059a9b32eb35f8009e5aaf6680dd1577a27da3fc53b22bfbd67df9a +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e416fe30658f67c4c28d14a269a390854af37b9b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d8d60430eec12206d1a0c3bd850e504e449bc44f770231a5abefe34f2637c3 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..097a3a63abf91df9d6e7c0ad6341796f06509028 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ca8bf286917b91590b2f5eaaad41f8787561a71f6a72904069eaf56f09eaeba +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6dd2dee8fb0438e02e84ccd773f96e6538cfb5ec --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5ed4f10a73a447786a0f9f6c0f576e554f45a4d7edc138e651cdd655c3c7450 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4268b2c2808636a45c3093c9e246e02e9055be97 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/trainer_state.json @@ -0,0 +1,4633 @@ +{ + "best_metric": 1.238026738166809, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 6488, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + }, + { + "epoch": 1.0110974106041923, + "grad_norm": 0.26760655641555786, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 820 + }, + { + "epoch": 1.0234278668310728, + "grad_norm": 0.4640820026397705, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 830 + }, + { + "epoch": 1.0357583230579532, + "grad_norm": 0.2699166238307953, + "learning_rate": 0.0002, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.0480887792848335, + "grad_norm": 0.3441709578037262, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 850 + }, + { + "epoch": 1.060419235511714, + "grad_norm": 0.299934983253479, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 860 + }, + { + "epoch": 1.0727496917385944, + "grad_norm": 0.2980666160583496, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 870 + }, + { + "epoch": 1.0850801479654748, + "grad_norm": 0.3131714463233948, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 880 + }, + { + "epoch": 1.097410604192355, + "grad_norm": 0.29881617426872253, + "learning_rate": 0.0002, + "loss": 0.9288, + "step": 890 + }, + { + "epoch": 1.1097410604192355, + "grad_norm": 0.29870888590812683, + "learning_rate": 0.0002, + "loss": 0.998, + "step": 900 + }, + { + "epoch": 1.122071516646116, + "grad_norm": 0.5735140442848206, + "learning_rate": 0.0002, + "loss": 0.9924, + "step": 910 + }, + { + "epoch": 1.1344019728729964, + "grad_norm": 0.33159002661705017, + "learning_rate": 0.0002, + "loss": 1.0694, + "step": 920 + }, + { + "epoch": 1.1467324290998766, + "grad_norm": 1.235399842262268, + "learning_rate": 0.0002, + "loss": 1.0069, + "step": 930 + }, + { + "epoch": 1.159062885326757, + "grad_norm": 0.27469736337661743, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 940 + }, + { + "epoch": 1.1713933415536375, + "grad_norm": 0.29130664467811584, + "learning_rate": 0.0002, + "loss": 0.9386, + "step": 950 + }, + { + "epoch": 1.183723797780518, + "grad_norm": 0.3730354607105255, + "learning_rate": 0.0002, + "loss": 0.8919, + "step": 960 + }, + { + "epoch": 1.1960542540073984, + "grad_norm": 0.5973590612411499, + "learning_rate": 0.0002, + "loss": 0.9988, + "step": 970 + }, + { + "epoch": 1.2083847102342786, + "grad_norm": 0.39631304144859314, + "learning_rate": 0.0002, + "loss": 0.9525, + "step": 980 + }, + { + "epoch": 1.220715166461159, + "grad_norm": 0.849051296710968, + "learning_rate": 0.0002, + "loss": 0.9217, + "step": 990 + }, + { + "epoch": 1.2330456226880395, + "grad_norm": 0.4390525817871094, + "learning_rate": 0.0002, + "loss": 1.0903, + "step": 1000 + }, + { + "epoch": 1.2453760789149197, + "grad_norm": 0.30423852801322937, + "learning_rate": 0.0002, + "loss": 0.9018, + "step": 1010 + }, + { + "epoch": 1.2577065351418002, + "grad_norm": 0.34736061096191406, + "learning_rate": 0.0002, + "loss": 1.0128, + "step": 1020 + }, + { + "epoch": 1.2700369913686806, + "grad_norm": 0.3421604037284851, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 1030 + }, + { + "epoch": 1.282367447595561, + "grad_norm": 0.544170081615448, + "learning_rate": 0.0002, + "loss": 0.8485, + "step": 1040 + }, + { + "epoch": 1.2946979038224415, + "grad_norm": 0.5128790736198425, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 1050 + }, + { + "epoch": 1.3070283600493218, + "grad_norm": 0.443344384431839, + "learning_rate": 0.0002, + "loss": 0.9214, + "step": 1060 + }, + { + "epoch": 1.3193588162762022, + "grad_norm": 0.6380868554115295, + "learning_rate": 0.0002, + "loss": 0.9367, + "step": 1070 + }, + { + "epoch": 1.3316892725030827, + "grad_norm": 0.4638073146343231, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 1080 + }, + { + "epoch": 1.344019728729963, + "grad_norm": 0.32406893372535706, + "learning_rate": 0.0002, + "loss": 0.8645, + "step": 1090 + }, + { + "epoch": 1.3563501849568433, + "grad_norm": 0.3955065608024597, + "learning_rate": 0.0002, + "loss": 0.8278, + "step": 1100 + }, + { + "epoch": 1.3686806411837238, + "grad_norm": 0.3489246666431427, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 1110 + }, + { + "epoch": 1.3810110974106042, + "grad_norm": 0.48451653122901917, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 1120 + }, + { + "epoch": 1.3933415536374847, + "grad_norm": 0.3652360439300537, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 1130 + }, + { + "epoch": 1.405672009864365, + "grad_norm": 1.3097436428070068, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 1140 + }, + { + "epoch": 1.4180024660912454, + "grad_norm": 0.3647715449333191, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 1150 + }, + { + "epoch": 1.4303329223181258, + "grad_norm": 0.37248560786247253, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 1160 + }, + { + "epoch": 1.442663378545006, + "grad_norm": 0.4639643430709839, + "learning_rate": 0.0002, + "loss": 0.936, + "step": 1170 + }, + { + "epoch": 1.4549938347718865, + "grad_norm": 0.5455219745635986, + "learning_rate": 0.0002, + "loss": 0.9511, + "step": 1180 + }, + { + "epoch": 1.467324290998767, + "grad_norm": 0.38862571120262146, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 1190 + }, + { + "epoch": 1.4796547472256474, + "grad_norm": 0.37586215138435364, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 1200 + }, + { + "epoch": 1.4919852034525278, + "grad_norm": 0.46244436502456665, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1210 + }, + { + "epoch": 1.504315659679408, + "grad_norm": 0.3570359945297241, + "learning_rate": 0.0002, + "loss": 0.9388, + "step": 1220 + }, + { + "epoch": 1.5166461159062885, + "grad_norm": 0.28393083810806274, + "learning_rate": 0.0002, + "loss": 0.971, + "step": 1230 + }, + { + "epoch": 1.528976572133169, + "grad_norm": 0.5672869682312012, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 1240 + }, + { + "epoch": 1.5413070283600492, + "grad_norm": 0.41605108976364136, + "learning_rate": 0.0002, + "loss": 0.8787, + "step": 1250 + }, + { + "epoch": 1.5536374845869299, + "grad_norm": 0.40657493472099304, + "learning_rate": 0.0002, + "loss": 0.8744, + "step": 1260 + }, + { + "epoch": 1.56596794081381, + "grad_norm": 0.43672341108322144, + "learning_rate": 0.0002, + "loss": 0.9046, + "step": 1270 + }, + { + "epoch": 1.5782983970406905, + "grad_norm": 0.3065410554409027, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 1280 + }, + { + "epoch": 1.590628853267571, + "grad_norm": 0.37826645374298096, + "learning_rate": 0.0002, + "loss": 0.9499, + "step": 1290 + }, + { + "epoch": 1.6029593094944512, + "grad_norm": 0.42307335138320923, + "learning_rate": 0.0002, + "loss": 0.901, + "step": 1300 + }, + { + "epoch": 1.6152897657213316, + "grad_norm": 0.3648843467235565, + "learning_rate": 0.0002, + "loss": 0.8673, + "step": 1310 + }, + { + "epoch": 1.627620221948212, + "grad_norm": 0.8921076059341431, + "learning_rate": 0.0002, + "loss": 0.9302, + "step": 1320 + }, + { + "epoch": 1.6399506781750923, + "grad_norm": 0.37522226572036743, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 1330 + }, + { + "epoch": 1.652281134401973, + "grad_norm": 0.7489957809448242, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 1340 + }, + { + "epoch": 1.6646115906288532, + "grad_norm": 0.31733131408691406, + "learning_rate": 0.0002, + "loss": 0.9297, + "step": 1350 + }, + { + "epoch": 1.6769420468557337, + "grad_norm": 0.3249478340148926, + "learning_rate": 0.0002, + "loss": 0.907, + "step": 1360 + }, + { + "epoch": 1.6892725030826141, + "grad_norm": 0.3178001344203949, + "learning_rate": 0.0002, + "loss": 1.0197, + "step": 1370 + }, + { + "epoch": 1.7016029593094943, + "grad_norm": 0.5674093961715698, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 1380 + }, + { + "epoch": 1.7139334155363748, + "grad_norm": 0.35272449254989624, + "learning_rate": 0.0002, + "loss": 0.8972, + "step": 1390 + }, + { + "epoch": 1.7262638717632552, + "grad_norm": 0.5778217911720276, + "learning_rate": 0.0002, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 1.7385943279901355, + "grad_norm": 0.33561450242996216, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 1410 + }, + { + "epoch": 1.7509247842170161, + "grad_norm": 0.31735464930534363, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 1420 + }, + { + "epoch": 1.7632552404438964, + "grad_norm": 1.0612670183181763, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 1430 + }, + { + "epoch": 1.7755856966707768, + "grad_norm": 0.5442509651184082, + "learning_rate": 0.0002, + "loss": 0.8224, + "step": 1440 + }, + { + "epoch": 1.7879161528976573, + "grad_norm": 0.7471332550048828, + "learning_rate": 0.0002, + "loss": 0.9275, + "step": 1450 + }, + { + "epoch": 1.8002466091245375, + "grad_norm": 0.4323609173297882, + "learning_rate": 0.0002, + "loss": 0.9389, + "step": 1460 + }, + { + "epoch": 1.8125770653514182, + "grad_norm": 0.47796759009361267, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1470 + }, + { + "epoch": 1.8249075215782984, + "grad_norm": 0.3348400592803955, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 1480 + }, + { + "epoch": 1.8372379778051788, + "grad_norm": 0.3354550898075104, + "learning_rate": 0.0002, + "loss": 0.9793, + "step": 1490 + }, + { + "epoch": 1.8495684340320593, + "grad_norm": 0.5988477468490601, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 1500 + }, + { + "epoch": 1.8618988902589395, + "grad_norm": 0.5222318172454834, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 1510 + }, + { + "epoch": 1.87422934648582, + "grad_norm": 0.5246642827987671, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 1520 + }, + { + "epoch": 1.8865598027127004, + "grad_norm": 0.3164594769477844, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 1530 + }, + { + "epoch": 1.8988902589395806, + "grad_norm": 0.3496174216270447, + "learning_rate": 0.0002, + "loss": 0.9961, + "step": 1540 + }, + { + "epoch": 1.9112207151664613, + "grad_norm": 0.8863359689712524, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 1550 + }, + { + "epoch": 1.9235511713933415, + "grad_norm": 0.3587026298046112, + "learning_rate": 0.0002, + "loss": 0.9405, + "step": 1560 + }, + { + "epoch": 1.935881627620222, + "grad_norm": 0.6052881479263306, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1570 + }, + { + "epoch": 1.9482120838471024, + "grad_norm": 0.567269504070282, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 1580 + }, + { + "epoch": 1.9605425400739827, + "grad_norm": 0.45184487104415894, + "learning_rate": 0.0002, + "loss": 0.9581, + "step": 1590 + }, + { + "epoch": 1.972872996300863, + "grad_norm": 0.5028569102287292, + "learning_rate": 0.0002, + "loss": 0.9147, + "step": 1600 + }, + { + "epoch": 1.9852034525277436, + "grad_norm": 0.4677547216415405, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 1610 + }, + { + "epoch": 1.9975339087546238, + "grad_norm": 0.35106056928634644, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 1620 + }, + { + "epoch": 2.0, + "eval_loss": 1.238026738166809, + "eval_runtime": 95.4287, + "eval_samples_per_second": 4.569, + "eval_steps_per_second": 0.576, + "step": 1622 + }, + { + "epoch": 2.0098643649815044, + "grad_norm": 0.444060355424881, + "learning_rate": 0.0002, + "loss": 0.7629, + "step": 1630 + }, + { + "epoch": 2.0221948212083847, + "grad_norm": 0.627570390701294, + "learning_rate": 0.0002, + "loss": 0.772, + "step": 1640 + }, + { + "epoch": 2.034525277435265, + "grad_norm": 0.38737839460372925, + "learning_rate": 0.0002, + "loss": 0.6186, + "step": 1650 + }, + { + "epoch": 2.0468557336621456, + "grad_norm": 0.4300459623336792, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 1660 + }, + { + "epoch": 2.059186189889026, + "grad_norm": 0.43037715554237366, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 1670 + }, + { + "epoch": 2.0715166461159065, + "grad_norm": 0.40772515535354614, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 1680 + }, + { + "epoch": 2.0838471023427867, + "grad_norm": 0.5295451879501343, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1690 + }, + { + "epoch": 2.096177558569667, + "grad_norm": 0.7452750205993652, + "learning_rate": 0.0002, + "loss": 0.7717, + "step": 1700 + }, + { + "epoch": 2.1085080147965476, + "grad_norm": 0.809183657169342, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 1710 + }, + { + "epoch": 2.120838471023428, + "grad_norm": 0.4597688913345337, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 1720 + }, + { + "epoch": 2.133168927250308, + "grad_norm": 0.806919276714325, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 1730 + }, + { + "epoch": 2.1454993834771887, + "grad_norm": 0.3755643665790558, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 1740 + }, + { + "epoch": 2.157829839704069, + "grad_norm": 0.5882734060287476, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1750 + }, + { + "epoch": 2.1701602959309496, + "grad_norm": 0.692960798740387, + "learning_rate": 0.0002, + "loss": 0.6782, + "step": 1760 + }, + { + "epoch": 2.18249075215783, + "grad_norm": 0.4737096428871155, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 1770 + }, + { + "epoch": 2.19482120838471, + "grad_norm": 0.6637021899223328, + "learning_rate": 0.0002, + "loss": 0.758, + "step": 1780 + }, + { + "epoch": 2.2071516646115907, + "grad_norm": 0.9109764099121094, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 1790 + }, + { + "epoch": 2.219482120838471, + "grad_norm": 0.4137539267539978, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 1800 + }, + { + "epoch": 2.2318125770653516, + "grad_norm": 0.44995415210723877, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 1810 + }, + { + "epoch": 2.244143033292232, + "grad_norm": 0.5985036492347717, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 1820 + }, + { + "epoch": 2.256473489519112, + "grad_norm": 0.7549490332603455, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 1830 + }, + { + "epoch": 2.2688039457459928, + "grad_norm": 0.4490937888622284, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 1840 + }, + { + "epoch": 2.281134401972873, + "grad_norm": 0.38859808444976807, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 1850 + }, + { + "epoch": 2.293464858199753, + "grad_norm": 1.0704916715621948, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 1860 + }, + { + "epoch": 2.305795314426634, + "grad_norm": 0.4647100865840912, + "learning_rate": 0.0002, + "loss": 0.7143, + "step": 1870 + }, + { + "epoch": 2.318125770653514, + "grad_norm": 0.6181163787841797, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 1880 + }, + { + "epoch": 2.3304562268803943, + "grad_norm": 0.9241904020309448, + "learning_rate": 0.0002, + "loss": 0.7689, + "step": 1890 + }, + { + "epoch": 2.342786683107275, + "grad_norm": 0.39101317524909973, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 1900 + }, + { + "epoch": 2.3551171393341552, + "grad_norm": 0.49442458152770996, + "learning_rate": 0.0002, + "loss": 0.7079, + "step": 1910 + }, + { + "epoch": 2.367447595561036, + "grad_norm": 0.4864824414253235, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 1920 + }, + { + "epoch": 2.379778051787916, + "grad_norm": 0.5427613854408264, + "learning_rate": 0.0002, + "loss": 0.7434, + "step": 1930 + }, + { + "epoch": 2.392108508014797, + "grad_norm": 0.7164974808692932, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1940 + }, + { + "epoch": 2.404438964241677, + "grad_norm": 0.562979519367218, + "learning_rate": 0.0002, + "loss": 0.6888, + "step": 1950 + }, + { + "epoch": 2.4167694204685573, + "grad_norm": 0.5631861090660095, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 1960 + }, + { + "epoch": 2.429099876695438, + "grad_norm": 0.4895121157169342, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 1970 + }, + { + "epoch": 2.441430332922318, + "grad_norm": 0.45674824714660645, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1980 + }, + { + "epoch": 2.4537607891491984, + "grad_norm": 1.1424206495285034, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 1990 + }, + { + "epoch": 2.466091245376079, + "grad_norm": 0.6314579844474792, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 2000 + }, + { + "epoch": 2.4784217016029593, + "grad_norm": 0.5481605529785156, + "learning_rate": 0.0002, + "loss": 0.7118, + "step": 2010 + }, + { + "epoch": 2.4907521578298395, + "grad_norm": 0.4671579599380493, + "learning_rate": 0.0002, + "loss": 0.6947, + "step": 2020 + }, + { + "epoch": 2.50308261405672, + "grad_norm": 0.7621194124221802, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 2030 + }, + { + "epoch": 2.5154130702836004, + "grad_norm": 0.38983288407325745, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 2.5277435265104806, + "grad_norm": 0.6341150999069214, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 2050 + }, + { + "epoch": 2.5400739827373613, + "grad_norm": 0.7151971459388733, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 2060 + }, + { + "epoch": 2.5524044389642415, + "grad_norm": 0.9665895104408264, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 2070 + }, + { + "epoch": 2.564734895191122, + "grad_norm": 0.9572727680206299, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 2080 + }, + { + "epoch": 2.5770653514180024, + "grad_norm": 1.1970765590667725, + "learning_rate": 0.0002, + "loss": 0.7205, + "step": 2090 + }, + { + "epoch": 2.589395807644883, + "grad_norm": 0.5505942702293396, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 2100 + }, + { + "epoch": 2.6017262638717633, + "grad_norm": 0.5903949737548828, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 2110 + }, + { + "epoch": 2.6140567200986435, + "grad_norm": 0.45640307664871216, + "learning_rate": 0.0002, + "loss": 0.678, + "step": 2120 + }, + { + "epoch": 2.626387176325524, + "grad_norm": 0.8763944506645203, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 2130 + }, + { + "epoch": 2.6387176325524044, + "grad_norm": 0.4472963213920593, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 2140 + }, + { + "epoch": 2.6510480887792847, + "grad_norm": 0.5335086584091187, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 2150 + }, + { + "epoch": 2.6633785450061653, + "grad_norm": 0.805263340473175, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 2160 + }, + { + "epoch": 2.6757090012330456, + "grad_norm": 0.6332727670669556, + "learning_rate": 0.0002, + "loss": 0.7026, + "step": 2170 + }, + { + "epoch": 2.688039457459926, + "grad_norm": 0.8667435646057129, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 2180 + }, + { + "epoch": 2.7003699136868065, + "grad_norm": 0.5638955235481262, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2190 + }, + { + "epoch": 2.7127003699136867, + "grad_norm": 0.4176250696182251, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 2200 + }, + { + "epoch": 2.7250308261405674, + "grad_norm": 0.6013461351394653, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 2210 + }, + { + "epoch": 2.7373612823674476, + "grad_norm": 0.553961992263794, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 2220 + }, + { + "epoch": 2.7496917385943282, + "grad_norm": 0.4710180461406708, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 2230 + }, + { + "epoch": 2.7620221948212085, + "grad_norm": 0.8141706585884094, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 2240 + }, + { + "epoch": 2.7743526510480887, + "grad_norm": 0.7449556589126587, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 2250 + }, + { + "epoch": 2.7866831072749694, + "grad_norm": 0.5366780757904053, + "learning_rate": 0.0002, + "loss": 0.6933, + "step": 2260 + }, + { + "epoch": 2.7990135635018496, + "grad_norm": 0.5316720604896545, + "learning_rate": 0.0002, + "loss": 0.7192, + "step": 2270 + }, + { + "epoch": 2.81134401972873, + "grad_norm": 0.4598459005355835, + "learning_rate": 0.0002, + "loss": 0.6212, + "step": 2280 + }, + { + "epoch": 2.8236744759556105, + "grad_norm": 0.6852091550827026, + "learning_rate": 0.0002, + "loss": 0.7024, + "step": 2290 + }, + { + "epoch": 2.8360049321824907, + "grad_norm": 0.8040902018547058, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 2300 + }, + { + "epoch": 2.848335388409371, + "grad_norm": 0.46976321935653687, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 2310 + }, + { + "epoch": 2.8606658446362516, + "grad_norm": 0.5214090347290039, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 2320 + }, + { + "epoch": 2.872996300863132, + "grad_norm": 0.5323054790496826, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 2330 + }, + { + "epoch": 2.885326757090012, + "grad_norm": 0.6842264533042908, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2340 + }, + { + "epoch": 2.8976572133168927, + "grad_norm": 0.9157055616378784, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2350 + }, + { + "epoch": 2.909987669543773, + "grad_norm": 0.5253258347511292, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 2360 + }, + { + "epoch": 2.9223181257706536, + "grad_norm": 0.4937705099582672, + "learning_rate": 0.0002, + "loss": 0.7162, + "step": 2370 + }, + { + "epoch": 2.934648581997534, + "grad_norm": 0.48762989044189453, + "learning_rate": 0.0002, + "loss": 0.7008, + "step": 2380 + }, + { + "epoch": 2.9469790382244145, + "grad_norm": 0.544335126876831, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 2390 + }, + { + "epoch": 2.9593094944512948, + "grad_norm": 0.4847845435142517, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 2400 + }, + { + "epoch": 2.971639950678175, + "grad_norm": 0.4787445366382599, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 2410 + }, + { + "epoch": 2.9839704069050557, + "grad_norm": 1.022318959236145, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 2420 + }, + { + "epoch": 2.996300863131936, + "grad_norm": 0.4987848103046417, + "learning_rate": 0.0002, + "loss": 0.6548, + "step": 2430 + }, + { + "epoch": 3.0, + "eval_loss": 1.2936296463012695, + "eval_runtime": 94.7897, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 0.58, + "step": 2433 + }, + { + "epoch": 3.008631319358816, + "grad_norm": 0.5562372803688049, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 2440 + }, + { + "epoch": 3.020961775585697, + "grad_norm": 1.133402705192566, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 2450 + }, + { + "epoch": 3.033292231812577, + "grad_norm": 0.6480470299720764, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 2460 + }, + { + "epoch": 3.0456226880394572, + "grad_norm": 0.8989138007164001, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 2470 + }, + { + "epoch": 3.057953144266338, + "grad_norm": 0.8257461786270142, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 2480 + }, + { + "epoch": 3.070283600493218, + "grad_norm": 0.6813381910324097, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 2490 + }, + { + "epoch": 3.082614056720099, + "grad_norm": 0.6989586353302002, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 2500 + }, + { + "epoch": 3.094944512946979, + "grad_norm": 0.7992092967033386, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 2510 + }, + { + "epoch": 3.1072749691738593, + "grad_norm": 0.698077917098999, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 2520 + }, + { + "epoch": 3.11960542540074, + "grad_norm": 0.5699033141136169, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 2530 + }, + { + "epoch": 3.13193588162762, + "grad_norm": 0.6142355799674988, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 2540 + }, + { + "epoch": 3.144266337854501, + "grad_norm": 0.7089933753013611, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 2550 + }, + { + "epoch": 3.156596794081381, + "grad_norm": 1.0107015371322632, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 2560 + }, + { + "epoch": 3.1689272503082613, + "grad_norm": 0.568138837814331, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 2570 + }, + { + "epoch": 3.181257706535142, + "grad_norm": 0.9960416555404663, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 2580 + }, + { + "epoch": 3.193588162762022, + "grad_norm": 0.6277595162391663, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 2590 + }, + { + "epoch": 3.2059186189889024, + "grad_norm": 0.681083619594574, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 2600 + }, + { + "epoch": 3.218249075215783, + "grad_norm": 0.5816057324409485, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 2610 + }, + { + "epoch": 3.2305795314426633, + "grad_norm": 0.7197734117507935, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 2620 + }, + { + "epoch": 3.242909987669544, + "grad_norm": 0.6524068117141724, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 2630 + }, + { + "epoch": 3.255240443896424, + "grad_norm": 1.273668646812439, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 2640 + }, + { + "epoch": 3.2675709001233044, + "grad_norm": 0.6950451731681824, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 2650 + }, + { + "epoch": 3.279901356350185, + "grad_norm": 0.8029071688652039, + "learning_rate": 0.0002, + "loss": 0.5194, + "step": 2660 + }, + { + "epoch": 3.2922318125770653, + "grad_norm": 0.7464073896408081, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 2670 + }, + { + "epoch": 3.304562268803946, + "grad_norm": 0.8342001438140869, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 2680 + }, + { + "epoch": 3.316892725030826, + "grad_norm": 0.5629868507385254, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 2690 + }, + { + "epoch": 3.3292231812577064, + "grad_norm": 0.753999650478363, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 2700 + }, + { + "epoch": 3.341553637484587, + "grad_norm": 1.0271371603012085, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 2710 + }, + { + "epoch": 3.3538840937114673, + "grad_norm": 0.9608535170555115, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 2720 + }, + { + "epoch": 3.3662145499383476, + "grad_norm": 0.7796488404273987, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 2730 + }, + { + "epoch": 3.3785450061652282, + "grad_norm": 0.5666437149047852, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 2740 + }, + { + "epoch": 3.3908754623921085, + "grad_norm": 0.5462956428527832, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 2750 + }, + { + "epoch": 3.4032059186189887, + "grad_norm": 1.289099097251892, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 2760 + }, + { + "epoch": 3.4155363748458694, + "grad_norm": 0.825566828250885, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 2770 + }, + { + "epoch": 3.4278668310727496, + "grad_norm": 0.8366670608520508, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 2780 + }, + { + "epoch": 3.4401972872996303, + "grad_norm": 1.0931549072265625, + "learning_rate": 0.0002, + "loss": 0.5732, + "step": 2790 + }, + { + "epoch": 3.4525277435265105, + "grad_norm": 0.9228858351707458, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 2800 + }, + { + "epoch": 3.4648581997533907, + "grad_norm": 1.3182806968688965, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 2810 + }, + { + "epoch": 3.4771886559802714, + "grad_norm": 0.8366976380348206, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 2820 + }, + { + "epoch": 3.4895191122071516, + "grad_norm": 0.8067695498466492, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 2830 + }, + { + "epoch": 3.5018495684340323, + "grad_norm": 1.1163437366485596, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 2840 + }, + { + "epoch": 3.5141800246609125, + "grad_norm": 1.7196556329727173, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 2850 + }, + { + "epoch": 3.5265104808877927, + "grad_norm": 1.1267012357711792, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 2860 + }, + { + "epoch": 3.5388409371146734, + "grad_norm": 0.7220137119293213, + "learning_rate": 0.0002, + "loss": 0.447, + "step": 2870 + }, + { + "epoch": 3.5511713933415536, + "grad_norm": 0.914114773273468, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 2880 + }, + { + "epoch": 3.563501849568434, + "grad_norm": 0.6193503141403198, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 2890 + }, + { + "epoch": 3.5758323057953145, + "grad_norm": 0.6060135960578918, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 2900 + }, + { + "epoch": 3.5881627620221948, + "grad_norm": 1.0177327394485474, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 2910 + }, + { + "epoch": 3.600493218249075, + "grad_norm": 0.5994468331336975, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 2920 + }, + { + "epoch": 3.6128236744759556, + "grad_norm": 0.7450457215309143, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 2930 + }, + { + "epoch": 3.625154130702836, + "grad_norm": 0.5825870037078857, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 2940 + }, + { + "epoch": 3.6374845869297165, + "grad_norm": 0.6289743781089783, + "learning_rate": 0.0002, + "loss": 0.6016, + "step": 2950 + }, + { + "epoch": 3.6498150431565968, + "grad_norm": 0.7801929116249084, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 2960 + }, + { + "epoch": 3.6621454993834774, + "grad_norm": 1.1206634044647217, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 2970 + }, + { + "epoch": 3.6744759556103577, + "grad_norm": 0.6738817691802979, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 2980 + }, + { + "epoch": 3.686806411837238, + "grad_norm": 1.1917344331741333, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 2990 + }, + { + "epoch": 3.6991368680641186, + "grad_norm": 1.3738657236099243, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 3000 + }, + { + "epoch": 3.711467324290999, + "grad_norm": 0.6642793416976929, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 3010 + }, + { + "epoch": 3.723797780517879, + "grad_norm": 0.9030995965003967, + "learning_rate": 0.0002, + "loss": 0.6243, + "step": 3020 + }, + { + "epoch": 3.7361282367447597, + "grad_norm": 1.0203914642333984, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 3030 + }, + { + "epoch": 3.74845869297164, + "grad_norm": 0.648394763469696, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 3040 + }, + { + "epoch": 3.76078914919852, + "grad_norm": 0.6304570436477661, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 3050 + }, + { + "epoch": 3.773119605425401, + "grad_norm": 0.8286601901054382, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 3060 + }, + { + "epoch": 3.785450061652281, + "grad_norm": 0.906444251537323, + "learning_rate": 0.0002, + "loss": 0.5302, + "step": 3070 + }, + { + "epoch": 3.7977805178791613, + "grad_norm": 1.4212149381637573, + "learning_rate": 0.0002, + "loss": 0.5345, + "step": 3080 + }, + { + "epoch": 3.810110974106042, + "grad_norm": 0.7574319839477539, + "learning_rate": 0.0002, + "loss": 0.6403, + "step": 3090 + }, + { + "epoch": 3.822441430332922, + "grad_norm": 0.6534451246261597, + "learning_rate": 0.0002, + "loss": 0.5756, + "step": 3100 + }, + { + "epoch": 3.834771886559803, + "grad_norm": 0.7525447010993958, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 3110 + }, + { + "epoch": 3.847102342786683, + "grad_norm": 0.6513990759849548, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 3120 + }, + { + "epoch": 3.8594327990135637, + "grad_norm": 0.7782694697380066, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 3130 + }, + { + "epoch": 3.871763255240444, + "grad_norm": 0.7998530268669128, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 3140 + }, + { + "epoch": 3.884093711467324, + "grad_norm": 0.8045353293418884, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 3150 + }, + { + "epoch": 3.896424167694205, + "grad_norm": 0.8242645263671875, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 3160 + }, + { + "epoch": 3.908754623921085, + "grad_norm": 0.8302360773086548, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 3170 + }, + { + "epoch": 3.9210850801479653, + "grad_norm": 0.8653109073638916, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 3180 + }, + { + "epoch": 3.933415536374846, + "grad_norm": 0.6461338996887207, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 3190 + }, + { + "epoch": 3.945745992601726, + "grad_norm": 0.8267415165901184, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 3200 + }, + { + "epoch": 3.9580764488286064, + "grad_norm": 1.1963194608688354, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 3210 + }, + { + "epoch": 3.970406905055487, + "grad_norm": 0.7101966142654419, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 3220 + }, + { + "epoch": 3.9827373612823673, + "grad_norm": 0.5931660532951355, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 3230 + }, + { + "epoch": 3.995067817509248, + "grad_norm": 0.7465988993644714, + "learning_rate": 0.0002, + "loss": 0.5619, + "step": 3240 + }, + { + "epoch": 4.0, + "eval_loss": 1.4066498279571533, + "eval_runtime": 95.7145, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 0.575, + "step": 3244 + }, + { + "epoch": 4.007398273736128, + "grad_norm": 0.9478800296783447, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 3250 + }, + { + "epoch": 4.019728729963009, + "grad_norm": 1.207059621810913, + "learning_rate": 0.0002, + "loss": 0.4129, + "step": 3260 + }, + { + "epoch": 4.032059186189889, + "grad_norm": 0.8984074592590332, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 3270 + }, + { + "epoch": 4.044389642416769, + "grad_norm": 0.8104140758514404, + "learning_rate": 0.0002, + "loss": 0.3798, + "step": 3280 + }, + { + "epoch": 4.05672009864365, + "grad_norm": 1.0875468254089355, + "learning_rate": 0.0002, + "loss": 0.3657, + "step": 3290 + }, + { + "epoch": 4.06905055487053, + "grad_norm": 0.8520309329032898, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 3300 + }, + { + "epoch": 4.0813810110974105, + "grad_norm": 1.076735496520996, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 3310 + }, + { + "epoch": 4.093711467324291, + "grad_norm": 0.7789369821548462, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 3320 + }, + { + "epoch": 4.106041923551172, + "grad_norm": 0.916862964630127, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 3330 + }, + { + "epoch": 4.118372379778052, + "grad_norm": 1.1251654624938965, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3340 + }, + { + "epoch": 4.130702836004932, + "grad_norm": 0.9373420476913452, + "learning_rate": 0.0002, + "loss": 0.3651, + "step": 3350 + }, + { + "epoch": 4.143033292231813, + "grad_norm": 1.03253972530365, + "learning_rate": 0.0002, + "loss": 0.384, + "step": 3360 + }, + { + "epoch": 4.155363748458693, + "grad_norm": 0.947023332118988, + "learning_rate": 0.0002, + "loss": 0.372, + "step": 3370 + }, + { + "epoch": 4.167694204685573, + "grad_norm": 0.8709157109260559, + "learning_rate": 0.0002, + "loss": 0.4018, + "step": 3380 + }, + { + "epoch": 4.180024660912454, + "grad_norm": 0.930983304977417, + "learning_rate": 0.0002, + "loss": 0.3754, + "step": 3390 + }, + { + "epoch": 4.192355117139334, + "grad_norm": 1.092809796333313, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 3400 + }, + { + "epoch": 4.2046855733662145, + "grad_norm": 0.8454303741455078, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 3410 + }, + { + "epoch": 4.217016029593095, + "grad_norm": 0.957210123538971, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3420 + }, + { + "epoch": 4.229346485819975, + "grad_norm": 0.854333758354187, + "learning_rate": 0.0002, + "loss": 0.3743, + "step": 3430 + }, + { + "epoch": 4.241676942046856, + "grad_norm": 1.0457639694213867, + "learning_rate": 0.0002, + "loss": 0.4041, + "step": 3440 + }, + { + "epoch": 4.254007398273736, + "grad_norm": 0.8972977995872498, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 3450 + }, + { + "epoch": 4.266337854500616, + "grad_norm": 1.0438238382339478, + "learning_rate": 0.0002, + "loss": 0.4445, + "step": 3460 + }, + { + "epoch": 4.278668310727497, + "grad_norm": 0.7000405192375183, + "learning_rate": 0.0002, + "loss": 0.4078, + "step": 3470 + }, + { + "epoch": 4.290998766954377, + "grad_norm": 1.0451240539550781, + "learning_rate": 0.0002, + "loss": 0.3718, + "step": 3480 + }, + { + "epoch": 4.303329223181258, + "grad_norm": 1.3339767456054688, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 3490 + }, + { + "epoch": 4.315659679408138, + "grad_norm": 0.7503946423530579, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 3500 + }, + { + "epoch": 4.3279901356350186, + "grad_norm": 0.8443584442138672, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 3510 + }, + { + "epoch": 4.340320591861899, + "grad_norm": 1.1681201457977295, + "learning_rate": 0.0002, + "loss": 0.3793, + "step": 3520 + }, + { + "epoch": 4.352651048088779, + "grad_norm": 1.078883171081543, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 3530 + }, + { + "epoch": 4.36498150431566, + "grad_norm": 0.6894834041595459, + "learning_rate": 0.0002, + "loss": 0.4216, + "step": 3540 + }, + { + "epoch": 4.37731196054254, + "grad_norm": 0.7059480547904968, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 3550 + }, + { + "epoch": 4.38964241676942, + "grad_norm": 1.1807256937026978, + "learning_rate": 0.0002, + "loss": 0.3821, + "step": 3560 + }, + { + "epoch": 4.401972872996301, + "grad_norm": 0.8341359496116638, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 3570 + }, + { + "epoch": 4.4143033292231815, + "grad_norm": 1.0273033380508423, + "learning_rate": 0.0002, + "loss": 0.4123, + "step": 3580 + }, + { + "epoch": 4.426633785450061, + "grad_norm": 0.6916454434394836, + "learning_rate": 0.0002, + "loss": 0.5018, + "step": 3590 + }, + { + "epoch": 4.438964241676942, + "grad_norm": 0.8210113644599915, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 3600 + }, + { + "epoch": 4.451294697903823, + "grad_norm": 1.0309500694274902, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 3610 + }, + { + "epoch": 4.463625154130703, + "grad_norm": 0.8847399353981018, + "learning_rate": 0.0002, + "loss": 0.3902, + "step": 3620 + }, + { + "epoch": 4.475955610357583, + "grad_norm": 1.668636679649353, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 3630 + }, + { + "epoch": 4.488286066584464, + "grad_norm": 1.3087958097457886, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 3640 + }, + { + "epoch": 4.500616522811344, + "grad_norm": 0.837852418422699, + "learning_rate": 0.0002, + "loss": 0.4294, + "step": 3650 + }, + { + "epoch": 4.512946979038224, + "grad_norm": 9.7662353515625, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 3660 + }, + { + "epoch": 4.525277435265105, + "grad_norm": 1.125719428062439, + "learning_rate": 0.0002, + "loss": 0.4033, + "step": 3670 + }, + { + "epoch": 4.5376078914919855, + "grad_norm": 0.7755377292633057, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 3680 + }, + { + "epoch": 4.549938347718865, + "grad_norm": 0.7185089588165283, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 3690 + }, + { + "epoch": 4.562268803945746, + "grad_norm": 1.182063102722168, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 3700 + }, + { + "epoch": 4.574599260172627, + "grad_norm": 1.001197338104248, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3710 + }, + { + "epoch": 4.586929716399506, + "grad_norm": 0.9705429077148438, + "learning_rate": 0.0002, + "loss": 0.4493, + "step": 3720 + }, + { + "epoch": 4.599260172626387, + "grad_norm": 0.7136746048927307, + "learning_rate": 0.0002, + "loss": 0.42, + "step": 3730 + }, + { + "epoch": 4.611590628853268, + "grad_norm": 1.0004864931106567, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 3740 + }, + { + "epoch": 4.623921085080148, + "grad_norm": 1.3193715810775757, + "learning_rate": 0.0002, + "loss": 0.4418, + "step": 3750 + }, + { + "epoch": 4.636251541307028, + "grad_norm": 0.6945042014122009, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 3760 + }, + { + "epoch": 4.648581997533909, + "grad_norm": 0.8903936743736267, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 3770 + }, + { + "epoch": 4.660912453760789, + "grad_norm": 0.7960889339447021, + "learning_rate": 0.0002, + "loss": 0.3582, + "step": 3780 + }, + { + "epoch": 4.673242909987669, + "grad_norm": 1.0439172983169556, + "learning_rate": 0.0002, + "loss": 0.3864, + "step": 3790 + }, + { + "epoch": 4.68557336621455, + "grad_norm": 1.4546219110488892, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 3800 + }, + { + "epoch": 4.697903822441431, + "grad_norm": 0.8194343447685242, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 3810 + }, + { + "epoch": 4.7102342786683105, + "grad_norm": 1.0727602243423462, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 3820 + }, + { + "epoch": 4.722564734895191, + "grad_norm": 0.7785195708274841, + "learning_rate": 0.0002, + "loss": 0.4021, + "step": 3830 + }, + { + "epoch": 4.734895191122072, + "grad_norm": 0.846783459186554, + "learning_rate": 0.0002, + "loss": 0.4252, + "step": 3840 + }, + { + "epoch": 4.747225647348952, + "grad_norm": 1.0481648445129395, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 3850 + }, + { + "epoch": 4.759556103575832, + "grad_norm": 0.7324008941650391, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 3860 + }, + { + "epoch": 4.771886559802713, + "grad_norm": 1.06382417678833, + "learning_rate": 0.0002, + "loss": 0.3831, + "step": 3870 + }, + { + "epoch": 4.784217016029594, + "grad_norm": 0.9851241111755371, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 3880 + }, + { + "epoch": 4.796547472256473, + "grad_norm": 0.8215277791023254, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 3890 + }, + { + "epoch": 4.808877928483354, + "grad_norm": 0.9901723861694336, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 3900 + }, + { + "epoch": 4.821208384710234, + "grad_norm": 0.9149112701416016, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 3910 + }, + { + "epoch": 4.8335388409371145, + "grad_norm": 0.9772973656654358, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3920 + }, + { + "epoch": 4.845869297163995, + "grad_norm": 0.8889636397361755, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 3930 + }, + { + "epoch": 4.858199753390876, + "grad_norm": 1.3032807111740112, + "learning_rate": 0.0002, + "loss": 0.421, + "step": 3940 + }, + { + "epoch": 4.870530209617756, + "grad_norm": 0.8575899600982666, + "learning_rate": 0.0002, + "loss": 0.434, + "step": 3950 + }, + { + "epoch": 4.882860665844636, + "grad_norm": 1.04326331615448, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 3960 + }, + { + "epoch": 4.895191122071517, + "grad_norm": 1.041210651397705, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 3970 + }, + { + "epoch": 4.907521578298397, + "grad_norm": 0.9113056063652039, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 3980 + }, + { + "epoch": 4.919852034525277, + "grad_norm": 1.019347906112671, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 3990 + }, + { + "epoch": 4.932182490752158, + "grad_norm": 0.7709218859672546, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 4000 + }, + { + "epoch": 4.944512946979038, + "grad_norm": 0.8891775608062744, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 4010 + }, + { + "epoch": 4.9568434032059185, + "grad_norm": 1.0396920442581177, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 4020 + }, + { + "epoch": 4.969173859432799, + "grad_norm": 0.9239833354949951, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 4030 + }, + { + "epoch": 4.981504315659679, + "grad_norm": 1.801400065422058, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 4040 + }, + { + "epoch": 4.99383477188656, + "grad_norm": 0.6194164752960205, + "learning_rate": 0.0002, + "loss": 0.4481, + "step": 4050 + }, + { + "epoch": 5.0, + "eval_loss": 1.544758915901184, + "eval_runtime": 96.2573, + "eval_samples_per_second": 4.53, + "eval_steps_per_second": 0.571, + "step": 4055 + }, + { + "epoch": 5.00616522811344, + "grad_norm": 0.9918256998062134, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 4060 + }, + { + "epoch": 5.018495684340321, + "grad_norm": 1.4851351976394653, + "learning_rate": 0.0002, + "loss": 0.2887, + "step": 4070 + }, + { + "epoch": 5.030826140567201, + "grad_norm": 0.9237686395645142, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 4080 + }, + { + "epoch": 5.0431565967940815, + "grad_norm": 1.2180852890014648, + "learning_rate": 0.0002, + "loss": 0.3072, + "step": 4090 + }, + { + "epoch": 5.055487053020962, + "grad_norm": 1.1247979402542114, + "learning_rate": 0.0002, + "loss": 0.282, + "step": 4100 + }, + { + "epoch": 5.067817509247842, + "grad_norm": 1.2969884872436523, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 4110 + }, + { + "epoch": 5.080147965474723, + "grad_norm": 1.0183063745498657, + "learning_rate": 0.0002, + "loss": 0.2858, + "step": 4120 + }, + { + "epoch": 5.092478421701603, + "grad_norm": 1.121330738067627, + "learning_rate": 0.0002, + "loss": 0.295, + "step": 4130 + }, + { + "epoch": 5.104808877928483, + "grad_norm": 1.0748186111450195, + "learning_rate": 0.0002, + "loss": 0.2697, + "step": 4140 + }, + { + "epoch": 5.117139334155364, + "grad_norm": 1.103474736213684, + "learning_rate": 0.0002, + "loss": 0.3414, + "step": 4150 + }, + { + "epoch": 5.129469790382244, + "grad_norm": 1.2251166105270386, + "learning_rate": 0.0002, + "loss": 0.305, + "step": 4160 + }, + { + "epoch": 5.141800246609124, + "grad_norm": 0.920898973941803, + "learning_rate": 0.0002, + "loss": 0.3131, + "step": 4170 + }, + { + "epoch": 5.154130702836005, + "grad_norm": 1.327542781829834, + "learning_rate": 0.0002, + "loss": 0.281, + "step": 4180 + }, + { + "epoch": 5.1664611590628855, + "grad_norm": 1.0677192211151123, + "learning_rate": 0.0002, + "loss": 0.3214, + "step": 4190 + }, + { + "epoch": 5.178791615289766, + "grad_norm": 0.897241473197937, + "learning_rate": 0.0002, + "loss": 0.2863, + "step": 4200 + }, + { + "epoch": 5.191122071516646, + "grad_norm": 0.977457582950592, + "learning_rate": 0.0002, + "loss": 0.2967, + "step": 4210 + }, + { + "epoch": 5.203452527743527, + "grad_norm": 1.4115267992019653, + "learning_rate": 0.0002, + "loss": 0.3032, + "step": 4220 + }, + { + "epoch": 5.215782983970407, + "grad_norm": 1.097743034362793, + "learning_rate": 0.0002, + "loss": 0.3279, + "step": 4230 + }, + { + "epoch": 5.228113440197287, + "grad_norm": 1.1095269918441772, + "learning_rate": 0.0002, + "loss": 0.293, + "step": 4240 + }, + { + "epoch": 5.240443896424168, + "grad_norm": 1.3785479068756104, + "learning_rate": 0.0002, + "loss": 0.3544, + "step": 4250 + }, + { + "epoch": 5.252774352651048, + "grad_norm": 1.0298776626586914, + "learning_rate": 0.0002, + "loss": 0.3118, + "step": 4260 + }, + { + "epoch": 5.265104808877928, + "grad_norm": 1.1592111587524414, + "learning_rate": 0.0002, + "loss": 0.296, + "step": 4270 + }, + { + "epoch": 5.277435265104809, + "grad_norm": 1.2355743646621704, + "learning_rate": 0.0002, + "loss": 0.2878, + "step": 4280 + }, + { + "epoch": 5.2897657213316895, + "grad_norm": 0.8543112874031067, + "learning_rate": 0.0002, + "loss": 0.3085, + "step": 4290 + }, + { + "epoch": 5.302096177558569, + "grad_norm": 1.2953215837478638, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 4300 + }, + { + "epoch": 5.31442663378545, + "grad_norm": 1.1001787185668945, + "learning_rate": 0.0002, + "loss": 0.2912, + "step": 4310 + }, + { + "epoch": 5.326757090012331, + "grad_norm": 0.7476816773414612, + "learning_rate": 0.0002, + "loss": 0.3003, + "step": 4320 + }, + { + "epoch": 5.3390875462392104, + "grad_norm": 0.8195574283599854, + "learning_rate": 0.0002, + "loss": 0.3247, + "step": 4330 + }, + { + "epoch": 5.351418002466091, + "grad_norm": 0.9490262866020203, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 4340 + }, + { + "epoch": 5.363748458692972, + "grad_norm": 1.2201412916183472, + "learning_rate": 0.0002, + "loss": 0.2846, + "step": 4350 + }, + { + "epoch": 5.376078914919852, + "grad_norm": 1.0311479568481445, + "learning_rate": 0.0002, + "loss": 0.2644, + "step": 4360 + }, + { + "epoch": 5.388409371146732, + "grad_norm": 1.2097488641738892, + "learning_rate": 0.0002, + "loss": 0.3104, + "step": 4370 + }, + { + "epoch": 5.400739827373613, + "grad_norm": 1.140942096710205, + "learning_rate": 0.0002, + "loss": 0.2977, + "step": 4380 + }, + { + "epoch": 5.413070283600494, + "grad_norm": 0.8091890811920166, + "learning_rate": 0.0002, + "loss": 0.2975, + "step": 4390 + }, + { + "epoch": 5.425400739827373, + "grad_norm": 1.4467964172363281, + "learning_rate": 0.0002, + "loss": 0.3727, + "step": 4400 + }, + { + "epoch": 5.437731196054254, + "grad_norm": 1.0836058855056763, + "learning_rate": 0.0002, + "loss": 0.2979, + "step": 4410 + }, + { + "epoch": 5.450061652281135, + "grad_norm": 1.0515433549880981, + "learning_rate": 0.0002, + "loss": 0.2601, + "step": 4420 + }, + { + "epoch": 5.4623921085080145, + "grad_norm": 0.9603073000907898, + "learning_rate": 0.0002, + "loss": 0.315, + "step": 4430 + }, + { + "epoch": 5.474722564734895, + "grad_norm": 1.234609842300415, + "learning_rate": 0.0002, + "loss": 0.3166, + "step": 4440 + }, + { + "epoch": 5.487053020961776, + "grad_norm": 0.8881428837776184, + "learning_rate": 0.0002, + "loss": 0.3142, + "step": 4450 + }, + { + "epoch": 5.499383477188656, + "grad_norm": 1.1817275285720825, + "learning_rate": 0.0002, + "loss": 0.3725, + "step": 4460 + }, + { + "epoch": 5.511713933415536, + "grad_norm": 1.213993787765503, + "learning_rate": 0.0002, + "loss": 0.2944, + "step": 4470 + }, + { + "epoch": 5.524044389642417, + "grad_norm": 1.0501725673675537, + "learning_rate": 0.0002, + "loss": 0.3136, + "step": 4480 + }, + { + "epoch": 5.536374845869297, + "grad_norm": 1.5061579942703247, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 4490 + }, + { + "epoch": 5.548705302096177, + "grad_norm": 1.1171475648880005, + "learning_rate": 0.0002, + "loss": 0.3226, + "step": 4500 + }, + { + "epoch": 5.561035758323058, + "grad_norm": 1.1147594451904297, + "learning_rate": 0.0002, + "loss": 0.3624, + "step": 4510 + }, + { + "epoch": 5.573366214549939, + "grad_norm": 1.0600544214248657, + "learning_rate": 0.0002, + "loss": 0.3435, + "step": 4520 + }, + { + "epoch": 5.5856966707768185, + "grad_norm": 1.247870922088623, + "learning_rate": 0.0002, + "loss": 0.3268, + "step": 4530 + }, + { + "epoch": 5.598027127003699, + "grad_norm": 0.9425561428070068, + "learning_rate": 0.0002, + "loss": 0.3168, + "step": 4540 + }, + { + "epoch": 5.61035758323058, + "grad_norm": 1.1111550331115723, + "learning_rate": 0.0002, + "loss": 0.3119, + "step": 4550 + }, + { + "epoch": 5.62268803945746, + "grad_norm": 1.743268609046936, + "learning_rate": 0.0002, + "loss": 0.3389, + "step": 4560 + }, + { + "epoch": 5.63501849568434, + "grad_norm": 1.3522645235061646, + "learning_rate": 0.0002, + "loss": 0.31, + "step": 4570 + }, + { + "epoch": 5.647348951911221, + "grad_norm": 0.7354221343994141, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 4580 + }, + { + "epoch": 5.659679408138101, + "grad_norm": 1.050743818283081, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 4590 + }, + { + "epoch": 5.6720098643649814, + "grad_norm": 1.1302396059036255, + "learning_rate": 0.0002, + "loss": 0.3449, + "step": 4600 + }, + { + "epoch": 5.684340320591862, + "grad_norm": 0.8774183392524719, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 4610 + }, + { + "epoch": 5.696670776818742, + "grad_norm": 1.090781569480896, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 4620 + }, + { + "epoch": 5.709001233045623, + "grad_norm": 0.9177733063697815, + "learning_rate": 0.0002, + "loss": 0.3211, + "step": 4630 + }, + { + "epoch": 5.721331689272503, + "grad_norm": 0.9985341429710388, + "learning_rate": 0.0002, + "loss": 0.3439, + "step": 4640 + }, + { + "epoch": 5.733662145499384, + "grad_norm": 1.0230613946914673, + "learning_rate": 0.0002, + "loss": 0.3323, + "step": 4650 + }, + { + "epoch": 5.745992601726264, + "grad_norm": 0.944656252861023, + "learning_rate": 0.0002, + "loss": 0.3525, + "step": 4660 + }, + { + "epoch": 5.758323057953144, + "grad_norm": 0.8162471652030945, + "learning_rate": 0.0002, + "loss": 0.3191, + "step": 4670 + }, + { + "epoch": 5.770653514180025, + "grad_norm": 1.0500398874282837, + "learning_rate": 0.0002, + "loss": 0.4011, + "step": 4680 + }, + { + "epoch": 5.782983970406905, + "grad_norm": 0.9487981796264648, + "learning_rate": 0.0002, + "loss": 0.3452, + "step": 4690 + }, + { + "epoch": 5.7953144266337855, + "grad_norm": 1.1856540441513062, + "learning_rate": 0.0002, + "loss": 0.2942, + "step": 4700 + }, + { + "epoch": 5.807644882860666, + "grad_norm": 1.2583396434783936, + "learning_rate": 0.0002, + "loss": 0.3107, + "step": 4710 + }, + { + "epoch": 5.819975339087546, + "grad_norm": 1.2532602548599243, + "learning_rate": 0.0002, + "loss": 0.3223, + "step": 4720 + }, + { + "epoch": 5.832305795314427, + "grad_norm": 1.115236520767212, + "learning_rate": 0.0002, + "loss": 0.3253, + "step": 4730 + }, + { + "epoch": 5.844636251541307, + "grad_norm": 1.2245537042617798, + "learning_rate": 0.0002, + "loss": 0.3539, + "step": 4740 + }, + { + "epoch": 5.856966707768187, + "grad_norm": 1.1964094638824463, + "learning_rate": 0.0002, + "loss": 0.3171, + "step": 4750 + }, + { + "epoch": 5.869297163995068, + "grad_norm": 1.0833805799484253, + "learning_rate": 0.0002, + "loss": 0.3623, + "step": 4760 + }, + { + "epoch": 5.881627620221948, + "grad_norm": 1.0694046020507812, + "learning_rate": 0.0002, + "loss": 0.3511, + "step": 4770 + }, + { + "epoch": 5.893958076448829, + "grad_norm": 0.9947936534881592, + "learning_rate": 0.0002, + "loss": 0.3266, + "step": 4780 + }, + { + "epoch": 5.906288532675709, + "grad_norm": 1.175716519355774, + "learning_rate": 0.0002, + "loss": 0.316, + "step": 4790 + }, + { + "epoch": 5.9186189889025895, + "grad_norm": 0.7717352509498596, + "learning_rate": 0.0002, + "loss": 0.3609, + "step": 4800 + }, + { + "epoch": 5.930949445129469, + "grad_norm": 1.2906442880630493, + "learning_rate": 0.0002, + "loss": 0.3058, + "step": 4810 + }, + { + "epoch": 5.94327990135635, + "grad_norm": 1.2416284084320068, + "learning_rate": 0.0002, + "loss": 0.3187, + "step": 4820 + }, + { + "epoch": 5.955610357583231, + "grad_norm": 1.3066956996917725, + "learning_rate": 0.0002, + "loss": 0.337, + "step": 4830 + }, + { + "epoch": 5.967940813810111, + "grad_norm": 1.0872026681900024, + "learning_rate": 0.0002, + "loss": 0.3167, + "step": 4840 + }, + { + "epoch": 5.980271270036991, + "grad_norm": 1.1941101551055908, + "learning_rate": 0.0002, + "loss": 0.3262, + "step": 4850 + }, + { + "epoch": 5.992601726263872, + "grad_norm": 1.1126095056533813, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 4860 + }, + { + "epoch": 6.0, + "eval_loss": 1.748323917388916, + "eval_runtime": 97.7488, + "eval_samples_per_second": 4.46, + "eval_steps_per_second": 0.563, + "step": 4866 + }, + { + "epoch": 6.0049321824907524, + "grad_norm": 1.3631165027618408, + "learning_rate": 0.0002, + "loss": 0.2774, + "step": 4870 + }, + { + "epoch": 6.017262638717632, + "grad_norm": 1.2631664276123047, + "learning_rate": 0.0002, + "loss": 0.2399, + "step": 4880 + }, + { + "epoch": 6.029593094944513, + "grad_norm": 0.7073080539703369, + "learning_rate": 0.0002, + "loss": 0.2177, + "step": 4890 + }, + { + "epoch": 6.041923551171394, + "grad_norm": 0.7856091856956482, + "learning_rate": 0.0002, + "loss": 0.215, + "step": 4900 + }, + { + "epoch": 6.054254007398273, + "grad_norm": 1.145540475845337, + "learning_rate": 0.0002, + "loss": 0.1999, + "step": 4910 + }, + { + "epoch": 6.066584463625154, + "grad_norm": 1.1742334365844727, + "learning_rate": 0.0002, + "loss": 0.2084, + "step": 4920 + }, + { + "epoch": 6.078914919852035, + "grad_norm": 0.8043994903564453, + "learning_rate": 0.0002, + "loss": 0.2342, + "step": 4930 + }, + { + "epoch": 6.0912453760789145, + "grad_norm": 1.1877652406692505, + "learning_rate": 0.0002, + "loss": 0.2454, + "step": 4940 + }, + { + "epoch": 6.103575832305795, + "grad_norm": 0.7624953985214233, + "learning_rate": 0.0002, + "loss": 0.1908, + "step": 4950 + }, + { + "epoch": 6.115906288532676, + "grad_norm": 1.0403119325637817, + "learning_rate": 0.0002, + "loss": 0.2254, + "step": 4960 + }, + { + "epoch": 6.1282367447595565, + "grad_norm": 1.2040252685546875, + "learning_rate": 0.0002, + "loss": 0.2274, + "step": 4970 + }, + { + "epoch": 6.140567200986436, + "grad_norm": 0.6242546439170837, + "learning_rate": 0.0002, + "loss": 0.2199, + "step": 4980 + }, + { + "epoch": 6.152897657213317, + "grad_norm": 1.1394767761230469, + "learning_rate": 0.0002, + "loss": 0.27, + "step": 4990 + }, + { + "epoch": 6.165228113440198, + "grad_norm": 1.3760257959365845, + "learning_rate": 0.0002, + "loss": 0.2377, + "step": 5000 + }, + { + "epoch": 6.177558569667077, + "grad_norm": 1.0707697868347168, + "learning_rate": 0.0002, + "loss": 0.2331, + "step": 5010 + }, + { + "epoch": 6.189889025893958, + "grad_norm": 1.288072109222412, + "learning_rate": 0.0002, + "loss": 0.2311, + "step": 5020 + }, + { + "epoch": 6.202219482120839, + "grad_norm": 1.1479463577270508, + "learning_rate": 0.0002, + "loss": 0.2276, + "step": 5030 + }, + { + "epoch": 6.2145499383477185, + "grad_norm": 0.905891478061676, + "learning_rate": 0.0002, + "loss": 0.2294, + "step": 5040 + }, + { + "epoch": 6.226880394574599, + "grad_norm": 1.0354516506195068, + "learning_rate": 0.0002, + "loss": 0.2575, + "step": 5050 + }, + { + "epoch": 6.23921085080148, + "grad_norm": 1.312671184539795, + "learning_rate": 0.0002, + "loss": 0.2259, + "step": 5060 + }, + { + "epoch": 6.25154130702836, + "grad_norm": 1.614709734916687, + "learning_rate": 0.0002, + "loss": 0.2281, + "step": 5070 + }, + { + "epoch": 6.26387176325524, + "grad_norm": 1.0864229202270508, + "learning_rate": 0.0002, + "loss": 0.2388, + "step": 5080 + }, + { + "epoch": 6.276202219482121, + "grad_norm": 1.0401391983032227, + "learning_rate": 0.0002, + "loss": 0.2014, + "step": 5090 + }, + { + "epoch": 6.288532675709002, + "grad_norm": 1.2187728881835938, + "learning_rate": 0.0002, + "loss": 0.2419, + "step": 5100 + }, + { + "epoch": 6.300863131935881, + "grad_norm": 0.9474364519119263, + "learning_rate": 0.0002, + "loss": 0.2144, + "step": 5110 + }, + { + "epoch": 6.313193588162762, + "grad_norm": 1.1228716373443604, + "learning_rate": 0.0002, + "loss": 0.238, + "step": 5120 + }, + { + "epoch": 6.325524044389643, + "grad_norm": 0.9294499754905701, + "learning_rate": 0.0002, + "loss": 0.2556, + "step": 5130 + }, + { + "epoch": 6.337854500616523, + "grad_norm": 1.0521048307418823, + "learning_rate": 0.0002, + "loss": 0.2384, + "step": 5140 + }, + { + "epoch": 6.350184956843403, + "grad_norm": 1.2406890392303467, + "learning_rate": 0.0002, + "loss": 0.2444, + "step": 5150 + }, + { + "epoch": 6.362515413070284, + "grad_norm": 1.2972853183746338, + "learning_rate": 0.0002, + "loss": 0.2301, + "step": 5160 + }, + { + "epoch": 6.374845869297164, + "grad_norm": 0.8772842288017273, + "learning_rate": 0.0002, + "loss": 0.2574, + "step": 5170 + }, + { + "epoch": 6.387176325524044, + "grad_norm": 1.050349473953247, + "learning_rate": 0.0002, + "loss": 0.2337, + "step": 5180 + }, + { + "epoch": 6.399506781750925, + "grad_norm": 0.9432134032249451, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 5190 + }, + { + "epoch": 6.411837237977805, + "grad_norm": 1.11045241355896, + "learning_rate": 0.0002, + "loss": 0.2546, + "step": 5200 + }, + { + "epoch": 6.4241676942046855, + "grad_norm": 1.117530345916748, + "learning_rate": 0.0002, + "loss": 0.268, + "step": 5210 + }, + { + "epoch": 6.436498150431566, + "grad_norm": 1.4194035530090332, + "learning_rate": 0.0002, + "loss": 0.25, + "step": 5220 + }, + { + "epoch": 6.448828606658447, + "grad_norm": 1.063950777053833, + "learning_rate": 0.0002, + "loss": 0.2335, + "step": 5230 + }, + { + "epoch": 6.461159062885327, + "grad_norm": 1.2946349382400513, + "learning_rate": 0.0002, + "loss": 0.2299, + "step": 5240 + }, + { + "epoch": 6.473489519112207, + "grad_norm": 1.5237880945205688, + "learning_rate": 0.0002, + "loss": 0.242, + "step": 5250 + }, + { + "epoch": 6.485819975339088, + "grad_norm": 1.1915720701217651, + "learning_rate": 0.0002, + "loss": 0.255, + "step": 5260 + }, + { + "epoch": 6.498150431565968, + "grad_norm": 1.0779626369476318, + "learning_rate": 0.0002, + "loss": 0.2357, + "step": 5270 + }, + { + "epoch": 6.510480887792848, + "grad_norm": 0.8255738019943237, + "learning_rate": 0.0002, + "loss": 0.2476, + "step": 5280 + }, + { + "epoch": 6.522811344019729, + "grad_norm": 1.275174856185913, + "learning_rate": 0.0002, + "loss": 0.267, + "step": 5290 + }, + { + "epoch": 6.535141800246609, + "grad_norm": 1.0878815650939941, + "learning_rate": 0.0002, + "loss": 0.2217, + "step": 5300 + }, + { + "epoch": 6.5474722564734895, + "grad_norm": 1.2594236135482788, + "learning_rate": 0.0002, + "loss": 0.2462, + "step": 5310 + }, + { + "epoch": 6.55980271270037, + "grad_norm": 0.9919610619544983, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 5320 + }, + { + "epoch": 6.57213316892725, + "grad_norm": 1.3703680038452148, + "learning_rate": 0.0002, + "loss": 0.2933, + "step": 5330 + }, + { + "epoch": 6.584463625154131, + "grad_norm": 1.403140902519226, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 5340 + }, + { + "epoch": 6.596794081381011, + "grad_norm": 1.3477165699005127, + "learning_rate": 0.0002, + "loss": 0.2584, + "step": 5350 + }, + { + "epoch": 6.609124537607892, + "grad_norm": 1.3145594596862793, + "learning_rate": 0.0002, + "loss": 0.2853, + "step": 5360 + }, + { + "epoch": 6.621454993834772, + "grad_norm": 0.9048973321914673, + "learning_rate": 0.0002, + "loss": 0.246, + "step": 5370 + }, + { + "epoch": 6.633785450061652, + "grad_norm": 1.4123972654342651, + "learning_rate": 0.0002, + "loss": 0.2646, + "step": 5380 + }, + { + "epoch": 6.646115906288532, + "grad_norm": 1.3584848642349243, + "learning_rate": 0.0002, + "loss": 0.272, + "step": 5390 + }, + { + "epoch": 6.658446362515413, + "grad_norm": 1.2085801362991333, + "learning_rate": 0.0002, + "loss": 0.2663, + "step": 5400 + }, + { + "epoch": 6.670776818742294, + "grad_norm": 1.9293283224105835, + "learning_rate": 0.0002, + "loss": 0.2796, + "step": 5410 + }, + { + "epoch": 6.683107274969174, + "grad_norm": 1.3658782243728638, + "learning_rate": 0.0002, + "loss": 0.2412, + "step": 5420 + }, + { + "epoch": 6.695437731196054, + "grad_norm": 1.2004997730255127, + "learning_rate": 0.0002, + "loss": 0.2442, + "step": 5430 + }, + { + "epoch": 6.707768187422935, + "grad_norm": 1.0671268701553345, + "learning_rate": 0.0002, + "loss": 0.2693, + "step": 5440 + }, + { + "epoch": 6.720098643649815, + "grad_norm": 0.8877466320991516, + "learning_rate": 0.0002, + "loss": 0.2216, + "step": 5450 + }, + { + "epoch": 6.732429099876695, + "grad_norm": 1.2843106985092163, + "learning_rate": 0.0002, + "loss": 0.2678, + "step": 5460 + }, + { + "epoch": 6.744759556103576, + "grad_norm": 1.0663448572158813, + "learning_rate": 0.0002, + "loss": 0.2418, + "step": 5470 + }, + { + "epoch": 6.7570900123304565, + "grad_norm": 1.3155773878097534, + "learning_rate": 0.0002, + "loss": 0.2402, + "step": 5480 + }, + { + "epoch": 6.769420468557336, + "grad_norm": 1.8862448930740356, + "learning_rate": 0.0002, + "loss": 0.2559, + "step": 5490 + }, + { + "epoch": 6.781750924784217, + "grad_norm": 1.165061116218567, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 5500 + }, + { + "epoch": 6.794081381011098, + "grad_norm": 1.0968598127365112, + "learning_rate": 0.0002, + "loss": 0.2342, + "step": 5510 + }, + { + "epoch": 6.806411837237977, + "grad_norm": 0.9448091983795166, + "learning_rate": 0.0002, + "loss": 0.2453, + "step": 5520 + }, + { + "epoch": 6.818742293464858, + "grad_norm": 1.400767207145691, + "learning_rate": 0.0002, + "loss": 0.2609, + "step": 5530 + }, + { + "epoch": 6.831072749691739, + "grad_norm": 1.1031112670898438, + "learning_rate": 0.0002, + "loss": 0.2642, + "step": 5540 + }, + { + "epoch": 6.843403205918619, + "grad_norm": 1.2436904907226562, + "learning_rate": 0.0002, + "loss": 0.2534, + "step": 5550 + }, + { + "epoch": 6.855733662145499, + "grad_norm": 1.0987974405288696, + "learning_rate": 0.0002, + "loss": 0.2601, + "step": 5560 + }, + { + "epoch": 6.86806411837238, + "grad_norm": 0.8656415939331055, + "learning_rate": 0.0002, + "loss": 0.2622, + "step": 5570 + }, + { + "epoch": 6.8803945745992605, + "grad_norm": 1.2153927087783813, + "learning_rate": 0.0002, + "loss": 0.2585, + "step": 5580 + }, + { + "epoch": 6.89272503082614, + "grad_norm": 1.111377477645874, + "learning_rate": 0.0002, + "loss": 0.2888, + "step": 5590 + }, + { + "epoch": 6.905055487053021, + "grad_norm": 1.0041896104812622, + "learning_rate": 0.0002, + "loss": 0.2569, + "step": 5600 + }, + { + "epoch": 6.917385943279902, + "grad_norm": 1.0638413429260254, + "learning_rate": 0.0002, + "loss": 0.2654, + "step": 5610 + }, + { + "epoch": 6.929716399506781, + "grad_norm": 0.9756764769554138, + "learning_rate": 0.0002, + "loss": 0.2364, + "step": 5620 + }, + { + "epoch": 6.942046855733662, + "grad_norm": 1.153550624847412, + "learning_rate": 0.0002, + "loss": 0.2756, + "step": 5630 + }, + { + "epoch": 6.954377311960543, + "grad_norm": 1.3393985033035278, + "learning_rate": 0.0002, + "loss": 0.2732, + "step": 5640 + }, + { + "epoch": 6.9667077681874225, + "grad_norm": 1.3233463764190674, + "learning_rate": 0.0002, + "loss": 0.2793, + "step": 5650 + }, + { + "epoch": 6.979038224414303, + "grad_norm": 1.1693105697631836, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 5660 + }, + { + "epoch": 6.991368680641184, + "grad_norm": 0.7186262607574463, + "learning_rate": 0.0002, + "loss": 0.278, + "step": 5670 + }, + { + "epoch": 7.0, + "eval_loss": 1.936746597290039, + "eval_runtime": 99.4259, + "eval_samples_per_second": 4.385, + "eval_steps_per_second": 0.553, + "step": 5677 + }, + { + "epoch": 7.003699136868065, + "grad_norm": 0.9832284450531006, + "learning_rate": 0.0002, + "loss": 0.2573, + "step": 5680 + }, + { + "epoch": 7.016029593094944, + "grad_norm": 1.6794530153274536, + "learning_rate": 0.0002, + "loss": 0.1879, + "step": 5690 + }, + { + "epoch": 7.028360049321825, + "grad_norm": 1.0405313968658447, + "learning_rate": 0.0002, + "loss": 0.1612, + "step": 5700 + }, + { + "epoch": 7.040690505548706, + "grad_norm": 0.8833287954330444, + "learning_rate": 0.0002, + "loss": 0.1623, + "step": 5710 + }, + { + "epoch": 7.0530209617755855, + "grad_norm": 1.081743597984314, + "learning_rate": 0.0002, + "loss": 0.1666, + "step": 5720 + }, + { + "epoch": 7.065351418002466, + "grad_norm": 1.1786993741989136, + "learning_rate": 0.0002, + "loss": 0.1952, + "step": 5730 + }, + { + "epoch": 7.077681874229347, + "grad_norm": 1.219215989112854, + "learning_rate": 0.0002, + "loss": 0.1618, + "step": 5740 + }, + { + "epoch": 7.090012330456227, + "grad_norm": 0.882033109664917, + "learning_rate": 0.0002, + "loss": 0.165, + "step": 5750 + }, + { + "epoch": 7.102342786683107, + "grad_norm": 1.0919346809387207, + "learning_rate": 0.0002, + "loss": 0.1801, + "step": 5760 + }, + { + "epoch": 7.114673242909988, + "grad_norm": 1.2448198795318604, + "learning_rate": 0.0002, + "loss": 0.1914, + "step": 5770 + }, + { + "epoch": 7.127003699136868, + "grad_norm": 0.8977628350257874, + "learning_rate": 0.0002, + "loss": 0.1826, + "step": 5780 + }, + { + "epoch": 7.139334155363748, + "grad_norm": 1.1030590534210205, + "learning_rate": 0.0002, + "loss": 0.2119, + "step": 5790 + }, + { + "epoch": 7.151664611590629, + "grad_norm": 0.9050454497337341, + "learning_rate": 0.0002, + "loss": 0.1841, + "step": 5800 + }, + { + "epoch": 7.163995067817509, + "grad_norm": 1.2709665298461914, + "learning_rate": 0.0002, + "loss": 0.1765, + "step": 5810 + }, + { + "epoch": 7.1763255240443895, + "grad_norm": 1.7741143703460693, + "learning_rate": 0.0002, + "loss": 0.2069, + "step": 5820 + }, + { + "epoch": 7.18865598027127, + "grad_norm": 1.040995478630066, + "learning_rate": 0.0002, + "loss": 0.2189, + "step": 5830 + }, + { + "epoch": 7.200986436498151, + "grad_norm": 0.995246171951294, + "learning_rate": 0.0002, + "loss": 0.1869, + "step": 5840 + }, + { + "epoch": 7.213316892725031, + "grad_norm": 0.962523341178894, + "learning_rate": 0.0002, + "loss": 0.1727, + "step": 5850 + }, + { + "epoch": 7.225647348951911, + "grad_norm": 1.2010393142700195, + "learning_rate": 0.0002, + "loss": 0.2051, + "step": 5860 + }, + { + "epoch": 7.237977805178792, + "grad_norm": 1.1749597787857056, + "learning_rate": 0.0002, + "loss": 0.19, + "step": 5870 + }, + { + "epoch": 7.250308261405672, + "grad_norm": 1.0654889345169067, + "learning_rate": 0.0002, + "loss": 0.1799, + "step": 5880 + }, + { + "epoch": 7.262638717632552, + "grad_norm": 0.761138379573822, + "learning_rate": 0.0002, + "loss": 0.2184, + "step": 5890 + }, + { + "epoch": 7.274969173859433, + "grad_norm": 0.9512502551078796, + "learning_rate": 0.0002, + "loss": 0.1828, + "step": 5900 + }, + { + "epoch": 7.287299630086313, + "grad_norm": 0.7542949318885803, + "learning_rate": 0.0002, + "loss": 0.1655, + "step": 5910 + }, + { + "epoch": 7.2996300863131935, + "grad_norm": 0.7638646364212036, + "learning_rate": 0.0002, + "loss": 0.1985, + "step": 5920 + }, + { + "epoch": 7.311960542540074, + "grad_norm": 1.162330985069275, + "learning_rate": 0.0002, + "loss": 0.1953, + "step": 5930 + }, + { + "epoch": 7.324290998766954, + "grad_norm": 1.5835925340652466, + "learning_rate": 0.0002, + "loss": 0.204, + "step": 5940 + }, + { + "epoch": 7.336621454993835, + "grad_norm": 1.0043281316757202, + "learning_rate": 0.0002, + "loss": 0.178, + "step": 5950 + }, + { + "epoch": 7.348951911220715, + "grad_norm": 1.2750244140625, + "learning_rate": 0.0002, + "loss": 0.1819, + "step": 5960 + }, + { + "epoch": 7.361282367447595, + "grad_norm": 0.8582083582878113, + "learning_rate": 0.0002, + "loss": 0.1917, + "step": 5970 + }, + { + "epoch": 7.373612823674476, + "grad_norm": 1.0025495290756226, + "learning_rate": 0.0002, + "loss": 0.2022, + "step": 5980 + }, + { + "epoch": 7.3859432799013565, + "grad_norm": 1.030452847480774, + "learning_rate": 0.0002, + "loss": 0.1971, + "step": 5990 + }, + { + "epoch": 7.398273736128237, + "grad_norm": 0.9436936378479004, + "learning_rate": 0.0002, + "loss": 0.2026, + "step": 6000 + }, + { + "epoch": 7.410604192355117, + "grad_norm": 1.3259925842285156, + "learning_rate": 0.0002, + "loss": 0.1847, + "step": 6010 + }, + { + "epoch": 7.422934648581998, + "grad_norm": 0.884767472743988, + "learning_rate": 0.0002, + "loss": 0.1794, + "step": 6020 + }, + { + "epoch": 7.435265104808878, + "grad_norm": 0.8467209339141846, + "learning_rate": 0.0002, + "loss": 0.1712, + "step": 6030 + }, + { + "epoch": 7.447595561035758, + "grad_norm": 0.9294904470443726, + "learning_rate": 0.0002, + "loss": 0.2025, + "step": 6040 + }, + { + "epoch": 7.459926017262639, + "grad_norm": 1.2054014205932617, + "learning_rate": 0.0002, + "loss": 0.1808, + "step": 6050 + }, + { + "epoch": 7.472256473489519, + "grad_norm": 0.9458960294723511, + "learning_rate": 0.0002, + "loss": 0.193, + "step": 6060 + }, + { + "epoch": 7.484586929716399, + "grad_norm": 1.0876508951187134, + "learning_rate": 0.0002, + "loss": 0.1762, + "step": 6070 + }, + { + "epoch": 7.49691738594328, + "grad_norm": 1.110326886177063, + "learning_rate": 0.0002, + "loss": 0.1999, + "step": 6080 + }, + { + "epoch": 7.5092478421701605, + "grad_norm": 1.1584968566894531, + "learning_rate": 0.0002, + "loss": 0.1854, + "step": 6090 + }, + { + "epoch": 7.52157829839704, + "grad_norm": 1.0806410312652588, + "learning_rate": 0.0002, + "loss": 0.2059, + "step": 6100 + }, + { + "epoch": 7.533908754623921, + "grad_norm": 0.9162251353263855, + "learning_rate": 0.0002, + "loss": 0.2042, + "step": 6110 + }, + { + "epoch": 7.546239210850802, + "grad_norm": 1.044049620628357, + "learning_rate": 0.0002, + "loss": 0.1981, + "step": 6120 + }, + { + "epoch": 7.558569667077682, + "grad_norm": 0.9524619579315186, + "learning_rate": 0.0002, + "loss": 0.1857, + "step": 6130 + }, + { + "epoch": 7.570900123304562, + "grad_norm": 1.0031976699829102, + "learning_rate": 0.0002, + "loss": 0.2113, + "step": 6140 + }, + { + "epoch": 7.583230579531443, + "grad_norm": 1.342751383781433, + "learning_rate": 0.0002, + "loss": 0.2008, + "step": 6150 + }, + { + "epoch": 7.595561035758323, + "grad_norm": 1.4278815984725952, + "learning_rate": 0.0002, + "loss": 0.1895, + "step": 6160 + }, + { + "epoch": 7.607891491985203, + "grad_norm": 1.6231565475463867, + "learning_rate": 0.0002, + "loss": 0.2473, + "step": 6170 + }, + { + "epoch": 7.620221948212084, + "grad_norm": 1.0082448720932007, + "learning_rate": 0.0002, + "loss": 0.1861, + "step": 6180 + }, + { + "epoch": 7.6325524044389645, + "grad_norm": 1.1605384349822998, + "learning_rate": 0.0002, + "loss": 0.1945, + "step": 6190 + }, + { + "epoch": 7.644882860665844, + "grad_norm": 1.3302881717681885, + "learning_rate": 0.0002, + "loss": 0.196, + "step": 6200 + }, + { + "epoch": 7.657213316892725, + "grad_norm": 1.3318504095077515, + "learning_rate": 0.0002, + "loss": 0.2169, + "step": 6210 + }, + { + "epoch": 7.669543773119606, + "grad_norm": 1.265977144241333, + "learning_rate": 0.0002, + "loss": 0.1872, + "step": 6220 + }, + { + "epoch": 7.6818742293464854, + "grad_norm": 1.3964512348175049, + "learning_rate": 0.0002, + "loss": 0.1917, + "step": 6230 + }, + { + "epoch": 7.694204685573366, + "grad_norm": 1.148972988128662, + "learning_rate": 0.0002, + "loss": 0.2244, + "step": 6240 + }, + { + "epoch": 7.706535141800247, + "grad_norm": 0.8778917193412781, + "learning_rate": 0.0002, + "loss": 0.2038, + "step": 6250 + }, + { + "epoch": 7.7188655980271275, + "grad_norm": 1.3537850379943848, + "learning_rate": 0.0002, + "loss": 0.2399, + "step": 6260 + }, + { + "epoch": 7.731196054254007, + "grad_norm": 0.8741335868835449, + "learning_rate": 0.0002, + "loss": 0.194, + "step": 6270 + }, + { + "epoch": 7.743526510480888, + "grad_norm": 0.9642979502677917, + "learning_rate": 0.0002, + "loss": 0.2178, + "step": 6280 + }, + { + "epoch": 7.755856966707768, + "grad_norm": 1.4556978940963745, + "learning_rate": 0.0002, + "loss": 0.2328, + "step": 6290 + }, + { + "epoch": 7.768187422934648, + "grad_norm": 1.1485596895217896, + "learning_rate": 0.0002, + "loss": 0.2309, + "step": 6300 + }, + { + "epoch": 7.780517879161529, + "grad_norm": 1.2361459732055664, + "learning_rate": 0.0002, + "loss": 0.2178, + "step": 6310 + }, + { + "epoch": 7.79284833538841, + "grad_norm": 1.0271167755126953, + "learning_rate": 0.0002, + "loss": 0.2291, + "step": 6320 + }, + { + "epoch": 7.8051787916152895, + "grad_norm": 1.2584497928619385, + "learning_rate": 0.0002, + "loss": 0.2112, + "step": 6330 + }, + { + "epoch": 7.81750924784217, + "grad_norm": 0.9013339877128601, + "learning_rate": 0.0002, + "loss": 0.2098, + "step": 6340 + }, + { + "epoch": 7.829839704069051, + "grad_norm": 1.1033759117126465, + "learning_rate": 0.0002, + "loss": 0.1927, + "step": 6350 + }, + { + "epoch": 7.842170160295931, + "grad_norm": 1.4669054746627808, + "learning_rate": 0.0002, + "loss": 0.2294, + "step": 6360 + }, + { + "epoch": 7.854500616522811, + "grad_norm": 1.3915599584579468, + "learning_rate": 0.0002, + "loss": 0.2176, + "step": 6370 + }, + { + "epoch": 7.866831072749692, + "grad_norm": 1.6034538745880127, + "learning_rate": 0.0002, + "loss": 0.2294, + "step": 6380 + }, + { + "epoch": 7.879161528976573, + "grad_norm": 1.3022582530975342, + "learning_rate": 0.0002, + "loss": 0.2244, + "step": 6390 + }, + { + "epoch": 7.891491985203452, + "grad_norm": 1.0695449113845825, + "learning_rate": 0.0002, + "loss": 0.2316, + "step": 6400 + }, + { + "epoch": 7.903822441430333, + "grad_norm": 1.1082428693771362, + "learning_rate": 0.0002, + "loss": 0.2203, + "step": 6410 + }, + { + "epoch": 7.916152897657213, + "grad_norm": 0.9848728775978088, + "learning_rate": 0.0002, + "loss": 0.2102, + "step": 6420 + }, + { + "epoch": 7.9284833538840935, + "grad_norm": 0.8668254017829895, + "learning_rate": 0.0002, + "loss": 0.2049, + "step": 6430 + }, + { + "epoch": 7.940813810110974, + "grad_norm": 0.9431440234184265, + "learning_rate": 0.0002, + "loss": 0.2054, + "step": 6440 + }, + { + "epoch": 7.953144266337855, + "grad_norm": 1.3903534412384033, + "learning_rate": 0.0002, + "loss": 0.2364, + "step": 6450 + }, + { + "epoch": 7.965474722564735, + "grad_norm": 1.111591100692749, + "learning_rate": 0.0002, + "loss": 0.2193, + "step": 6460 + }, + { + "epoch": 7.977805178791615, + "grad_norm": 0.9858004450798035, + "learning_rate": 0.0002, + "loss": 0.2178, + "step": 6470 + }, + { + "epoch": 7.990135635018496, + "grad_norm": 0.9721771478652954, + "learning_rate": 0.0002, + "loss": 0.208, + "step": 6480 + }, + { + "epoch": 8.0, + "eval_loss": 2.145089626312256, + "eval_runtime": 99.6883, + "eval_samples_per_second": 4.374, + "eval_steps_per_second": 0.552, + "step": 6488 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.332479816355021e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-6488/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fd97101b620c486ae09c1735cab9e0cb876bdc77 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84c76a07108de4d52d6d1ca3b590035cdfa1622b60c85d1ee5ddba7d62c3e28 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1d11f1649a351743b24536f6e96451d3f17bab5 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58806af7af359649b5ade476b1a76eca6a37e287f030753631c1b49fd4aec36c +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4208fa1bcd526248f36011b20dd074140a212b21 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd9be0ee08bddb5f8e1c772ef94e9744bd61e8f63e50db22874917c2bfb606e6 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8308ca551eb626f5b968d9e93641d1bd18e6b06c --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c6112a460bf71ffe48986a0a5d8e7d7f2fd904b4345f25e28a23a413c344c70 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4ffd629a71352759af56f5c650811f25735d7f4b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/trainer_state.json @@ -0,0 +1,608 @@ +{ + "best_metric": 1.246457576751709, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 811, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012330456226880395, + "grad_norm": 0.8258164525032043, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 10 + }, + { + "epoch": 0.02466091245376079, + "grad_norm": 0.4577729105949402, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 20 + }, + { + "epoch": 0.036991368680641186, + "grad_norm": 0.639807939529419, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 30 + }, + { + "epoch": 0.04932182490752158, + "grad_norm": 0.5311757922172546, + "learning_rate": 0.0002, + "loss": 1.2088, + "step": 40 + }, + { + "epoch": 0.06165228113440197, + "grad_norm": 0.386595219373703, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 50 + }, + { + "epoch": 0.07398273736128237, + "grad_norm": 0.4401357173919678, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 60 + }, + { + "epoch": 0.08631319358816276, + "grad_norm": 0.3234352171421051, + "learning_rate": 0.0002, + "loss": 1.1022, + "step": 70 + }, + { + "epoch": 0.09864364981504316, + "grad_norm": 0.29643672704696655, + "learning_rate": 0.0002, + "loss": 1.3738, + "step": 80 + }, + { + "epoch": 0.11097410604192355, + "grad_norm": 0.2941012382507324, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 90 + }, + { + "epoch": 0.12330456226880394, + "grad_norm": 0.5498173832893372, + "learning_rate": 0.0002, + "loss": 1.2067, + "step": 100 + }, + { + "epoch": 0.13563501849568435, + "grad_norm": 0.2545783519744873, + "learning_rate": 0.0002, + "loss": 1.142, + "step": 110 + }, + { + "epoch": 0.14796547472256474, + "grad_norm": 0.2984241247177124, + "learning_rate": 0.0002, + "loss": 1.0297, + "step": 120 + }, + { + "epoch": 0.16029593094944514, + "grad_norm": 0.2710968852043152, + "learning_rate": 0.0002, + "loss": 1.1591, + "step": 130 + }, + { + "epoch": 0.17262638717632553, + "grad_norm": 0.2817152142524719, + "learning_rate": 0.0002, + "loss": 1.0427, + "step": 140 + }, + { + "epoch": 0.18495684340320592, + "grad_norm": 0.41083765029907227, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 150 + }, + { + "epoch": 0.19728729963008632, + "grad_norm": 0.36536213755607605, + "learning_rate": 0.0002, + "loss": 1.3537, + "step": 160 + }, + { + "epoch": 0.2096177558569667, + "grad_norm": 0.2738671600818634, + "learning_rate": 0.0002, + "loss": 1.0132, + "step": 170 + }, + { + "epoch": 0.2219482120838471, + "grad_norm": 0.27403146028518677, + "learning_rate": 0.0002, + "loss": 0.9555, + "step": 180 + }, + { + "epoch": 0.2342786683107275, + "grad_norm": 0.4446810483932495, + "learning_rate": 0.0002, + "loss": 1.0769, + "step": 190 + }, + { + "epoch": 0.2466091245376079, + "grad_norm": 0.5295385718345642, + "learning_rate": 0.0002, + "loss": 1.0588, + "step": 200 + }, + { + "epoch": 0.2589395807644883, + "grad_norm": 0.311404824256897, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 210 + }, + { + "epoch": 0.2712700369913687, + "grad_norm": 0.2448509782552719, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 220 + }, + { + "epoch": 0.2836004932182491, + "grad_norm": 0.6507014036178589, + "learning_rate": 0.0002, + "loss": 1.0929, + "step": 230 + }, + { + "epoch": 0.2959309494451295, + "grad_norm": 0.2339320331811905, + "learning_rate": 0.0002, + "loss": 0.9875, + "step": 240 + }, + { + "epoch": 0.3082614056720099, + "grad_norm": 0.8210226893424988, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 250 + }, + { + "epoch": 0.3205918618988903, + "grad_norm": 0.27473965287208557, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 260 + }, + { + "epoch": 0.33292231812577067, + "grad_norm": 0.3051395118236542, + "learning_rate": 0.0002, + "loss": 1.0218, + "step": 270 + }, + { + "epoch": 0.34525277435265106, + "grad_norm": 0.3037777245044708, + "learning_rate": 0.0002, + "loss": 1.0286, + "step": 280 + }, + { + "epoch": 0.35758323057953145, + "grad_norm": 0.2748974859714508, + "learning_rate": 0.0002, + "loss": 1.144, + "step": 290 + }, + { + "epoch": 0.36991368680641185, + "grad_norm": 0.23656068742275238, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 300 + }, + { + "epoch": 0.38224414303329224, + "grad_norm": 0.2523384094238281, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 310 + }, + { + "epoch": 0.39457459926017263, + "grad_norm": 0.27848055958747864, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 320 + }, + { + "epoch": 0.406905055487053, + "grad_norm": 0.3204525411128998, + "learning_rate": 0.0002, + "loss": 1.0252, + "step": 330 + }, + { + "epoch": 0.4192355117139334, + "grad_norm": 0.3459707498550415, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 340 + }, + { + "epoch": 0.4315659679408138, + "grad_norm": 0.2458430379629135, + "learning_rate": 0.0002, + "loss": 1.1111, + "step": 350 + }, + { + "epoch": 0.4438964241676942, + "grad_norm": 0.5022910237312317, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 360 + }, + { + "epoch": 0.4562268803945746, + "grad_norm": 0.27076372504234314, + "learning_rate": 0.0002, + "loss": 1.1272, + "step": 370 + }, + { + "epoch": 0.468557336621455, + "grad_norm": 0.6489047408103943, + "learning_rate": 0.0002, + "loss": 1.1927, + "step": 380 + }, + { + "epoch": 0.4808877928483354, + "grad_norm": 0.3324144184589386, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 390 + }, + { + "epoch": 0.4932182490752158, + "grad_norm": 0.32813116908073425, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 400 + }, + { + "epoch": 0.5055487053020962, + "grad_norm": 0.25295355916023254, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 410 + }, + { + "epoch": 0.5178791615289766, + "grad_norm": 0.2912578880786896, + "learning_rate": 0.0002, + "loss": 0.9477, + "step": 420 + }, + { + "epoch": 0.530209617755857, + "grad_norm": 0.34780189394950867, + "learning_rate": 0.0002, + "loss": 1.0121, + "step": 430 + }, + { + "epoch": 0.5425400739827374, + "grad_norm": 0.24604526162147522, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 440 + }, + { + "epoch": 0.5548705302096177, + "grad_norm": 0.32759982347488403, + "learning_rate": 0.0002, + "loss": 1.253, + "step": 450 + }, + { + "epoch": 0.5672009864364982, + "grad_norm": 0.40810221433639526, + "learning_rate": 0.0002, + "loss": 1.1925, + "step": 460 + }, + { + "epoch": 0.5795314426633785, + "grad_norm": 0.3590679466724396, + "learning_rate": 0.0002, + "loss": 1.174, + "step": 470 + }, + { + "epoch": 0.591861898890259, + "grad_norm": 0.5656213760375977, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 480 + }, + { + "epoch": 0.6041923551171393, + "grad_norm": 0.30830657482147217, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 490 + }, + { + "epoch": 0.6165228113440198, + "grad_norm": 0.317905455827713, + "learning_rate": 0.0002, + "loss": 1.1873, + "step": 500 + }, + { + "epoch": 0.6288532675709001, + "grad_norm": 0.3254566490650177, + "learning_rate": 0.0002, + "loss": 0.9805, + "step": 510 + }, + { + "epoch": 0.6411837237977805, + "grad_norm": 0.29187721014022827, + "learning_rate": 0.0002, + "loss": 1.0384, + "step": 520 + }, + { + "epoch": 0.6535141800246609, + "grad_norm": 0.3439238965511322, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 530 + }, + { + "epoch": 0.6658446362515413, + "grad_norm": 0.20970556139945984, + "learning_rate": 0.0002, + "loss": 1.0698, + "step": 540 + }, + { + "epoch": 0.6781750924784217, + "grad_norm": 0.4022853374481201, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 550 + }, + { + "epoch": 0.6905055487053021, + "grad_norm": 0.2235759049654007, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 560 + }, + { + "epoch": 0.7028360049321825, + "grad_norm": 0.33849895000457764, + "learning_rate": 0.0002, + "loss": 1.2339, + "step": 570 + }, + { + "epoch": 0.7151664611590629, + "grad_norm": 0.34745967388153076, + "learning_rate": 0.0002, + "loss": 1.1929, + "step": 580 + }, + { + "epoch": 0.7274969173859432, + "grad_norm": 0.26041269302368164, + "learning_rate": 0.0002, + "loss": 1.1158, + "step": 590 + }, + { + "epoch": 0.7398273736128237, + "grad_norm": 0.3804777264595032, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 600 + }, + { + "epoch": 0.752157829839704, + "grad_norm": 0.2456253319978714, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 610 + }, + { + "epoch": 0.7644882860665845, + "grad_norm": 0.37838423252105713, + "learning_rate": 0.0002, + "loss": 1.0638, + "step": 620 + }, + { + "epoch": 0.7768187422934648, + "grad_norm": 0.28105494379997253, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 630 + }, + { + "epoch": 0.7891491985203453, + "grad_norm": 0.2774018943309784, + "learning_rate": 0.0002, + "loss": 1.0672, + "step": 640 + }, + { + "epoch": 0.8014796547472256, + "grad_norm": 1.8184229135513306, + "learning_rate": 0.0002, + "loss": 0.9978, + "step": 650 + }, + { + "epoch": 0.813810110974106, + "grad_norm": 0.3325096070766449, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 660 + }, + { + "epoch": 0.8261405672009864, + "grad_norm": 0.2686693072319031, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 670 + }, + { + "epoch": 0.8384710234278668, + "grad_norm": 0.3271431624889374, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 680 + }, + { + "epoch": 0.8508014796547472, + "grad_norm": 2.359999656677246, + "learning_rate": 0.0002, + "loss": 1.1116, + "step": 690 + }, + { + "epoch": 0.8631319358816276, + "grad_norm": 0.46242964267730713, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 700 + }, + { + "epoch": 0.8754623921085081, + "grad_norm": 0.34731170535087585, + "learning_rate": 0.0002, + "loss": 0.95, + "step": 710 + }, + { + "epoch": 0.8877928483353884, + "grad_norm": 0.39381715655326843, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 720 + }, + { + "epoch": 0.9001233045622689, + "grad_norm": 0.43496373295783997, + "learning_rate": 0.0002, + "loss": 1.1319, + "step": 730 + }, + { + "epoch": 0.9124537607891492, + "grad_norm": 0.32243210077285767, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 740 + }, + { + "epoch": 0.9247842170160296, + "grad_norm": 0.30396756529808044, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 750 + }, + { + "epoch": 0.93711467324291, + "grad_norm": 0.4461122751235962, + "learning_rate": 0.0002, + "loss": 1.1141, + "step": 760 + }, + { + "epoch": 0.9494451294697904, + "grad_norm": 0.24081681668758392, + "learning_rate": 0.0002, + "loss": 1.0049, + "step": 770 + }, + { + "epoch": 0.9617755856966708, + "grad_norm": 0.27461910247802734, + "learning_rate": 0.0002, + "loss": 1.0966, + "step": 780 + }, + { + "epoch": 0.9741060419235512, + "grad_norm": 0.3325668275356293, + "learning_rate": 0.0002, + "loss": 0.9942, + "step": 790 + }, + { + "epoch": 0.9864364981504316, + "grad_norm": 0.24046339094638824, + "learning_rate": 0.0002, + "loss": 1.0506, + "step": 800 + }, + { + "epoch": 0.998766954377312, + "grad_norm": 0.42950066924095154, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 810 + }, + { + "epoch": 1.0, + "eval_loss": 1.246457576751709, + "eval_runtime": 98.7974, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.557, + "step": 811 + } + ], + "logging_steps": 10, + "max_steps": 6488, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.165599770443776e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a305d1de4d8f47c0252b4d7fe65a10dd8e2c22 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060 +size 34362873 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..459ba5959a39ea126110113c9faed75cda19ffff --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05287f447dfde244c1a85dc0f576593b4a9dd61961eb0797da4844688fc48447 +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/training_log.jsonl b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bae6098617f67718b80fe759fa31675595f31ded --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 811, "epoch_duration": 2476.3049235343933, "total_accumulated_duration": 2476.3049235343933, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12786.0}, "peak_memory_reserved": {"GPU_0": 12786.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}]} +{"epoch": 2.0, "step": 1622, "epoch_duration": 2439.8832573890686, "total_accumulated_duration": 4916.188180923462, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19860.224609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-811", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}, {"eval_loss": 1.246457576751709, "eval_runtime": 98.7974, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.557, "epoch": 1.0, "step": 811}, {"loss": 1.0536, "grad_norm": 0.26760655641555786, "learning_rate": 0.0002, "epoch": 1.0110974106041923, "step": 820}, {"loss": 0.9722, "grad_norm": 0.4640820026397705, "learning_rate": 0.0002, "epoch": 1.0234278668310728, "step": 830}, {"loss": 0.9392, "grad_norm": 0.2699166238307953, "learning_rate": 0.0002, "epoch": 1.0357583230579532, "step": 840}, {"loss": 0.9335, "grad_norm": 0.3441709578037262, "learning_rate": 0.0002, "epoch": 1.0480887792848335, "step": 850}, {"loss": 0.9083, "grad_norm": 0.299934983253479, "learning_rate": 0.0002, "epoch": 1.060419235511714, "step": 860}, {"loss": 0.9416, "grad_norm": 0.2980666160583496, "learning_rate": 0.0002, "epoch": 1.0727496917385944, "step": 870}, {"loss": 0.94, "grad_norm": 0.3131714463233948, "learning_rate": 0.0002, "epoch": 1.0850801479654748, "step": 880}, {"loss": 0.9288, "grad_norm": 0.29881617426872253, "learning_rate": 0.0002, "epoch": 1.097410604192355, "step": 890}, {"loss": 0.998, "grad_norm": 0.29870888590812683, "learning_rate": 0.0002, "epoch": 1.1097410604192355, "step": 900}, {"loss": 0.9924, "grad_norm": 0.5735140442848206, "learning_rate": 0.0002, "epoch": 1.122071516646116, "step": 910}, {"loss": 1.0694, "grad_norm": 0.33159002661705017, "learning_rate": 0.0002, "epoch": 1.1344019728729964, "step": 920}, {"loss": 1.0069, "grad_norm": 1.235399842262268, "learning_rate": 0.0002, "epoch": 1.1467324290998766, "step": 930}, {"loss": 1.0315, "grad_norm": 0.27469736337661743, "learning_rate": 0.0002, "epoch": 1.159062885326757, "step": 940}, {"loss": 0.9386, "grad_norm": 0.29130664467811584, "learning_rate": 0.0002, "epoch": 1.1713933415536375, "step": 950}, {"loss": 0.8919, "grad_norm": 0.3730354607105255, "learning_rate": 0.0002, "epoch": 1.183723797780518, "step": 960}, {"loss": 0.9988, "grad_norm": 0.5973590612411499, "learning_rate": 0.0002, "epoch": 1.1960542540073984, "step": 970}, {"loss": 0.9525, "grad_norm": 0.39631304144859314, "learning_rate": 0.0002, "epoch": 1.2083847102342786, "step": 980}, {"loss": 0.9217, "grad_norm": 0.849051296710968, "learning_rate": 0.0002, "epoch": 1.220715166461159, "step": 990}, {"loss": 1.0903, "grad_norm": 0.4390525817871094, "learning_rate": 0.0002, "epoch": 1.2330456226880395, "step": 1000}, {"loss": 0.9018, "grad_norm": 0.30423852801322937, "learning_rate": 0.0002, "epoch": 1.2453760789149197, "step": 1010}, {"loss": 1.0128, "grad_norm": 0.34736061096191406, "learning_rate": 0.0002, "epoch": 1.2577065351418002, "step": 1020}, {"loss": 0.9026, "grad_norm": 0.3421604037284851, "learning_rate": 0.0002, "epoch": 1.2700369913686806, "step": 1030}, {"loss": 0.8485, "grad_norm": 0.544170081615448, "learning_rate": 0.0002, "epoch": 1.282367447595561, "step": 1040}, {"loss": 0.9591, "grad_norm": 0.5128790736198425, "learning_rate": 0.0002, "epoch": 1.2946979038224415, "step": 1050}, {"loss": 0.9214, "grad_norm": 0.443344384431839, "learning_rate": 0.0002, "epoch": 1.3070283600493218, "step": 1060}, {"loss": 0.9367, "grad_norm": 0.6380868554115295, "learning_rate": 0.0002, "epoch": 1.3193588162762022, "step": 1070}, {"loss": 0.9849, "grad_norm": 0.4638073146343231, "learning_rate": 0.0002, "epoch": 1.3316892725030827, "step": 1080}, {"loss": 0.8645, "grad_norm": 0.32406893372535706, "learning_rate": 0.0002, "epoch": 1.344019728729963, "step": 1090}, {"loss": 0.8278, "grad_norm": 0.3955065608024597, "learning_rate": 0.0002, "epoch": 1.3563501849568433, "step": 1100}, {"loss": 0.9306, "grad_norm": 0.3489246666431427, "learning_rate": 0.0002, "epoch": 1.3686806411837238, "step": 1110}, {"loss": 1.0138, "grad_norm": 0.48451653122901917, "learning_rate": 0.0002, "epoch": 1.3810110974106042, "step": 1120}, {"loss": 0.9165, "grad_norm": 0.3652360439300537, "learning_rate": 0.0002, "epoch": 1.3933415536374847, "step": 1130}, {"loss": 0.9576, "grad_norm": 1.3097436428070068, "learning_rate": 0.0002, "epoch": 1.405672009864365, "step": 1140}, {"loss": 0.8115, "grad_norm": 0.3647715449333191, "learning_rate": 0.0002, "epoch": 1.4180024660912454, "step": 1150}, {"loss": 0.8573, "grad_norm": 0.37248560786247253, "learning_rate": 0.0002, "epoch": 1.4303329223181258, "step": 1160}, {"loss": 0.936, "grad_norm": 0.4639643430709839, "learning_rate": 0.0002, "epoch": 1.442663378545006, "step": 1170}, {"loss": 0.9511, "grad_norm": 0.5455219745635986, "learning_rate": 0.0002, "epoch": 1.4549938347718865, "step": 1180}, {"loss": 0.8611, "grad_norm": 0.38862571120262146, "learning_rate": 0.0002, "epoch": 1.467324290998767, "step": 1190}, {"loss": 0.8681, "grad_norm": 0.37586215138435364, "learning_rate": 0.0002, "epoch": 1.4796547472256474, "step": 1200}, {"loss": 0.8673, "grad_norm": 0.46244436502456665, "learning_rate": 0.0002, "epoch": 1.4919852034525278, "step": 1210}, {"loss": 0.9388, "grad_norm": 0.3570359945297241, "learning_rate": 0.0002, "epoch": 1.504315659679408, "step": 1220}, {"loss": 0.971, "grad_norm": 0.28393083810806274, "learning_rate": 0.0002, "epoch": 1.5166461159062885, "step": 1230}, {"loss": 0.9296, "grad_norm": 0.5672869682312012, "learning_rate": 0.0002, "epoch": 1.528976572133169, "step": 1240}, {"loss": 0.8787, "grad_norm": 0.41605108976364136, "learning_rate": 0.0002, "epoch": 1.5413070283600492, "step": 1250}, {"loss": 0.8744, "grad_norm": 0.40657493472099304, "learning_rate": 0.0002, "epoch": 1.5536374845869299, "step": 1260}, {"loss": 0.9046, "grad_norm": 0.43672341108322144, "learning_rate": 0.0002, "epoch": 1.56596794081381, "step": 1270}, {"loss": 0.8586, "grad_norm": 0.3065410554409027, "learning_rate": 0.0002, "epoch": 1.5782983970406905, "step": 1280}, {"loss": 0.9499, "grad_norm": 0.37826645374298096, "learning_rate": 0.0002, "epoch": 1.590628853267571, "step": 1290}, {"loss": 0.901, "grad_norm": 0.42307335138320923, "learning_rate": 0.0002, "epoch": 1.6029593094944512, "step": 1300}, {"loss": 0.8673, "grad_norm": 0.3648843467235565, "learning_rate": 0.0002, "epoch": 1.6152897657213316, "step": 1310}, {"loss": 0.9302, "grad_norm": 0.8921076059341431, "learning_rate": 0.0002, "epoch": 1.627620221948212, "step": 1320}, {"loss": 0.9378, "grad_norm": 0.37522226572036743, "learning_rate": 0.0002, "epoch": 1.6399506781750923, "step": 1330}, {"loss": 0.8921, "grad_norm": 0.7489957809448242, "learning_rate": 0.0002, "epoch": 1.652281134401973, "step": 1340}, {"loss": 0.9297, "grad_norm": 0.31733131408691406, "learning_rate": 0.0002, "epoch": 1.6646115906288532, "step": 1350}, {"loss": 0.907, "grad_norm": 0.3249478340148926, "learning_rate": 0.0002, "epoch": 1.6769420468557337, "step": 1360}, {"loss": 1.0197, "grad_norm": 0.3178001344203949, "learning_rate": 0.0002, "epoch": 1.6892725030826141, "step": 1370}, {"loss": 1.0781, "grad_norm": 0.5674093961715698, "learning_rate": 0.0002, "epoch": 1.7016029593094943, "step": 1380}, {"loss": 0.8972, "grad_norm": 0.35272449254989624, "learning_rate": 0.0002, "epoch": 1.7139334155363748, "step": 1390}, {"loss": 0.9346, "grad_norm": 0.5778217911720276, "learning_rate": 0.0002, "epoch": 1.7262638717632552, "step": 1400}, {"loss": 0.9099, "grad_norm": 0.33561450242996216, "learning_rate": 0.0002, "epoch": 1.7385943279901355, "step": 1410}, {"loss": 0.8636, "grad_norm": 0.31735464930534363, "learning_rate": 0.0002, "epoch": 1.7509247842170161, "step": 1420}, {"loss": 0.982, "grad_norm": 1.0612670183181763, "learning_rate": 0.0002, "epoch": 1.7632552404438964, "step": 1430}, {"loss": 0.8224, "grad_norm": 0.5442509651184082, "learning_rate": 0.0002, "epoch": 1.7755856966707768, "step": 1440}, {"loss": 0.9275, "grad_norm": 0.7471332550048828, "learning_rate": 0.0002, "epoch": 1.7879161528976573, "step": 1450}, {"loss": 0.9389, "grad_norm": 0.4323609173297882, "learning_rate": 0.0002, "epoch": 1.8002466091245375, "step": 1460}, {"loss": 0.8247, "grad_norm": 0.47796759009361267, "learning_rate": 0.0002, "epoch": 1.8125770653514182, "step": 1470}, {"loss": 0.9395, "grad_norm": 0.3348400592803955, "learning_rate": 0.0002, "epoch": 1.8249075215782984, "step": 1480}, {"loss": 0.9793, "grad_norm": 0.3354550898075104, "learning_rate": 0.0002, "epoch": 1.8372379778051788, "step": 1490}, {"loss": 0.8581, "grad_norm": 0.5988477468490601, "learning_rate": 0.0002, "epoch": 1.8495684340320593, "step": 1500}, {"loss": 0.9268, "grad_norm": 0.5222318172454834, "learning_rate": 0.0002, "epoch": 1.8618988902589395, "step": 1510}, {"loss": 0.8846, "grad_norm": 0.5246642827987671, "learning_rate": 0.0002, "epoch": 1.87422934648582, "step": 1520}, {"loss": 0.9317, "grad_norm": 0.3164594769477844, "learning_rate": 0.0002, "epoch": 1.8865598027127004, "step": 1530}, {"loss": 0.9961, "grad_norm": 0.3496174216270447, "learning_rate": 0.0002, "epoch": 1.8988902589395806, "step": 1540}, {"loss": 0.9057, "grad_norm": 0.8863359689712524, "learning_rate": 0.0002, "epoch": 1.9112207151664613, "step": 1550}, {"loss": 0.9405, "grad_norm": 0.3587026298046112, "learning_rate": 0.0002, "epoch": 1.9235511713933415, "step": 1560}, {"loss": 0.8335, "grad_norm": 0.6052881479263306, "learning_rate": 0.0002, "epoch": 1.935881627620222, "step": 1570}, {"loss": 0.8805, "grad_norm": 0.567269504070282, "learning_rate": 0.0002, "epoch": 1.9482120838471024, "step": 1580}, {"loss": 0.9581, "grad_norm": 0.45184487104415894, "learning_rate": 0.0002, "epoch": 1.9605425400739827, "step": 1590}, {"loss": 0.9147, "grad_norm": 0.5028569102287292, "learning_rate": 0.0002, "epoch": 1.972872996300863, "step": 1600}, {"loss": 0.75, "grad_norm": 0.4677547216415405, "learning_rate": 0.0002, "epoch": 1.9852034525277436, "step": 1610}, {"loss": 0.8469, "grad_norm": 0.35106056928634644, "learning_rate": 0.0002, "epoch": 1.9975339087546238, "step": 1620}]} +{"epoch": 3.0, "step": 2433, "epoch_duration": 2425.9265751838684, "total_accumulated_duration": 7342.11475610733, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19860.224609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}, {"eval_loss": 1.246457576751709, "eval_runtime": 98.7974, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.557, "epoch": 1.0, "step": 811}, {"loss": 1.0536, "grad_norm": 0.26760655641555786, "learning_rate": 0.0002, "epoch": 1.0110974106041923, "step": 820}, {"loss": 0.9722, "grad_norm": 0.4640820026397705, "learning_rate": 0.0002, "epoch": 1.0234278668310728, "step": 830}, {"loss": 0.9392, "grad_norm": 0.2699166238307953, "learning_rate": 0.0002, "epoch": 1.0357583230579532, "step": 840}, {"loss": 0.9335, "grad_norm": 0.3441709578037262, "learning_rate": 0.0002, "epoch": 1.0480887792848335, "step": 850}, {"loss": 0.9083, "grad_norm": 0.299934983253479, "learning_rate": 0.0002, "epoch": 1.060419235511714, "step": 860}, {"loss": 0.9416, "grad_norm": 0.2980666160583496, "learning_rate": 0.0002, "epoch": 1.0727496917385944, "step": 870}, {"loss": 0.94, "grad_norm": 0.3131714463233948, "learning_rate": 0.0002, "epoch": 1.0850801479654748, "step": 880}, {"loss": 0.9288, "grad_norm": 0.29881617426872253, "learning_rate": 0.0002, "epoch": 1.097410604192355, "step": 890}, {"loss": 0.998, "grad_norm": 0.29870888590812683, "learning_rate": 0.0002, "epoch": 1.1097410604192355, "step": 900}, {"loss": 0.9924, "grad_norm": 0.5735140442848206, "learning_rate": 0.0002, "epoch": 1.122071516646116, "step": 910}, {"loss": 1.0694, "grad_norm": 0.33159002661705017, "learning_rate": 0.0002, "epoch": 1.1344019728729964, "step": 920}, {"loss": 1.0069, "grad_norm": 1.235399842262268, "learning_rate": 0.0002, "epoch": 1.1467324290998766, "step": 930}, {"loss": 1.0315, "grad_norm": 0.27469736337661743, "learning_rate": 0.0002, "epoch": 1.159062885326757, "step": 940}, {"loss": 0.9386, "grad_norm": 0.29130664467811584, "learning_rate": 0.0002, "epoch": 1.1713933415536375, "step": 950}, {"loss": 0.8919, "grad_norm": 0.3730354607105255, "learning_rate": 0.0002, "epoch": 1.183723797780518, "step": 960}, {"loss": 0.9988, "grad_norm": 0.5973590612411499, "learning_rate": 0.0002, "epoch": 1.1960542540073984, "step": 970}, {"loss": 0.9525, "grad_norm": 0.39631304144859314, "learning_rate": 0.0002, "epoch": 1.2083847102342786, "step": 980}, {"loss": 0.9217, "grad_norm": 0.849051296710968, "learning_rate": 0.0002, "epoch": 1.220715166461159, "step": 990}, {"loss": 1.0903, "grad_norm": 0.4390525817871094, "learning_rate": 0.0002, "epoch": 1.2330456226880395, "step": 1000}, {"loss": 0.9018, "grad_norm": 0.30423852801322937, "learning_rate": 0.0002, "epoch": 1.2453760789149197, "step": 1010}, {"loss": 1.0128, "grad_norm": 0.34736061096191406, "learning_rate": 0.0002, "epoch": 1.2577065351418002, "step": 1020}, {"loss": 0.9026, "grad_norm": 0.3421604037284851, "learning_rate": 0.0002, "epoch": 1.2700369913686806, "step": 1030}, {"loss": 0.8485, "grad_norm": 0.544170081615448, "learning_rate": 0.0002, "epoch": 1.282367447595561, "step": 1040}, {"loss": 0.9591, "grad_norm": 0.5128790736198425, "learning_rate": 0.0002, "epoch": 1.2946979038224415, "step": 1050}, {"loss": 0.9214, "grad_norm": 0.443344384431839, "learning_rate": 0.0002, "epoch": 1.3070283600493218, "step": 1060}, {"loss": 0.9367, "grad_norm": 0.6380868554115295, "learning_rate": 0.0002, "epoch": 1.3193588162762022, "step": 1070}, {"loss": 0.9849, "grad_norm": 0.4638073146343231, "learning_rate": 0.0002, "epoch": 1.3316892725030827, "step": 1080}, {"loss": 0.8645, "grad_norm": 0.32406893372535706, "learning_rate": 0.0002, "epoch": 1.344019728729963, "step": 1090}, {"loss": 0.8278, "grad_norm": 0.3955065608024597, "learning_rate": 0.0002, "epoch": 1.3563501849568433, "step": 1100}, {"loss": 0.9306, "grad_norm": 0.3489246666431427, "learning_rate": 0.0002, "epoch": 1.3686806411837238, "step": 1110}, {"loss": 1.0138, "grad_norm": 0.48451653122901917, "learning_rate": 0.0002, "epoch": 1.3810110974106042, "step": 1120}, {"loss": 0.9165, "grad_norm": 0.3652360439300537, "learning_rate": 0.0002, "epoch": 1.3933415536374847, "step": 1130}, {"loss": 0.9576, "grad_norm": 1.3097436428070068, "learning_rate": 0.0002, "epoch": 1.405672009864365, "step": 1140}, {"loss": 0.8115, "grad_norm": 0.3647715449333191, "learning_rate": 0.0002, "epoch": 1.4180024660912454, "step": 1150}, {"loss": 0.8573, "grad_norm": 0.37248560786247253, "learning_rate": 0.0002, "epoch": 1.4303329223181258, "step": 1160}, {"loss": 0.936, "grad_norm": 0.4639643430709839, "learning_rate": 0.0002, "epoch": 1.442663378545006, "step": 1170}, {"loss": 0.9511, "grad_norm": 0.5455219745635986, "learning_rate": 0.0002, "epoch": 1.4549938347718865, "step": 1180}, {"loss": 0.8611, "grad_norm": 0.38862571120262146, "learning_rate": 0.0002, "epoch": 1.467324290998767, "step": 1190}, {"loss": 0.8681, "grad_norm": 0.37586215138435364, "learning_rate": 0.0002, "epoch": 1.4796547472256474, "step": 1200}, {"loss": 0.8673, "grad_norm": 0.46244436502456665, "learning_rate": 0.0002, "epoch": 1.4919852034525278, "step": 1210}, {"loss": 0.9388, "grad_norm": 0.3570359945297241, "learning_rate": 0.0002, "epoch": 1.504315659679408, "step": 1220}, {"loss": 0.971, "grad_norm": 0.28393083810806274, "learning_rate": 0.0002, "epoch": 1.5166461159062885, "step": 1230}, {"loss": 0.9296, "grad_norm": 0.5672869682312012, "learning_rate": 0.0002, "epoch": 1.528976572133169, "step": 1240}, {"loss": 0.8787, "grad_norm": 0.41605108976364136, "learning_rate": 0.0002, "epoch": 1.5413070283600492, "step": 1250}, {"loss": 0.8744, "grad_norm": 0.40657493472099304, "learning_rate": 0.0002, "epoch": 1.5536374845869299, "step": 1260}, {"loss": 0.9046, "grad_norm": 0.43672341108322144, "learning_rate": 0.0002, "epoch": 1.56596794081381, "step": 1270}, {"loss": 0.8586, "grad_norm": 0.3065410554409027, "learning_rate": 0.0002, "epoch": 1.5782983970406905, "step": 1280}, {"loss": 0.9499, "grad_norm": 0.37826645374298096, "learning_rate": 0.0002, "epoch": 1.590628853267571, "step": 1290}, {"loss": 0.901, "grad_norm": 0.42307335138320923, "learning_rate": 0.0002, "epoch": 1.6029593094944512, "step": 1300}, {"loss": 0.8673, "grad_norm": 0.3648843467235565, "learning_rate": 0.0002, "epoch": 1.6152897657213316, "step": 1310}, {"loss": 0.9302, "grad_norm": 0.8921076059341431, "learning_rate": 0.0002, "epoch": 1.627620221948212, "step": 1320}, {"loss": 0.9378, "grad_norm": 0.37522226572036743, "learning_rate": 0.0002, "epoch": 1.6399506781750923, "step": 1330}, {"loss": 0.8921, "grad_norm": 0.7489957809448242, "learning_rate": 0.0002, "epoch": 1.652281134401973, "step": 1340}, {"loss": 0.9297, "grad_norm": 0.31733131408691406, "learning_rate": 0.0002, "epoch": 1.6646115906288532, "step": 1350}, {"loss": 0.907, "grad_norm": 0.3249478340148926, "learning_rate": 0.0002, "epoch": 1.6769420468557337, "step": 1360}, {"loss": 1.0197, "grad_norm": 0.3178001344203949, "learning_rate": 0.0002, "epoch": 1.6892725030826141, "step": 1370}, {"loss": 1.0781, "grad_norm": 0.5674093961715698, "learning_rate": 0.0002, "epoch": 1.7016029593094943, "step": 1380}, {"loss": 0.8972, "grad_norm": 0.35272449254989624, "learning_rate": 0.0002, "epoch": 1.7139334155363748, "step": 1390}, {"loss": 0.9346, "grad_norm": 0.5778217911720276, "learning_rate": 0.0002, "epoch": 1.7262638717632552, "step": 1400}, {"loss": 0.9099, "grad_norm": 0.33561450242996216, "learning_rate": 0.0002, "epoch": 1.7385943279901355, "step": 1410}, {"loss": 0.8636, "grad_norm": 0.31735464930534363, "learning_rate": 0.0002, "epoch": 1.7509247842170161, "step": 1420}, {"loss": 0.982, "grad_norm": 1.0612670183181763, "learning_rate": 0.0002, "epoch": 1.7632552404438964, "step": 1430}, {"loss": 0.8224, "grad_norm": 0.5442509651184082, "learning_rate": 0.0002, "epoch": 1.7755856966707768, "step": 1440}, {"loss": 0.9275, "grad_norm": 0.7471332550048828, "learning_rate": 0.0002, "epoch": 1.7879161528976573, "step": 1450}, {"loss": 0.9389, "grad_norm": 0.4323609173297882, "learning_rate": 0.0002, "epoch": 1.8002466091245375, "step": 1460}, {"loss": 0.8247, "grad_norm": 0.47796759009361267, "learning_rate": 0.0002, "epoch": 1.8125770653514182, "step": 1470}, {"loss": 0.9395, "grad_norm": 0.3348400592803955, "learning_rate": 0.0002, "epoch": 1.8249075215782984, "step": 1480}, {"loss": 0.9793, "grad_norm": 0.3354550898075104, "learning_rate": 0.0002, "epoch": 1.8372379778051788, "step": 1490}, {"loss": 0.8581, "grad_norm": 0.5988477468490601, "learning_rate": 0.0002, "epoch": 1.8495684340320593, "step": 1500}, {"loss": 0.9268, "grad_norm": 0.5222318172454834, "learning_rate": 0.0002, "epoch": 1.8618988902589395, "step": 1510}, {"loss": 0.8846, "grad_norm": 0.5246642827987671, "learning_rate": 0.0002, "epoch": 1.87422934648582, "step": 1520}, {"loss": 0.9317, "grad_norm": 0.3164594769477844, "learning_rate": 0.0002, "epoch": 1.8865598027127004, "step": 1530}, {"loss": 0.9961, "grad_norm": 0.3496174216270447, "learning_rate": 0.0002, "epoch": 1.8988902589395806, "step": 1540}, {"loss": 0.9057, "grad_norm": 0.8863359689712524, "learning_rate": 0.0002, "epoch": 1.9112207151664613, "step": 1550}, {"loss": 0.9405, "grad_norm": 0.3587026298046112, "learning_rate": 0.0002, "epoch": 1.9235511713933415, "step": 1560}, {"loss": 0.8335, "grad_norm": 0.6052881479263306, "learning_rate": 0.0002, "epoch": 1.935881627620222, "step": 1570}, {"loss": 0.8805, "grad_norm": 0.567269504070282, "learning_rate": 0.0002, "epoch": 1.9482120838471024, "step": 1580}, {"loss": 0.9581, "grad_norm": 0.45184487104415894, "learning_rate": 0.0002, "epoch": 1.9605425400739827, "step": 1590}, {"loss": 0.9147, "grad_norm": 0.5028569102287292, "learning_rate": 0.0002, "epoch": 1.972872996300863, "step": 1600}, {"loss": 0.75, "grad_norm": 0.4677547216415405, "learning_rate": 0.0002, "epoch": 1.9852034525277436, "step": 1610}, {"loss": 0.8469, "grad_norm": 0.35106056928634644, "learning_rate": 0.0002, "epoch": 1.9975339087546238, "step": 1620}, {"eval_loss": 1.238026738166809, "eval_runtime": 95.4287, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 1622}, {"loss": 0.7629, "grad_norm": 0.444060355424881, "learning_rate": 0.0002, "epoch": 2.0098643649815044, "step": 1630}, {"loss": 0.772, "grad_norm": 0.627570390701294, "learning_rate": 0.0002, "epoch": 2.0221948212083847, "step": 1640}, {"loss": 0.6186, "grad_norm": 0.38737839460372925, "learning_rate": 0.0002, "epoch": 2.034525277435265, "step": 1650}, {"loss": 0.7734, "grad_norm": 0.4300459623336792, "learning_rate": 0.0002, "epoch": 2.0468557336621456, "step": 1660}, {"loss": 0.6943, "grad_norm": 0.43037715554237366, "learning_rate": 0.0002, "epoch": 2.059186189889026, "step": 1670}, {"loss": 0.6588, "grad_norm": 0.40772515535354614, "learning_rate": 0.0002, "epoch": 2.0715166461159065, "step": 1680}, {"loss": 0.8105, "grad_norm": 0.5295451879501343, "learning_rate": 0.0002, "epoch": 2.0838471023427867, "step": 1690}, {"loss": 0.7717, "grad_norm": 0.7452750205993652, "learning_rate": 0.0002, "epoch": 2.096177558569667, "step": 1700}, {"loss": 0.7458, "grad_norm": 0.809183657169342, "learning_rate": 0.0002, "epoch": 2.1085080147965476, "step": 1710}, {"loss": 0.7398, "grad_norm": 0.4597688913345337, "learning_rate": 0.0002, "epoch": 2.120838471023428, "step": 1720}, {"loss": 0.6856, "grad_norm": 0.806919276714325, "learning_rate": 0.0002, "epoch": 2.133168927250308, "step": 1730}, {"loss": 0.679, "grad_norm": 0.3755643665790558, "learning_rate": 0.0002, "epoch": 2.1454993834771887, "step": 1740}, {"loss": 0.7938, "grad_norm": 0.5882734060287476, "learning_rate": 0.0002, "epoch": 2.157829839704069, "step": 1750}, {"loss": 0.6782, "grad_norm": 0.692960798740387, "learning_rate": 0.0002, "epoch": 2.1701602959309496, "step": 1760}, {"loss": 0.7195, "grad_norm": 0.4737096428871155, "learning_rate": 0.0002, "epoch": 2.18249075215783, "step": 1770}, {"loss": 0.758, "grad_norm": 0.6637021899223328, "learning_rate": 0.0002, "epoch": 2.19482120838471, "step": 1780}, {"loss": 0.7139, "grad_norm": 0.9109764099121094, "learning_rate": 0.0002, "epoch": 2.2071516646115907, "step": 1790}, {"loss": 0.7373, "grad_norm": 0.4137539267539978, "learning_rate": 0.0002, "epoch": 2.219482120838471, "step": 1800}, {"loss": 0.7266, "grad_norm": 0.44995415210723877, "learning_rate": 0.0002, "epoch": 2.2318125770653516, "step": 1810}, {"loss": 0.7663, "grad_norm": 0.5985036492347717, "learning_rate": 0.0002, "epoch": 2.244143033292232, "step": 1820}, {"loss": 0.7502, "grad_norm": 0.7549490332603455, "learning_rate": 0.0002, "epoch": 2.256473489519112, "step": 1830}, {"loss": 0.7452, "grad_norm": 0.4490937888622284, "learning_rate": 0.0002, "epoch": 2.2688039457459928, "step": 1840}, {"loss": 0.7531, "grad_norm": 0.38859808444976807, "learning_rate": 0.0002, "epoch": 2.281134401972873, "step": 1850}, {"loss": 0.7278, "grad_norm": 1.0704916715621948, "learning_rate": 0.0002, "epoch": 2.293464858199753, "step": 1860}, {"loss": 0.7143, "grad_norm": 0.4647100865840912, "learning_rate": 0.0002, "epoch": 2.305795314426634, "step": 1870}, {"loss": 0.7146, "grad_norm": 0.6181163787841797, "learning_rate": 0.0002, "epoch": 2.318125770653514, "step": 1880}, {"loss": 0.7689, "grad_norm": 0.9241904020309448, "learning_rate": 0.0002, "epoch": 2.3304562268803943, "step": 1890}, {"loss": 0.7294, "grad_norm": 0.39101317524909973, "learning_rate": 0.0002, "epoch": 2.342786683107275, "step": 1900}, {"loss": 0.7079, "grad_norm": 0.49442458152770996, "learning_rate": 0.0002, "epoch": 2.3551171393341552, "step": 1910}, {"loss": 0.7586, "grad_norm": 0.4864824414253235, "learning_rate": 0.0002, "epoch": 2.367447595561036, "step": 1920}, {"loss": 0.7434, "grad_norm": 0.5427613854408264, "learning_rate": 0.0002, "epoch": 2.379778051787916, "step": 1930}, {"loss": 0.8423, "grad_norm": 0.7164974808692932, "learning_rate": 0.0002, "epoch": 2.392108508014797, "step": 1940}, {"loss": 0.6888, "grad_norm": 0.562979519367218, "learning_rate": 0.0002, "epoch": 2.404438964241677, "step": 1950}, {"loss": 0.7692, "grad_norm": 0.5631861090660095, "learning_rate": 0.0002, "epoch": 2.4167694204685573, "step": 1960}, {"loss": 0.67, "grad_norm": 0.4895121157169342, "learning_rate": 0.0002, "epoch": 2.429099876695438, "step": 1970}, {"loss": 0.7735, "grad_norm": 0.45674824714660645, "learning_rate": 0.0002, "epoch": 2.441430332922318, "step": 1980}, {"loss": 0.685, "grad_norm": 1.1424206495285034, "learning_rate": 0.0002, "epoch": 2.4537607891491984, "step": 1990}, {"loss": 0.7627, "grad_norm": 0.6314579844474792, "learning_rate": 0.0002, "epoch": 2.466091245376079, "step": 2000}, {"loss": 0.7118, "grad_norm": 0.5481605529785156, "learning_rate": 0.0002, "epoch": 2.4784217016029593, "step": 2010}, {"loss": 0.6947, "grad_norm": 0.4671579599380493, "learning_rate": 0.0002, "epoch": 2.4907521578298395, "step": 2020}, {"loss": 0.7377, "grad_norm": 0.7621194124221802, "learning_rate": 0.0002, "epoch": 2.50308261405672, "step": 2030}, {"loss": 0.69, "grad_norm": 0.38983288407325745, "learning_rate": 0.0002, "epoch": 2.5154130702836004, "step": 2040}, {"loss": 0.8381, "grad_norm": 0.6341150999069214, "learning_rate": 0.0002, "epoch": 2.5277435265104806, "step": 2050}, {"loss": 0.773, "grad_norm": 0.7151971459388733, "learning_rate": 0.0002, "epoch": 2.5400739827373613, "step": 2060}, {"loss": 0.6733, "grad_norm": 0.9665895104408264, "learning_rate": 0.0002, "epoch": 2.5524044389642415, "step": 2070}, {"loss": 0.7791, "grad_norm": 0.9572727680206299, "learning_rate": 0.0002, "epoch": 2.564734895191122, "step": 2080}, {"loss": 0.7205, "grad_norm": 1.1970765590667725, "learning_rate": 0.0002, "epoch": 2.5770653514180024, "step": 2090}, {"loss": 0.6736, "grad_norm": 0.5505942702293396, "learning_rate": 0.0002, "epoch": 2.589395807644883, "step": 2100}, {"loss": 0.673, "grad_norm": 0.5903949737548828, "learning_rate": 0.0002, "epoch": 2.6017262638717633, "step": 2110}, {"loss": 0.678, "grad_norm": 0.45640307664871216, "learning_rate": 0.0002, "epoch": 2.6140567200986435, "step": 2120}, {"loss": 0.6562, "grad_norm": 0.8763944506645203, "learning_rate": 0.0002, "epoch": 2.626387176325524, "step": 2130}, {"loss": 0.6484, "grad_norm": 0.4472963213920593, "learning_rate": 0.0002, "epoch": 2.6387176325524044, "step": 2140}, {"loss": 0.7702, "grad_norm": 0.5335086584091187, "learning_rate": 0.0002, "epoch": 2.6510480887792847, "step": 2150}, {"loss": 0.6851, "grad_norm": 0.805263340473175, "learning_rate": 0.0002, "epoch": 2.6633785450061653, "step": 2160}, {"loss": 0.7026, "grad_norm": 0.6332727670669556, "learning_rate": 0.0002, "epoch": 2.6757090012330456, "step": 2170}, {"loss": 0.7925, "grad_norm": 0.8667435646057129, "learning_rate": 0.0002, "epoch": 2.688039457459926, "step": 2180}, {"loss": 0.8044, "grad_norm": 0.5638955235481262, "learning_rate": 0.0002, "epoch": 2.7003699136868065, "step": 2190}, {"loss": 0.7117, "grad_norm": 0.4176250696182251, "learning_rate": 0.0002, "epoch": 2.7127003699136867, "step": 2200}, {"loss": 0.6932, "grad_norm": 0.6013461351394653, "learning_rate": 0.0002, "epoch": 2.7250308261405674, "step": 2210}, {"loss": 0.7843, "grad_norm": 0.553961992263794, "learning_rate": 0.0002, "epoch": 2.7373612823674476, "step": 2220}, {"loss": 0.8633, "grad_norm": 0.4710180461406708, "learning_rate": 0.0002, "epoch": 2.7496917385943282, "step": 2230}, {"loss": 0.7469, "grad_norm": 0.8141706585884094, "learning_rate": 0.0002, "epoch": 2.7620221948212085, "step": 2240}, {"loss": 0.7086, "grad_norm": 0.7449556589126587, "learning_rate": 0.0002, "epoch": 2.7743526510480887, "step": 2250}, {"loss": 0.6933, "grad_norm": 0.5366780757904053, "learning_rate": 0.0002, "epoch": 2.7866831072749694, "step": 2260}, {"loss": 0.7192, "grad_norm": 0.5316720604896545, "learning_rate": 0.0002, "epoch": 2.7990135635018496, "step": 2270}, {"loss": 0.6212, "grad_norm": 0.4598459005355835, "learning_rate": 0.0002, "epoch": 2.81134401972873, "step": 2280}, {"loss": 0.7024, "grad_norm": 0.6852091550827026, "learning_rate": 0.0002, "epoch": 2.8236744759556105, "step": 2290}, {"loss": 0.7357, "grad_norm": 0.8040902018547058, "learning_rate": 0.0002, "epoch": 2.8360049321824907, "step": 2300}, {"loss": 0.7563, "grad_norm": 0.46976321935653687, "learning_rate": 0.0002, "epoch": 2.848335388409371, "step": 2310}, {"loss": 0.731, "grad_norm": 0.5214090347290039, "learning_rate": 0.0002, "epoch": 2.8606658446362516, "step": 2320}, {"loss": 0.6687, "grad_norm": 0.5323054790496826, "learning_rate": 0.0002, "epoch": 2.872996300863132, "step": 2330}, {"loss": 0.7895, "grad_norm": 0.6842264533042908, "learning_rate": 0.0002, "epoch": 2.885326757090012, "step": 2340}, {"loss": 0.7737, "grad_norm": 0.9157055616378784, "learning_rate": 0.0002, "epoch": 2.8976572133168927, "step": 2350}, {"loss": 0.7217, "grad_norm": 0.5253258347511292, "learning_rate": 0.0002, "epoch": 2.909987669543773, "step": 2360}, {"loss": 0.7162, "grad_norm": 0.4937705099582672, "learning_rate": 0.0002, "epoch": 2.9223181257706536, "step": 2370}, {"loss": 0.7008, "grad_norm": 0.48762989044189453, "learning_rate": 0.0002, "epoch": 2.934648581997534, "step": 2380}, {"loss": 0.8086, "grad_norm": 0.544335126876831, "learning_rate": 0.0002, "epoch": 2.9469790382244145, "step": 2390}, {"loss": 0.643, "grad_norm": 0.4847845435142517, "learning_rate": 0.0002, "epoch": 2.9593094944512948, "step": 2400}, {"loss": 0.7757, "grad_norm": 0.4787445366382599, "learning_rate": 0.0002, "epoch": 2.971639950678175, "step": 2410}, {"loss": 0.7678, "grad_norm": 1.022318959236145, "learning_rate": 0.0002, "epoch": 2.9839704069050557, "step": 2420}, {"loss": 0.6548, "grad_norm": 0.4987848103046417, "learning_rate": 0.0002, "epoch": 2.996300863131936, "step": 2430}]} +{"epoch": 4.0, "step": 3244, "epoch_duration": 2421.7602109909058, "total_accumulated_duration": 9763.874967098236, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19860.224609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}, {"eval_loss": 1.246457576751709, "eval_runtime": 98.7974, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.557, "epoch": 1.0, "step": 811}, {"loss": 1.0536, "grad_norm": 0.26760655641555786, "learning_rate": 0.0002, "epoch": 1.0110974106041923, "step": 820}, {"loss": 0.9722, "grad_norm": 0.4640820026397705, "learning_rate": 0.0002, "epoch": 1.0234278668310728, "step": 830}, {"loss": 0.9392, "grad_norm": 0.2699166238307953, "learning_rate": 0.0002, "epoch": 1.0357583230579532, "step": 840}, {"loss": 0.9335, "grad_norm": 0.3441709578037262, "learning_rate": 0.0002, "epoch": 1.0480887792848335, "step": 850}, {"loss": 0.9083, "grad_norm": 0.299934983253479, "learning_rate": 0.0002, "epoch": 1.060419235511714, "step": 860}, {"loss": 0.9416, "grad_norm": 0.2980666160583496, "learning_rate": 0.0002, "epoch": 1.0727496917385944, "step": 870}, {"loss": 0.94, "grad_norm": 0.3131714463233948, "learning_rate": 0.0002, "epoch": 1.0850801479654748, "step": 880}, {"loss": 0.9288, "grad_norm": 0.29881617426872253, "learning_rate": 0.0002, "epoch": 1.097410604192355, "step": 890}, {"loss": 0.998, "grad_norm": 0.29870888590812683, "learning_rate": 0.0002, "epoch": 1.1097410604192355, "step": 900}, {"loss": 0.9924, "grad_norm": 0.5735140442848206, "learning_rate": 0.0002, "epoch": 1.122071516646116, "step": 910}, {"loss": 1.0694, "grad_norm": 0.33159002661705017, "learning_rate": 0.0002, "epoch": 1.1344019728729964, "step": 920}, {"loss": 1.0069, "grad_norm": 1.235399842262268, "learning_rate": 0.0002, "epoch": 1.1467324290998766, "step": 930}, {"loss": 1.0315, "grad_norm": 0.27469736337661743, "learning_rate": 0.0002, "epoch": 1.159062885326757, "step": 940}, {"loss": 0.9386, "grad_norm": 0.29130664467811584, "learning_rate": 0.0002, "epoch": 1.1713933415536375, "step": 950}, {"loss": 0.8919, "grad_norm": 0.3730354607105255, "learning_rate": 0.0002, "epoch": 1.183723797780518, "step": 960}, {"loss": 0.9988, "grad_norm": 0.5973590612411499, "learning_rate": 0.0002, "epoch": 1.1960542540073984, "step": 970}, {"loss": 0.9525, "grad_norm": 0.39631304144859314, "learning_rate": 0.0002, "epoch": 1.2083847102342786, "step": 980}, {"loss": 0.9217, "grad_norm": 0.849051296710968, "learning_rate": 0.0002, "epoch": 1.220715166461159, "step": 990}, {"loss": 1.0903, "grad_norm": 0.4390525817871094, "learning_rate": 0.0002, "epoch": 1.2330456226880395, "step": 1000}, {"loss": 0.9018, "grad_norm": 0.30423852801322937, "learning_rate": 0.0002, "epoch": 1.2453760789149197, "step": 1010}, {"loss": 1.0128, "grad_norm": 0.34736061096191406, "learning_rate": 0.0002, "epoch": 1.2577065351418002, "step": 1020}, {"loss": 0.9026, "grad_norm": 0.3421604037284851, "learning_rate": 0.0002, "epoch": 1.2700369913686806, "step": 1030}, {"loss": 0.8485, "grad_norm": 0.544170081615448, "learning_rate": 0.0002, "epoch": 1.282367447595561, "step": 1040}, {"loss": 0.9591, "grad_norm": 0.5128790736198425, "learning_rate": 0.0002, "epoch": 1.2946979038224415, "step": 1050}, {"loss": 0.9214, "grad_norm": 0.443344384431839, "learning_rate": 0.0002, "epoch": 1.3070283600493218, "step": 1060}, {"loss": 0.9367, "grad_norm": 0.6380868554115295, "learning_rate": 0.0002, "epoch": 1.3193588162762022, "step": 1070}, {"loss": 0.9849, "grad_norm": 0.4638073146343231, "learning_rate": 0.0002, "epoch": 1.3316892725030827, "step": 1080}, {"loss": 0.8645, "grad_norm": 0.32406893372535706, "learning_rate": 0.0002, "epoch": 1.344019728729963, "step": 1090}, {"loss": 0.8278, "grad_norm": 0.3955065608024597, "learning_rate": 0.0002, "epoch": 1.3563501849568433, "step": 1100}, {"loss": 0.9306, "grad_norm": 0.3489246666431427, "learning_rate": 0.0002, "epoch": 1.3686806411837238, "step": 1110}, {"loss": 1.0138, "grad_norm": 0.48451653122901917, "learning_rate": 0.0002, "epoch": 1.3810110974106042, "step": 1120}, {"loss": 0.9165, "grad_norm": 0.3652360439300537, "learning_rate": 0.0002, "epoch": 1.3933415536374847, "step": 1130}, {"loss": 0.9576, "grad_norm": 1.3097436428070068, "learning_rate": 0.0002, "epoch": 1.405672009864365, "step": 1140}, {"loss": 0.8115, "grad_norm": 0.3647715449333191, "learning_rate": 0.0002, "epoch": 1.4180024660912454, "step": 1150}, {"loss": 0.8573, "grad_norm": 0.37248560786247253, "learning_rate": 0.0002, "epoch": 1.4303329223181258, "step": 1160}, {"loss": 0.936, "grad_norm": 0.4639643430709839, "learning_rate": 0.0002, "epoch": 1.442663378545006, "step": 1170}, {"loss": 0.9511, "grad_norm": 0.5455219745635986, "learning_rate": 0.0002, "epoch": 1.4549938347718865, "step": 1180}, {"loss": 0.8611, "grad_norm": 0.38862571120262146, "learning_rate": 0.0002, "epoch": 1.467324290998767, "step": 1190}, {"loss": 0.8681, "grad_norm": 0.37586215138435364, "learning_rate": 0.0002, "epoch": 1.4796547472256474, "step": 1200}, {"loss": 0.8673, "grad_norm": 0.46244436502456665, "learning_rate": 0.0002, "epoch": 1.4919852034525278, "step": 1210}, {"loss": 0.9388, "grad_norm": 0.3570359945297241, "learning_rate": 0.0002, "epoch": 1.504315659679408, "step": 1220}, {"loss": 0.971, "grad_norm": 0.28393083810806274, "learning_rate": 0.0002, "epoch": 1.5166461159062885, "step": 1230}, {"loss": 0.9296, "grad_norm": 0.5672869682312012, "learning_rate": 0.0002, "epoch": 1.528976572133169, "step": 1240}, {"loss": 0.8787, "grad_norm": 0.41605108976364136, "learning_rate": 0.0002, "epoch": 1.5413070283600492, "step": 1250}, {"loss": 0.8744, "grad_norm": 0.40657493472099304, "learning_rate": 0.0002, "epoch": 1.5536374845869299, "step": 1260}, {"loss": 0.9046, "grad_norm": 0.43672341108322144, "learning_rate": 0.0002, "epoch": 1.56596794081381, "step": 1270}, {"loss": 0.8586, "grad_norm": 0.3065410554409027, "learning_rate": 0.0002, "epoch": 1.5782983970406905, "step": 1280}, {"loss": 0.9499, "grad_norm": 0.37826645374298096, "learning_rate": 0.0002, "epoch": 1.590628853267571, "step": 1290}, {"loss": 0.901, "grad_norm": 0.42307335138320923, "learning_rate": 0.0002, "epoch": 1.6029593094944512, "step": 1300}, {"loss": 0.8673, "grad_norm": 0.3648843467235565, "learning_rate": 0.0002, "epoch": 1.6152897657213316, "step": 1310}, {"loss": 0.9302, "grad_norm": 0.8921076059341431, "learning_rate": 0.0002, "epoch": 1.627620221948212, "step": 1320}, {"loss": 0.9378, "grad_norm": 0.37522226572036743, "learning_rate": 0.0002, "epoch": 1.6399506781750923, "step": 1330}, {"loss": 0.8921, "grad_norm": 0.7489957809448242, "learning_rate": 0.0002, "epoch": 1.652281134401973, "step": 1340}, {"loss": 0.9297, "grad_norm": 0.31733131408691406, "learning_rate": 0.0002, "epoch": 1.6646115906288532, "step": 1350}, {"loss": 0.907, "grad_norm": 0.3249478340148926, "learning_rate": 0.0002, "epoch": 1.6769420468557337, "step": 1360}, {"loss": 1.0197, "grad_norm": 0.3178001344203949, "learning_rate": 0.0002, "epoch": 1.6892725030826141, "step": 1370}, {"loss": 1.0781, "grad_norm": 0.5674093961715698, "learning_rate": 0.0002, "epoch": 1.7016029593094943, "step": 1380}, {"loss": 0.8972, "grad_norm": 0.35272449254989624, "learning_rate": 0.0002, "epoch": 1.7139334155363748, "step": 1390}, {"loss": 0.9346, "grad_norm": 0.5778217911720276, "learning_rate": 0.0002, "epoch": 1.7262638717632552, "step": 1400}, {"loss": 0.9099, "grad_norm": 0.33561450242996216, "learning_rate": 0.0002, "epoch": 1.7385943279901355, "step": 1410}, {"loss": 0.8636, "grad_norm": 0.31735464930534363, "learning_rate": 0.0002, "epoch": 1.7509247842170161, "step": 1420}, {"loss": 0.982, "grad_norm": 1.0612670183181763, "learning_rate": 0.0002, "epoch": 1.7632552404438964, "step": 1430}, {"loss": 0.8224, "grad_norm": 0.5442509651184082, "learning_rate": 0.0002, "epoch": 1.7755856966707768, "step": 1440}, {"loss": 0.9275, "grad_norm": 0.7471332550048828, "learning_rate": 0.0002, "epoch": 1.7879161528976573, "step": 1450}, {"loss": 0.9389, "grad_norm": 0.4323609173297882, "learning_rate": 0.0002, "epoch": 1.8002466091245375, "step": 1460}, {"loss": 0.8247, "grad_norm": 0.47796759009361267, "learning_rate": 0.0002, "epoch": 1.8125770653514182, "step": 1470}, {"loss": 0.9395, "grad_norm": 0.3348400592803955, "learning_rate": 0.0002, "epoch": 1.8249075215782984, "step": 1480}, {"loss": 0.9793, "grad_norm": 0.3354550898075104, "learning_rate": 0.0002, "epoch": 1.8372379778051788, "step": 1490}, {"loss": 0.8581, "grad_norm": 0.5988477468490601, "learning_rate": 0.0002, "epoch": 1.8495684340320593, "step": 1500}, {"loss": 0.9268, "grad_norm": 0.5222318172454834, "learning_rate": 0.0002, "epoch": 1.8618988902589395, "step": 1510}, {"loss": 0.8846, "grad_norm": 0.5246642827987671, "learning_rate": 0.0002, "epoch": 1.87422934648582, "step": 1520}, {"loss": 0.9317, "grad_norm": 0.3164594769477844, "learning_rate": 0.0002, "epoch": 1.8865598027127004, "step": 1530}, {"loss": 0.9961, "grad_norm": 0.3496174216270447, "learning_rate": 0.0002, "epoch": 1.8988902589395806, "step": 1540}, {"loss": 0.9057, "grad_norm": 0.8863359689712524, "learning_rate": 0.0002, "epoch": 1.9112207151664613, "step": 1550}, {"loss": 0.9405, "grad_norm": 0.3587026298046112, "learning_rate": 0.0002, "epoch": 1.9235511713933415, "step": 1560}, {"loss": 0.8335, "grad_norm": 0.6052881479263306, "learning_rate": 0.0002, "epoch": 1.935881627620222, "step": 1570}, {"loss": 0.8805, "grad_norm": 0.567269504070282, "learning_rate": 0.0002, "epoch": 1.9482120838471024, "step": 1580}, {"loss": 0.9581, "grad_norm": 0.45184487104415894, "learning_rate": 0.0002, "epoch": 1.9605425400739827, "step": 1590}, {"loss": 0.9147, "grad_norm": 0.5028569102287292, "learning_rate": 0.0002, "epoch": 1.972872996300863, "step": 1600}, {"loss": 0.75, "grad_norm": 0.4677547216415405, "learning_rate": 0.0002, "epoch": 1.9852034525277436, "step": 1610}, {"loss": 0.8469, "grad_norm": 0.35106056928634644, "learning_rate": 0.0002, "epoch": 1.9975339087546238, "step": 1620}, {"eval_loss": 1.238026738166809, "eval_runtime": 95.4287, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 1622}, {"loss": 0.7629, "grad_norm": 0.444060355424881, "learning_rate": 0.0002, "epoch": 2.0098643649815044, "step": 1630}, {"loss": 0.772, "grad_norm": 0.627570390701294, "learning_rate": 0.0002, "epoch": 2.0221948212083847, "step": 1640}, {"loss": 0.6186, "grad_norm": 0.38737839460372925, "learning_rate": 0.0002, "epoch": 2.034525277435265, "step": 1650}, {"loss": 0.7734, "grad_norm": 0.4300459623336792, "learning_rate": 0.0002, "epoch": 2.0468557336621456, "step": 1660}, {"loss": 0.6943, "grad_norm": 0.43037715554237366, "learning_rate": 0.0002, "epoch": 2.059186189889026, "step": 1670}, {"loss": 0.6588, "grad_norm": 0.40772515535354614, "learning_rate": 0.0002, "epoch": 2.0715166461159065, "step": 1680}, {"loss": 0.8105, "grad_norm": 0.5295451879501343, "learning_rate": 0.0002, "epoch": 2.0838471023427867, "step": 1690}, {"loss": 0.7717, "grad_norm": 0.7452750205993652, "learning_rate": 0.0002, "epoch": 2.096177558569667, "step": 1700}, {"loss": 0.7458, "grad_norm": 0.809183657169342, "learning_rate": 0.0002, "epoch": 2.1085080147965476, "step": 1710}, {"loss": 0.7398, "grad_norm": 0.4597688913345337, "learning_rate": 0.0002, "epoch": 2.120838471023428, "step": 1720}, {"loss": 0.6856, "grad_norm": 0.806919276714325, "learning_rate": 0.0002, "epoch": 2.133168927250308, "step": 1730}, {"loss": 0.679, "grad_norm": 0.3755643665790558, "learning_rate": 0.0002, "epoch": 2.1454993834771887, "step": 1740}, {"loss": 0.7938, "grad_norm": 0.5882734060287476, "learning_rate": 0.0002, "epoch": 2.157829839704069, "step": 1750}, {"loss": 0.6782, "grad_norm": 0.692960798740387, "learning_rate": 0.0002, "epoch": 2.1701602959309496, "step": 1760}, {"loss": 0.7195, "grad_norm": 0.4737096428871155, "learning_rate": 0.0002, "epoch": 2.18249075215783, "step": 1770}, {"loss": 0.758, "grad_norm": 0.6637021899223328, "learning_rate": 0.0002, "epoch": 2.19482120838471, "step": 1780}, {"loss": 0.7139, "grad_norm": 0.9109764099121094, "learning_rate": 0.0002, "epoch": 2.2071516646115907, "step": 1790}, {"loss": 0.7373, "grad_norm": 0.4137539267539978, "learning_rate": 0.0002, "epoch": 2.219482120838471, "step": 1800}, {"loss": 0.7266, "grad_norm": 0.44995415210723877, "learning_rate": 0.0002, "epoch": 2.2318125770653516, "step": 1810}, {"loss": 0.7663, "grad_norm": 0.5985036492347717, "learning_rate": 0.0002, "epoch": 2.244143033292232, "step": 1820}, {"loss": 0.7502, "grad_norm": 0.7549490332603455, "learning_rate": 0.0002, "epoch": 2.256473489519112, "step": 1830}, {"loss": 0.7452, "grad_norm": 0.4490937888622284, "learning_rate": 0.0002, "epoch": 2.2688039457459928, "step": 1840}, {"loss": 0.7531, "grad_norm": 0.38859808444976807, "learning_rate": 0.0002, "epoch": 2.281134401972873, "step": 1850}, {"loss": 0.7278, "grad_norm": 1.0704916715621948, "learning_rate": 0.0002, "epoch": 2.293464858199753, "step": 1860}, {"loss": 0.7143, "grad_norm": 0.4647100865840912, "learning_rate": 0.0002, "epoch": 2.305795314426634, "step": 1870}, {"loss": 0.7146, "grad_norm": 0.6181163787841797, "learning_rate": 0.0002, "epoch": 2.318125770653514, "step": 1880}, {"loss": 0.7689, "grad_norm": 0.9241904020309448, "learning_rate": 0.0002, "epoch": 2.3304562268803943, "step": 1890}, {"loss": 0.7294, "grad_norm": 0.39101317524909973, "learning_rate": 0.0002, "epoch": 2.342786683107275, "step": 1900}, {"loss": 0.7079, "grad_norm": 0.49442458152770996, "learning_rate": 0.0002, "epoch": 2.3551171393341552, "step": 1910}, {"loss": 0.7586, "grad_norm": 0.4864824414253235, "learning_rate": 0.0002, "epoch": 2.367447595561036, "step": 1920}, {"loss": 0.7434, "grad_norm": 0.5427613854408264, "learning_rate": 0.0002, "epoch": 2.379778051787916, "step": 1930}, {"loss": 0.8423, "grad_norm": 0.7164974808692932, "learning_rate": 0.0002, "epoch": 2.392108508014797, "step": 1940}, {"loss": 0.6888, "grad_norm": 0.562979519367218, "learning_rate": 0.0002, "epoch": 2.404438964241677, "step": 1950}, {"loss": 0.7692, "grad_norm": 0.5631861090660095, "learning_rate": 0.0002, "epoch": 2.4167694204685573, "step": 1960}, {"loss": 0.67, "grad_norm": 0.4895121157169342, "learning_rate": 0.0002, "epoch": 2.429099876695438, "step": 1970}, {"loss": 0.7735, "grad_norm": 0.45674824714660645, "learning_rate": 0.0002, "epoch": 2.441430332922318, "step": 1980}, {"loss": 0.685, "grad_norm": 1.1424206495285034, "learning_rate": 0.0002, "epoch": 2.4537607891491984, "step": 1990}, {"loss": 0.7627, "grad_norm": 0.6314579844474792, "learning_rate": 0.0002, "epoch": 2.466091245376079, "step": 2000}, {"loss": 0.7118, "grad_norm": 0.5481605529785156, "learning_rate": 0.0002, "epoch": 2.4784217016029593, "step": 2010}, {"loss": 0.6947, "grad_norm": 0.4671579599380493, "learning_rate": 0.0002, "epoch": 2.4907521578298395, "step": 2020}, {"loss": 0.7377, "grad_norm": 0.7621194124221802, "learning_rate": 0.0002, "epoch": 2.50308261405672, "step": 2030}, {"loss": 0.69, "grad_norm": 0.38983288407325745, "learning_rate": 0.0002, "epoch": 2.5154130702836004, "step": 2040}, {"loss": 0.8381, "grad_norm": 0.6341150999069214, "learning_rate": 0.0002, "epoch": 2.5277435265104806, "step": 2050}, {"loss": 0.773, "grad_norm": 0.7151971459388733, "learning_rate": 0.0002, "epoch": 2.5400739827373613, "step": 2060}, {"loss": 0.6733, "grad_norm": 0.9665895104408264, "learning_rate": 0.0002, "epoch": 2.5524044389642415, "step": 2070}, {"loss": 0.7791, "grad_norm": 0.9572727680206299, "learning_rate": 0.0002, "epoch": 2.564734895191122, "step": 2080}, {"loss": 0.7205, "grad_norm": 1.1970765590667725, "learning_rate": 0.0002, "epoch": 2.5770653514180024, "step": 2090}, {"loss": 0.6736, "grad_norm": 0.5505942702293396, "learning_rate": 0.0002, "epoch": 2.589395807644883, "step": 2100}, {"loss": 0.673, "grad_norm": 0.5903949737548828, "learning_rate": 0.0002, "epoch": 2.6017262638717633, "step": 2110}, {"loss": 0.678, "grad_norm": 0.45640307664871216, "learning_rate": 0.0002, "epoch": 2.6140567200986435, "step": 2120}, {"loss": 0.6562, "grad_norm": 0.8763944506645203, "learning_rate": 0.0002, "epoch": 2.626387176325524, "step": 2130}, {"loss": 0.6484, "grad_norm": 0.4472963213920593, "learning_rate": 0.0002, "epoch": 2.6387176325524044, "step": 2140}, {"loss": 0.7702, "grad_norm": 0.5335086584091187, "learning_rate": 0.0002, "epoch": 2.6510480887792847, "step": 2150}, {"loss": 0.6851, "grad_norm": 0.805263340473175, "learning_rate": 0.0002, "epoch": 2.6633785450061653, "step": 2160}, {"loss": 0.7026, "grad_norm": 0.6332727670669556, "learning_rate": 0.0002, "epoch": 2.6757090012330456, "step": 2170}, {"loss": 0.7925, "grad_norm": 0.8667435646057129, "learning_rate": 0.0002, "epoch": 2.688039457459926, "step": 2180}, {"loss": 0.8044, "grad_norm": 0.5638955235481262, "learning_rate": 0.0002, "epoch": 2.7003699136868065, "step": 2190}, {"loss": 0.7117, "grad_norm": 0.4176250696182251, "learning_rate": 0.0002, "epoch": 2.7127003699136867, "step": 2200}, {"loss": 0.6932, "grad_norm": 0.6013461351394653, "learning_rate": 0.0002, "epoch": 2.7250308261405674, "step": 2210}, {"loss": 0.7843, "grad_norm": 0.553961992263794, "learning_rate": 0.0002, "epoch": 2.7373612823674476, "step": 2220}, {"loss": 0.8633, "grad_norm": 0.4710180461406708, "learning_rate": 0.0002, "epoch": 2.7496917385943282, "step": 2230}, {"loss": 0.7469, "grad_norm": 0.8141706585884094, "learning_rate": 0.0002, "epoch": 2.7620221948212085, "step": 2240}, {"loss": 0.7086, "grad_norm": 0.7449556589126587, "learning_rate": 0.0002, "epoch": 2.7743526510480887, "step": 2250}, {"loss": 0.6933, "grad_norm": 0.5366780757904053, "learning_rate": 0.0002, "epoch": 2.7866831072749694, "step": 2260}, {"loss": 0.7192, "grad_norm": 0.5316720604896545, "learning_rate": 0.0002, "epoch": 2.7990135635018496, "step": 2270}, {"loss": 0.6212, "grad_norm": 0.4598459005355835, "learning_rate": 0.0002, "epoch": 2.81134401972873, "step": 2280}, {"loss": 0.7024, "grad_norm": 0.6852091550827026, "learning_rate": 0.0002, "epoch": 2.8236744759556105, "step": 2290}, {"loss": 0.7357, "grad_norm": 0.8040902018547058, "learning_rate": 0.0002, "epoch": 2.8360049321824907, "step": 2300}, {"loss": 0.7563, "grad_norm": 0.46976321935653687, "learning_rate": 0.0002, "epoch": 2.848335388409371, "step": 2310}, {"loss": 0.731, "grad_norm": 0.5214090347290039, "learning_rate": 0.0002, "epoch": 2.8606658446362516, "step": 2320}, {"loss": 0.6687, "grad_norm": 0.5323054790496826, "learning_rate": 0.0002, "epoch": 2.872996300863132, "step": 2330}, {"loss": 0.7895, "grad_norm": 0.6842264533042908, "learning_rate": 0.0002, "epoch": 2.885326757090012, "step": 2340}, {"loss": 0.7737, "grad_norm": 0.9157055616378784, "learning_rate": 0.0002, "epoch": 2.8976572133168927, "step": 2350}, {"loss": 0.7217, "grad_norm": 0.5253258347511292, "learning_rate": 0.0002, "epoch": 2.909987669543773, "step": 2360}, {"loss": 0.7162, "grad_norm": 0.4937705099582672, "learning_rate": 0.0002, "epoch": 2.9223181257706536, "step": 2370}, {"loss": 0.7008, "grad_norm": 0.48762989044189453, "learning_rate": 0.0002, "epoch": 2.934648581997534, "step": 2380}, {"loss": 0.8086, "grad_norm": 0.544335126876831, "learning_rate": 0.0002, "epoch": 2.9469790382244145, "step": 2390}, {"loss": 0.643, "grad_norm": 0.4847845435142517, "learning_rate": 0.0002, "epoch": 2.9593094944512948, "step": 2400}, {"loss": 0.7757, "grad_norm": 0.4787445366382599, "learning_rate": 0.0002, "epoch": 2.971639950678175, "step": 2410}, {"loss": 0.7678, "grad_norm": 1.022318959236145, "learning_rate": 0.0002, "epoch": 2.9839704069050557, "step": 2420}, {"loss": 0.6548, "grad_norm": 0.4987848103046417, "learning_rate": 0.0002, "epoch": 2.996300863131936, "step": 2430}, {"eval_loss": 1.2936296463012695, "eval_runtime": 94.7897, "eval_samples_per_second": 4.6, "eval_steps_per_second": 0.58, "epoch": 3.0, "step": 2433}, {"loss": 0.6073, "grad_norm": 0.5562372803688049, "learning_rate": 0.0002, "epoch": 3.008631319358816, "step": 2440}, {"loss": 0.5181, "grad_norm": 1.133402705192566, "learning_rate": 0.0002, "epoch": 3.020961775585697, "step": 2450}, {"loss": 0.5333, "grad_norm": 0.6480470299720764, "learning_rate": 0.0002, "epoch": 3.033292231812577, "step": 2460}, {"loss": 0.4828, "grad_norm": 0.8989138007164001, "learning_rate": 0.0002, "epoch": 3.0456226880394572, "step": 2470}, {"loss": 0.5097, "grad_norm": 0.8257461786270142, "learning_rate": 0.0002, "epoch": 3.057953144266338, "step": 2480}, {"loss": 0.6229, "grad_norm": 0.6813381910324097, "learning_rate": 0.0002, "epoch": 3.070283600493218, "step": 2490}, {"loss": 0.531, "grad_norm": 0.6989586353302002, "learning_rate": 0.0002, "epoch": 3.082614056720099, "step": 2500}, {"loss": 0.54, "grad_norm": 0.7992092967033386, "learning_rate": 0.0002, "epoch": 3.094944512946979, "step": 2510}, {"loss": 0.5054, "grad_norm": 0.698077917098999, "learning_rate": 0.0002, "epoch": 3.1072749691738593, "step": 2520}, {"loss": 0.5064, "grad_norm": 0.5699033141136169, "learning_rate": 0.0002, "epoch": 3.11960542540074, "step": 2530}, {"loss": 0.6088, "grad_norm": 0.6142355799674988, "learning_rate": 0.0002, "epoch": 3.13193588162762, "step": 2540}, {"loss": 0.585, "grad_norm": 0.7089933753013611, "learning_rate": 0.0002, "epoch": 3.144266337854501, "step": 2550}, {"loss": 0.5373, "grad_norm": 1.0107015371322632, "learning_rate": 0.0002, "epoch": 3.156596794081381, "step": 2560}, {"loss": 0.5429, "grad_norm": 0.568138837814331, "learning_rate": 0.0002, "epoch": 3.1689272503082613, "step": 2570}, {"loss": 0.5897, "grad_norm": 0.9960416555404663, "learning_rate": 0.0002, "epoch": 3.181257706535142, "step": 2580}, {"loss": 0.5211, "grad_norm": 0.6277595162391663, "learning_rate": 0.0002, "epoch": 3.193588162762022, "step": 2590}, {"loss": 0.5787, "grad_norm": 0.681083619594574, "learning_rate": 0.0002, "epoch": 3.2059186189889024, "step": 2600}, {"loss": 0.5166, "grad_norm": 0.5816057324409485, "learning_rate": 0.0002, "epoch": 3.218249075215783, "step": 2610}, {"loss": 0.545, "grad_norm": 0.7197734117507935, "learning_rate": 0.0002, "epoch": 3.2305795314426633, "step": 2620}, {"loss": 0.614, "grad_norm": 0.6524068117141724, "learning_rate": 0.0002, "epoch": 3.242909987669544, "step": 2630}, {"loss": 0.5456, "grad_norm": 1.273668646812439, "learning_rate": 0.0002, "epoch": 3.255240443896424, "step": 2640}, {"loss": 0.5266, "grad_norm": 0.6950451731681824, "learning_rate": 0.0002, "epoch": 3.2675709001233044, "step": 2650}, {"loss": 0.5194, "grad_norm": 0.8029071688652039, "learning_rate": 0.0002, "epoch": 3.279901356350185, "step": 2660}, {"loss": 0.5729, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 3.2922318125770653, "step": 2670}, {"loss": 0.5366, "grad_norm": 0.8342001438140869, "learning_rate": 0.0002, "epoch": 3.304562268803946, "step": 2680}, {"loss": 0.5413, "grad_norm": 0.5629868507385254, "learning_rate": 0.0002, "epoch": 3.316892725030826, "step": 2690}, {"loss": 0.633, "grad_norm": 0.753999650478363, "learning_rate": 0.0002, "epoch": 3.3292231812577064, "step": 2700}, {"loss": 0.5048, "grad_norm": 1.0271371603012085, "learning_rate": 0.0002, "epoch": 3.341553637484587, "step": 2710}, {"loss": 0.5233, "grad_norm": 0.9608535170555115, "learning_rate": 0.0002, "epoch": 3.3538840937114673, "step": 2720}, {"loss": 0.5102, "grad_norm": 0.7796488404273987, "learning_rate": 0.0002, "epoch": 3.3662145499383476, "step": 2730}, {"loss": 0.5172, "grad_norm": 0.5666437149047852, "learning_rate": 0.0002, "epoch": 3.3785450061652282, "step": 2740}, {"loss": 0.491, "grad_norm": 0.5462956428527832, "learning_rate": 0.0002, "epoch": 3.3908754623921085, "step": 2750}, {"loss": 0.5855, "grad_norm": 1.289099097251892, "learning_rate": 0.0002, "epoch": 3.4032059186189887, "step": 2760}, {"loss": 0.635, "grad_norm": 0.825566828250885, "learning_rate": 0.0002, "epoch": 3.4155363748458694, "step": 2770}, {"loss": 0.4998, "grad_norm": 0.8366670608520508, "learning_rate": 0.0002, "epoch": 3.4278668310727496, "step": 2780}, {"loss": 0.5732, "grad_norm": 1.0931549072265625, "learning_rate": 0.0002, "epoch": 3.4401972872996303, "step": 2790}, {"loss": 0.6093, "grad_norm": 0.9228858351707458, "learning_rate": 0.0002, "epoch": 3.4525277435265105, "step": 2800}, {"loss": 0.6089, "grad_norm": 1.3182806968688965, "learning_rate": 0.0002, "epoch": 3.4648581997533907, "step": 2810}, {"loss": 0.5665, "grad_norm": 0.8366976380348206, "learning_rate": 0.0002, "epoch": 3.4771886559802714, "step": 2820}, {"loss": 0.5666, "grad_norm": 0.8067695498466492, "learning_rate": 0.0002, "epoch": 3.4895191122071516, "step": 2830}, {"loss": 0.579, "grad_norm": 1.1163437366485596, "learning_rate": 0.0002, "epoch": 3.5018495684340323, "step": 2840}, {"loss": 0.5785, "grad_norm": 1.7196556329727173, "learning_rate": 0.0002, "epoch": 3.5141800246609125, "step": 2850}, {"loss": 0.5346, "grad_norm": 1.1267012357711792, "learning_rate": 0.0002, "epoch": 3.5265104808877927, "step": 2860}, {"loss": 0.447, "grad_norm": 0.7220137119293213, "learning_rate": 0.0002, "epoch": 3.5388409371146734, "step": 2870}, {"loss": 0.6099, "grad_norm": 0.914114773273468, "learning_rate": 0.0002, "epoch": 3.5511713933415536, "step": 2880}, {"loss": 0.6143, "grad_norm": 0.6193503141403198, "learning_rate": 0.0002, "epoch": 3.563501849568434, "step": 2890}, {"loss": 0.5171, "grad_norm": 0.6060135960578918, "learning_rate": 0.0002, "epoch": 3.5758323057953145, "step": 2900}, {"loss": 0.5659, "grad_norm": 1.0177327394485474, "learning_rate": 0.0002, "epoch": 3.5881627620221948, "step": 2910}, {"loss": 0.5711, "grad_norm": 0.5994468331336975, "learning_rate": 0.0002, "epoch": 3.600493218249075, "step": 2920}, {"loss": 0.6373, "grad_norm": 0.7450457215309143, "learning_rate": 0.0002, "epoch": 3.6128236744759556, "step": 2930}, {"loss": 0.4933, "grad_norm": 0.5825870037078857, "learning_rate": 0.0002, "epoch": 3.625154130702836, "step": 2940}, {"loss": 0.6016, "grad_norm": 0.6289743781089783, "learning_rate": 0.0002, "epoch": 3.6374845869297165, "step": 2950}, {"loss": 0.5507, "grad_norm": 0.7801929116249084, "learning_rate": 0.0002, "epoch": 3.6498150431565968, "step": 2960}, {"loss": 0.5695, "grad_norm": 1.1206634044647217, "learning_rate": 0.0002, "epoch": 3.6621454993834774, "step": 2970}, {"loss": 0.4985, "grad_norm": 0.6738817691802979, "learning_rate": 0.0002, "epoch": 3.6744759556103577, "step": 2980}, {"loss": 0.6209, "grad_norm": 1.1917344331741333, "learning_rate": 0.0002, "epoch": 3.686806411837238, "step": 2990}, {"loss": 0.5373, "grad_norm": 1.3738657236099243, "learning_rate": 0.0002, "epoch": 3.6991368680641186, "step": 3000}, {"loss": 0.5467, "grad_norm": 0.6642793416976929, "learning_rate": 0.0002, "epoch": 3.711467324290999, "step": 3010}, {"loss": 0.6243, "grad_norm": 0.9030995965003967, "learning_rate": 0.0002, "epoch": 3.723797780517879, "step": 3020}, {"loss": 0.592, "grad_norm": 1.0203914642333984, "learning_rate": 0.0002, "epoch": 3.7361282367447597, "step": 3030}, {"loss": 0.5453, "grad_norm": 0.648394763469696, "learning_rate": 0.0002, "epoch": 3.74845869297164, "step": 3040}, {"loss": 0.498, "grad_norm": 0.6304570436477661, "learning_rate": 0.0002, "epoch": 3.76078914919852, "step": 3050}, {"loss": 0.683, "grad_norm": 0.8286601901054382, "learning_rate": 0.0002, "epoch": 3.773119605425401, "step": 3060}, {"loss": 0.5302, "grad_norm": 0.906444251537323, "learning_rate": 0.0002, "epoch": 3.785450061652281, "step": 3070}, {"loss": 0.5345, "grad_norm": 1.4212149381637573, "learning_rate": 0.0002, "epoch": 3.7977805178791613, "step": 3080}, {"loss": 0.6403, "grad_norm": 0.7574319839477539, "learning_rate": 0.0002, "epoch": 3.810110974106042, "step": 3090}, {"loss": 0.5756, "grad_norm": 0.6534451246261597, "learning_rate": 0.0002, "epoch": 3.822441430332922, "step": 3100}, {"loss": 0.5306, "grad_norm": 0.7525447010993958, "learning_rate": 0.0002, "epoch": 3.834771886559803, "step": 3110}, {"loss": 0.5368, "grad_norm": 0.6513990759849548, "learning_rate": 0.0002, "epoch": 3.847102342786683, "step": 3120}, {"loss": 0.5492, "grad_norm": 0.7782694697380066, "learning_rate": 0.0002, "epoch": 3.8594327990135637, "step": 3130}, {"loss": 0.5727, "grad_norm": 0.7998530268669128, "learning_rate": 0.0002, "epoch": 3.871763255240444, "step": 3140}, {"loss": 0.5156, "grad_norm": 0.8045353293418884, "learning_rate": 0.0002, "epoch": 3.884093711467324, "step": 3150}, {"loss": 0.5341, "grad_norm": 0.8242645263671875, "learning_rate": 0.0002, "epoch": 3.896424167694205, "step": 3160}, {"loss": 0.5563, "grad_norm": 0.8302360773086548, "learning_rate": 0.0002, "epoch": 3.908754623921085, "step": 3170}, {"loss": 0.5793, "grad_norm": 0.8653109073638916, "learning_rate": 0.0002, "epoch": 3.9210850801479653, "step": 3180}, {"loss": 0.5219, "grad_norm": 0.6461338996887207, "learning_rate": 0.0002, "epoch": 3.933415536374846, "step": 3190}, {"loss": 0.6009, "grad_norm": 0.8267415165901184, "learning_rate": 0.0002, "epoch": 3.945745992601726, "step": 3200}, {"loss": 0.5956, "grad_norm": 1.1963194608688354, "learning_rate": 0.0002, "epoch": 3.9580764488286064, "step": 3210}, {"loss": 0.5692, "grad_norm": 0.7101966142654419, "learning_rate": 0.0002, "epoch": 3.970406905055487, "step": 3220}, {"loss": 0.5471, "grad_norm": 0.5931660532951355, "learning_rate": 0.0002, "epoch": 3.9827373612823673, "step": 3230}, {"loss": 0.5619, "grad_norm": 0.7465988993644714, "learning_rate": 0.0002, "epoch": 3.995067817509248, "step": 3240}]} +{"epoch": 5.0, "step": 4055, "epoch_duration": 2417.7227470874786, "total_accumulated_duration": 12181.597714185715, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19860.224609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}, {"eval_loss": 1.246457576751709, "eval_runtime": 98.7974, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.557, "epoch": 1.0, "step": 811}, {"loss": 1.0536, "grad_norm": 0.26760655641555786, "learning_rate": 0.0002, "epoch": 1.0110974106041923, "step": 820}, {"loss": 0.9722, "grad_norm": 0.4640820026397705, "learning_rate": 0.0002, "epoch": 1.0234278668310728, "step": 830}, {"loss": 0.9392, "grad_norm": 0.2699166238307953, "learning_rate": 0.0002, "epoch": 1.0357583230579532, "step": 840}, {"loss": 0.9335, "grad_norm": 0.3441709578037262, "learning_rate": 0.0002, "epoch": 1.0480887792848335, "step": 850}, {"loss": 0.9083, "grad_norm": 0.299934983253479, "learning_rate": 0.0002, "epoch": 1.060419235511714, "step": 860}, {"loss": 0.9416, "grad_norm": 0.2980666160583496, "learning_rate": 0.0002, "epoch": 1.0727496917385944, "step": 870}, {"loss": 0.94, "grad_norm": 0.3131714463233948, "learning_rate": 0.0002, "epoch": 1.0850801479654748, "step": 880}, {"loss": 0.9288, "grad_norm": 0.29881617426872253, "learning_rate": 0.0002, "epoch": 1.097410604192355, "step": 890}, {"loss": 0.998, "grad_norm": 0.29870888590812683, "learning_rate": 0.0002, "epoch": 1.1097410604192355, "step": 900}, {"loss": 0.9924, "grad_norm": 0.5735140442848206, "learning_rate": 0.0002, "epoch": 1.122071516646116, "step": 910}, {"loss": 1.0694, "grad_norm": 0.33159002661705017, "learning_rate": 0.0002, "epoch": 1.1344019728729964, "step": 920}, {"loss": 1.0069, "grad_norm": 1.235399842262268, "learning_rate": 0.0002, "epoch": 1.1467324290998766, "step": 930}, {"loss": 1.0315, "grad_norm": 0.27469736337661743, "learning_rate": 0.0002, "epoch": 1.159062885326757, "step": 940}, {"loss": 0.9386, "grad_norm": 0.29130664467811584, "learning_rate": 0.0002, "epoch": 1.1713933415536375, "step": 950}, {"loss": 0.8919, "grad_norm": 0.3730354607105255, "learning_rate": 0.0002, "epoch": 1.183723797780518, "step": 960}, {"loss": 0.9988, "grad_norm": 0.5973590612411499, "learning_rate": 0.0002, "epoch": 1.1960542540073984, "step": 970}, {"loss": 0.9525, "grad_norm": 0.39631304144859314, "learning_rate": 0.0002, "epoch": 1.2083847102342786, "step": 980}, {"loss": 0.9217, "grad_norm": 0.849051296710968, "learning_rate": 0.0002, "epoch": 1.220715166461159, "step": 990}, {"loss": 1.0903, "grad_norm": 0.4390525817871094, "learning_rate": 0.0002, "epoch": 1.2330456226880395, "step": 1000}, {"loss": 0.9018, "grad_norm": 0.30423852801322937, "learning_rate": 0.0002, "epoch": 1.2453760789149197, "step": 1010}, {"loss": 1.0128, "grad_norm": 0.34736061096191406, "learning_rate": 0.0002, "epoch": 1.2577065351418002, "step": 1020}, {"loss": 0.9026, "grad_norm": 0.3421604037284851, "learning_rate": 0.0002, "epoch": 1.2700369913686806, "step": 1030}, {"loss": 0.8485, "grad_norm": 0.544170081615448, "learning_rate": 0.0002, "epoch": 1.282367447595561, "step": 1040}, {"loss": 0.9591, "grad_norm": 0.5128790736198425, "learning_rate": 0.0002, "epoch": 1.2946979038224415, "step": 1050}, {"loss": 0.9214, "grad_norm": 0.443344384431839, "learning_rate": 0.0002, "epoch": 1.3070283600493218, "step": 1060}, {"loss": 0.9367, "grad_norm": 0.6380868554115295, "learning_rate": 0.0002, "epoch": 1.3193588162762022, "step": 1070}, {"loss": 0.9849, "grad_norm": 0.4638073146343231, "learning_rate": 0.0002, "epoch": 1.3316892725030827, "step": 1080}, {"loss": 0.8645, "grad_norm": 0.32406893372535706, "learning_rate": 0.0002, "epoch": 1.344019728729963, "step": 1090}, {"loss": 0.8278, "grad_norm": 0.3955065608024597, "learning_rate": 0.0002, "epoch": 1.3563501849568433, "step": 1100}, {"loss": 0.9306, "grad_norm": 0.3489246666431427, "learning_rate": 0.0002, "epoch": 1.3686806411837238, "step": 1110}, {"loss": 1.0138, "grad_norm": 0.48451653122901917, "learning_rate": 0.0002, "epoch": 1.3810110974106042, "step": 1120}, {"loss": 0.9165, "grad_norm": 0.3652360439300537, "learning_rate": 0.0002, "epoch": 1.3933415536374847, "step": 1130}, {"loss": 0.9576, "grad_norm": 1.3097436428070068, "learning_rate": 0.0002, "epoch": 1.405672009864365, "step": 1140}, {"loss": 0.8115, "grad_norm": 0.3647715449333191, "learning_rate": 0.0002, "epoch": 1.4180024660912454, "step": 1150}, {"loss": 0.8573, "grad_norm": 0.37248560786247253, "learning_rate": 0.0002, "epoch": 1.4303329223181258, "step": 1160}, {"loss": 0.936, "grad_norm": 0.4639643430709839, "learning_rate": 0.0002, "epoch": 1.442663378545006, "step": 1170}, {"loss": 0.9511, "grad_norm": 0.5455219745635986, "learning_rate": 0.0002, "epoch": 1.4549938347718865, "step": 1180}, {"loss": 0.8611, "grad_norm": 0.38862571120262146, "learning_rate": 0.0002, "epoch": 1.467324290998767, "step": 1190}, {"loss": 0.8681, "grad_norm": 0.37586215138435364, "learning_rate": 0.0002, "epoch": 1.4796547472256474, "step": 1200}, {"loss": 0.8673, "grad_norm": 0.46244436502456665, "learning_rate": 0.0002, "epoch": 1.4919852034525278, "step": 1210}, {"loss": 0.9388, "grad_norm": 0.3570359945297241, "learning_rate": 0.0002, "epoch": 1.504315659679408, "step": 1220}, {"loss": 0.971, "grad_norm": 0.28393083810806274, "learning_rate": 0.0002, "epoch": 1.5166461159062885, "step": 1230}, {"loss": 0.9296, "grad_norm": 0.5672869682312012, "learning_rate": 0.0002, "epoch": 1.528976572133169, "step": 1240}, {"loss": 0.8787, "grad_norm": 0.41605108976364136, "learning_rate": 0.0002, "epoch": 1.5413070283600492, "step": 1250}, {"loss": 0.8744, "grad_norm": 0.40657493472099304, "learning_rate": 0.0002, "epoch": 1.5536374845869299, "step": 1260}, {"loss": 0.9046, "grad_norm": 0.43672341108322144, "learning_rate": 0.0002, "epoch": 1.56596794081381, "step": 1270}, {"loss": 0.8586, "grad_norm": 0.3065410554409027, "learning_rate": 0.0002, "epoch": 1.5782983970406905, "step": 1280}, {"loss": 0.9499, "grad_norm": 0.37826645374298096, "learning_rate": 0.0002, "epoch": 1.590628853267571, "step": 1290}, {"loss": 0.901, "grad_norm": 0.42307335138320923, "learning_rate": 0.0002, "epoch": 1.6029593094944512, "step": 1300}, {"loss": 0.8673, "grad_norm": 0.3648843467235565, "learning_rate": 0.0002, "epoch": 1.6152897657213316, "step": 1310}, {"loss": 0.9302, "grad_norm": 0.8921076059341431, "learning_rate": 0.0002, "epoch": 1.627620221948212, "step": 1320}, {"loss": 0.9378, "grad_norm": 0.37522226572036743, "learning_rate": 0.0002, "epoch": 1.6399506781750923, "step": 1330}, {"loss": 0.8921, "grad_norm": 0.7489957809448242, "learning_rate": 0.0002, "epoch": 1.652281134401973, "step": 1340}, {"loss": 0.9297, "grad_norm": 0.31733131408691406, "learning_rate": 0.0002, "epoch": 1.6646115906288532, "step": 1350}, {"loss": 0.907, "grad_norm": 0.3249478340148926, "learning_rate": 0.0002, "epoch": 1.6769420468557337, "step": 1360}, {"loss": 1.0197, "grad_norm": 0.3178001344203949, "learning_rate": 0.0002, "epoch": 1.6892725030826141, "step": 1370}, {"loss": 1.0781, "grad_norm": 0.5674093961715698, "learning_rate": 0.0002, "epoch": 1.7016029593094943, "step": 1380}, {"loss": 0.8972, "grad_norm": 0.35272449254989624, "learning_rate": 0.0002, "epoch": 1.7139334155363748, "step": 1390}, {"loss": 0.9346, "grad_norm": 0.5778217911720276, "learning_rate": 0.0002, "epoch": 1.7262638717632552, "step": 1400}, {"loss": 0.9099, "grad_norm": 0.33561450242996216, "learning_rate": 0.0002, "epoch": 1.7385943279901355, "step": 1410}, {"loss": 0.8636, "grad_norm": 0.31735464930534363, "learning_rate": 0.0002, "epoch": 1.7509247842170161, "step": 1420}, {"loss": 0.982, "grad_norm": 1.0612670183181763, "learning_rate": 0.0002, "epoch": 1.7632552404438964, "step": 1430}, {"loss": 0.8224, "grad_norm": 0.5442509651184082, "learning_rate": 0.0002, "epoch": 1.7755856966707768, "step": 1440}, {"loss": 0.9275, "grad_norm": 0.7471332550048828, "learning_rate": 0.0002, "epoch": 1.7879161528976573, "step": 1450}, {"loss": 0.9389, "grad_norm": 0.4323609173297882, "learning_rate": 0.0002, "epoch": 1.8002466091245375, "step": 1460}, {"loss": 0.8247, "grad_norm": 0.47796759009361267, "learning_rate": 0.0002, "epoch": 1.8125770653514182, "step": 1470}, {"loss": 0.9395, "grad_norm": 0.3348400592803955, "learning_rate": 0.0002, "epoch": 1.8249075215782984, "step": 1480}, {"loss": 0.9793, "grad_norm": 0.3354550898075104, "learning_rate": 0.0002, "epoch": 1.8372379778051788, "step": 1490}, {"loss": 0.8581, "grad_norm": 0.5988477468490601, "learning_rate": 0.0002, "epoch": 1.8495684340320593, "step": 1500}, {"loss": 0.9268, "grad_norm": 0.5222318172454834, "learning_rate": 0.0002, "epoch": 1.8618988902589395, "step": 1510}, {"loss": 0.8846, "grad_norm": 0.5246642827987671, "learning_rate": 0.0002, "epoch": 1.87422934648582, "step": 1520}, {"loss": 0.9317, "grad_norm": 0.3164594769477844, "learning_rate": 0.0002, "epoch": 1.8865598027127004, "step": 1530}, {"loss": 0.9961, "grad_norm": 0.3496174216270447, "learning_rate": 0.0002, "epoch": 1.8988902589395806, "step": 1540}, {"loss": 0.9057, "grad_norm": 0.8863359689712524, "learning_rate": 0.0002, "epoch": 1.9112207151664613, "step": 1550}, {"loss": 0.9405, "grad_norm": 0.3587026298046112, "learning_rate": 0.0002, "epoch": 1.9235511713933415, "step": 1560}, {"loss": 0.8335, "grad_norm": 0.6052881479263306, "learning_rate": 0.0002, "epoch": 1.935881627620222, "step": 1570}, {"loss": 0.8805, "grad_norm": 0.567269504070282, "learning_rate": 0.0002, "epoch": 1.9482120838471024, "step": 1580}, {"loss": 0.9581, "grad_norm": 0.45184487104415894, "learning_rate": 0.0002, "epoch": 1.9605425400739827, "step": 1590}, {"loss": 0.9147, "grad_norm": 0.5028569102287292, "learning_rate": 0.0002, "epoch": 1.972872996300863, "step": 1600}, {"loss": 0.75, "grad_norm": 0.4677547216415405, "learning_rate": 0.0002, "epoch": 1.9852034525277436, "step": 1610}, {"loss": 0.8469, "grad_norm": 0.35106056928634644, "learning_rate": 0.0002, "epoch": 1.9975339087546238, "step": 1620}, {"eval_loss": 1.238026738166809, "eval_runtime": 95.4287, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 1622}, {"loss": 0.7629, "grad_norm": 0.444060355424881, "learning_rate": 0.0002, "epoch": 2.0098643649815044, "step": 1630}, {"loss": 0.772, "grad_norm": 0.627570390701294, "learning_rate": 0.0002, "epoch": 2.0221948212083847, "step": 1640}, {"loss": 0.6186, "grad_norm": 0.38737839460372925, "learning_rate": 0.0002, "epoch": 2.034525277435265, "step": 1650}, {"loss": 0.7734, "grad_norm": 0.4300459623336792, "learning_rate": 0.0002, "epoch": 2.0468557336621456, "step": 1660}, {"loss": 0.6943, "grad_norm": 0.43037715554237366, "learning_rate": 0.0002, "epoch": 2.059186189889026, "step": 1670}, {"loss": 0.6588, "grad_norm": 0.40772515535354614, "learning_rate": 0.0002, "epoch": 2.0715166461159065, "step": 1680}, {"loss": 0.8105, "grad_norm": 0.5295451879501343, "learning_rate": 0.0002, "epoch": 2.0838471023427867, "step": 1690}, {"loss": 0.7717, "grad_norm": 0.7452750205993652, "learning_rate": 0.0002, "epoch": 2.096177558569667, "step": 1700}, {"loss": 0.7458, "grad_norm": 0.809183657169342, "learning_rate": 0.0002, "epoch": 2.1085080147965476, "step": 1710}, {"loss": 0.7398, "grad_norm": 0.4597688913345337, "learning_rate": 0.0002, "epoch": 2.120838471023428, "step": 1720}, {"loss": 0.6856, "grad_norm": 0.806919276714325, "learning_rate": 0.0002, "epoch": 2.133168927250308, "step": 1730}, {"loss": 0.679, "grad_norm": 0.3755643665790558, "learning_rate": 0.0002, "epoch": 2.1454993834771887, "step": 1740}, {"loss": 0.7938, "grad_norm": 0.5882734060287476, "learning_rate": 0.0002, "epoch": 2.157829839704069, "step": 1750}, {"loss": 0.6782, "grad_norm": 0.692960798740387, "learning_rate": 0.0002, "epoch": 2.1701602959309496, "step": 1760}, {"loss": 0.7195, "grad_norm": 0.4737096428871155, "learning_rate": 0.0002, "epoch": 2.18249075215783, "step": 1770}, {"loss": 0.758, "grad_norm": 0.6637021899223328, "learning_rate": 0.0002, "epoch": 2.19482120838471, "step": 1780}, {"loss": 0.7139, "grad_norm": 0.9109764099121094, "learning_rate": 0.0002, "epoch": 2.2071516646115907, "step": 1790}, {"loss": 0.7373, "grad_norm": 0.4137539267539978, "learning_rate": 0.0002, "epoch": 2.219482120838471, "step": 1800}, {"loss": 0.7266, "grad_norm": 0.44995415210723877, "learning_rate": 0.0002, "epoch": 2.2318125770653516, "step": 1810}, {"loss": 0.7663, "grad_norm": 0.5985036492347717, "learning_rate": 0.0002, "epoch": 2.244143033292232, "step": 1820}, {"loss": 0.7502, "grad_norm": 0.7549490332603455, "learning_rate": 0.0002, "epoch": 2.256473489519112, "step": 1830}, {"loss": 0.7452, "grad_norm": 0.4490937888622284, "learning_rate": 0.0002, "epoch": 2.2688039457459928, "step": 1840}, {"loss": 0.7531, "grad_norm": 0.38859808444976807, "learning_rate": 0.0002, "epoch": 2.281134401972873, "step": 1850}, {"loss": 0.7278, "grad_norm": 1.0704916715621948, "learning_rate": 0.0002, "epoch": 2.293464858199753, "step": 1860}, {"loss": 0.7143, "grad_norm": 0.4647100865840912, "learning_rate": 0.0002, "epoch": 2.305795314426634, "step": 1870}, {"loss": 0.7146, "grad_norm": 0.6181163787841797, "learning_rate": 0.0002, "epoch": 2.318125770653514, "step": 1880}, {"loss": 0.7689, "grad_norm": 0.9241904020309448, "learning_rate": 0.0002, "epoch": 2.3304562268803943, "step": 1890}, {"loss": 0.7294, "grad_norm": 0.39101317524909973, "learning_rate": 0.0002, "epoch": 2.342786683107275, "step": 1900}, {"loss": 0.7079, "grad_norm": 0.49442458152770996, "learning_rate": 0.0002, "epoch": 2.3551171393341552, "step": 1910}, {"loss": 0.7586, "grad_norm": 0.4864824414253235, "learning_rate": 0.0002, "epoch": 2.367447595561036, "step": 1920}, {"loss": 0.7434, "grad_norm": 0.5427613854408264, "learning_rate": 0.0002, "epoch": 2.379778051787916, "step": 1930}, {"loss": 0.8423, "grad_norm": 0.7164974808692932, "learning_rate": 0.0002, "epoch": 2.392108508014797, "step": 1940}, {"loss": 0.6888, "grad_norm": 0.562979519367218, "learning_rate": 0.0002, "epoch": 2.404438964241677, "step": 1950}, {"loss": 0.7692, "grad_norm": 0.5631861090660095, "learning_rate": 0.0002, "epoch": 2.4167694204685573, "step": 1960}, {"loss": 0.67, "grad_norm": 0.4895121157169342, "learning_rate": 0.0002, "epoch": 2.429099876695438, "step": 1970}, {"loss": 0.7735, "grad_norm": 0.45674824714660645, "learning_rate": 0.0002, "epoch": 2.441430332922318, "step": 1980}, {"loss": 0.685, "grad_norm": 1.1424206495285034, "learning_rate": 0.0002, "epoch": 2.4537607891491984, "step": 1990}, {"loss": 0.7627, "grad_norm": 0.6314579844474792, "learning_rate": 0.0002, "epoch": 2.466091245376079, "step": 2000}, {"loss": 0.7118, "grad_norm": 0.5481605529785156, "learning_rate": 0.0002, "epoch": 2.4784217016029593, "step": 2010}, {"loss": 0.6947, "grad_norm": 0.4671579599380493, "learning_rate": 0.0002, "epoch": 2.4907521578298395, "step": 2020}, {"loss": 0.7377, "grad_norm": 0.7621194124221802, "learning_rate": 0.0002, "epoch": 2.50308261405672, "step": 2030}, {"loss": 0.69, "grad_norm": 0.38983288407325745, "learning_rate": 0.0002, "epoch": 2.5154130702836004, "step": 2040}, {"loss": 0.8381, "grad_norm": 0.6341150999069214, "learning_rate": 0.0002, "epoch": 2.5277435265104806, "step": 2050}, {"loss": 0.773, "grad_norm": 0.7151971459388733, "learning_rate": 0.0002, "epoch": 2.5400739827373613, "step": 2060}, {"loss": 0.6733, "grad_norm": 0.9665895104408264, "learning_rate": 0.0002, "epoch": 2.5524044389642415, "step": 2070}, {"loss": 0.7791, "grad_norm": 0.9572727680206299, "learning_rate": 0.0002, "epoch": 2.564734895191122, "step": 2080}, {"loss": 0.7205, "grad_norm": 1.1970765590667725, "learning_rate": 0.0002, "epoch": 2.5770653514180024, "step": 2090}, {"loss": 0.6736, "grad_norm": 0.5505942702293396, "learning_rate": 0.0002, "epoch": 2.589395807644883, "step": 2100}, {"loss": 0.673, "grad_norm": 0.5903949737548828, "learning_rate": 0.0002, "epoch": 2.6017262638717633, "step": 2110}, {"loss": 0.678, "grad_norm": 0.45640307664871216, "learning_rate": 0.0002, "epoch": 2.6140567200986435, "step": 2120}, {"loss": 0.6562, "grad_norm": 0.8763944506645203, "learning_rate": 0.0002, "epoch": 2.626387176325524, "step": 2130}, {"loss": 0.6484, "grad_norm": 0.4472963213920593, "learning_rate": 0.0002, "epoch": 2.6387176325524044, "step": 2140}, {"loss": 0.7702, "grad_norm": 0.5335086584091187, "learning_rate": 0.0002, "epoch": 2.6510480887792847, "step": 2150}, {"loss": 0.6851, "grad_norm": 0.805263340473175, "learning_rate": 0.0002, "epoch": 2.6633785450061653, "step": 2160}, {"loss": 0.7026, "grad_norm": 0.6332727670669556, "learning_rate": 0.0002, "epoch": 2.6757090012330456, "step": 2170}, {"loss": 0.7925, "grad_norm": 0.8667435646057129, "learning_rate": 0.0002, "epoch": 2.688039457459926, "step": 2180}, {"loss": 0.8044, "grad_norm": 0.5638955235481262, "learning_rate": 0.0002, "epoch": 2.7003699136868065, "step": 2190}, {"loss": 0.7117, "grad_norm": 0.4176250696182251, "learning_rate": 0.0002, "epoch": 2.7127003699136867, "step": 2200}, {"loss": 0.6932, "grad_norm": 0.6013461351394653, "learning_rate": 0.0002, "epoch": 2.7250308261405674, "step": 2210}, {"loss": 0.7843, "grad_norm": 0.553961992263794, "learning_rate": 0.0002, "epoch": 2.7373612823674476, "step": 2220}, {"loss": 0.8633, "grad_norm": 0.4710180461406708, "learning_rate": 0.0002, "epoch": 2.7496917385943282, "step": 2230}, {"loss": 0.7469, "grad_norm": 0.8141706585884094, "learning_rate": 0.0002, "epoch": 2.7620221948212085, "step": 2240}, {"loss": 0.7086, "grad_norm": 0.7449556589126587, "learning_rate": 0.0002, "epoch": 2.7743526510480887, "step": 2250}, {"loss": 0.6933, "grad_norm": 0.5366780757904053, "learning_rate": 0.0002, "epoch": 2.7866831072749694, "step": 2260}, {"loss": 0.7192, "grad_norm": 0.5316720604896545, "learning_rate": 0.0002, "epoch": 2.7990135635018496, "step": 2270}, {"loss": 0.6212, "grad_norm": 0.4598459005355835, "learning_rate": 0.0002, "epoch": 2.81134401972873, "step": 2280}, {"loss": 0.7024, "grad_norm": 0.6852091550827026, "learning_rate": 0.0002, "epoch": 2.8236744759556105, "step": 2290}, {"loss": 0.7357, "grad_norm": 0.8040902018547058, "learning_rate": 0.0002, "epoch": 2.8360049321824907, "step": 2300}, {"loss": 0.7563, "grad_norm": 0.46976321935653687, "learning_rate": 0.0002, "epoch": 2.848335388409371, "step": 2310}, {"loss": 0.731, "grad_norm": 0.5214090347290039, "learning_rate": 0.0002, "epoch": 2.8606658446362516, "step": 2320}, {"loss": 0.6687, "grad_norm": 0.5323054790496826, "learning_rate": 0.0002, "epoch": 2.872996300863132, "step": 2330}, {"loss": 0.7895, "grad_norm": 0.6842264533042908, "learning_rate": 0.0002, "epoch": 2.885326757090012, "step": 2340}, {"loss": 0.7737, "grad_norm": 0.9157055616378784, "learning_rate": 0.0002, "epoch": 2.8976572133168927, "step": 2350}, {"loss": 0.7217, "grad_norm": 0.5253258347511292, "learning_rate": 0.0002, "epoch": 2.909987669543773, "step": 2360}, {"loss": 0.7162, "grad_norm": 0.4937705099582672, "learning_rate": 0.0002, "epoch": 2.9223181257706536, "step": 2370}, {"loss": 0.7008, "grad_norm": 0.48762989044189453, "learning_rate": 0.0002, "epoch": 2.934648581997534, "step": 2380}, {"loss": 0.8086, "grad_norm": 0.544335126876831, "learning_rate": 0.0002, "epoch": 2.9469790382244145, "step": 2390}, {"loss": 0.643, "grad_norm": 0.4847845435142517, "learning_rate": 0.0002, "epoch": 2.9593094944512948, "step": 2400}, {"loss": 0.7757, "grad_norm": 0.4787445366382599, "learning_rate": 0.0002, "epoch": 2.971639950678175, "step": 2410}, {"loss": 0.7678, "grad_norm": 1.022318959236145, "learning_rate": 0.0002, "epoch": 2.9839704069050557, "step": 2420}, {"loss": 0.6548, "grad_norm": 0.4987848103046417, "learning_rate": 0.0002, "epoch": 2.996300863131936, "step": 2430}, {"eval_loss": 1.2936296463012695, "eval_runtime": 94.7897, "eval_samples_per_second": 4.6, "eval_steps_per_second": 0.58, "epoch": 3.0, "step": 2433}, {"loss": 0.6073, "grad_norm": 0.5562372803688049, "learning_rate": 0.0002, "epoch": 3.008631319358816, "step": 2440}, {"loss": 0.5181, "grad_norm": 1.133402705192566, "learning_rate": 0.0002, "epoch": 3.020961775585697, "step": 2450}, {"loss": 0.5333, "grad_norm": 0.6480470299720764, "learning_rate": 0.0002, "epoch": 3.033292231812577, "step": 2460}, {"loss": 0.4828, "grad_norm": 0.8989138007164001, "learning_rate": 0.0002, "epoch": 3.0456226880394572, "step": 2470}, {"loss": 0.5097, "grad_norm": 0.8257461786270142, "learning_rate": 0.0002, "epoch": 3.057953144266338, "step": 2480}, {"loss": 0.6229, "grad_norm": 0.6813381910324097, "learning_rate": 0.0002, "epoch": 3.070283600493218, "step": 2490}, {"loss": 0.531, "grad_norm": 0.6989586353302002, "learning_rate": 0.0002, "epoch": 3.082614056720099, "step": 2500}, {"loss": 0.54, "grad_norm": 0.7992092967033386, "learning_rate": 0.0002, "epoch": 3.094944512946979, "step": 2510}, {"loss": 0.5054, "grad_norm": 0.698077917098999, "learning_rate": 0.0002, "epoch": 3.1072749691738593, "step": 2520}, {"loss": 0.5064, "grad_norm": 0.5699033141136169, "learning_rate": 0.0002, "epoch": 3.11960542540074, "step": 2530}, {"loss": 0.6088, "grad_norm": 0.6142355799674988, "learning_rate": 0.0002, "epoch": 3.13193588162762, "step": 2540}, {"loss": 0.585, "grad_norm": 0.7089933753013611, "learning_rate": 0.0002, "epoch": 3.144266337854501, "step": 2550}, {"loss": 0.5373, "grad_norm": 1.0107015371322632, "learning_rate": 0.0002, "epoch": 3.156596794081381, "step": 2560}, {"loss": 0.5429, "grad_norm": 0.568138837814331, "learning_rate": 0.0002, "epoch": 3.1689272503082613, "step": 2570}, {"loss": 0.5897, "grad_norm": 0.9960416555404663, "learning_rate": 0.0002, "epoch": 3.181257706535142, "step": 2580}, {"loss": 0.5211, "grad_norm": 0.6277595162391663, "learning_rate": 0.0002, "epoch": 3.193588162762022, "step": 2590}, {"loss": 0.5787, "grad_norm": 0.681083619594574, "learning_rate": 0.0002, "epoch": 3.2059186189889024, "step": 2600}, {"loss": 0.5166, "grad_norm": 0.5816057324409485, "learning_rate": 0.0002, "epoch": 3.218249075215783, "step": 2610}, {"loss": 0.545, "grad_norm": 0.7197734117507935, "learning_rate": 0.0002, "epoch": 3.2305795314426633, "step": 2620}, {"loss": 0.614, "grad_norm": 0.6524068117141724, "learning_rate": 0.0002, "epoch": 3.242909987669544, "step": 2630}, {"loss": 0.5456, "grad_norm": 1.273668646812439, "learning_rate": 0.0002, "epoch": 3.255240443896424, "step": 2640}, {"loss": 0.5266, "grad_norm": 0.6950451731681824, "learning_rate": 0.0002, "epoch": 3.2675709001233044, "step": 2650}, {"loss": 0.5194, "grad_norm": 0.8029071688652039, "learning_rate": 0.0002, "epoch": 3.279901356350185, "step": 2660}, {"loss": 0.5729, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 3.2922318125770653, "step": 2670}, {"loss": 0.5366, "grad_norm": 0.8342001438140869, "learning_rate": 0.0002, "epoch": 3.304562268803946, "step": 2680}, {"loss": 0.5413, "grad_norm": 0.5629868507385254, "learning_rate": 0.0002, "epoch": 3.316892725030826, "step": 2690}, {"loss": 0.633, "grad_norm": 0.753999650478363, "learning_rate": 0.0002, "epoch": 3.3292231812577064, "step": 2700}, {"loss": 0.5048, "grad_norm": 1.0271371603012085, "learning_rate": 0.0002, "epoch": 3.341553637484587, "step": 2710}, {"loss": 0.5233, "grad_norm": 0.9608535170555115, "learning_rate": 0.0002, "epoch": 3.3538840937114673, "step": 2720}, {"loss": 0.5102, "grad_norm": 0.7796488404273987, "learning_rate": 0.0002, "epoch": 3.3662145499383476, "step": 2730}, {"loss": 0.5172, "grad_norm": 0.5666437149047852, "learning_rate": 0.0002, "epoch": 3.3785450061652282, "step": 2740}, {"loss": 0.491, "grad_norm": 0.5462956428527832, "learning_rate": 0.0002, "epoch": 3.3908754623921085, "step": 2750}, {"loss": 0.5855, "grad_norm": 1.289099097251892, "learning_rate": 0.0002, "epoch": 3.4032059186189887, "step": 2760}, {"loss": 0.635, "grad_norm": 0.825566828250885, "learning_rate": 0.0002, "epoch": 3.4155363748458694, "step": 2770}, {"loss": 0.4998, "grad_norm": 0.8366670608520508, "learning_rate": 0.0002, "epoch": 3.4278668310727496, "step": 2780}, {"loss": 0.5732, "grad_norm": 1.0931549072265625, "learning_rate": 0.0002, "epoch": 3.4401972872996303, "step": 2790}, {"loss": 0.6093, "grad_norm": 0.9228858351707458, "learning_rate": 0.0002, "epoch": 3.4525277435265105, "step": 2800}, {"loss": 0.6089, "grad_norm": 1.3182806968688965, "learning_rate": 0.0002, "epoch": 3.4648581997533907, "step": 2810}, {"loss": 0.5665, "grad_norm": 0.8366976380348206, "learning_rate": 0.0002, "epoch": 3.4771886559802714, "step": 2820}, {"loss": 0.5666, "grad_norm": 0.8067695498466492, "learning_rate": 0.0002, "epoch": 3.4895191122071516, "step": 2830}, {"loss": 0.579, "grad_norm": 1.1163437366485596, "learning_rate": 0.0002, "epoch": 3.5018495684340323, "step": 2840}, {"loss": 0.5785, "grad_norm": 1.7196556329727173, "learning_rate": 0.0002, "epoch": 3.5141800246609125, "step": 2850}, {"loss": 0.5346, "grad_norm": 1.1267012357711792, "learning_rate": 0.0002, "epoch": 3.5265104808877927, "step": 2860}, {"loss": 0.447, "grad_norm": 0.7220137119293213, "learning_rate": 0.0002, "epoch": 3.5388409371146734, "step": 2870}, {"loss": 0.6099, "grad_norm": 0.914114773273468, "learning_rate": 0.0002, "epoch": 3.5511713933415536, "step": 2880}, {"loss": 0.6143, "grad_norm": 0.6193503141403198, "learning_rate": 0.0002, "epoch": 3.563501849568434, "step": 2890}, {"loss": 0.5171, "grad_norm": 0.6060135960578918, "learning_rate": 0.0002, "epoch": 3.5758323057953145, "step": 2900}, {"loss": 0.5659, "grad_norm": 1.0177327394485474, "learning_rate": 0.0002, "epoch": 3.5881627620221948, "step": 2910}, {"loss": 0.5711, "grad_norm": 0.5994468331336975, "learning_rate": 0.0002, "epoch": 3.600493218249075, "step": 2920}, {"loss": 0.6373, "grad_norm": 0.7450457215309143, "learning_rate": 0.0002, "epoch": 3.6128236744759556, "step": 2930}, {"loss": 0.4933, "grad_norm": 0.5825870037078857, "learning_rate": 0.0002, "epoch": 3.625154130702836, "step": 2940}, {"loss": 0.6016, "grad_norm": 0.6289743781089783, "learning_rate": 0.0002, "epoch": 3.6374845869297165, "step": 2950}, {"loss": 0.5507, "grad_norm": 0.7801929116249084, "learning_rate": 0.0002, "epoch": 3.6498150431565968, "step": 2960}, {"loss": 0.5695, "grad_norm": 1.1206634044647217, "learning_rate": 0.0002, "epoch": 3.6621454993834774, "step": 2970}, {"loss": 0.4985, "grad_norm": 0.6738817691802979, "learning_rate": 0.0002, "epoch": 3.6744759556103577, "step": 2980}, {"loss": 0.6209, "grad_norm": 1.1917344331741333, "learning_rate": 0.0002, "epoch": 3.686806411837238, "step": 2990}, {"loss": 0.5373, "grad_norm": 1.3738657236099243, "learning_rate": 0.0002, "epoch": 3.6991368680641186, "step": 3000}, {"loss": 0.5467, "grad_norm": 0.6642793416976929, "learning_rate": 0.0002, "epoch": 3.711467324290999, "step": 3010}, {"loss": 0.6243, "grad_norm": 0.9030995965003967, "learning_rate": 0.0002, "epoch": 3.723797780517879, "step": 3020}, {"loss": 0.592, "grad_norm": 1.0203914642333984, "learning_rate": 0.0002, "epoch": 3.7361282367447597, "step": 3030}, {"loss": 0.5453, "grad_norm": 0.648394763469696, "learning_rate": 0.0002, "epoch": 3.74845869297164, "step": 3040}, {"loss": 0.498, "grad_norm": 0.6304570436477661, "learning_rate": 0.0002, "epoch": 3.76078914919852, "step": 3050}, {"loss": 0.683, "grad_norm": 0.8286601901054382, "learning_rate": 0.0002, "epoch": 3.773119605425401, "step": 3060}, {"loss": 0.5302, "grad_norm": 0.906444251537323, "learning_rate": 0.0002, "epoch": 3.785450061652281, "step": 3070}, {"loss": 0.5345, "grad_norm": 1.4212149381637573, "learning_rate": 0.0002, "epoch": 3.7977805178791613, "step": 3080}, {"loss": 0.6403, "grad_norm": 0.7574319839477539, "learning_rate": 0.0002, "epoch": 3.810110974106042, "step": 3090}, {"loss": 0.5756, "grad_norm": 0.6534451246261597, "learning_rate": 0.0002, "epoch": 3.822441430332922, "step": 3100}, {"loss": 0.5306, "grad_norm": 0.7525447010993958, "learning_rate": 0.0002, "epoch": 3.834771886559803, "step": 3110}, {"loss": 0.5368, "grad_norm": 0.6513990759849548, "learning_rate": 0.0002, "epoch": 3.847102342786683, "step": 3120}, {"loss": 0.5492, "grad_norm": 0.7782694697380066, "learning_rate": 0.0002, "epoch": 3.8594327990135637, "step": 3130}, {"loss": 0.5727, "grad_norm": 0.7998530268669128, "learning_rate": 0.0002, "epoch": 3.871763255240444, "step": 3140}, {"loss": 0.5156, "grad_norm": 0.8045353293418884, "learning_rate": 0.0002, "epoch": 3.884093711467324, "step": 3150}, {"loss": 0.5341, "grad_norm": 0.8242645263671875, "learning_rate": 0.0002, "epoch": 3.896424167694205, "step": 3160}, {"loss": 0.5563, "grad_norm": 0.8302360773086548, "learning_rate": 0.0002, "epoch": 3.908754623921085, "step": 3170}, {"loss": 0.5793, "grad_norm": 0.8653109073638916, "learning_rate": 0.0002, "epoch": 3.9210850801479653, "step": 3180}, {"loss": 0.5219, "grad_norm": 0.6461338996887207, "learning_rate": 0.0002, "epoch": 3.933415536374846, "step": 3190}, {"loss": 0.6009, "grad_norm": 0.8267415165901184, "learning_rate": 0.0002, "epoch": 3.945745992601726, "step": 3200}, {"loss": 0.5956, "grad_norm": 1.1963194608688354, "learning_rate": 0.0002, "epoch": 3.9580764488286064, "step": 3210}, {"loss": 0.5692, "grad_norm": 0.7101966142654419, "learning_rate": 0.0002, "epoch": 3.970406905055487, "step": 3220}, {"loss": 0.5471, "grad_norm": 0.5931660532951355, "learning_rate": 0.0002, "epoch": 3.9827373612823673, "step": 3230}, {"loss": 0.5619, "grad_norm": 0.7465988993644714, "learning_rate": 0.0002, "epoch": 3.995067817509248, "step": 3240}, {"eval_loss": 1.4066498279571533, "eval_runtime": 95.7145, "eval_samples_per_second": 4.555, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 3244}, {"loss": 0.4948, "grad_norm": 0.9478800296783447, "learning_rate": 0.0002, "epoch": 4.007398273736128, "step": 3250}, {"loss": 0.4129, "grad_norm": 1.207059621810913, "learning_rate": 0.0002, "epoch": 4.019728729963009, "step": 3260}, {"loss": 0.3577, "grad_norm": 0.8984074592590332, "learning_rate": 0.0002, "epoch": 4.032059186189889, "step": 3270}, {"loss": 0.3798, "grad_norm": 0.8104140758514404, "learning_rate": 0.0002, "epoch": 4.044389642416769, "step": 3280}, {"loss": 0.3657, "grad_norm": 1.0875468254089355, "learning_rate": 0.0002, "epoch": 4.05672009864365, "step": 3290}, {"loss": 0.3703, "grad_norm": 0.8520309329032898, "learning_rate": 0.0002, "epoch": 4.06905055487053, "step": 3300}, {"loss": 0.3933, "grad_norm": 1.076735496520996, "learning_rate": 0.0002, "epoch": 4.0813810110974105, "step": 3310}, {"loss": 0.4422, "grad_norm": 0.7789369821548462, "learning_rate": 0.0002, "epoch": 4.093711467324291, "step": 3320}, {"loss": 0.4009, "grad_norm": 0.916862964630127, "learning_rate": 0.0002, "epoch": 4.106041923551172, "step": 3330}, {"loss": 0.3934, "grad_norm": 1.1251654624938965, "learning_rate": 0.0002, "epoch": 4.118372379778052, "step": 3340}, {"loss": 0.3651, "grad_norm": 0.9373420476913452, "learning_rate": 0.0002, "epoch": 4.130702836004932, "step": 3350}, {"loss": 0.384, "grad_norm": 1.03253972530365, "learning_rate": 0.0002, "epoch": 4.143033292231813, "step": 3360}, {"loss": 0.372, "grad_norm": 0.947023332118988, "learning_rate": 0.0002, "epoch": 4.155363748458693, "step": 3370}, {"loss": 0.4018, "grad_norm": 0.8709157109260559, "learning_rate": 0.0002, "epoch": 4.167694204685573, "step": 3380}, {"loss": 0.3754, "grad_norm": 0.930983304977417, "learning_rate": 0.0002, "epoch": 4.180024660912454, "step": 3390}, {"loss": 0.4248, "grad_norm": 1.092809796333313, "learning_rate": 0.0002, "epoch": 4.192355117139334, "step": 3400}, {"loss": 0.4453, "grad_norm": 0.8454303741455078, "learning_rate": 0.0002, "epoch": 4.2046855733662145, "step": 3410}, {"loss": 0.4198, "grad_norm": 0.957210123538971, "learning_rate": 0.0002, "epoch": 4.217016029593095, "step": 3420}, {"loss": 0.3743, "grad_norm": 0.854333758354187, "learning_rate": 0.0002, "epoch": 4.229346485819975, "step": 3430}, {"loss": 0.4041, "grad_norm": 1.0457639694213867, "learning_rate": 0.0002, "epoch": 4.241676942046856, "step": 3440}, {"loss": 0.3817, "grad_norm": 0.8972977995872498, "learning_rate": 0.0002, "epoch": 4.254007398273736, "step": 3450}, {"loss": 0.4445, "grad_norm": 1.0438238382339478, "learning_rate": 0.0002, "epoch": 4.266337854500616, "step": 3460}, {"loss": 0.4078, "grad_norm": 0.7000405192375183, "learning_rate": 0.0002, "epoch": 4.278668310727497, "step": 3470}, {"loss": 0.3718, "grad_norm": 1.0451240539550781, "learning_rate": 0.0002, "epoch": 4.290998766954377, "step": 3480}, {"loss": 0.4506, "grad_norm": 1.3339767456054688, "learning_rate": 0.0002, "epoch": 4.303329223181258, "step": 3490}, {"loss": 0.3999, "grad_norm": 0.7503946423530579, "learning_rate": 0.0002, "epoch": 4.315659679408138, "step": 3500}, {"loss": 0.4503, "grad_norm": 0.8443584442138672, "learning_rate": 0.0002, "epoch": 4.3279901356350186, "step": 3510}, {"loss": 0.3793, "grad_norm": 1.1681201457977295, "learning_rate": 0.0002, "epoch": 4.340320591861899, "step": 3520}, {"loss": 0.4462, "grad_norm": 1.078883171081543, "learning_rate": 0.0002, "epoch": 4.352651048088779, "step": 3530}, {"loss": 0.4216, "grad_norm": 0.6894834041595459, "learning_rate": 0.0002, "epoch": 4.36498150431566, "step": 3540}, {"loss": 0.4315, "grad_norm": 0.7059480547904968, "learning_rate": 0.0002, "epoch": 4.37731196054254, "step": 3550}, {"loss": 0.3821, "grad_norm": 1.1807256937026978, "learning_rate": 0.0002, "epoch": 4.38964241676942, "step": 3560}, {"loss": 0.4192, "grad_norm": 0.8341359496116638, "learning_rate": 0.0002, "epoch": 4.401972872996301, "step": 3570}, {"loss": 0.4123, "grad_norm": 1.0273033380508423, "learning_rate": 0.0002, "epoch": 4.4143033292231815, "step": 3580}, {"loss": 0.5018, "grad_norm": 0.6916454434394836, "learning_rate": 0.0002, "epoch": 4.426633785450061, "step": 3590}, {"loss": 0.3909, "grad_norm": 0.8210113644599915, "learning_rate": 0.0002, "epoch": 4.438964241676942, "step": 3600}, {"loss": 0.3893, "grad_norm": 1.0309500694274902, "learning_rate": 0.0002, "epoch": 4.451294697903823, "step": 3610}, {"loss": 0.3902, "grad_norm": 0.8847399353981018, "learning_rate": 0.0002, "epoch": 4.463625154130703, "step": 3620}, {"loss": 0.4198, "grad_norm": 1.668636679649353, "learning_rate": 0.0002, "epoch": 4.475955610357583, "step": 3630}, {"loss": 0.4075, "grad_norm": 1.3087958097457886, "learning_rate": 0.0002, "epoch": 4.488286066584464, "step": 3640}, {"loss": 0.4294, "grad_norm": 0.837852418422699, "learning_rate": 0.0002, "epoch": 4.500616522811344, "step": 3650}, {"loss": 0.4053, "grad_norm": 9.7662353515625, "learning_rate": 0.0002, "epoch": 4.512946979038224, "step": 3660}, {"loss": 0.4033, "grad_norm": 1.125719428062439, "learning_rate": 0.0002, "epoch": 4.525277435265105, "step": 3670}, {"loss": 0.4566, "grad_norm": 0.7755377292633057, "learning_rate": 0.0002, "epoch": 4.5376078914919855, "step": 3680}, {"loss": 0.4415, "grad_norm": 0.7185089588165283, "learning_rate": 0.0002, "epoch": 4.549938347718865, "step": 3690}, {"loss": 0.4616, "grad_norm": 1.182063102722168, "learning_rate": 0.0002, "epoch": 4.562268803945746, "step": 3700}, {"loss": 0.4572, "grad_norm": 1.001197338104248, "learning_rate": 0.0002, "epoch": 4.574599260172627, "step": 3710}, {"loss": 0.4493, "grad_norm": 0.9705429077148438, "learning_rate": 0.0002, "epoch": 4.586929716399506, "step": 3720}, {"loss": 0.42, "grad_norm": 0.7136746048927307, "learning_rate": 0.0002, "epoch": 4.599260172626387, "step": 3730}, {"loss": 0.3757, "grad_norm": 1.0004864931106567, "learning_rate": 0.0002, "epoch": 4.611590628853268, "step": 3740}, {"loss": 0.4418, "grad_norm": 1.3193715810775757, "learning_rate": 0.0002, "epoch": 4.623921085080148, "step": 3750}, {"loss": 0.4572, "grad_norm": 0.6945042014122009, "learning_rate": 0.0002, "epoch": 4.636251541307028, "step": 3760}, {"loss": 0.4255, "grad_norm": 0.8903936743736267, "learning_rate": 0.0002, "epoch": 4.648581997533909, "step": 3770}, {"loss": 0.3582, "grad_norm": 0.7960889339447021, "learning_rate": 0.0002, "epoch": 4.660912453760789, "step": 3780}, {"loss": 0.3864, "grad_norm": 1.0439172983169556, "learning_rate": 0.0002, "epoch": 4.673242909987669, "step": 3790}, {"loss": 0.4378, "grad_norm": 1.4546219110488892, "learning_rate": 0.0002, "epoch": 4.68557336621455, "step": 3800}, {"loss": 0.4191, "grad_norm": 0.8194343447685242, "learning_rate": 0.0002, "epoch": 4.697903822441431, "step": 3810}, {"loss": 0.4473, "grad_norm": 1.0727602243423462, "learning_rate": 0.0002, "epoch": 4.7102342786683105, "step": 3820}, {"loss": 0.4021, "grad_norm": 0.7785195708274841, "learning_rate": 0.0002, "epoch": 4.722564734895191, "step": 3830}, {"loss": 0.4252, "grad_norm": 0.846783459186554, "learning_rate": 0.0002, "epoch": 4.734895191122072, "step": 3840}, {"loss": 0.4647, "grad_norm": 1.0481648445129395, "learning_rate": 0.0002, "epoch": 4.747225647348952, "step": 3850}, {"loss": 0.4944, "grad_norm": 0.7324008941650391, "learning_rate": 0.0002, "epoch": 4.759556103575832, "step": 3860}, {"loss": 0.3831, "grad_norm": 1.06382417678833, "learning_rate": 0.0002, "epoch": 4.771886559802713, "step": 3870}, {"loss": 0.3934, "grad_norm": 0.9851241111755371, "learning_rate": 0.0002, "epoch": 4.784217016029594, "step": 3880}, {"loss": 0.5172, "grad_norm": 0.8215277791023254, "learning_rate": 0.0002, "epoch": 4.796547472256473, "step": 3890}, {"loss": 0.4437, "grad_norm": 0.9901723861694336, "learning_rate": 0.0002, "epoch": 4.808877928483354, "step": 3900}, {"loss": 0.4673, "grad_norm": 0.9149112701416016, "learning_rate": 0.0002, "epoch": 4.821208384710234, "step": 3910}, {"loss": 0.4295, "grad_norm": 0.9772973656654358, "learning_rate": 0.0002, "epoch": 4.8335388409371145, "step": 3920}, {"loss": 0.4346, "grad_norm": 0.8889636397361755, "learning_rate": 0.0002, "epoch": 4.845869297163995, "step": 3930}, {"loss": 0.421, "grad_norm": 1.3032807111740112, "learning_rate": 0.0002, "epoch": 4.858199753390876, "step": 3940}, {"loss": 0.434, "grad_norm": 0.8575899600982666, "learning_rate": 0.0002, "epoch": 4.870530209617756, "step": 3950}, {"loss": 0.4295, "grad_norm": 1.04326331615448, "learning_rate": 0.0002, "epoch": 4.882860665844636, "step": 3960}, {"loss": 0.3633, "grad_norm": 1.041210651397705, "learning_rate": 0.0002, "epoch": 4.895191122071517, "step": 3970}, {"loss": 0.4104, "grad_norm": 0.9113056063652039, "learning_rate": 0.0002, "epoch": 4.907521578298397, "step": 3980}, {"loss": 0.4496, "grad_norm": 1.019347906112671, "learning_rate": 0.0002, "epoch": 4.919852034525277, "step": 3990}, {"loss": 0.457, "grad_norm": 0.7709218859672546, "learning_rate": 0.0002, "epoch": 4.932182490752158, "step": 4000}, {"loss": 0.4697, "grad_norm": 0.8891775608062744, "learning_rate": 0.0002, "epoch": 4.944512946979038, "step": 4010}, {"loss": 0.4436, "grad_norm": 1.0396920442581177, "learning_rate": 0.0002, "epoch": 4.9568434032059185, "step": 4020}, {"loss": 0.4251, "grad_norm": 0.9239833354949951, "learning_rate": 0.0002, "epoch": 4.969173859432799, "step": 4030}, {"loss": 0.5049, "grad_norm": 1.801400065422058, "learning_rate": 0.0002, "epoch": 4.981504315659679, "step": 4040}, {"loss": 0.4481, "grad_norm": 0.6194164752960205, "learning_rate": 0.0002, "epoch": 4.99383477188656, "step": 4050}]} +{"epoch": 6.0, "step": 4866, "epoch_duration": 2412.182984352112, "total_accumulated_duration": 14593.780698537827, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19860.224609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}, {"eval_loss": 1.246457576751709, "eval_runtime": 98.7974, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.557, "epoch": 1.0, "step": 811}, {"loss": 1.0536, "grad_norm": 0.26760655641555786, "learning_rate": 0.0002, "epoch": 1.0110974106041923, "step": 820}, {"loss": 0.9722, "grad_norm": 0.4640820026397705, "learning_rate": 0.0002, "epoch": 1.0234278668310728, "step": 830}, {"loss": 0.9392, "grad_norm": 0.2699166238307953, "learning_rate": 0.0002, "epoch": 1.0357583230579532, "step": 840}, {"loss": 0.9335, "grad_norm": 0.3441709578037262, "learning_rate": 0.0002, "epoch": 1.0480887792848335, "step": 850}, {"loss": 0.9083, "grad_norm": 0.299934983253479, "learning_rate": 0.0002, "epoch": 1.060419235511714, "step": 860}, {"loss": 0.9416, "grad_norm": 0.2980666160583496, "learning_rate": 0.0002, "epoch": 1.0727496917385944, "step": 870}, {"loss": 0.94, "grad_norm": 0.3131714463233948, "learning_rate": 0.0002, "epoch": 1.0850801479654748, "step": 880}, {"loss": 0.9288, "grad_norm": 0.29881617426872253, "learning_rate": 0.0002, "epoch": 1.097410604192355, "step": 890}, {"loss": 0.998, "grad_norm": 0.29870888590812683, "learning_rate": 0.0002, "epoch": 1.1097410604192355, "step": 900}, {"loss": 0.9924, "grad_norm": 0.5735140442848206, "learning_rate": 0.0002, "epoch": 1.122071516646116, "step": 910}, {"loss": 1.0694, "grad_norm": 0.33159002661705017, "learning_rate": 0.0002, "epoch": 1.1344019728729964, "step": 920}, {"loss": 1.0069, "grad_norm": 1.235399842262268, "learning_rate": 0.0002, "epoch": 1.1467324290998766, "step": 930}, {"loss": 1.0315, "grad_norm": 0.27469736337661743, "learning_rate": 0.0002, "epoch": 1.159062885326757, "step": 940}, {"loss": 0.9386, "grad_norm": 0.29130664467811584, "learning_rate": 0.0002, "epoch": 1.1713933415536375, "step": 950}, {"loss": 0.8919, "grad_norm": 0.3730354607105255, "learning_rate": 0.0002, "epoch": 1.183723797780518, "step": 960}, {"loss": 0.9988, "grad_norm": 0.5973590612411499, "learning_rate": 0.0002, "epoch": 1.1960542540073984, "step": 970}, {"loss": 0.9525, "grad_norm": 0.39631304144859314, "learning_rate": 0.0002, "epoch": 1.2083847102342786, "step": 980}, {"loss": 0.9217, "grad_norm": 0.849051296710968, "learning_rate": 0.0002, "epoch": 1.220715166461159, "step": 990}, {"loss": 1.0903, "grad_norm": 0.4390525817871094, "learning_rate": 0.0002, "epoch": 1.2330456226880395, "step": 1000}, {"loss": 0.9018, "grad_norm": 0.30423852801322937, "learning_rate": 0.0002, "epoch": 1.2453760789149197, "step": 1010}, {"loss": 1.0128, "grad_norm": 0.34736061096191406, "learning_rate": 0.0002, "epoch": 1.2577065351418002, "step": 1020}, {"loss": 0.9026, "grad_norm": 0.3421604037284851, "learning_rate": 0.0002, "epoch": 1.2700369913686806, "step": 1030}, {"loss": 0.8485, "grad_norm": 0.544170081615448, "learning_rate": 0.0002, "epoch": 1.282367447595561, "step": 1040}, {"loss": 0.9591, "grad_norm": 0.5128790736198425, "learning_rate": 0.0002, "epoch": 1.2946979038224415, "step": 1050}, {"loss": 0.9214, "grad_norm": 0.443344384431839, "learning_rate": 0.0002, "epoch": 1.3070283600493218, "step": 1060}, {"loss": 0.9367, "grad_norm": 0.6380868554115295, "learning_rate": 0.0002, "epoch": 1.3193588162762022, "step": 1070}, {"loss": 0.9849, "grad_norm": 0.4638073146343231, "learning_rate": 0.0002, "epoch": 1.3316892725030827, "step": 1080}, {"loss": 0.8645, "grad_norm": 0.32406893372535706, "learning_rate": 0.0002, "epoch": 1.344019728729963, "step": 1090}, {"loss": 0.8278, "grad_norm": 0.3955065608024597, "learning_rate": 0.0002, "epoch": 1.3563501849568433, "step": 1100}, {"loss": 0.9306, "grad_norm": 0.3489246666431427, "learning_rate": 0.0002, "epoch": 1.3686806411837238, "step": 1110}, {"loss": 1.0138, "grad_norm": 0.48451653122901917, "learning_rate": 0.0002, "epoch": 1.3810110974106042, "step": 1120}, {"loss": 0.9165, "grad_norm": 0.3652360439300537, "learning_rate": 0.0002, "epoch": 1.3933415536374847, "step": 1130}, {"loss": 0.9576, "grad_norm": 1.3097436428070068, "learning_rate": 0.0002, "epoch": 1.405672009864365, "step": 1140}, {"loss": 0.8115, "grad_norm": 0.3647715449333191, "learning_rate": 0.0002, "epoch": 1.4180024660912454, "step": 1150}, {"loss": 0.8573, "grad_norm": 0.37248560786247253, "learning_rate": 0.0002, "epoch": 1.4303329223181258, "step": 1160}, {"loss": 0.936, "grad_norm": 0.4639643430709839, "learning_rate": 0.0002, "epoch": 1.442663378545006, "step": 1170}, {"loss": 0.9511, "grad_norm": 0.5455219745635986, "learning_rate": 0.0002, "epoch": 1.4549938347718865, "step": 1180}, {"loss": 0.8611, "grad_norm": 0.38862571120262146, "learning_rate": 0.0002, "epoch": 1.467324290998767, "step": 1190}, {"loss": 0.8681, "grad_norm": 0.37586215138435364, "learning_rate": 0.0002, "epoch": 1.4796547472256474, "step": 1200}, {"loss": 0.8673, "grad_norm": 0.46244436502456665, "learning_rate": 0.0002, "epoch": 1.4919852034525278, "step": 1210}, {"loss": 0.9388, "grad_norm": 0.3570359945297241, "learning_rate": 0.0002, "epoch": 1.504315659679408, "step": 1220}, {"loss": 0.971, "grad_norm": 0.28393083810806274, "learning_rate": 0.0002, "epoch": 1.5166461159062885, "step": 1230}, {"loss": 0.9296, "grad_norm": 0.5672869682312012, "learning_rate": 0.0002, "epoch": 1.528976572133169, "step": 1240}, {"loss": 0.8787, "grad_norm": 0.41605108976364136, "learning_rate": 0.0002, "epoch": 1.5413070283600492, "step": 1250}, {"loss": 0.8744, "grad_norm": 0.40657493472099304, "learning_rate": 0.0002, "epoch": 1.5536374845869299, "step": 1260}, {"loss": 0.9046, "grad_norm": 0.43672341108322144, "learning_rate": 0.0002, "epoch": 1.56596794081381, "step": 1270}, {"loss": 0.8586, "grad_norm": 0.3065410554409027, "learning_rate": 0.0002, "epoch": 1.5782983970406905, "step": 1280}, {"loss": 0.9499, "grad_norm": 0.37826645374298096, "learning_rate": 0.0002, "epoch": 1.590628853267571, "step": 1290}, {"loss": 0.901, "grad_norm": 0.42307335138320923, "learning_rate": 0.0002, "epoch": 1.6029593094944512, "step": 1300}, {"loss": 0.8673, "grad_norm": 0.3648843467235565, "learning_rate": 0.0002, "epoch": 1.6152897657213316, "step": 1310}, {"loss": 0.9302, "grad_norm": 0.8921076059341431, "learning_rate": 0.0002, "epoch": 1.627620221948212, "step": 1320}, {"loss": 0.9378, "grad_norm": 0.37522226572036743, "learning_rate": 0.0002, "epoch": 1.6399506781750923, "step": 1330}, {"loss": 0.8921, "grad_norm": 0.7489957809448242, "learning_rate": 0.0002, "epoch": 1.652281134401973, "step": 1340}, {"loss": 0.9297, "grad_norm": 0.31733131408691406, "learning_rate": 0.0002, "epoch": 1.6646115906288532, "step": 1350}, {"loss": 0.907, "grad_norm": 0.3249478340148926, "learning_rate": 0.0002, "epoch": 1.6769420468557337, "step": 1360}, {"loss": 1.0197, "grad_norm": 0.3178001344203949, "learning_rate": 0.0002, "epoch": 1.6892725030826141, "step": 1370}, {"loss": 1.0781, "grad_norm": 0.5674093961715698, "learning_rate": 0.0002, "epoch": 1.7016029593094943, "step": 1380}, {"loss": 0.8972, "grad_norm": 0.35272449254989624, "learning_rate": 0.0002, "epoch": 1.7139334155363748, "step": 1390}, {"loss": 0.9346, "grad_norm": 0.5778217911720276, "learning_rate": 0.0002, "epoch": 1.7262638717632552, "step": 1400}, {"loss": 0.9099, "grad_norm": 0.33561450242996216, "learning_rate": 0.0002, "epoch": 1.7385943279901355, "step": 1410}, {"loss": 0.8636, "grad_norm": 0.31735464930534363, "learning_rate": 0.0002, "epoch": 1.7509247842170161, "step": 1420}, {"loss": 0.982, "grad_norm": 1.0612670183181763, "learning_rate": 0.0002, "epoch": 1.7632552404438964, "step": 1430}, {"loss": 0.8224, "grad_norm": 0.5442509651184082, "learning_rate": 0.0002, "epoch": 1.7755856966707768, "step": 1440}, {"loss": 0.9275, "grad_norm": 0.7471332550048828, "learning_rate": 0.0002, "epoch": 1.7879161528976573, "step": 1450}, {"loss": 0.9389, "grad_norm": 0.4323609173297882, "learning_rate": 0.0002, "epoch": 1.8002466091245375, "step": 1460}, {"loss": 0.8247, "grad_norm": 0.47796759009361267, "learning_rate": 0.0002, "epoch": 1.8125770653514182, "step": 1470}, {"loss": 0.9395, "grad_norm": 0.3348400592803955, "learning_rate": 0.0002, "epoch": 1.8249075215782984, "step": 1480}, {"loss": 0.9793, "grad_norm": 0.3354550898075104, "learning_rate": 0.0002, "epoch": 1.8372379778051788, "step": 1490}, {"loss": 0.8581, "grad_norm": 0.5988477468490601, "learning_rate": 0.0002, "epoch": 1.8495684340320593, "step": 1500}, {"loss": 0.9268, "grad_norm": 0.5222318172454834, "learning_rate": 0.0002, "epoch": 1.8618988902589395, "step": 1510}, {"loss": 0.8846, "grad_norm": 0.5246642827987671, "learning_rate": 0.0002, "epoch": 1.87422934648582, "step": 1520}, {"loss": 0.9317, "grad_norm": 0.3164594769477844, "learning_rate": 0.0002, "epoch": 1.8865598027127004, "step": 1530}, {"loss": 0.9961, "grad_norm": 0.3496174216270447, "learning_rate": 0.0002, "epoch": 1.8988902589395806, "step": 1540}, {"loss": 0.9057, "grad_norm": 0.8863359689712524, "learning_rate": 0.0002, "epoch": 1.9112207151664613, "step": 1550}, {"loss": 0.9405, "grad_norm": 0.3587026298046112, "learning_rate": 0.0002, "epoch": 1.9235511713933415, "step": 1560}, {"loss": 0.8335, "grad_norm": 0.6052881479263306, "learning_rate": 0.0002, "epoch": 1.935881627620222, "step": 1570}, {"loss": 0.8805, "grad_norm": 0.567269504070282, "learning_rate": 0.0002, "epoch": 1.9482120838471024, "step": 1580}, {"loss": 0.9581, "grad_norm": 0.45184487104415894, "learning_rate": 0.0002, "epoch": 1.9605425400739827, "step": 1590}, {"loss": 0.9147, "grad_norm": 0.5028569102287292, "learning_rate": 0.0002, "epoch": 1.972872996300863, "step": 1600}, {"loss": 0.75, "grad_norm": 0.4677547216415405, "learning_rate": 0.0002, "epoch": 1.9852034525277436, "step": 1610}, {"loss": 0.8469, "grad_norm": 0.35106056928634644, "learning_rate": 0.0002, "epoch": 1.9975339087546238, "step": 1620}, {"eval_loss": 1.238026738166809, "eval_runtime": 95.4287, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 1622}, {"loss": 0.7629, "grad_norm": 0.444060355424881, "learning_rate": 0.0002, "epoch": 2.0098643649815044, "step": 1630}, {"loss": 0.772, "grad_norm": 0.627570390701294, "learning_rate": 0.0002, "epoch": 2.0221948212083847, "step": 1640}, {"loss": 0.6186, "grad_norm": 0.38737839460372925, "learning_rate": 0.0002, "epoch": 2.034525277435265, "step": 1650}, {"loss": 0.7734, "grad_norm": 0.4300459623336792, "learning_rate": 0.0002, "epoch": 2.0468557336621456, "step": 1660}, {"loss": 0.6943, "grad_norm": 0.43037715554237366, "learning_rate": 0.0002, "epoch": 2.059186189889026, "step": 1670}, {"loss": 0.6588, "grad_norm": 0.40772515535354614, "learning_rate": 0.0002, "epoch": 2.0715166461159065, "step": 1680}, {"loss": 0.8105, "grad_norm": 0.5295451879501343, "learning_rate": 0.0002, "epoch": 2.0838471023427867, "step": 1690}, {"loss": 0.7717, "grad_norm": 0.7452750205993652, "learning_rate": 0.0002, "epoch": 2.096177558569667, "step": 1700}, {"loss": 0.7458, "grad_norm": 0.809183657169342, "learning_rate": 0.0002, "epoch": 2.1085080147965476, "step": 1710}, {"loss": 0.7398, "grad_norm": 0.4597688913345337, "learning_rate": 0.0002, "epoch": 2.120838471023428, "step": 1720}, {"loss": 0.6856, "grad_norm": 0.806919276714325, "learning_rate": 0.0002, "epoch": 2.133168927250308, "step": 1730}, {"loss": 0.679, "grad_norm": 0.3755643665790558, "learning_rate": 0.0002, "epoch": 2.1454993834771887, "step": 1740}, {"loss": 0.7938, "grad_norm": 0.5882734060287476, "learning_rate": 0.0002, "epoch": 2.157829839704069, "step": 1750}, {"loss": 0.6782, "grad_norm": 0.692960798740387, "learning_rate": 0.0002, "epoch": 2.1701602959309496, "step": 1760}, {"loss": 0.7195, "grad_norm": 0.4737096428871155, "learning_rate": 0.0002, "epoch": 2.18249075215783, "step": 1770}, {"loss": 0.758, "grad_norm": 0.6637021899223328, "learning_rate": 0.0002, "epoch": 2.19482120838471, "step": 1780}, {"loss": 0.7139, "grad_norm": 0.9109764099121094, "learning_rate": 0.0002, "epoch": 2.2071516646115907, "step": 1790}, {"loss": 0.7373, "grad_norm": 0.4137539267539978, "learning_rate": 0.0002, "epoch": 2.219482120838471, "step": 1800}, {"loss": 0.7266, "grad_norm": 0.44995415210723877, "learning_rate": 0.0002, "epoch": 2.2318125770653516, "step": 1810}, {"loss": 0.7663, "grad_norm": 0.5985036492347717, "learning_rate": 0.0002, "epoch": 2.244143033292232, "step": 1820}, {"loss": 0.7502, "grad_norm": 0.7549490332603455, "learning_rate": 0.0002, "epoch": 2.256473489519112, "step": 1830}, {"loss": 0.7452, "grad_norm": 0.4490937888622284, "learning_rate": 0.0002, "epoch": 2.2688039457459928, "step": 1840}, {"loss": 0.7531, "grad_norm": 0.38859808444976807, "learning_rate": 0.0002, "epoch": 2.281134401972873, "step": 1850}, {"loss": 0.7278, "grad_norm": 1.0704916715621948, "learning_rate": 0.0002, "epoch": 2.293464858199753, "step": 1860}, {"loss": 0.7143, "grad_norm": 0.4647100865840912, "learning_rate": 0.0002, "epoch": 2.305795314426634, "step": 1870}, {"loss": 0.7146, "grad_norm": 0.6181163787841797, "learning_rate": 0.0002, "epoch": 2.318125770653514, "step": 1880}, {"loss": 0.7689, "grad_norm": 0.9241904020309448, "learning_rate": 0.0002, "epoch": 2.3304562268803943, "step": 1890}, {"loss": 0.7294, "grad_norm": 0.39101317524909973, "learning_rate": 0.0002, "epoch": 2.342786683107275, "step": 1900}, {"loss": 0.7079, "grad_norm": 0.49442458152770996, "learning_rate": 0.0002, "epoch": 2.3551171393341552, "step": 1910}, {"loss": 0.7586, "grad_norm": 0.4864824414253235, "learning_rate": 0.0002, "epoch": 2.367447595561036, "step": 1920}, {"loss": 0.7434, "grad_norm": 0.5427613854408264, "learning_rate": 0.0002, "epoch": 2.379778051787916, "step": 1930}, {"loss": 0.8423, "grad_norm": 0.7164974808692932, "learning_rate": 0.0002, "epoch": 2.392108508014797, "step": 1940}, {"loss": 0.6888, "grad_norm": 0.562979519367218, "learning_rate": 0.0002, "epoch": 2.404438964241677, "step": 1950}, {"loss": 0.7692, "grad_norm": 0.5631861090660095, "learning_rate": 0.0002, "epoch": 2.4167694204685573, "step": 1960}, {"loss": 0.67, "grad_norm": 0.4895121157169342, "learning_rate": 0.0002, "epoch": 2.429099876695438, "step": 1970}, {"loss": 0.7735, "grad_norm": 0.45674824714660645, "learning_rate": 0.0002, "epoch": 2.441430332922318, "step": 1980}, {"loss": 0.685, "grad_norm": 1.1424206495285034, "learning_rate": 0.0002, "epoch": 2.4537607891491984, "step": 1990}, {"loss": 0.7627, "grad_norm": 0.6314579844474792, "learning_rate": 0.0002, "epoch": 2.466091245376079, "step": 2000}, {"loss": 0.7118, "grad_norm": 0.5481605529785156, "learning_rate": 0.0002, "epoch": 2.4784217016029593, "step": 2010}, {"loss": 0.6947, "grad_norm": 0.4671579599380493, "learning_rate": 0.0002, "epoch": 2.4907521578298395, "step": 2020}, {"loss": 0.7377, "grad_norm": 0.7621194124221802, "learning_rate": 0.0002, "epoch": 2.50308261405672, "step": 2030}, {"loss": 0.69, "grad_norm": 0.38983288407325745, "learning_rate": 0.0002, "epoch": 2.5154130702836004, "step": 2040}, {"loss": 0.8381, "grad_norm": 0.6341150999069214, "learning_rate": 0.0002, "epoch": 2.5277435265104806, "step": 2050}, {"loss": 0.773, "grad_norm": 0.7151971459388733, "learning_rate": 0.0002, "epoch": 2.5400739827373613, "step": 2060}, {"loss": 0.6733, "grad_norm": 0.9665895104408264, "learning_rate": 0.0002, "epoch": 2.5524044389642415, "step": 2070}, {"loss": 0.7791, "grad_norm": 0.9572727680206299, "learning_rate": 0.0002, "epoch": 2.564734895191122, "step": 2080}, {"loss": 0.7205, "grad_norm": 1.1970765590667725, "learning_rate": 0.0002, "epoch": 2.5770653514180024, "step": 2090}, {"loss": 0.6736, "grad_norm": 0.5505942702293396, "learning_rate": 0.0002, "epoch": 2.589395807644883, "step": 2100}, {"loss": 0.673, "grad_norm": 0.5903949737548828, "learning_rate": 0.0002, "epoch": 2.6017262638717633, "step": 2110}, {"loss": 0.678, "grad_norm": 0.45640307664871216, "learning_rate": 0.0002, "epoch": 2.6140567200986435, "step": 2120}, {"loss": 0.6562, "grad_norm": 0.8763944506645203, "learning_rate": 0.0002, "epoch": 2.626387176325524, "step": 2130}, {"loss": 0.6484, "grad_norm": 0.4472963213920593, "learning_rate": 0.0002, "epoch": 2.6387176325524044, "step": 2140}, {"loss": 0.7702, "grad_norm": 0.5335086584091187, "learning_rate": 0.0002, "epoch": 2.6510480887792847, "step": 2150}, {"loss": 0.6851, "grad_norm": 0.805263340473175, "learning_rate": 0.0002, "epoch": 2.6633785450061653, "step": 2160}, {"loss": 0.7026, "grad_norm": 0.6332727670669556, "learning_rate": 0.0002, "epoch": 2.6757090012330456, "step": 2170}, {"loss": 0.7925, "grad_norm": 0.8667435646057129, "learning_rate": 0.0002, "epoch": 2.688039457459926, "step": 2180}, {"loss": 0.8044, "grad_norm": 0.5638955235481262, "learning_rate": 0.0002, "epoch": 2.7003699136868065, "step": 2190}, {"loss": 0.7117, "grad_norm": 0.4176250696182251, "learning_rate": 0.0002, "epoch": 2.7127003699136867, "step": 2200}, {"loss": 0.6932, "grad_norm": 0.6013461351394653, "learning_rate": 0.0002, "epoch": 2.7250308261405674, "step": 2210}, {"loss": 0.7843, "grad_norm": 0.553961992263794, "learning_rate": 0.0002, "epoch": 2.7373612823674476, "step": 2220}, {"loss": 0.8633, "grad_norm": 0.4710180461406708, "learning_rate": 0.0002, "epoch": 2.7496917385943282, "step": 2230}, {"loss": 0.7469, "grad_norm": 0.8141706585884094, "learning_rate": 0.0002, "epoch": 2.7620221948212085, "step": 2240}, {"loss": 0.7086, "grad_norm": 0.7449556589126587, "learning_rate": 0.0002, "epoch": 2.7743526510480887, "step": 2250}, {"loss": 0.6933, "grad_norm": 0.5366780757904053, "learning_rate": 0.0002, "epoch": 2.7866831072749694, "step": 2260}, {"loss": 0.7192, "grad_norm": 0.5316720604896545, "learning_rate": 0.0002, "epoch": 2.7990135635018496, "step": 2270}, {"loss": 0.6212, "grad_norm": 0.4598459005355835, "learning_rate": 0.0002, "epoch": 2.81134401972873, "step": 2280}, {"loss": 0.7024, "grad_norm": 0.6852091550827026, "learning_rate": 0.0002, "epoch": 2.8236744759556105, "step": 2290}, {"loss": 0.7357, "grad_norm": 0.8040902018547058, "learning_rate": 0.0002, "epoch": 2.8360049321824907, "step": 2300}, {"loss": 0.7563, "grad_norm": 0.46976321935653687, "learning_rate": 0.0002, "epoch": 2.848335388409371, "step": 2310}, {"loss": 0.731, "grad_norm": 0.5214090347290039, "learning_rate": 0.0002, "epoch": 2.8606658446362516, "step": 2320}, {"loss": 0.6687, "grad_norm": 0.5323054790496826, "learning_rate": 0.0002, "epoch": 2.872996300863132, "step": 2330}, {"loss": 0.7895, "grad_norm": 0.6842264533042908, "learning_rate": 0.0002, "epoch": 2.885326757090012, "step": 2340}, {"loss": 0.7737, "grad_norm": 0.9157055616378784, "learning_rate": 0.0002, "epoch": 2.8976572133168927, "step": 2350}, {"loss": 0.7217, "grad_norm": 0.5253258347511292, "learning_rate": 0.0002, "epoch": 2.909987669543773, "step": 2360}, {"loss": 0.7162, "grad_norm": 0.4937705099582672, "learning_rate": 0.0002, "epoch": 2.9223181257706536, "step": 2370}, {"loss": 0.7008, "grad_norm": 0.48762989044189453, "learning_rate": 0.0002, "epoch": 2.934648581997534, "step": 2380}, {"loss": 0.8086, "grad_norm": 0.544335126876831, "learning_rate": 0.0002, "epoch": 2.9469790382244145, "step": 2390}, {"loss": 0.643, "grad_norm": 0.4847845435142517, "learning_rate": 0.0002, "epoch": 2.9593094944512948, "step": 2400}, {"loss": 0.7757, "grad_norm": 0.4787445366382599, "learning_rate": 0.0002, "epoch": 2.971639950678175, "step": 2410}, {"loss": 0.7678, "grad_norm": 1.022318959236145, "learning_rate": 0.0002, "epoch": 2.9839704069050557, "step": 2420}, {"loss": 0.6548, "grad_norm": 0.4987848103046417, "learning_rate": 0.0002, "epoch": 2.996300863131936, "step": 2430}, {"eval_loss": 1.2936296463012695, "eval_runtime": 94.7897, "eval_samples_per_second": 4.6, "eval_steps_per_second": 0.58, "epoch": 3.0, "step": 2433}, {"loss": 0.6073, "grad_norm": 0.5562372803688049, "learning_rate": 0.0002, "epoch": 3.008631319358816, "step": 2440}, {"loss": 0.5181, "grad_norm": 1.133402705192566, "learning_rate": 0.0002, "epoch": 3.020961775585697, "step": 2450}, {"loss": 0.5333, "grad_norm": 0.6480470299720764, "learning_rate": 0.0002, "epoch": 3.033292231812577, "step": 2460}, {"loss": 0.4828, "grad_norm": 0.8989138007164001, "learning_rate": 0.0002, "epoch": 3.0456226880394572, "step": 2470}, {"loss": 0.5097, "grad_norm": 0.8257461786270142, "learning_rate": 0.0002, "epoch": 3.057953144266338, "step": 2480}, {"loss": 0.6229, "grad_norm": 0.6813381910324097, "learning_rate": 0.0002, "epoch": 3.070283600493218, "step": 2490}, {"loss": 0.531, "grad_norm": 0.6989586353302002, "learning_rate": 0.0002, "epoch": 3.082614056720099, "step": 2500}, {"loss": 0.54, "grad_norm": 0.7992092967033386, "learning_rate": 0.0002, "epoch": 3.094944512946979, "step": 2510}, {"loss": 0.5054, "grad_norm": 0.698077917098999, "learning_rate": 0.0002, "epoch": 3.1072749691738593, "step": 2520}, {"loss": 0.5064, "grad_norm": 0.5699033141136169, "learning_rate": 0.0002, "epoch": 3.11960542540074, "step": 2530}, {"loss": 0.6088, "grad_norm": 0.6142355799674988, "learning_rate": 0.0002, "epoch": 3.13193588162762, "step": 2540}, {"loss": 0.585, "grad_norm": 0.7089933753013611, "learning_rate": 0.0002, "epoch": 3.144266337854501, "step": 2550}, {"loss": 0.5373, "grad_norm": 1.0107015371322632, "learning_rate": 0.0002, "epoch": 3.156596794081381, "step": 2560}, {"loss": 0.5429, "grad_norm": 0.568138837814331, "learning_rate": 0.0002, "epoch": 3.1689272503082613, "step": 2570}, {"loss": 0.5897, "grad_norm": 0.9960416555404663, "learning_rate": 0.0002, "epoch": 3.181257706535142, "step": 2580}, {"loss": 0.5211, "grad_norm": 0.6277595162391663, "learning_rate": 0.0002, "epoch": 3.193588162762022, "step": 2590}, {"loss": 0.5787, "grad_norm": 0.681083619594574, "learning_rate": 0.0002, "epoch": 3.2059186189889024, "step": 2600}, {"loss": 0.5166, "grad_norm": 0.5816057324409485, "learning_rate": 0.0002, "epoch": 3.218249075215783, "step": 2610}, {"loss": 0.545, "grad_norm": 0.7197734117507935, "learning_rate": 0.0002, "epoch": 3.2305795314426633, "step": 2620}, {"loss": 0.614, "grad_norm": 0.6524068117141724, "learning_rate": 0.0002, "epoch": 3.242909987669544, "step": 2630}, {"loss": 0.5456, "grad_norm": 1.273668646812439, "learning_rate": 0.0002, "epoch": 3.255240443896424, "step": 2640}, {"loss": 0.5266, "grad_norm": 0.6950451731681824, "learning_rate": 0.0002, "epoch": 3.2675709001233044, "step": 2650}, {"loss": 0.5194, "grad_norm": 0.8029071688652039, "learning_rate": 0.0002, "epoch": 3.279901356350185, "step": 2660}, {"loss": 0.5729, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 3.2922318125770653, "step": 2670}, {"loss": 0.5366, "grad_norm": 0.8342001438140869, "learning_rate": 0.0002, "epoch": 3.304562268803946, "step": 2680}, {"loss": 0.5413, "grad_norm": 0.5629868507385254, "learning_rate": 0.0002, "epoch": 3.316892725030826, "step": 2690}, {"loss": 0.633, "grad_norm": 0.753999650478363, "learning_rate": 0.0002, "epoch": 3.3292231812577064, "step": 2700}, {"loss": 0.5048, "grad_norm": 1.0271371603012085, "learning_rate": 0.0002, "epoch": 3.341553637484587, "step": 2710}, {"loss": 0.5233, "grad_norm": 0.9608535170555115, "learning_rate": 0.0002, "epoch": 3.3538840937114673, "step": 2720}, {"loss": 0.5102, "grad_norm": 0.7796488404273987, "learning_rate": 0.0002, "epoch": 3.3662145499383476, "step": 2730}, {"loss": 0.5172, "grad_norm": 0.5666437149047852, "learning_rate": 0.0002, "epoch": 3.3785450061652282, "step": 2740}, {"loss": 0.491, "grad_norm": 0.5462956428527832, "learning_rate": 0.0002, "epoch": 3.3908754623921085, "step": 2750}, {"loss": 0.5855, "grad_norm": 1.289099097251892, "learning_rate": 0.0002, "epoch": 3.4032059186189887, "step": 2760}, {"loss": 0.635, "grad_norm": 0.825566828250885, "learning_rate": 0.0002, "epoch": 3.4155363748458694, "step": 2770}, {"loss": 0.4998, "grad_norm": 0.8366670608520508, "learning_rate": 0.0002, "epoch": 3.4278668310727496, "step": 2780}, {"loss": 0.5732, "grad_norm": 1.0931549072265625, "learning_rate": 0.0002, "epoch": 3.4401972872996303, "step": 2790}, {"loss": 0.6093, "grad_norm": 0.9228858351707458, "learning_rate": 0.0002, "epoch": 3.4525277435265105, "step": 2800}, {"loss": 0.6089, "grad_norm": 1.3182806968688965, "learning_rate": 0.0002, "epoch": 3.4648581997533907, "step": 2810}, {"loss": 0.5665, "grad_norm": 0.8366976380348206, "learning_rate": 0.0002, "epoch": 3.4771886559802714, "step": 2820}, {"loss": 0.5666, "grad_norm": 0.8067695498466492, "learning_rate": 0.0002, "epoch": 3.4895191122071516, "step": 2830}, {"loss": 0.579, "grad_norm": 1.1163437366485596, "learning_rate": 0.0002, "epoch": 3.5018495684340323, "step": 2840}, {"loss": 0.5785, "grad_norm": 1.7196556329727173, "learning_rate": 0.0002, "epoch": 3.5141800246609125, "step": 2850}, {"loss": 0.5346, "grad_norm": 1.1267012357711792, "learning_rate": 0.0002, "epoch": 3.5265104808877927, "step": 2860}, {"loss": 0.447, "grad_norm": 0.7220137119293213, "learning_rate": 0.0002, "epoch": 3.5388409371146734, "step": 2870}, {"loss": 0.6099, "grad_norm": 0.914114773273468, "learning_rate": 0.0002, "epoch": 3.5511713933415536, "step": 2880}, {"loss": 0.6143, "grad_norm": 0.6193503141403198, "learning_rate": 0.0002, "epoch": 3.563501849568434, "step": 2890}, {"loss": 0.5171, "grad_norm": 0.6060135960578918, "learning_rate": 0.0002, "epoch": 3.5758323057953145, "step": 2900}, {"loss": 0.5659, "grad_norm": 1.0177327394485474, "learning_rate": 0.0002, "epoch": 3.5881627620221948, "step": 2910}, {"loss": 0.5711, "grad_norm": 0.5994468331336975, "learning_rate": 0.0002, "epoch": 3.600493218249075, "step": 2920}, {"loss": 0.6373, "grad_norm": 0.7450457215309143, "learning_rate": 0.0002, "epoch": 3.6128236744759556, "step": 2930}, {"loss": 0.4933, "grad_norm": 0.5825870037078857, "learning_rate": 0.0002, "epoch": 3.625154130702836, "step": 2940}, {"loss": 0.6016, "grad_norm": 0.6289743781089783, "learning_rate": 0.0002, "epoch": 3.6374845869297165, "step": 2950}, {"loss": 0.5507, "grad_norm": 0.7801929116249084, "learning_rate": 0.0002, "epoch": 3.6498150431565968, "step": 2960}, {"loss": 0.5695, "grad_norm": 1.1206634044647217, "learning_rate": 0.0002, "epoch": 3.6621454993834774, "step": 2970}, {"loss": 0.4985, "grad_norm": 0.6738817691802979, "learning_rate": 0.0002, "epoch": 3.6744759556103577, "step": 2980}, {"loss": 0.6209, "grad_norm": 1.1917344331741333, "learning_rate": 0.0002, "epoch": 3.686806411837238, "step": 2990}, {"loss": 0.5373, "grad_norm": 1.3738657236099243, "learning_rate": 0.0002, "epoch": 3.6991368680641186, "step": 3000}, {"loss": 0.5467, "grad_norm": 0.6642793416976929, "learning_rate": 0.0002, "epoch": 3.711467324290999, "step": 3010}, {"loss": 0.6243, "grad_norm": 0.9030995965003967, "learning_rate": 0.0002, "epoch": 3.723797780517879, "step": 3020}, {"loss": 0.592, "grad_norm": 1.0203914642333984, "learning_rate": 0.0002, "epoch": 3.7361282367447597, "step": 3030}, {"loss": 0.5453, "grad_norm": 0.648394763469696, "learning_rate": 0.0002, "epoch": 3.74845869297164, "step": 3040}, {"loss": 0.498, "grad_norm": 0.6304570436477661, "learning_rate": 0.0002, "epoch": 3.76078914919852, "step": 3050}, {"loss": 0.683, "grad_norm": 0.8286601901054382, "learning_rate": 0.0002, "epoch": 3.773119605425401, "step": 3060}, {"loss": 0.5302, "grad_norm": 0.906444251537323, "learning_rate": 0.0002, "epoch": 3.785450061652281, "step": 3070}, {"loss": 0.5345, "grad_norm": 1.4212149381637573, "learning_rate": 0.0002, "epoch": 3.7977805178791613, "step": 3080}, {"loss": 0.6403, "grad_norm": 0.7574319839477539, "learning_rate": 0.0002, "epoch": 3.810110974106042, "step": 3090}, {"loss": 0.5756, "grad_norm": 0.6534451246261597, "learning_rate": 0.0002, "epoch": 3.822441430332922, "step": 3100}, {"loss": 0.5306, "grad_norm": 0.7525447010993958, "learning_rate": 0.0002, "epoch": 3.834771886559803, "step": 3110}, {"loss": 0.5368, "grad_norm": 0.6513990759849548, "learning_rate": 0.0002, "epoch": 3.847102342786683, "step": 3120}, {"loss": 0.5492, "grad_norm": 0.7782694697380066, "learning_rate": 0.0002, "epoch": 3.8594327990135637, "step": 3130}, {"loss": 0.5727, "grad_norm": 0.7998530268669128, "learning_rate": 0.0002, "epoch": 3.871763255240444, "step": 3140}, {"loss": 0.5156, "grad_norm": 0.8045353293418884, "learning_rate": 0.0002, "epoch": 3.884093711467324, "step": 3150}, {"loss": 0.5341, "grad_norm": 0.8242645263671875, "learning_rate": 0.0002, "epoch": 3.896424167694205, "step": 3160}, {"loss": 0.5563, "grad_norm": 0.8302360773086548, "learning_rate": 0.0002, "epoch": 3.908754623921085, "step": 3170}, {"loss": 0.5793, "grad_norm": 0.8653109073638916, "learning_rate": 0.0002, "epoch": 3.9210850801479653, "step": 3180}, {"loss": 0.5219, "grad_norm": 0.6461338996887207, "learning_rate": 0.0002, "epoch": 3.933415536374846, "step": 3190}, {"loss": 0.6009, "grad_norm": 0.8267415165901184, "learning_rate": 0.0002, "epoch": 3.945745992601726, "step": 3200}, {"loss": 0.5956, "grad_norm": 1.1963194608688354, "learning_rate": 0.0002, "epoch": 3.9580764488286064, "step": 3210}, {"loss": 0.5692, "grad_norm": 0.7101966142654419, "learning_rate": 0.0002, "epoch": 3.970406905055487, "step": 3220}, {"loss": 0.5471, "grad_norm": 0.5931660532951355, "learning_rate": 0.0002, "epoch": 3.9827373612823673, "step": 3230}, {"loss": 0.5619, "grad_norm": 0.7465988993644714, "learning_rate": 0.0002, "epoch": 3.995067817509248, "step": 3240}, {"eval_loss": 1.4066498279571533, "eval_runtime": 95.7145, "eval_samples_per_second": 4.555, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 3244}, {"loss": 0.4948, "grad_norm": 0.9478800296783447, "learning_rate": 0.0002, "epoch": 4.007398273736128, "step": 3250}, {"loss": 0.4129, "grad_norm": 1.207059621810913, "learning_rate": 0.0002, "epoch": 4.019728729963009, "step": 3260}, {"loss": 0.3577, "grad_norm": 0.8984074592590332, "learning_rate": 0.0002, "epoch": 4.032059186189889, "step": 3270}, {"loss": 0.3798, "grad_norm": 0.8104140758514404, "learning_rate": 0.0002, "epoch": 4.044389642416769, "step": 3280}, {"loss": 0.3657, "grad_norm": 1.0875468254089355, "learning_rate": 0.0002, "epoch": 4.05672009864365, "step": 3290}, {"loss": 0.3703, "grad_norm": 0.8520309329032898, "learning_rate": 0.0002, "epoch": 4.06905055487053, "step": 3300}, {"loss": 0.3933, "grad_norm": 1.076735496520996, "learning_rate": 0.0002, "epoch": 4.0813810110974105, "step": 3310}, {"loss": 0.4422, "grad_norm": 0.7789369821548462, "learning_rate": 0.0002, "epoch": 4.093711467324291, "step": 3320}, {"loss": 0.4009, "grad_norm": 0.916862964630127, "learning_rate": 0.0002, "epoch": 4.106041923551172, "step": 3330}, {"loss": 0.3934, "grad_norm": 1.1251654624938965, "learning_rate": 0.0002, "epoch": 4.118372379778052, "step": 3340}, {"loss": 0.3651, "grad_norm": 0.9373420476913452, "learning_rate": 0.0002, "epoch": 4.130702836004932, "step": 3350}, {"loss": 0.384, "grad_norm": 1.03253972530365, "learning_rate": 0.0002, "epoch": 4.143033292231813, "step": 3360}, {"loss": 0.372, "grad_norm": 0.947023332118988, "learning_rate": 0.0002, "epoch": 4.155363748458693, "step": 3370}, {"loss": 0.4018, "grad_norm": 0.8709157109260559, "learning_rate": 0.0002, "epoch": 4.167694204685573, "step": 3380}, {"loss": 0.3754, "grad_norm": 0.930983304977417, "learning_rate": 0.0002, "epoch": 4.180024660912454, "step": 3390}, {"loss": 0.4248, "grad_norm": 1.092809796333313, "learning_rate": 0.0002, "epoch": 4.192355117139334, "step": 3400}, {"loss": 0.4453, "grad_norm": 0.8454303741455078, "learning_rate": 0.0002, "epoch": 4.2046855733662145, "step": 3410}, {"loss": 0.4198, "grad_norm": 0.957210123538971, "learning_rate": 0.0002, "epoch": 4.217016029593095, "step": 3420}, {"loss": 0.3743, "grad_norm": 0.854333758354187, "learning_rate": 0.0002, "epoch": 4.229346485819975, "step": 3430}, {"loss": 0.4041, "grad_norm": 1.0457639694213867, "learning_rate": 0.0002, "epoch": 4.241676942046856, "step": 3440}, {"loss": 0.3817, "grad_norm": 0.8972977995872498, "learning_rate": 0.0002, "epoch": 4.254007398273736, "step": 3450}, {"loss": 0.4445, "grad_norm": 1.0438238382339478, "learning_rate": 0.0002, "epoch": 4.266337854500616, "step": 3460}, {"loss": 0.4078, "grad_norm": 0.7000405192375183, "learning_rate": 0.0002, "epoch": 4.278668310727497, "step": 3470}, {"loss": 0.3718, "grad_norm": 1.0451240539550781, "learning_rate": 0.0002, "epoch": 4.290998766954377, "step": 3480}, {"loss": 0.4506, "grad_norm": 1.3339767456054688, "learning_rate": 0.0002, "epoch": 4.303329223181258, "step": 3490}, {"loss": 0.3999, "grad_norm": 0.7503946423530579, "learning_rate": 0.0002, "epoch": 4.315659679408138, "step": 3500}, {"loss": 0.4503, "grad_norm": 0.8443584442138672, "learning_rate": 0.0002, "epoch": 4.3279901356350186, "step": 3510}, {"loss": 0.3793, "grad_norm": 1.1681201457977295, "learning_rate": 0.0002, "epoch": 4.340320591861899, "step": 3520}, {"loss": 0.4462, "grad_norm": 1.078883171081543, "learning_rate": 0.0002, "epoch": 4.352651048088779, "step": 3530}, {"loss": 0.4216, "grad_norm": 0.6894834041595459, "learning_rate": 0.0002, "epoch": 4.36498150431566, "step": 3540}, {"loss": 0.4315, "grad_norm": 0.7059480547904968, "learning_rate": 0.0002, "epoch": 4.37731196054254, "step": 3550}, {"loss": 0.3821, "grad_norm": 1.1807256937026978, "learning_rate": 0.0002, "epoch": 4.38964241676942, "step": 3560}, {"loss": 0.4192, "grad_norm": 0.8341359496116638, "learning_rate": 0.0002, "epoch": 4.401972872996301, "step": 3570}, {"loss": 0.4123, "grad_norm": 1.0273033380508423, "learning_rate": 0.0002, "epoch": 4.4143033292231815, "step": 3580}, {"loss": 0.5018, "grad_norm": 0.6916454434394836, "learning_rate": 0.0002, "epoch": 4.426633785450061, "step": 3590}, {"loss": 0.3909, "grad_norm": 0.8210113644599915, "learning_rate": 0.0002, "epoch": 4.438964241676942, "step": 3600}, {"loss": 0.3893, "grad_norm": 1.0309500694274902, "learning_rate": 0.0002, "epoch": 4.451294697903823, "step": 3610}, {"loss": 0.3902, "grad_norm": 0.8847399353981018, "learning_rate": 0.0002, "epoch": 4.463625154130703, "step": 3620}, {"loss": 0.4198, "grad_norm": 1.668636679649353, "learning_rate": 0.0002, "epoch": 4.475955610357583, "step": 3630}, {"loss": 0.4075, "grad_norm": 1.3087958097457886, "learning_rate": 0.0002, "epoch": 4.488286066584464, "step": 3640}, {"loss": 0.4294, "grad_norm": 0.837852418422699, "learning_rate": 0.0002, "epoch": 4.500616522811344, "step": 3650}, {"loss": 0.4053, "grad_norm": 9.7662353515625, "learning_rate": 0.0002, "epoch": 4.512946979038224, "step": 3660}, {"loss": 0.4033, "grad_norm": 1.125719428062439, "learning_rate": 0.0002, "epoch": 4.525277435265105, "step": 3670}, {"loss": 0.4566, "grad_norm": 0.7755377292633057, "learning_rate": 0.0002, "epoch": 4.5376078914919855, "step": 3680}, {"loss": 0.4415, "grad_norm": 0.7185089588165283, "learning_rate": 0.0002, "epoch": 4.549938347718865, "step": 3690}, {"loss": 0.4616, "grad_norm": 1.182063102722168, "learning_rate": 0.0002, "epoch": 4.562268803945746, "step": 3700}, {"loss": 0.4572, "grad_norm": 1.001197338104248, "learning_rate": 0.0002, "epoch": 4.574599260172627, "step": 3710}, {"loss": 0.4493, "grad_norm": 0.9705429077148438, "learning_rate": 0.0002, "epoch": 4.586929716399506, "step": 3720}, {"loss": 0.42, "grad_norm": 0.7136746048927307, "learning_rate": 0.0002, "epoch": 4.599260172626387, "step": 3730}, {"loss": 0.3757, "grad_norm": 1.0004864931106567, "learning_rate": 0.0002, "epoch": 4.611590628853268, "step": 3740}, {"loss": 0.4418, "grad_norm": 1.3193715810775757, "learning_rate": 0.0002, "epoch": 4.623921085080148, "step": 3750}, {"loss": 0.4572, "grad_norm": 0.6945042014122009, "learning_rate": 0.0002, "epoch": 4.636251541307028, "step": 3760}, {"loss": 0.4255, "grad_norm": 0.8903936743736267, "learning_rate": 0.0002, "epoch": 4.648581997533909, "step": 3770}, {"loss": 0.3582, "grad_norm": 0.7960889339447021, "learning_rate": 0.0002, "epoch": 4.660912453760789, "step": 3780}, {"loss": 0.3864, "grad_norm": 1.0439172983169556, "learning_rate": 0.0002, "epoch": 4.673242909987669, "step": 3790}, {"loss": 0.4378, "grad_norm": 1.4546219110488892, "learning_rate": 0.0002, "epoch": 4.68557336621455, "step": 3800}, {"loss": 0.4191, "grad_norm": 0.8194343447685242, "learning_rate": 0.0002, "epoch": 4.697903822441431, "step": 3810}, {"loss": 0.4473, "grad_norm": 1.0727602243423462, "learning_rate": 0.0002, "epoch": 4.7102342786683105, "step": 3820}, {"loss": 0.4021, "grad_norm": 0.7785195708274841, "learning_rate": 0.0002, "epoch": 4.722564734895191, "step": 3830}, {"loss": 0.4252, "grad_norm": 0.846783459186554, "learning_rate": 0.0002, "epoch": 4.734895191122072, "step": 3840}, {"loss": 0.4647, "grad_norm": 1.0481648445129395, "learning_rate": 0.0002, "epoch": 4.747225647348952, "step": 3850}, {"loss": 0.4944, "grad_norm": 0.7324008941650391, "learning_rate": 0.0002, "epoch": 4.759556103575832, "step": 3860}, {"loss": 0.3831, "grad_norm": 1.06382417678833, "learning_rate": 0.0002, "epoch": 4.771886559802713, "step": 3870}, {"loss": 0.3934, "grad_norm": 0.9851241111755371, "learning_rate": 0.0002, "epoch": 4.784217016029594, "step": 3880}, {"loss": 0.5172, "grad_norm": 0.8215277791023254, "learning_rate": 0.0002, "epoch": 4.796547472256473, "step": 3890}, {"loss": 0.4437, "grad_norm": 0.9901723861694336, "learning_rate": 0.0002, "epoch": 4.808877928483354, "step": 3900}, {"loss": 0.4673, "grad_norm": 0.9149112701416016, "learning_rate": 0.0002, "epoch": 4.821208384710234, "step": 3910}, {"loss": 0.4295, "grad_norm": 0.9772973656654358, "learning_rate": 0.0002, "epoch": 4.8335388409371145, "step": 3920}, {"loss": 0.4346, "grad_norm": 0.8889636397361755, "learning_rate": 0.0002, "epoch": 4.845869297163995, "step": 3930}, {"loss": 0.421, "grad_norm": 1.3032807111740112, "learning_rate": 0.0002, "epoch": 4.858199753390876, "step": 3940}, {"loss": 0.434, "grad_norm": 0.8575899600982666, "learning_rate": 0.0002, "epoch": 4.870530209617756, "step": 3950}, {"loss": 0.4295, "grad_norm": 1.04326331615448, "learning_rate": 0.0002, "epoch": 4.882860665844636, "step": 3960}, {"loss": 0.3633, "grad_norm": 1.041210651397705, "learning_rate": 0.0002, "epoch": 4.895191122071517, "step": 3970}, {"loss": 0.4104, "grad_norm": 0.9113056063652039, "learning_rate": 0.0002, "epoch": 4.907521578298397, "step": 3980}, {"loss": 0.4496, "grad_norm": 1.019347906112671, "learning_rate": 0.0002, "epoch": 4.919852034525277, "step": 3990}, {"loss": 0.457, "grad_norm": 0.7709218859672546, "learning_rate": 0.0002, "epoch": 4.932182490752158, "step": 4000}, {"loss": 0.4697, "grad_norm": 0.8891775608062744, "learning_rate": 0.0002, "epoch": 4.944512946979038, "step": 4010}, {"loss": 0.4436, "grad_norm": 1.0396920442581177, "learning_rate": 0.0002, "epoch": 4.9568434032059185, "step": 4020}, {"loss": 0.4251, "grad_norm": 0.9239833354949951, "learning_rate": 0.0002, "epoch": 4.969173859432799, "step": 4030}, {"loss": 0.5049, "grad_norm": 1.801400065422058, "learning_rate": 0.0002, "epoch": 4.981504315659679, "step": 4040}, {"loss": 0.4481, "grad_norm": 0.6194164752960205, "learning_rate": 0.0002, "epoch": 4.99383477188656, "step": 4050}, {"eval_loss": 1.544758915901184, "eval_runtime": 96.2573, "eval_samples_per_second": 4.53, "eval_steps_per_second": 0.571, "epoch": 5.0, "step": 4055}, {"loss": 0.3774, "grad_norm": 0.9918256998062134, "learning_rate": 0.0002, "epoch": 5.00616522811344, "step": 4060}, {"loss": 0.2887, "grad_norm": 1.4851351976394653, "learning_rate": 0.0002, "epoch": 5.018495684340321, "step": 4070}, {"loss": 0.2454, "grad_norm": 0.9237686395645142, "learning_rate": 0.0002, "epoch": 5.030826140567201, "step": 4080}, {"loss": 0.3072, "grad_norm": 1.2180852890014648, "learning_rate": 0.0002, "epoch": 5.0431565967940815, "step": 4090}, {"loss": 0.282, "grad_norm": 1.1247979402542114, "learning_rate": 0.0002, "epoch": 5.055487053020962, "step": 4100}, {"loss": 0.3108, "grad_norm": 1.2969884872436523, "learning_rate": 0.0002, "epoch": 5.067817509247842, "step": 4110}, {"loss": 0.2858, "grad_norm": 1.0183063745498657, "learning_rate": 0.0002, "epoch": 5.080147965474723, "step": 4120}, {"loss": 0.295, "grad_norm": 1.121330738067627, "learning_rate": 0.0002, "epoch": 5.092478421701603, "step": 4130}, {"loss": 0.2697, "grad_norm": 1.0748186111450195, "learning_rate": 0.0002, "epoch": 5.104808877928483, "step": 4140}, {"loss": 0.3414, "grad_norm": 1.103474736213684, "learning_rate": 0.0002, "epoch": 5.117139334155364, "step": 4150}, {"loss": 0.305, "grad_norm": 1.2251166105270386, "learning_rate": 0.0002, "epoch": 5.129469790382244, "step": 4160}, {"loss": 0.3131, "grad_norm": 0.920898973941803, "learning_rate": 0.0002, "epoch": 5.141800246609124, "step": 4170}, {"loss": 0.281, "grad_norm": 1.327542781829834, "learning_rate": 0.0002, "epoch": 5.154130702836005, "step": 4180}, {"loss": 0.3214, "grad_norm": 1.0677192211151123, "learning_rate": 0.0002, "epoch": 5.1664611590628855, "step": 4190}, {"loss": 0.2863, "grad_norm": 0.897241473197937, "learning_rate": 0.0002, "epoch": 5.178791615289766, "step": 4200}, {"loss": 0.2967, "grad_norm": 0.977457582950592, "learning_rate": 0.0002, "epoch": 5.191122071516646, "step": 4210}, {"loss": 0.3032, "grad_norm": 1.4115267992019653, "learning_rate": 0.0002, "epoch": 5.203452527743527, "step": 4220}, {"loss": 0.3279, "grad_norm": 1.097743034362793, "learning_rate": 0.0002, "epoch": 5.215782983970407, "step": 4230}, {"loss": 0.293, "grad_norm": 1.1095269918441772, "learning_rate": 0.0002, "epoch": 5.228113440197287, "step": 4240}, {"loss": 0.3544, "grad_norm": 1.3785479068756104, "learning_rate": 0.0002, "epoch": 5.240443896424168, "step": 4250}, {"loss": 0.3118, "grad_norm": 1.0298776626586914, "learning_rate": 0.0002, "epoch": 5.252774352651048, "step": 4260}, {"loss": 0.296, "grad_norm": 1.1592111587524414, "learning_rate": 0.0002, "epoch": 5.265104808877928, "step": 4270}, {"loss": 0.2878, "grad_norm": 1.2355743646621704, "learning_rate": 0.0002, "epoch": 5.277435265104809, "step": 4280}, {"loss": 0.3085, "grad_norm": 0.8543112874031067, "learning_rate": 0.0002, "epoch": 5.2897657213316895, "step": 4290}, {"loss": 0.3108, "grad_norm": 1.2953215837478638, "learning_rate": 0.0002, "epoch": 5.302096177558569, "step": 4300}, {"loss": 0.2912, "grad_norm": 1.1001787185668945, "learning_rate": 0.0002, "epoch": 5.31442663378545, "step": 4310}, {"loss": 0.3003, "grad_norm": 0.7476816773414612, "learning_rate": 0.0002, "epoch": 5.326757090012331, "step": 4320}, {"loss": 0.3247, "grad_norm": 0.8195574283599854, "learning_rate": 0.0002, "epoch": 5.3390875462392104, "step": 4330}, {"loss": 0.3035, "grad_norm": 0.9490262866020203, "learning_rate": 0.0002, "epoch": 5.351418002466091, "step": 4340}, {"loss": 0.2846, "grad_norm": 1.2201412916183472, "learning_rate": 0.0002, "epoch": 5.363748458692972, "step": 4350}, {"loss": 0.2644, "grad_norm": 1.0311479568481445, "learning_rate": 0.0002, "epoch": 5.376078914919852, "step": 4360}, {"loss": 0.3104, "grad_norm": 1.2097488641738892, "learning_rate": 0.0002, "epoch": 5.388409371146732, "step": 4370}, {"loss": 0.2977, "grad_norm": 1.140942096710205, "learning_rate": 0.0002, "epoch": 5.400739827373613, "step": 4380}, {"loss": 0.2975, "grad_norm": 0.8091890811920166, "learning_rate": 0.0002, "epoch": 5.413070283600494, "step": 4390}, {"loss": 0.3727, "grad_norm": 1.4467964172363281, "learning_rate": 0.0002, "epoch": 5.425400739827373, "step": 4400}, {"loss": 0.2979, "grad_norm": 1.0836058855056763, "learning_rate": 0.0002, "epoch": 5.437731196054254, "step": 4410}, {"loss": 0.2601, "grad_norm": 1.0515433549880981, "learning_rate": 0.0002, "epoch": 5.450061652281135, "step": 4420}, {"loss": 0.315, "grad_norm": 0.9603073000907898, "learning_rate": 0.0002, "epoch": 5.4623921085080145, "step": 4430}, {"loss": 0.3166, "grad_norm": 1.234609842300415, "learning_rate": 0.0002, "epoch": 5.474722564734895, "step": 4440}, {"loss": 0.3142, "grad_norm": 0.8881428837776184, "learning_rate": 0.0002, "epoch": 5.487053020961776, "step": 4450}, {"loss": 0.3725, "grad_norm": 1.1817275285720825, "learning_rate": 0.0002, "epoch": 5.499383477188656, "step": 4460}, {"loss": 0.2944, "grad_norm": 1.213993787765503, "learning_rate": 0.0002, "epoch": 5.511713933415536, "step": 4470}, {"loss": 0.3136, "grad_norm": 1.0501725673675537, "learning_rate": 0.0002, "epoch": 5.524044389642417, "step": 4480}, {"loss": 0.306, "grad_norm": 1.5061579942703247, "learning_rate": 0.0002, "epoch": 5.536374845869297, "step": 4490}, {"loss": 0.3226, "grad_norm": 1.1171475648880005, "learning_rate": 0.0002, "epoch": 5.548705302096177, "step": 4500}, {"loss": 0.3624, "grad_norm": 1.1147594451904297, "learning_rate": 0.0002, "epoch": 5.561035758323058, "step": 4510}, {"loss": 0.3435, "grad_norm": 1.0600544214248657, "learning_rate": 0.0002, "epoch": 5.573366214549939, "step": 4520}, {"loss": 0.3268, "grad_norm": 1.247870922088623, "learning_rate": 0.0002, "epoch": 5.5856966707768185, "step": 4530}, {"loss": 0.3168, "grad_norm": 0.9425561428070068, "learning_rate": 0.0002, "epoch": 5.598027127003699, "step": 4540}, {"loss": 0.3119, "grad_norm": 1.1111550331115723, "learning_rate": 0.0002, "epoch": 5.61035758323058, "step": 4550}, {"loss": 0.3389, "grad_norm": 1.743268609046936, "learning_rate": 0.0002, "epoch": 5.62268803945746, "step": 4560}, {"loss": 0.31, "grad_norm": 1.3522645235061646, "learning_rate": 0.0002, "epoch": 5.63501849568434, "step": 4570}, {"loss": 0.3121, "grad_norm": 0.7354221343994141, "learning_rate": 0.0002, "epoch": 5.647348951911221, "step": 4580}, {"loss": 0.3693, "grad_norm": 1.050743818283081, "learning_rate": 0.0002, "epoch": 5.659679408138101, "step": 4590}, {"loss": 0.3449, "grad_norm": 1.1302396059036255, "learning_rate": 0.0002, "epoch": 5.6720098643649814, "step": 4600}, {"loss": 0.3211, "grad_norm": 0.8774183392524719, "learning_rate": 0.0002, "epoch": 5.684340320591862, "step": 4610}, {"loss": 0.3501, "grad_norm": 1.090781569480896, "learning_rate": 0.0002, "epoch": 5.696670776818742, "step": 4620}, {"loss": 0.3211, "grad_norm": 0.9177733063697815, "learning_rate": 0.0002, "epoch": 5.709001233045623, "step": 4630}, {"loss": 0.3439, "grad_norm": 0.9985341429710388, "learning_rate": 0.0002, "epoch": 5.721331689272503, "step": 4640}, {"loss": 0.3323, "grad_norm": 1.0230613946914673, "learning_rate": 0.0002, "epoch": 5.733662145499384, "step": 4650}, {"loss": 0.3525, "grad_norm": 0.944656252861023, "learning_rate": 0.0002, "epoch": 5.745992601726264, "step": 4660}, {"loss": 0.3191, "grad_norm": 0.8162471652030945, "learning_rate": 0.0002, "epoch": 5.758323057953144, "step": 4670}, {"loss": 0.4011, "grad_norm": 1.0500398874282837, "learning_rate": 0.0002, "epoch": 5.770653514180025, "step": 4680}, {"loss": 0.3452, "grad_norm": 0.9487981796264648, "learning_rate": 0.0002, "epoch": 5.782983970406905, "step": 4690}, {"loss": 0.2942, "grad_norm": 1.1856540441513062, "learning_rate": 0.0002, "epoch": 5.7953144266337855, "step": 4700}, {"loss": 0.3107, "grad_norm": 1.2583396434783936, "learning_rate": 0.0002, "epoch": 5.807644882860666, "step": 4710}, {"loss": 0.3223, "grad_norm": 1.2532602548599243, "learning_rate": 0.0002, "epoch": 5.819975339087546, "step": 4720}, {"loss": 0.3253, "grad_norm": 1.115236520767212, "learning_rate": 0.0002, "epoch": 5.832305795314427, "step": 4730}, {"loss": 0.3539, "grad_norm": 1.2245537042617798, "learning_rate": 0.0002, "epoch": 5.844636251541307, "step": 4740}, {"loss": 0.3171, "grad_norm": 1.1964094638824463, "learning_rate": 0.0002, "epoch": 5.856966707768187, "step": 4750}, {"loss": 0.3623, "grad_norm": 1.0833805799484253, "learning_rate": 0.0002, "epoch": 5.869297163995068, "step": 4760}, {"loss": 0.3511, "grad_norm": 1.0694046020507812, "learning_rate": 0.0002, "epoch": 5.881627620221948, "step": 4770}, {"loss": 0.3266, "grad_norm": 0.9947936534881592, "learning_rate": 0.0002, "epoch": 5.893958076448829, "step": 4780}, {"loss": 0.316, "grad_norm": 1.175716519355774, "learning_rate": 0.0002, "epoch": 5.906288532675709, "step": 4790}, {"loss": 0.3609, "grad_norm": 0.7717352509498596, "learning_rate": 0.0002, "epoch": 5.9186189889025895, "step": 4800}, {"loss": 0.3058, "grad_norm": 1.2906442880630493, "learning_rate": 0.0002, "epoch": 5.930949445129469, "step": 4810}, {"loss": 0.3187, "grad_norm": 1.2416284084320068, "learning_rate": 0.0002, "epoch": 5.94327990135635, "step": 4820}, {"loss": 0.337, "grad_norm": 1.3066956996917725, "learning_rate": 0.0002, "epoch": 5.955610357583231, "step": 4830}, {"loss": 0.3167, "grad_norm": 1.0872026681900024, "learning_rate": 0.0002, "epoch": 5.967940813810111, "step": 4840}, {"loss": 0.3262, "grad_norm": 1.1941101551055908, "learning_rate": 0.0002, "epoch": 5.980271270036991, "step": 4850}, {"loss": 0.3234, "grad_norm": 1.1126095056533813, "learning_rate": 0.0002, "epoch": 5.992601726263872, "step": 4860}]} +{"epoch": 7.0, "step": 5677, "epoch_duration": 2505.4287343025208, "total_accumulated_duration": 17099.209432840347, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19860.224609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}, {"eval_loss": 1.246457576751709, "eval_runtime": 98.7974, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.557, "epoch": 1.0, "step": 811}, {"loss": 1.0536, "grad_norm": 0.26760655641555786, "learning_rate": 0.0002, "epoch": 1.0110974106041923, "step": 820}, {"loss": 0.9722, "grad_norm": 0.4640820026397705, "learning_rate": 0.0002, "epoch": 1.0234278668310728, "step": 830}, {"loss": 0.9392, "grad_norm": 0.2699166238307953, "learning_rate": 0.0002, "epoch": 1.0357583230579532, "step": 840}, {"loss": 0.9335, "grad_norm": 0.3441709578037262, "learning_rate": 0.0002, "epoch": 1.0480887792848335, "step": 850}, {"loss": 0.9083, "grad_norm": 0.299934983253479, "learning_rate": 0.0002, "epoch": 1.060419235511714, "step": 860}, {"loss": 0.9416, "grad_norm": 0.2980666160583496, "learning_rate": 0.0002, "epoch": 1.0727496917385944, "step": 870}, {"loss": 0.94, "grad_norm": 0.3131714463233948, "learning_rate": 0.0002, "epoch": 1.0850801479654748, "step": 880}, {"loss": 0.9288, "grad_norm": 0.29881617426872253, "learning_rate": 0.0002, "epoch": 1.097410604192355, "step": 890}, {"loss": 0.998, "grad_norm": 0.29870888590812683, "learning_rate": 0.0002, "epoch": 1.1097410604192355, "step": 900}, {"loss": 0.9924, "grad_norm": 0.5735140442848206, "learning_rate": 0.0002, "epoch": 1.122071516646116, "step": 910}, {"loss": 1.0694, "grad_norm": 0.33159002661705017, "learning_rate": 0.0002, "epoch": 1.1344019728729964, "step": 920}, {"loss": 1.0069, "grad_norm": 1.235399842262268, "learning_rate": 0.0002, "epoch": 1.1467324290998766, "step": 930}, {"loss": 1.0315, "grad_norm": 0.27469736337661743, "learning_rate": 0.0002, "epoch": 1.159062885326757, "step": 940}, {"loss": 0.9386, "grad_norm": 0.29130664467811584, "learning_rate": 0.0002, "epoch": 1.1713933415536375, "step": 950}, {"loss": 0.8919, "grad_norm": 0.3730354607105255, "learning_rate": 0.0002, "epoch": 1.183723797780518, "step": 960}, {"loss": 0.9988, "grad_norm": 0.5973590612411499, "learning_rate": 0.0002, "epoch": 1.1960542540073984, "step": 970}, {"loss": 0.9525, "grad_norm": 0.39631304144859314, "learning_rate": 0.0002, "epoch": 1.2083847102342786, "step": 980}, {"loss": 0.9217, "grad_norm": 0.849051296710968, "learning_rate": 0.0002, "epoch": 1.220715166461159, "step": 990}, {"loss": 1.0903, "grad_norm": 0.4390525817871094, "learning_rate": 0.0002, "epoch": 1.2330456226880395, "step": 1000}, {"loss": 0.9018, "grad_norm": 0.30423852801322937, "learning_rate": 0.0002, "epoch": 1.2453760789149197, "step": 1010}, {"loss": 1.0128, "grad_norm": 0.34736061096191406, "learning_rate": 0.0002, "epoch": 1.2577065351418002, "step": 1020}, {"loss": 0.9026, "grad_norm": 0.3421604037284851, "learning_rate": 0.0002, "epoch": 1.2700369913686806, "step": 1030}, {"loss": 0.8485, "grad_norm": 0.544170081615448, "learning_rate": 0.0002, "epoch": 1.282367447595561, "step": 1040}, {"loss": 0.9591, "grad_norm": 0.5128790736198425, "learning_rate": 0.0002, "epoch": 1.2946979038224415, "step": 1050}, {"loss": 0.9214, "grad_norm": 0.443344384431839, "learning_rate": 0.0002, "epoch": 1.3070283600493218, "step": 1060}, {"loss": 0.9367, "grad_norm": 0.6380868554115295, "learning_rate": 0.0002, "epoch": 1.3193588162762022, "step": 1070}, {"loss": 0.9849, "grad_norm": 0.4638073146343231, "learning_rate": 0.0002, "epoch": 1.3316892725030827, "step": 1080}, {"loss": 0.8645, "grad_norm": 0.32406893372535706, "learning_rate": 0.0002, "epoch": 1.344019728729963, "step": 1090}, {"loss": 0.8278, "grad_norm": 0.3955065608024597, "learning_rate": 0.0002, "epoch": 1.3563501849568433, "step": 1100}, {"loss": 0.9306, "grad_norm": 0.3489246666431427, "learning_rate": 0.0002, "epoch": 1.3686806411837238, "step": 1110}, {"loss": 1.0138, "grad_norm": 0.48451653122901917, "learning_rate": 0.0002, "epoch": 1.3810110974106042, "step": 1120}, {"loss": 0.9165, "grad_norm": 0.3652360439300537, "learning_rate": 0.0002, "epoch": 1.3933415536374847, "step": 1130}, {"loss": 0.9576, "grad_norm": 1.3097436428070068, "learning_rate": 0.0002, "epoch": 1.405672009864365, "step": 1140}, {"loss": 0.8115, "grad_norm": 0.3647715449333191, "learning_rate": 0.0002, "epoch": 1.4180024660912454, "step": 1150}, {"loss": 0.8573, "grad_norm": 0.37248560786247253, "learning_rate": 0.0002, "epoch": 1.4303329223181258, "step": 1160}, {"loss": 0.936, "grad_norm": 0.4639643430709839, "learning_rate": 0.0002, "epoch": 1.442663378545006, "step": 1170}, {"loss": 0.9511, "grad_norm": 0.5455219745635986, "learning_rate": 0.0002, "epoch": 1.4549938347718865, "step": 1180}, {"loss": 0.8611, "grad_norm": 0.38862571120262146, "learning_rate": 0.0002, "epoch": 1.467324290998767, "step": 1190}, {"loss": 0.8681, "grad_norm": 0.37586215138435364, "learning_rate": 0.0002, "epoch": 1.4796547472256474, "step": 1200}, {"loss": 0.8673, "grad_norm": 0.46244436502456665, "learning_rate": 0.0002, "epoch": 1.4919852034525278, "step": 1210}, {"loss": 0.9388, "grad_norm": 0.3570359945297241, "learning_rate": 0.0002, "epoch": 1.504315659679408, "step": 1220}, {"loss": 0.971, "grad_norm": 0.28393083810806274, "learning_rate": 0.0002, "epoch": 1.5166461159062885, "step": 1230}, {"loss": 0.9296, "grad_norm": 0.5672869682312012, "learning_rate": 0.0002, "epoch": 1.528976572133169, "step": 1240}, {"loss": 0.8787, "grad_norm": 0.41605108976364136, "learning_rate": 0.0002, "epoch": 1.5413070283600492, "step": 1250}, {"loss": 0.8744, "grad_norm": 0.40657493472099304, "learning_rate": 0.0002, "epoch": 1.5536374845869299, "step": 1260}, {"loss": 0.9046, "grad_norm": 0.43672341108322144, "learning_rate": 0.0002, "epoch": 1.56596794081381, "step": 1270}, {"loss": 0.8586, "grad_norm": 0.3065410554409027, "learning_rate": 0.0002, "epoch": 1.5782983970406905, "step": 1280}, {"loss": 0.9499, "grad_norm": 0.37826645374298096, "learning_rate": 0.0002, "epoch": 1.590628853267571, "step": 1290}, {"loss": 0.901, "grad_norm": 0.42307335138320923, "learning_rate": 0.0002, "epoch": 1.6029593094944512, "step": 1300}, {"loss": 0.8673, "grad_norm": 0.3648843467235565, "learning_rate": 0.0002, "epoch": 1.6152897657213316, "step": 1310}, {"loss": 0.9302, "grad_norm": 0.8921076059341431, "learning_rate": 0.0002, "epoch": 1.627620221948212, "step": 1320}, {"loss": 0.9378, "grad_norm": 0.37522226572036743, "learning_rate": 0.0002, "epoch": 1.6399506781750923, "step": 1330}, {"loss": 0.8921, "grad_norm": 0.7489957809448242, "learning_rate": 0.0002, "epoch": 1.652281134401973, "step": 1340}, {"loss": 0.9297, "grad_norm": 0.31733131408691406, "learning_rate": 0.0002, "epoch": 1.6646115906288532, "step": 1350}, {"loss": 0.907, "grad_norm": 0.3249478340148926, "learning_rate": 0.0002, "epoch": 1.6769420468557337, "step": 1360}, {"loss": 1.0197, "grad_norm": 0.3178001344203949, "learning_rate": 0.0002, "epoch": 1.6892725030826141, "step": 1370}, {"loss": 1.0781, "grad_norm": 0.5674093961715698, "learning_rate": 0.0002, "epoch": 1.7016029593094943, "step": 1380}, {"loss": 0.8972, "grad_norm": 0.35272449254989624, "learning_rate": 0.0002, "epoch": 1.7139334155363748, "step": 1390}, {"loss": 0.9346, "grad_norm": 0.5778217911720276, "learning_rate": 0.0002, "epoch": 1.7262638717632552, "step": 1400}, {"loss": 0.9099, "grad_norm": 0.33561450242996216, "learning_rate": 0.0002, "epoch": 1.7385943279901355, "step": 1410}, {"loss": 0.8636, "grad_norm": 0.31735464930534363, "learning_rate": 0.0002, "epoch": 1.7509247842170161, "step": 1420}, {"loss": 0.982, "grad_norm": 1.0612670183181763, "learning_rate": 0.0002, "epoch": 1.7632552404438964, "step": 1430}, {"loss": 0.8224, "grad_norm": 0.5442509651184082, "learning_rate": 0.0002, "epoch": 1.7755856966707768, "step": 1440}, {"loss": 0.9275, "grad_norm": 0.7471332550048828, "learning_rate": 0.0002, "epoch": 1.7879161528976573, "step": 1450}, {"loss": 0.9389, "grad_norm": 0.4323609173297882, "learning_rate": 0.0002, "epoch": 1.8002466091245375, "step": 1460}, {"loss": 0.8247, "grad_norm": 0.47796759009361267, "learning_rate": 0.0002, "epoch": 1.8125770653514182, "step": 1470}, {"loss": 0.9395, "grad_norm": 0.3348400592803955, "learning_rate": 0.0002, "epoch": 1.8249075215782984, "step": 1480}, {"loss": 0.9793, "grad_norm": 0.3354550898075104, "learning_rate": 0.0002, "epoch": 1.8372379778051788, "step": 1490}, {"loss": 0.8581, "grad_norm": 0.5988477468490601, "learning_rate": 0.0002, "epoch": 1.8495684340320593, "step": 1500}, {"loss": 0.9268, "grad_norm": 0.5222318172454834, "learning_rate": 0.0002, "epoch": 1.8618988902589395, "step": 1510}, {"loss": 0.8846, "grad_norm": 0.5246642827987671, "learning_rate": 0.0002, "epoch": 1.87422934648582, "step": 1520}, {"loss": 0.9317, "grad_norm": 0.3164594769477844, "learning_rate": 0.0002, "epoch": 1.8865598027127004, "step": 1530}, {"loss": 0.9961, "grad_norm": 0.3496174216270447, "learning_rate": 0.0002, "epoch": 1.8988902589395806, "step": 1540}, {"loss": 0.9057, "grad_norm": 0.8863359689712524, "learning_rate": 0.0002, "epoch": 1.9112207151664613, "step": 1550}, {"loss": 0.9405, "grad_norm": 0.3587026298046112, "learning_rate": 0.0002, "epoch": 1.9235511713933415, "step": 1560}, {"loss": 0.8335, "grad_norm": 0.6052881479263306, "learning_rate": 0.0002, "epoch": 1.935881627620222, "step": 1570}, {"loss": 0.8805, "grad_norm": 0.567269504070282, "learning_rate": 0.0002, "epoch": 1.9482120838471024, "step": 1580}, {"loss": 0.9581, "grad_norm": 0.45184487104415894, "learning_rate": 0.0002, "epoch": 1.9605425400739827, "step": 1590}, {"loss": 0.9147, "grad_norm": 0.5028569102287292, "learning_rate": 0.0002, "epoch": 1.972872996300863, "step": 1600}, {"loss": 0.75, "grad_norm": 0.4677547216415405, "learning_rate": 0.0002, "epoch": 1.9852034525277436, "step": 1610}, {"loss": 0.8469, "grad_norm": 0.35106056928634644, "learning_rate": 0.0002, "epoch": 1.9975339087546238, "step": 1620}, {"eval_loss": 1.238026738166809, "eval_runtime": 95.4287, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 1622}, {"loss": 0.7629, "grad_norm": 0.444060355424881, "learning_rate": 0.0002, "epoch": 2.0098643649815044, "step": 1630}, {"loss": 0.772, "grad_norm": 0.627570390701294, "learning_rate": 0.0002, "epoch": 2.0221948212083847, "step": 1640}, {"loss": 0.6186, "grad_norm": 0.38737839460372925, "learning_rate": 0.0002, "epoch": 2.034525277435265, "step": 1650}, {"loss": 0.7734, "grad_norm": 0.4300459623336792, "learning_rate": 0.0002, "epoch": 2.0468557336621456, "step": 1660}, {"loss": 0.6943, "grad_norm": 0.43037715554237366, "learning_rate": 0.0002, "epoch": 2.059186189889026, "step": 1670}, {"loss": 0.6588, "grad_norm": 0.40772515535354614, "learning_rate": 0.0002, "epoch": 2.0715166461159065, "step": 1680}, {"loss": 0.8105, "grad_norm": 0.5295451879501343, "learning_rate": 0.0002, "epoch": 2.0838471023427867, "step": 1690}, {"loss": 0.7717, "grad_norm": 0.7452750205993652, "learning_rate": 0.0002, "epoch": 2.096177558569667, "step": 1700}, {"loss": 0.7458, "grad_norm": 0.809183657169342, "learning_rate": 0.0002, "epoch": 2.1085080147965476, "step": 1710}, {"loss": 0.7398, "grad_norm": 0.4597688913345337, "learning_rate": 0.0002, "epoch": 2.120838471023428, "step": 1720}, {"loss": 0.6856, "grad_norm": 0.806919276714325, "learning_rate": 0.0002, "epoch": 2.133168927250308, "step": 1730}, {"loss": 0.679, "grad_norm": 0.3755643665790558, "learning_rate": 0.0002, "epoch": 2.1454993834771887, "step": 1740}, {"loss": 0.7938, "grad_norm": 0.5882734060287476, "learning_rate": 0.0002, "epoch": 2.157829839704069, "step": 1750}, {"loss": 0.6782, "grad_norm": 0.692960798740387, "learning_rate": 0.0002, "epoch": 2.1701602959309496, "step": 1760}, {"loss": 0.7195, "grad_norm": 0.4737096428871155, "learning_rate": 0.0002, "epoch": 2.18249075215783, "step": 1770}, {"loss": 0.758, "grad_norm": 0.6637021899223328, "learning_rate": 0.0002, "epoch": 2.19482120838471, "step": 1780}, {"loss": 0.7139, "grad_norm": 0.9109764099121094, "learning_rate": 0.0002, "epoch": 2.2071516646115907, "step": 1790}, {"loss": 0.7373, "grad_norm": 0.4137539267539978, "learning_rate": 0.0002, "epoch": 2.219482120838471, "step": 1800}, {"loss": 0.7266, "grad_norm": 0.44995415210723877, "learning_rate": 0.0002, "epoch": 2.2318125770653516, "step": 1810}, {"loss": 0.7663, "grad_norm": 0.5985036492347717, "learning_rate": 0.0002, "epoch": 2.244143033292232, "step": 1820}, {"loss": 0.7502, "grad_norm": 0.7549490332603455, "learning_rate": 0.0002, "epoch": 2.256473489519112, "step": 1830}, {"loss": 0.7452, "grad_norm": 0.4490937888622284, "learning_rate": 0.0002, "epoch": 2.2688039457459928, "step": 1840}, {"loss": 0.7531, "grad_norm": 0.38859808444976807, "learning_rate": 0.0002, "epoch": 2.281134401972873, "step": 1850}, {"loss": 0.7278, "grad_norm": 1.0704916715621948, "learning_rate": 0.0002, "epoch": 2.293464858199753, "step": 1860}, {"loss": 0.7143, "grad_norm": 0.4647100865840912, "learning_rate": 0.0002, "epoch": 2.305795314426634, "step": 1870}, {"loss": 0.7146, "grad_norm": 0.6181163787841797, "learning_rate": 0.0002, "epoch": 2.318125770653514, "step": 1880}, {"loss": 0.7689, "grad_norm": 0.9241904020309448, "learning_rate": 0.0002, "epoch": 2.3304562268803943, "step": 1890}, {"loss": 0.7294, "grad_norm": 0.39101317524909973, "learning_rate": 0.0002, "epoch": 2.342786683107275, "step": 1900}, {"loss": 0.7079, "grad_norm": 0.49442458152770996, "learning_rate": 0.0002, "epoch": 2.3551171393341552, "step": 1910}, {"loss": 0.7586, "grad_norm": 0.4864824414253235, "learning_rate": 0.0002, "epoch": 2.367447595561036, "step": 1920}, {"loss": 0.7434, "grad_norm": 0.5427613854408264, "learning_rate": 0.0002, "epoch": 2.379778051787916, "step": 1930}, {"loss": 0.8423, "grad_norm": 0.7164974808692932, "learning_rate": 0.0002, "epoch": 2.392108508014797, "step": 1940}, {"loss": 0.6888, "grad_norm": 0.562979519367218, "learning_rate": 0.0002, "epoch": 2.404438964241677, "step": 1950}, {"loss": 0.7692, "grad_norm": 0.5631861090660095, "learning_rate": 0.0002, "epoch": 2.4167694204685573, "step": 1960}, {"loss": 0.67, "grad_norm": 0.4895121157169342, "learning_rate": 0.0002, "epoch": 2.429099876695438, "step": 1970}, {"loss": 0.7735, "grad_norm": 0.45674824714660645, "learning_rate": 0.0002, "epoch": 2.441430332922318, "step": 1980}, {"loss": 0.685, "grad_norm": 1.1424206495285034, "learning_rate": 0.0002, "epoch": 2.4537607891491984, "step": 1990}, {"loss": 0.7627, "grad_norm": 0.6314579844474792, "learning_rate": 0.0002, "epoch": 2.466091245376079, "step": 2000}, {"loss": 0.7118, "grad_norm": 0.5481605529785156, "learning_rate": 0.0002, "epoch": 2.4784217016029593, "step": 2010}, {"loss": 0.6947, "grad_norm": 0.4671579599380493, "learning_rate": 0.0002, "epoch": 2.4907521578298395, "step": 2020}, {"loss": 0.7377, "grad_norm": 0.7621194124221802, "learning_rate": 0.0002, "epoch": 2.50308261405672, "step": 2030}, {"loss": 0.69, "grad_norm": 0.38983288407325745, "learning_rate": 0.0002, "epoch": 2.5154130702836004, "step": 2040}, {"loss": 0.8381, "grad_norm": 0.6341150999069214, "learning_rate": 0.0002, "epoch": 2.5277435265104806, "step": 2050}, {"loss": 0.773, "grad_norm": 0.7151971459388733, "learning_rate": 0.0002, "epoch": 2.5400739827373613, "step": 2060}, {"loss": 0.6733, "grad_norm": 0.9665895104408264, "learning_rate": 0.0002, "epoch": 2.5524044389642415, "step": 2070}, {"loss": 0.7791, "grad_norm": 0.9572727680206299, "learning_rate": 0.0002, "epoch": 2.564734895191122, "step": 2080}, {"loss": 0.7205, "grad_norm": 1.1970765590667725, "learning_rate": 0.0002, "epoch": 2.5770653514180024, "step": 2090}, {"loss": 0.6736, "grad_norm": 0.5505942702293396, "learning_rate": 0.0002, "epoch": 2.589395807644883, "step": 2100}, {"loss": 0.673, "grad_norm": 0.5903949737548828, "learning_rate": 0.0002, "epoch": 2.6017262638717633, "step": 2110}, {"loss": 0.678, "grad_norm": 0.45640307664871216, "learning_rate": 0.0002, "epoch": 2.6140567200986435, "step": 2120}, {"loss": 0.6562, "grad_norm": 0.8763944506645203, "learning_rate": 0.0002, "epoch": 2.626387176325524, "step": 2130}, {"loss": 0.6484, "grad_norm": 0.4472963213920593, "learning_rate": 0.0002, "epoch": 2.6387176325524044, "step": 2140}, {"loss": 0.7702, "grad_norm": 0.5335086584091187, "learning_rate": 0.0002, "epoch": 2.6510480887792847, "step": 2150}, {"loss": 0.6851, "grad_norm": 0.805263340473175, "learning_rate": 0.0002, "epoch": 2.6633785450061653, "step": 2160}, {"loss": 0.7026, "grad_norm": 0.6332727670669556, "learning_rate": 0.0002, "epoch": 2.6757090012330456, "step": 2170}, {"loss": 0.7925, "grad_norm": 0.8667435646057129, "learning_rate": 0.0002, "epoch": 2.688039457459926, "step": 2180}, {"loss": 0.8044, "grad_norm": 0.5638955235481262, "learning_rate": 0.0002, "epoch": 2.7003699136868065, "step": 2190}, {"loss": 0.7117, "grad_norm": 0.4176250696182251, "learning_rate": 0.0002, "epoch": 2.7127003699136867, "step": 2200}, {"loss": 0.6932, "grad_norm": 0.6013461351394653, "learning_rate": 0.0002, "epoch": 2.7250308261405674, "step": 2210}, {"loss": 0.7843, "grad_norm": 0.553961992263794, "learning_rate": 0.0002, "epoch": 2.7373612823674476, "step": 2220}, {"loss": 0.8633, "grad_norm": 0.4710180461406708, "learning_rate": 0.0002, "epoch": 2.7496917385943282, "step": 2230}, {"loss": 0.7469, "grad_norm": 0.8141706585884094, "learning_rate": 0.0002, "epoch": 2.7620221948212085, "step": 2240}, {"loss": 0.7086, "grad_norm": 0.7449556589126587, "learning_rate": 0.0002, "epoch": 2.7743526510480887, "step": 2250}, {"loss": 0.6933, "grad_norm": 0.5366780757904053, "learning_rate": 0.0002, "epoch": 2.7866831072749694, "step": 2260}, {"loss": 0.7192, "grad_norm": 0.5316720604896545, "learning_rate": 0.0002, "epoch": 2.7990135635018496, "step": 2270}, {"loss": 0.6212, "grad_norm": 0.4598459005355835, "learning_rate": 0.0002, "epoch": 2.81134401972873, "step": 2280}, {"loss": 0.7024, "grad_norm": 0.6852091550827026, "learning_rate": 0.0002, "epoch": 2.8236744759556105, "step": 2290}, {"loss": 0.7357, "grad_norm": 0.8040902018547058, "learning_rate": 0.0002, "epoch": 2.8360049321824907, "step": 2300}, {"loss": 0.7563, "grad_norm": 0.46976321935653687, "learning_rate": 0.0002, "epoch": 2.848335388409371, "step": 2310}, {"loss": 0.731, "grad_norm": 0.5214090347290039, "learning_rate": 0.0002, "epoch": 2.8606658446362516, "step": 2320}, {"loss": 0.6687, "grad_norm": 0.5323054790496826, "learning_rate": 0.0002, "epoch": 2.872996300863132, "step": 2330}, {"loss": 0.7895, "grad_norm": 0.6842264533042908, "learning_rate": 0.0002, "epoch": 2.885326757090012, "step": 2340}, {"loss": 0.7737, "grad_norm": 0.9157055616378784, "learning_rate": 0.0002, "epoch": 2.8976572133168927, "step": 2350}, {"loss": 0.7217, "grad_norm": 0.5253258347511292, "learning_rate": 0.0002, "epoch": 2.909987669543773, "step": 2360}, {"loss": 0.7162, "grad_norm": 0.4937705099582672, "learning_rate": 0.0002, "epoch": 2.9223181257706536, "step": 2370}, {"loss": 0.7008, "grad_norm": 0.48762989044189453, "learning_rate": 0.0002, "epoch": 2.934648581997534, "step": 2380}, {"loss": 0.8086, "grad_norm": 0.544335126876831, "learning_rate": 0.0002, "epoch": 2.9469790382244145, "step": 2390}, {"loss": 0.643, "grad_norm": 0.4847845435142517, "learning_rate": 0.0002, "epoch": 2.9593094944512948, "step": 2400}, {"loss": 0.7757, "grad_norm": 0.4787445366382599, "learning_rate": 0.0002, "epoch": 2.971639950678175, "step": 2410}, {"loss": 0.7678, "grad_norm": 1.022318959236145, "learning_rate": 0.0002, "epoch": 2.9839704069050557, "step": 2420}, {"loss": 0.6548, "grad_norm": 0.4987848103046417, "learning_rate": 0.0002, "epoch": 2.996300863131936, "step": 2430}, {"eval_loss": 1.2936296463012695, "eval_runtime": 94.7897, "eval_samples_per_second": 4.6, "eval_steps_per_second": 0.58, "epoch": 3.0, "step": 2433}, {"loss": 0.6073, "grad_norm": 0.5562372803688049, "learning_rate": 0.0002, "epoch": 3.008631319358816, "step": 2440}, {"loss": 0.5181, "grad_norm": 1.133402705192566, "learning_rate": 0.0002, "epoch": 3.020961775585697, "step": 2450}, {"loss": 0.5333, "grad_norm": 0.6480470299720764, "learning_rate": 0.0002, "epoch": 3.033292231812577, "step": 2460}, {"loss": 0.4828, "grad_norm": 0.8989138007164001, "learning_rate": 0.0002, "epoch": 3.0456226880394572, "step": 2470}, {"loss": 0.5097, "grad_norm": 0.8257461786270142, "learning_rate": 0.0002, "epoch": 3.057953144266338, "step": 2480}, {"loss": 0.6229, "grad_norm": 0.6813381910324097, "learning_rate": 0.0002, "epoch": 3.070283600493218, "step": 2490}, {"loss": 0.531, "grad_norm": 0.6989586353302002, "learning_rate": 0.0002, "epoch": 3.082614056720099, "step": 2500}, {"loss": 0.54, "grad_norm": 0.7992092967033386, "learning_rate": 0.0002, "epoch": 3.094944512946979, "step": 2510}, {"loss": 0.5054, "grad_norm": 0.698077917098999, "learning_rate": 0.0002, "epoch": 3.1072749691738593, "step": 2520}, {"loss": 0.5064, "grad_norm": 0.5699033141136169, "learning_rate": 0.0002, "epoch": 3.11960542540074, "step": 2530}, {"loss": 0.6088, "grad_norm": 0.6142355799674988, "learning_rate": 0.0002, "epoch": 3.13193588162762, "step": 2540}, {"loss": 0.585, "grad_norm": 0.7089933753013611, "learning_rate": 0.0002, "epoch": 3.144266337854501, "step": 2550}, {"loss": 0.5373, "grad_norm": 1.0107015371322632, "learning_rate": 0.0002, "epoch": 3.156596794081381, "step": 2560}, {"loss": 0.5429, "grad_norm": 0.568138837814331, "learning_rate": 0.0002, "epoch": 3.1689272503082613, "step": 2570}, {"loss": 0.5897, "grad_norm": 0.9960416555404663, "learning_rate": 0.0002, "epoch": 3.181257706535142, "step": 2580}, {"loss": 0.5211, "grad_norm": 0.6277595162391663, "learning_rate": 0.0002, "epoch": 3.193588162762022, "step": 2590}, {"loss": 0.5787, "grad_norm": 0.681083619594574, "learning_rate": 0.0002, "epoch": 3.2059186189889024, "step": 2600}, {"loss": 0.5166, "grad_norm": 0.5816057324409485, "learning_rate": 0.0002, "epoch": 3.218249075215783, "step": 2610}, {"loss": 0.545, "grad_norm": 0.7197734117507935, "learning_rate": 0.0002, "epoch": 3.2305795314426633, "step": 2620}, {"loss": 0.614, "grad_norm": 0.6524068117141724, "learning_rate": 0.0002, "epoch": 3.242909987669544, "step": 2630}, {"loss": 0.5456, "grad_norm": 1.273668646812439, "learning_rate": 0.0002, "epoch": 3.255240443896424, "step": 2640}, {"loss": 0.5266, "grad_norm": 0.6950451731681824, "learning_rate": 0.0002, "epoch": 3.2675709001233044, "step": 2650}, {"loss": 0.5194, "grad_norm": 0.8029071688652039, "learning_rate": 0.0002, "epoch": 3.279901356350185, "step": 2660}, {"loss": 0.5729, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 3.2922318125770653, "step": 2670}, {"loss": 0.5366, "grad_norm": 0.8342001438140869, "learning_rate": 0.0002, "epoch": 3.304562268803946, "step": 2680}, {"loss": 0.5413, "grad_norm": 0.5629868507385254, "learning_rate": 0.0002, "epoch": 3.316892725030826, "step": 2690}, {"loss": 0.633, "grad_norm": 0.753999650478363, "learning_rate": 0.0002, "epoch": 3.3292231812577064, "step": 2700}, {"loss": 0.5048, "grad_norm": 1.0271371603012085, "learning_rate": 0.0002, "epoch": 3.341553637484587, "step": 2710}, {"loss": 0.5233, "grad_norm": 0.9608535170555115, "learning_rate": 0.0002, "epoch": 3.3538840937114673, "step": 2720}, {"loss": 0.5102, "grad_norm": 0.7796488404273987, "learning_rate": 0.0002, "epoch": 3.3662145499383476, "step": 2730}, {"loss": 0.5172, "grad_norm": 0.5666437149047852, "learning_rate": 0.0002, "epoch": 3.3785450061652282, "step": 2740}, {"loss": 0.491, "grad_norm": 0.5462956428527832, "learning_rate": 0.0002, "epoch": 3.3908754623921085, "step": 2750}, {"loss": 0.5855, "grad_norm": 1.289099097251892, "learning_rate": 0.0002, "epoch": 3.4032059186189887, "step": 2760}, {"loss": 0.635, "grad_norm": 0.825566828250885, "learning_rate": 0.0002, "epoch": 3.4155363748458694, "step": 2770}, {"loss": 0.4998, "grad_norm": 0.8366670608520508, "learning_rate": 0.0002, "epoch": 3.4278668310727496, "step": 2780}, {"loss": 0.5732, "grad_norm": 1.0931549072265625, "learning_rate": 0.0002, "epoch": 3.4401972872996303, "step": 2790}, {"loss": 0.6093, "grad_norm": 0.9228858351707458, "learning_rate": 0.0002, "epoch": 3.4525277435265105, "step": 2800}, {"loss": 0.6089, "grad_norm": 1.3182806968688965, "learning_rate": 0.0002, "epoch": 3.4648581997533907, "step": 2810}, {"loss": 0.5665, "grad_norm": 0.8366976380348206, "learning_rate": 0.0002, "epoch": 3.4771886559802714, "step": 2820}, {"loss": 0.5666, "grad_norm": 0.8067695498466492, "learning_rate": 0.0002, "epoch": 3.4895191122071516, "step": 2830}, {"loss": 0.579, "grad_norm": 1.1163437366485596, "learning_rate": 0.0002, "epoch": 3.5018495684340323, "step": 2840}, {"loss": 0.5785, "grad_norm": 1.7196556329727173, "learning_rate": 0.0002, "epoch": 3.5141800246609125, "step": 2850}, {"loss": 0.5346, "grad_norm": 1.1267012357711792, "learning_rate": 0.0002, "epoch": 3.5265104808877927, "step": 2860}, {"loss": 0.447, "grad_norm": 0.7220137119293213, "learning_rate": 0.0002, "epoch": 3.5388409371146734, "step": 2870}, {"loss": 0.6099, "grad_norm": 0.914114773273468, "learning_rate": 0.0002, "epoch": 3.5511713933415536, "step": 2880}, {"loss": 0.6143, "grad_norm": 0.6193503141403198, "learning_rate": 0.0002, "epoch": 3.563501849568434, "step": 2890}, {"loss": 0.5171, "grad_norm": 0.6060135960578918, "learning_rate": 0.0002, "epoch": 3.5758323057953145, "step": 2900}, {"loss": 0.5659, "grad_norm": 1.0177327394485474, "learning_rate": 0.0002, "epoch": 3.5881627620221948, "step": 2910}, {"loss": 0.5711, "grad_norm": 0.5994468331336975, "learning_rate": 0.0002, "epoch": 3.600493218249075, "step": 2920}, {"loss": 0.6373, "grad_norm": 0.7450457215309143, "learning_rate": 0.0002, "epoch": 3.6128236744759556, "step": 2930}, {"loss": 0.4933, "grad_norm": 0.5825870037078857, "learning_rate": 0.0002, "epoch": 3.625154130702836, "step": 2940}, {"loss": 0.6016, "grad_norm": 0.6289743781089783, "learning_rate": 0.0002, "epoch": 3.6374845869297165, "step": 2950}, {"loss": 0.5507, "grad_norm": 0.7801929116249084, "learning_rate": 0.0002, "epoch": 3.6498150431565968, "step": 2960}, {"loss": 0.5695, "grad_norm": 1.1206634044647217, "learning_rate": 0.0002, "epoch": 3.6621454993834774, "step": 2970}, {"loss": 0.4985, "grad_norm": 0.6738817691802979, "learning_rate": 0.0002, "epoch": 3.6744759556103577, "step": 2980}, {"loss": 0.6209, "grad_norm": 1.1917344331741333, "learning_rate": 0.0002, "epoch": 3.686806411837238, "step": 2990}, {"loss": 0.5373, "grad_norm": 1.3738657236099243, "learning_rate": 0.0002, "epoch": 3.6991368680641186, "step": 3000}, {"loss": 0.5467, "grad_norm": 0.6642793416976929, "learning_rate": 0.0002, "epoch": 3.711467324290999, "step": 3010}, {"loss": 0.6243, "grad_norm": 0.9030995965003967, "learning_rate": 0.0002, "epoch": 3.723797780517879, "step": 3020}, {"loss": 0.592, "grad_norm": 1.0203914642333984, "learning_rate": 0.0002, "epoch": 3.7361282367447597, "step": 3030}, {"loss": 0.5453, "grad_norm": 0.648394763469696, "learning_rate": 0.0002, "epoch": 3.74845869297164, "step": 3040}, {"loss": 0.498, "grad_norm": 0.6304570436477661, "learning_rate": 0.0002, "epoch": 3.76078914919852, "step": 3050}, {"loss": 0.683, "grad_norm": 0.8286601901054382, "learning_rate": 0.0002, "epoch": 3.773119605425401, "step": 3060}, {"loss": 0.5302, "grad_norm": 0.906444251537323, "learning_rate": 0.0002, "epoch": 3.785450061652281, "step": 3070}, {"loss": 0.5345, "grad_norm": 1.4212149381637573, "learning_rate": 0.0002, "epoch": 3.7977805178791613, "step": 3080}, {"loss": 0.6403, "grad_norm": 0.7574319839477539, "learning_rate": 0.0002, "epoch": 3.810110974106042, "step": 3090}, {"loss": 0.5756, "grad_norm": 0.6534451246261597, "learning_rate": 0.0002, "epoch": 3.822441430332922, "step": 3100}, {"loss": 0.5306, "grad_norm": 0.7525447010993958, "learning_rate": 0.0002, "epoch": 3.834771886559803, "step": 3110}, {"loss": 0.5368, "grad_norm": 0.6513990759849548, "learning_rate": 0.0002, "epoch": 3.847102342786683, "step": 3120}, {"loss": 0.5492, "grad_norm": 0.7782694697380066, "learning_rate": 0.0002, "epoch": 3.8594327990135637, "step": 3130}, {"loss": 0.5727, "grad_norm": 0.7998530268669128, "learning_rate": 0.0002, "epoch": 3.871763255240444, "step": 3140}, {"loss": 0.5156, "grad_norm": 0.8045353293418884, "learning_rate": 0.0002, "epoch": 3.884093711467324, "step": 3150}, {"loss": 0.5341, "grad_norm": 0.8242645263671875, "learning_rate": 0.0002, "epoch": 3.896424167694205, "step": 3160}, {"loss": 0.5563, "grad_norm": 0.8302360773086548, "learning_rate": 0.0002, "epoch": 3.908754623921085, "step": 3170}, {"loss": 0.5793, "grad_norm": 0.8653109073638916, "learning_rate": 0.0002, "epoch": 3.9210850801479653, "step": 3180}, {"loss": 0.5219, "grad_norm": 0.6461338996887207, "learning_rate": 0.0002, "epoch": 3.933415536374846, "step": 3190}, {"loss": 0.6009, "grad_norm": 0.8267415165901184, "learning_rate": 0.0002, "epoch": 3.945745992601726, "step": 3200}, {"loss": 0.5956, "grad_norm": 1.1963194608688354, "learning_rate": 0.0002, "epoch": 3.9580764488286064, "step": 3210}, {"loss": 0.5692, "grad_norm": 0.7101966142654419, "learning_rate": 0.0002, "epoch": 3.970406905055487, "step": 3220}, {"loss": 0.5471, "grad_norm": 0.5931660532951355, "learning_rate": 0.0002, "epoch": 3.9827373612823673, "step": 3230}, {"loss": 0.5619, "grad_norm": 0.7465988993644714, "learning_rate": 0.0002, "epoch": 3.995067817509248, "step": 3240}, {"eval_loss": 1.4066498279571533, "eval_runtime": 95.7145, "eval_samples_per_second": 4.555, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 3244}, {"loss": 0.4948, "grad_norm": 0.9478800296783447, "learning_rate": 0.0002, "epoch": 4.007398273736128, "step": 3250}, {"loss": 0.4129, "grad_norm": 1.207059621810913, "learning_rate": 0.0002, "epoch": 4.019728729963009, "step": 3260}, {"loss": 0.3577, "grad_norm": 0.8984074592590332, "learning_rate": 0.0002, "epoch": 4.032059186189889, "step": 3270}, {"loss": 0.3798, "grad_norm": 0.8104140758514404, "learning_rate": 0.0002, "epoch": 4.044389642416769, "step": 3280}, {"loss": 0.3657, "grad_norm": 1.0875468254089355, "learning_rate": 0.0002, "epoch": 4.05672009864365, "step": 3290}, {"loss": 0.3703, "grad_norm": 0.8520309329032898, "learning_rate": 0.0002, "epoch": 4.06905055487053, "step": 3300}, {"loss": 0.3933, "grad_norm": 1.076735496520996, "learning_rate": 0.0002, "epoch": 4.0813810110974105, "step": 3310}, {"loss": 0.4422, "grad_norm": 0.7789369821548462, "learning_rate": 0.0002, "epoch": 4.093711467324291, "step": 3320}, {"loss": 0.4009, "grad_norm": 0.916862964630127, "learning_rate": 0.0002, "epoch": 4.106041923551172, "step": 3330}, {"loss": 0.3934, "grad_norm": 1.1251654624938965, "learning_rate": 0.0002, "epoch": 4.118372379778052, "step": 3340}, {"loss": 0.3651, "grad_norm": 0.9373420476913452, "learning_rate": 0.0002, "epoch": 4.130702836004932, "step": 3350}, {"loss": 0.384, "grad_norm": 1.03253972530365, "learning_rate": 0.0002, "epoch": 4.143033292231813, "step": 3360}, {"loss": 0.372, "grad_norm": 0.947023332118988, "learning_rate": 0.0002, "epoch": 4.155363748458693, "step": 3370}, {"loss": 0.4018, "grad_norm": 0.8709157109260559, "learning_rate": 0.0002, "epoch": 4.167694204685573, "step": 3380}, {"loss": 0.3754, "grad_norm": 0.930983304977417, "learning_rate": 0.0002, "epoch": 4.180024660912454, "step": 3390}, {"loss": 0.4248, "grad_norm": 1.092809796333313, "learning_rate": 0.0002, "epoch": 4.192355117139334, "step": 3400}, {"loss": 0.4453, "grad_norm": 0.8454303741455078, "learning_rate": 0.0002, "epoch": 4.2046855733662145, "step": 3410}, {"loss": 0.4198, "grad_norm": 0.957210123538971, "learning_rate": 0.0002, "epoch": 4.217016029593095, "step": 3420}, {"loss": 0.3743, "grad_norm": 0.854333758354187, "learning_rate": 0.0002, "epoch": 4.229346485819975, "step": 3430}, {"loss": 0.4041, "grad_norm": 1.0457639694213867, "learning_rate": 0.0002, "epoch": 4.241676942046856, "step": 3440}, {"loss": 0.3817, "grad_norm": 0.8972977995872498, "learning_rate": 0.0002, "epoch": 4.254007398273736, "step": 3450}, {"loss": 0.4445, "grad_norm": 1.0438238382339478, "learning_rate": 0.0002, "epoch": 4.266337854500616, "step": 3460}, {"loss": 0.4078, "grad_norm": 0.7000405192375183, "learning_rate": 0.0002, "epoch": 4.278668310727497, "step": 3470}, {"loss": 0.3718, "grad_norm": 1.0451240539550781, "learning_rate": 0.0002, "epoch": 4.290998766954377, "step": 3480}, {"loss": 0.4506, "grad_norm": 1.3339767456054688, "learning_rate": 0.0002, "epoch": 4.303329223181258, "step": 3490}, {"loss": 0.3999, "grad_norm": 0.7503946423530579, "learning_rate": 0.0002, "epoch": 4.315659679408138, "step": 3500}, {"loss": 0.4503, "grad_norm": 0.8443584442138672, "learning_rate": 0.0002, "epoch": 4.3279901356350186, "step": 3510}, {"loss": 0.3793, "grad_norm": 1.1681201457977295, "learning_rate": 0.0002, "epoch": 4.340320591861899, "step": 3520}, {"loss": 0.4462, "grad_norm": 1.078883171081543, "learning_rate": 0.0002, "epoch": 4.352651048088779, "step": 3530}, {"loss": 0.4216, "grad_norm": 0.6894834041595459, "learning_rate": 0.0002, "epoch": 4.36498150431566, "step": 3540}, {"loss": 0.4315, "grad_norm": 0.7059480547904968, "learning_rate": 0.0002, "epoch": 4.37731196054254, "step": 3550}, {"loss": 0.3821, "grad_norm": 1.1807256937026978, "learning_rate": 0.0002, "epoch": 4.38964241676942, "step": 3560}, {"loss": 0.4192, "grad_norm": 0.8341359496116638, "learning_rate": 0.0002, "epoch": 4.401972872996301, "step": 3570}, {"loss": 0.4123, "grad_norm": 1.0273033380508423, "learning_rate": 0.0002, "epoch": 4.4143033292231815, "step": 3580}, {"loss": 0.5018, "grad_norm": 0.6916454434394836, "learning_rate": 0.0002, "epoch": 4.426633785450061, "step": 3590}, {"loss": 0.3909, "grad_norm": 0.8210113644599915, "learning_rate": 0.0002, "epoch": 4.438964241676942, "step": 3600}, {"loss": 0.3893, "grad_norm": 1.0309500694274902, "learning_rate": 0.0002, "epoch": 4.451294697903823, "step": 3610}, {"loss": 0.3902, "grad_norm": 0.8847399353981018, "learning_rate": 0.0002, "epoch": 4.463625154130703, "step": 3620}, {"loss": 0.4198, "grad_norm": 1.668636679649353, "learning_rate": 0.0002, "epoch": 4.475955610357583, "step": 3630}, {"loss": 0.4075, "grad_norm": 1.3087958097457886, "learning_rate": 0.0002, "epoch": 4.488286066584464, "step": 3640}, {"loss": 0.4294, "grad_norm": 0.837852418422699, "learning_rate": 0.0002, "epoch": 4.500616522811344, "step": 3650}, {"loss": 0.4053, "grad_norm": 9.7662353515625, "learning_rate": 0.0002, "epoch": 4.512946979038224, "step": 3660}, {"loss": 0.4033, "grad_norm": 1.125719428062439, "learning_rate": 0.0002, "epoch": 4.525277435265105, "step": 3670}, {"loss": 0.4566, "grad_norm": 0.7755377292633057, "learning_rate": 0.0002, "epoch": 4.5376078914919855, "step": 3680}, {"loss": 0.4415, "grad_norm": 0.7185089588165283, "learning_rate": 0.0002, "epoch": 4.549938347718865, "step": 3690}, {"loss": 0.4616, "grad_norm": 1.182063102722168, "learning_rate": 0.0002, "epoch": 4.562268803945746, "step": 3700}, {"loss": 0.4572, "grad_norm": 1.001197338104248, "learning_rate": 0.0002, "epoch": 4.574599260172627, "step": 3710}, {"loss": 0.4493, "grad_norm": 0.9705429077148438, "learning_rate": 0.0002, "epoch": 4.586929716399506, "step": 3720}, {"loss": 0.42, "grad_norm": 0.7136746048927307, "learning_rate": 0.0002, "epoch": 4.599260172626387, "step": 3730}, {"loss": 0.3757, "grad_norm": 1.0004864931106567, "learning_rate": 0.0002, "epoch": 4.611590628853268, "step": 3740}, {"loss": 0.4418, "grad_norm": 1.3193715810775757, "learning_rate": 0.0002, "epoch": 4.623921085080148, "step": 3750}, {"loss": 0.4572, "grad_norm": 0.6945042014122009, "learning_rate": 0.0002, "epoch": 4.636251541307028, "step": 3760}, {"loss": 0.4255, "grad_norm": 0.8903936743736267, "learning_rate": 0.0002, "epoch": 4.648581997533909, "step": 3770}, {"loss": 0.3582, "grad_norm": 0.7960889339447021, "learning_rate": 0.0002, "epoch": 4.660912453760789, "step": 3780}, {"loss": 0.3864, "grad_norm": 1.0439172983169556, "learning_rate": 0.0002, "epoch": 4.673242909987669, "step": 3790}, {"loss": 0.4378, "grad_norm": 1.4546219110488892, "learning_rate": 0.0002, "epoch": 4.68557336621455, "step": 3800}, {"loss": 0.4191, "grad_norm": 0.8194343447685242, "learning_rate": 0.0002, "epoch": 4.697903822441431, "step": 3810}, {"loss": 0.4473, "grad_norm": 1.0727602243423462, "learning_rate": 0.0002, "epoch": 4.7102342786683105, "step": 3820}, {"loss": 0.4021, "grad_norm": 0.7785195708274841, "learning_rate": 0.0002, "epoch": 4.722564734895191, "step": 3830}, {"loss": 0.4252, "grad_norm": 0.846783459186554, "learning_rate": 0.0002, "epoch": 4.734895191122072, "step": 3840}, {"loss": 0.4647, "grad_norm": 1.0481648445129395, "learning_rate": 0.0002, "epoch": 4.747225647348952, "step": 3850}, {"loss": 0.4944, "grad_norm": 0.7324008941650391, "learning_rate": 0.0002, "epoch": 4.759556103575832, "step": 3860}, {"loss": 0.3831, "grad_norm": 1.06382417678833, "learning_rate": 0.0002, "epoch": 4.771886559802713, "step": 3870}, {"loss": 0.3934, "grad_norm": 0.9851241111755371, "learning_rate": 0.0002, "epoch": 4.784217016029594, "step": 3880}, {"loss": 0.5172, "grad_norm": 0.8215277791023254, "learning_rate": 0.0002, "epoch": 4.796547472256473, "step": 3890}, {"loss": 0.4437, "grad_norm": 0.9901723861694336, "learning_rate": 0.0002, "epoch": 4.808877928483354, "step": 3900}, {"loss": 0.4673, "grad_norm": 0.9149112701416016, "learning_rate": 0.0002, "epoch": 4.821208384710234, "step": 3910}, {"loss": 0.4295, "grad_norm": 0.9772973656654358, "learning_rate": 0.0002, "epoch": 4.8335388409371145, "step": 3920}, {"loss": 0.4346, "grad_norm": 0.8889636397361755, "learning_rate": 0.0002, "epoch": 4.845869297163995, "step": 3930}, {"loss": 0.421, "grad_norm": 1.3032807111740112, "learning_rate": 0.0002, "epoch": 4.858199753390876, "step": 3940}, {"loss": 0.434, "grad_norm": 0.8575899600982666, "learning_rate": 0.0002, "epoch": 4.870530209617756, "step": 3950}, {"loss": 0.4295, "grad_norm": 1.04326331615448, "learning_rate": 0.0002, "epoch": 4.882860665844636, "step": 3960}, {"loss": 0.3633, "grad_norm": 1.041210651397705, "learning_rate": 0.0002, "epoch": 4.895191122071517, "step": 3970}, {"loss": 0.4104, "grad_norm": 0.9113056063652039, "learning_rate": 0.0002, "epoch": 4.907521578298397, "step": 3980}, {"loss": 0.4496, "grad_norm": 1.019347906112671, "learning_rate": 0.0002, "epoch": 4.919852034525277, "step": 3990}, {"loss": 0.457, "grad_norm": 0.7709218859672546, "learning_rate": 0.0002, "epoch": 4.932182490752158, "step": 4000}, {"loss": 0.4697, "grad_norm": 0.8891775608062744, "learning_rate": 0.0002, "epoch": 4.944512946979038, "step": 4010}, {"loss": 0.4436, "grad_norm": 1.0396920442581177, "learning_rate": 0.0002, "epoch": 4.9568434032059185, "step": 4020}, {"loss": 0.4251, "grad_norm": 0.9239833354949951, "learning_rate": 0.0002, "epoch": 4.969173859432799, "step": 4030}, {"loss": 0.5049, "grad_norm": 1.801400065422058, "learning_rate": 0.0002, "epoch": 4.981504315659679, "step": 4040}, {"loss": 0.4481, "grad_norm": 0.6194164752960205, "learning_rate": 0.0002, "epoch": 4.99383477188656, "step": 4050}, {"eval_loss": 1.544758915901184, "eval_runtime": 96.2573, "eval_samples_per_second": 4.53, "eval_steps_per_second": 0.571, "epoch": 5.0, "step": 4055}, {"loss": 0.3774, "grad_norm": 0.9918256998062134, "learning_rate": 0.0002, "epoch": 5.00616522811344, "step": 4060}, {"loss": 0.2887, "grad_norm": 1.4851351976394653, "learning_rate": 0.0002, "epoch": 5.018495684340321, "step": 4070}, {"loss": 0.2454, "grad_norm": 0.9237686395645142, "learning_rate": 0.0002, "epoch": 5.030826140567201, "step": 4080}, {"loss": 0.3072, "grad_norm": 1.2180852890014648, "learning_rate": 0.0002, "epoch": 5.0431565967940815, "step": 4090}, {"loss": 0.282, "grad_norm": 1.1247979402542114, "learning_rate": 0.0002, "epoch": 5.055487053020962, "step": 4100}, {"loss": 0.3108, "grad_norm": 1.2969884872436523, "learning_rate": 0.0002, "epoch": 5.067817509247842, "step": 4110}, {"loss": 0.2858, "grad_norm": 1.0183063745498657, "learning_rate": 0.0002, "epoch": 5.080147965474723, "step": 4120}, {"loss": 0.295, "grad_norm": 1.121330738067627, "learning_rate": 0.0002, "epoch": 5.092478421701603, "step": 4130}, {"loss": 0.2697, "grad_norm": 1.0748186111450195, "learning_rate": 0.0002, "epoch": 5.104808877928483, "step": 4140}, {"loss": 0.3414, "grad_norm": 1.103474736213684, "learning_rate": 0.0002, "epoch": 5.117139334155364, "step": 4150}, {"loss": 0.305, "grad_norm": 1.2251166105270386, "learning_rate": 0.0002, "epoch": 5.129469790382244, "step": 4160}, {"loss": 0.3131, "grad_norm": 0.920898973941803, "learning_rate": 0.0002, "epoch": 5.141800246609124, "step": 4170}, {"loss": 0.281, "grad_norm": 1.327542781829834, "learning_rate": 0.0002, "epoch": 5.154130702836005, "step": 4180}, {"loss": 0.3214, "grad_norm": 1.0677192211151123, "learning_rate": 0.0002, "epoch": 5.1664611590628855, "step": 4190}, {"loss": 0.2863, "grad_norm": 0.897241473197937, "learning_rate": 0.0002, "epoch": 5.178791615289766, "step": 4200}, {"loss": 0.2967, "grad_norm": 0.977457582950592, "learning_rate": 0.0002, "epoch": 5.191122071516646, "step": 4210}, {"loss": 0.3032, "grad_norm": 1.4115267992019653, "learning_rate": 0.0002, "epoch": 5.203452527743527, "step": 4220}, {"loss": 0.3279, "grad_norm": 1.097743034362793, "learning_rate": 0.0002, "epoch": 5.215782983970407, "step": 4230}, {"loss": 0.293, "grad_norm": 1.1095269918441772, "learning_rate": 0.0002, "epoch": 5.228113440197287, "step": 4240}, {"loss": 0.3544, "grad_norm": 1.3785479068756104, "learning_rate": 0.0002, "epoch": 5.240443896424168, "step": 4250}, {"loss": 0.3118, "grad_norm": 1.0298776626586914, "learning_rate": 0.0002, "epoch": 5.252774352651048, "step": 4260}, {"loss": 0.296, "grad_norm": 1.1592111587524414, "learning_rate": 0.0002, "epoch": 5.265104808877928, "step": 4270}, {"loss": 0.2878, "grad_norm": 1.2355743646621704, "learning_rate": 0.0002, "epoch": 5.277435265104809, "step": 4280}, {"loss": 0.3085, "grad_norm": 0.8543112874031067, "learning_rate": 0.0002, "epoch": 5.2897657213316895, "step": 4290}, {"loss": 0.3108, "grad_norm": 1.2953215837478638, "learning_rate": 0.0002, "epoch": 5.302096177558569, "step": 4300}, {"loss": 0.2912, "grad_norm": 1.1001787185668945, "learning_rate": 0.0002, "epoch": 5.31442663378545, "step": 4310}, {"loss": 0.3003, "grad_norm": 0.7476816773414612, "learning_rate": 0.0002, "epoch": 5.326757090012331, "step": 4320}, {"loss": 0.3247, "grad_norm": 0.8195574283599854, "learning_rate": 0.0002, "epoch": 5.3390875462392104, "step": 4330}, {"loss": 0.3035, "grad_norm": 0.9490262866020203, "learning_rate": 0.0002, "epoch": 5.351418002466091, "step": 4340}, {"loss": 0.2846, "grad_norm": 1.2201412916183472, "learning_rate": 0.0002, "epoch": 5.363748458692972, "step": 4350}, {"loss": 0.2644, "grad_norm": 1.0311479568481445, "learning_rate": 0.0002, "epoch": 5.376078914919852, "step": 4360}, {"loss": 0.3104, "grad_norm": 1.2097488641738892, "learning_rate": 0.0002, "epoch": 5.388409371146732, "step": 4370}, {"loss": 0.2977, "grad_norm": 1.140942096710205, "learning_rate": 0.0002, "epoch": 5.400739827373613, "step": 4380}, {"loss": 0.2975, "grad_norm": 0.8091890811920166, "learning_rate": 0.0002, "epoch": 5.413070283600494, "step": 4390}, {"loss": 0.3727, "grad_norm": 1.4467964172363281, "learning_rate": 0.0002, "epoch": 5.425400739827373, "step": 4400}, {"loss": 0.2979, "grad_norm": 1.0836058855056763, "learning_rate": 0.0002, "epoch": 5.437731196054254, "step": 4410}, {"loss": 0.2601, "grad_norm": 1.0515433549880981, "learning_rate": 0.0002, "epoch": 5.450061652281135, "step": 4420}, {"loss": 0.315, "grad_norm": 0.9603073000907898, "learning_rate": 0.0002, "epoch": 5.4623921085080145, "step": 4430}, {"loss": 0.3166, "grad_norm": 1.234609842300415, "learning_rate": 0.0002, "epoch": 5.474722564734895, "step": 4440}, {"loss": 0.3142, "grad_norm": 0.8881428837776184, "learning_rate": 0.0002, "epoch": 5.487053020961776, "step": 4450}, {"loss": 0.3725, "grad_norm": 1.1817275285720825, "learning_rate": 0.0002, "epoch": 5.499383477188656, "step": 4460}, {"loss": 0.2944, "grad_norm": 1.213993787765503, "learning_rate": 0.0002, "epoch": 5.511713933415536, "step": 4470}, {"loss": 0.3136, "grad_norm": 1.0501725673675537, "learning_rate": 0.0002, "epoch": 5.524044389642417, "step": 4480}, {"loss": 0.306, "grad_norm": 1.5061579942703247, "learning_rate": 0.0002, "epoch": 5.536374845869297, "step": 4490}, {"loss": 0.3226, "grad_norm": 1.1171475648880005, "learning_rate": 0.0002, "epoch": 5.548705302096177, "step": 4500}, {"loss": 0.3624, "grad_norm": 1.1147594451904297, "learning_rate": 0.0002, "epoch": 5.561035758323058, "step": 4510}, {"loss": 0.3435, "grad_norm": 1.0600544214248657, "learning_rate": 0.0002, "epoch": 5.573366214549939, "step": 4520}, {"loss": 0.3268, "grad_norm": 1.247870922088623, "learning_rate": 0.0002, "epoch": 5.5856966707768185, "step": 4530}, {"loss": 0.3168, "grad_norm": 0.9425561428070068, "learning_rate": 0.0002, "epoch": 5.598027127003699, "step": 4540}, {"loss": 0.3119, "grad_norm": 1.1111550331115723, "learning_rate": 0.0002, "epoch": 5.61035758323058, "step": 4550}, {"loss": 0.3389, "grad_norm": 1.743268609046936, "learning_rate": 0.0002, "epoch": 5.62268803945746, "step": 4560}, {"loss": 0.31, "grad_norm": 1.3522645235061646, "learning_rate": 0.0002, "epoch": 5.63501849568434, "step": 4570}, {"loss": 0.3121, "grad_norm": 0.7354221343994141, "learning_rate": 0.0002, "epoch": 5.647348951911221, "step": 4580}, {"loss": 0.3693, "grad_norm": 1.050743818283081, "learning_rate": 0.0002, "epoch": 5.659679408138101, "step": 4590}, {"loss": 0.3449, "grad_norm": 1.1302396059036255, "learning_rate": 0.0002, "epoch": 5.6720098643649814, "step": 4600}, {"loss": 0.3211, "grad_norm": 0.8774183392524719, "learning_rate": 0.0002, "epoch": 5.684340320591862, "step": 4610}, {"loss": 0.3501, "grad_norm": 1.090781569480896, "learning_rate": 0.0002, "epoch": 5.696670776818742, "step": 4620}, {"loss": 0.3211, "grad_norm": 0.9177733063697815, "learning_rate": 0.0002, "epoch": 5.709001233045623, "step": 4630}, {"loss": 0.3439, "grad_norm": 0.9985341429710388, "learning_rate": 0.0002, "epoch": 5.721331689272503, "step": 4640}, {"loss": 0.3323, "grad_norm": 1.0230613946914673, "learning_rate": 0.0002, "epoch": 5.733662145499384, "step": 4650}, {"loss": 0.3525, "grad_norm": 0.944656252861023, "learning_rate": 0.0002, "epoch": 5.745992601726264, "step": 4660}, {"loss": 0.3191, "grad_norm": 0.8162471652030945, "learning_rate": 0.0002, "epoch": 5.758323057953144, "step": 4670}, {"loss": 0.4011, "grad_norm": 1.0500398874282837, "learning_rate": 0.0002, "epoch": 5.770653514180025, "step": 4680}, {"loss": 0.3452, "grad_norm": 0.9487981796264648, "learning_rate": 0.0002, "epoch": 5.782983970406905, "step": 4690}, {"loss": 0.2942, "grad_norm": 1.1856540441513062, "learning_rate": 0.0002, "epoch": 5.7953144266337855, "step": 4700}, {"loss": 0.3107, "grad_norm": 1.2583396434783936, "learning_rate": 0.0002, "epoch": 5.807644882860666, "step": 4710}, {"loss": 0.3223, "grad_norm": 1.2532602548599243, "learning_rate": 0.0002, "epoch": 5.819975339087546, "step": 4720}, {"loss": 0.3253, "grad_norm": 1.115236520767212, "learning_rate": 0.0002, "epoch": 5.832305795314427, "step": 4730}, {"loss": 0.3539, "grad_norm": 1.2245537042617798, "learning_rate": 0.0002, "epoch": 5.844636251541307, "step": 4740}, {"loss": 0.3171, "grad_norm": 1.1964094638824463, "learning_rate": 0.0002, "epoch": 5.856966707768187, "step": 4750}, {"loss": 0.3623, "grad_norm": 1.0833805799484253, "learning_rate": 0.0002, "epoch": 5.869297163995068, "step": 4760}, {"loss": 0.3511, "grad_norm": 1.0694046020507812, "learning_rate": 0.0002, "epoch": 5.881627620221948, "step": 4770}, {"loss": 0.3266, "grad_norm": 0.9947936534881592, "learning_rate": 0.0002, "epoch": 5.893958076448829, "step": 4780}, {"loss": 0.316, "grad_norm": 1.175716519355774, "learning_rate": 0.0002, "epoch": 5.906288532675709, "step": 4790}, {"loss": 0.3609, "grad_norm": 0.7717352509498596, "learning_rate": 0.0002, "epoch": 5.9186189889025895, "step": 4800}, {"loss": 0.3058, "grad_norm": 1.2906442880630493, "learning_rate": 0.0002, "epoch": 5.930949445129469, "step": 4810}, {"loss": 0.3187, "grad_norm": 1.2416284084320068, "learning_rate": 0.0002, "epoch": 5.94327990135635, "step": 4820}, {"loss": 0.337, "grad_norm": 1.3066956996917725, "learning_rate": 0.0002, "epoch": 5.955610357583231, "step": 4830}, {"loss": 0.3167, "grad_norm": 1.0872026681900024, "learning_rate": 0.0002, "epoch": 5.967940813810111, "step": 4840}, {"loss": 0.3262, "grad_norm": 1.1941101551055908, "learning_rate": 0.0002, "epoch": 5.980271270036991, "step": 4850}, {"loss": 0.3234, "grad_norm": 1.1126095056533813, "learning_rate": 0.0002, "epoch": 5.992601726263872, "step": 4860}, {"eval_loss": 1.748323917388916, "eval_runtime": 97.7488, "eval_samples_per_second": 4.46, "eval_steps_per_second": 0.563, "epoch": 6.0, "step": 4866}, {"loss": 0.2774, "grad_norm": 1.3631165027618408, "learning_rate": 0.0002, "epoch": 6.0049321824907524, "step": 4870}, {"loss": 0.2399, "grad_norm": 1.2631664276123047, "learning_rate": 0.0002, "epoch": 6.017262638717632, "step": 4880}, {"loss": 0.2177, "grad_norm": 0.7073080539703369, "learning_rate": 0.0002, "epoch": 6.029593094944513, "step": 4890}, {"loss": 0.215, "grad_norm": 0.7856091856956482, "learning_rate": 0.0002, "epoch": 6.041923551171394, "step": 4900}, {"loss": 0.1999, "grad_norm": 1.145540475845337, "learning_rate": 0.0002, "epoch": 6.054254007398273, "step": 4910}, {"loss": 0.2084, "grad_norm": 1.1742334365844727, "learning_rate": 0.0002, "epoch": 6.066584463625154, "step": 4920}, {"loss": 0.2342, "grad_norm": 0.8043994903564453, "learning_rate": 0.0002, "epoch": 6.078914919852035, "step": 4930}, {"loss": 0.2454, "grad_norm": 1.1877652406692505, "learning_rate": 0.0002, "epoch": 6.0912453760789145, "step": 4940}, {"loss": 0.1908, "grad_norm": 0.7624953985214233, "learning_rate": 0.0002, "epoch": 6.103575832305795, "step": 4950}, {"loss": 0.2254, "grad_norm": 1.0403119325637817, "learning_rate": 0.0002, "epoch": 6.115906288532676, "step": 4960}, {"loss": 0.2274, "grad_norm": 1.2040252685546875, "learning_rate": 0.0002, "epoch": 6.1282367447595565, "step": 4970}, {"loss": 0.2199, "grad_norm": 0.6242546439170837, "learning_rate": 0.0002, "epoch": 6.140567200986436, "step": 4980}, {"loss": 0.27, "grad_norm": 1.1394767761230469, "learning_rate": 0.0002, "epoch": 6.152897657213317, "step": 4990}, {"loss": 0.2377, "grad_norm": 1.3760257959365845, "learning_rate": 0.0002, "epoch": 6.165228113440198, "step": 5000}, {"loss": 0.2331, "grad_norm": 1.0707697868347168, "learning_rate": 0.0002, "epoch": 6.177558569667077, "step": 5010}, {"loss": 0.2311, "grad_norm": 1.288072109222412, "learning_rate": 0.0002, "epoch": 6.189889025893958, "step": 5020}, {"loss": 0.2276, "grad_norm": 1.1479463577270508, "learning_rate": 0.0002, "epoch": 6.202219482120839, "step": 5030}, {"loss": 0.2294, "grad_norm": 0.905891478061676, "learning_rate": 0.0002, "epoch": 6.2145499383477185, "step": 5040}, {"loss": 0.2575, "grad_norm": 1.0354516506195068, "learning_rate": 0.0002, "epoch": 6.226880394574599, "step": 5050}, {"loss": 0.2259, "grad_norm": 1.312671184539795, "learning_rate": 0.0002, "epoch": 6.23921085080148, "step": 5060}, {"loss": 0.2281, "grad_norm": 1.614709734916687, "learning_rate": 0.0002, "epoch": 6.25154130702836, "step": 5070}, {"loss": 0.2388, "grad_norm": 1.0864229202270508, "learning_rate": 0.0002, "epoch": 6.26387176325524, "step": 5080}, {"loss": 0.2014, "grad_norm": 1.0401391983032227, "learning_rate": 0.0002, "epoch": 6.276202219482121, "step": 5090}, {"loss": 0.2419, "grad_norm": 1.2187728881835938, "learning_rate": 0.0002, "epoch": 6.288532675709002, "step": 5100}, {"loss": 0.2144, "grad_norm": 0.9474364519119263, "learning_rate": 0.0002, "epoch": 6.300863131935881, "step": 5110}, {"loss": 0.238, "grad_norm": 1.1228716373443604, "learning_rate": 0.0002, "epoch": 6.313193588162762, "step": 5120}, {"loss": 0.2556, "grad_norm": 0.9294499754905701, "learning_rate": 0.0002, "epoch": 6.325524044389643, "step": 5130}, {"loss": 0.2384, "grad_norm": 1.0521048307418823, "learning_rate": 0.0002, "epoch": 6.337854500616523, "step": 5140}, {"loss": 0.2444, "grad_norm": 1.2406890392303467, "learning_rate": 0.0002, "epoch": 6.350184956843403, "step": 5150}, {"loss": 0.2301, "grad_norm": 1.2972853183746338, "learning_rate": 0.0002, "epoch": 6.362515413070284, "step": 5160}, {"loss": 0.2574, "grad_norm": 0.8772842288017273, "learning_rate": 0.0002, "epoch": 6.374845869297164, "step": 5170}, {"loss": 0.2337, "grad_norm": 1.050349473953247, "learning_rate": 0.0002, "epoch": 6.387176325524044, "step": 5180}, {"loss": 0.2593, "grad_norm": 0.9432134032249451, "learning_rate": 0.0002, "epoch": 6.399506781750925, "step": 5190}, {"loss": 0.2546, "grad_norm": 1.11045241355896, "learning_rate": 0.0002, "epoch": 6.411837237977805, "step": 5200}, {"loss": 0.268, "grad_norm": 1.117530345916748, "learning_rate": 0.0002, "epoch": 6.4241676942046855, "step": 5210}, {"loss": 0.25, "grad_norm": 1.4194035530090332, "learning_rate": 0.0002, "epoch": 6.436498150431566, "step": 5220}, {"loss": 0.2335, "grad_norm": 1.063950777053833, "learning_rate": 0.0002, "epoch": 6.448828606658447, "step": 5230}, {"loss": 0.2299, "grad_norm": 1.2946349382400513, "learning_rate": 0.0002, "epoch": 6.461159062885327, "step": 5240}, {"loss": 0.242, "grad_norm": 1.5237880945205688, "learning_rate": 0.0002, "epoch": 6.473489519112207, "step": 5250}, {"loss": 0.255, "grad_norm": 1.1915720701217651, "learning_rate": 0.0002, "epoch": 6.485819975339088, "step": 5260}, {"loss": 0.2357, "grad_norm": 1.0779626369476318, "learning_rate": 0.0002, "epoch": 6.498150431565968, "step": 5270}, {"loss": 0.2476, "grad_norm": 0.8255738019943237, "learning_rate": 0.0002, "epoch": 6.510480887792848, "step": 5280}, {"loss": 0.267, "grad_norm": 1.275174856185913, "learning_rate": 0.0002, "epoch": 6.522811344019729, "step": 5290}, {"loss": 0.2217, "grad_norm": 1.0878815650939941, "learning_rate": 0.0002, "epoch": 6.535141800246609, "step": 5300}, {"loss": 0.2462, "grad_norm": 1.2594236135482788, "learning_rate": 0.0002, "epoch": 6.5474722564734895, "step": 5310}, {"loss": 0.2457, "grad_norm": 0.9919610619544983, "learning_rate": 0.0002, "epoch": 6.55980271270037, "step": 5320}, {"loss": 0.2933, "grad_norm": 1.3703680038452148, "learning_rate": 0.0002, "epoch": 6.57213316892725, "step": 5330}, {"loss": 0.2453, "grad_norm": 1.403140902519226, "learning_rate": 0.0002, "epoch": 6.584463625154131, "step": 5340}, {"loss": 0.2584, "grad_norm": 1.3477165699005127, "learning_rate": 0.0002, "epoch": 6.596794081381011, "step": 5350}, {"loss": 0.2853, "grad_norm": 1.3145594596862793, "learning_rate": 0.0002, "epoch": 6.609124537607892, "step": 5360}, {"loss": 0.246, "grad_norm": 0.9048973321914673, "learning_rate": 0.0002, "epoch": 6.621454993834772, "step": 5370}, {"loss": 0.2646, "grad_norm": 1.4123972654342651, "learning_rate": 0.0002, "epoch": 6.633785450061652, "step": 5380}, {"loss": 0.272, "grad_norm": 1.3584848642349243, "learning_rate": 0.0002, "epoch": 6.646115906288532, "step": 5390}, {"loss": 0.2663, "grad_norm": 1.2085801362991333, "learning_rate": 0.0002, "epoch": 6.658446362515413, "step": 5400}, {"loss": 0.2796, "grad_norm": 1.9293283224105835, "learning_rate": 0.0002, "epoch": 6.670776818742294, "step": 5410}, {"loss": 0.2412, "grad_norm": 1.3658782243728638, "learning_rate": 0.0002, "epoch": 6.683107274969174, "step": 5420}, {"loss": 0.2442, "grad_norm": 1.2004997730255127, "learning_rate": 0.0002, "epoch": 6.695437731196054, "step": 5430}, {"loss": 0.2693, "grad_norm": 1.0671268701553345, "learning_rate": 0.0002, "epoch": 6.707768187422935, "step": 5440}, {"loss": 0.2216, "grad_norm": 0.8877466320991516, "learning_rate": 0.0002, "epoch": 6.720098643649815, "step": 5450}, {"loss": 0.2678, "grad_norm": 1.2843106985092163, "learning_rate": 0.0002, "epoch": 6.732429099876695, "step": 5460}, {"loss": 0.2418, "grad_norm": 1.0663448572158813, "learning_rate": 0.0002, "epoch": 6.744759556103576, "step": 5470}, {"loss": 0.2402, "grad_norm": 1.3155773878097534, "learning_rate": 0.0002, "epoch": 6.7570900123304565, "step": 5480}, {"loss": 0.2559, "grad_norm": 1.8862448930740356, "learning_rate": 0.0002, "epoch": 6.769420468557336, "step": 5490}, {"loss": 0.2651, "grad_norm": 1.165061116218567, "learning_rate": 0.0002, "epoch": 6.781750924784217, "step": 5500}, {"loss": 0.2342, "grad_norm": 1.0968598127365112, "learning_rate": 0.0002, "epoch": 6.794081381011098, "step": 5510}, {"loss": 0.2453, "grad_norm": 0.9448091983795166, "learning_rate": 0.0002, "epoch": 6.806411837237977, "step": 5520}, {"loss": 0.2609, "grad_norm": 1.400767207145691, "learning_rate": 0.0002, "epoch": 6.818742293464858, "step": 5530}, {"loss": 0.2642, "grad_norm": 1.1031112670898438, "learning_rate": 0.0002, "epoch": 6.831072749691739, "step": 5540}, {"loss": 0.2534, "grad_norm": 1.2436904907226562, "learning_rate": 0.0002, "epoch": 6.843403205918619, "step": 5550}, {"loss": 0.2601, "grad_norm": 1.0987974405288696, "learning_rate": 0.0002, "epoch": 6.855733662145499, "step": 5560}, {"loss": 0.2622, "grad_norm": 0.8656415939331055, "learning_rate": 0.0002, "epoch": 6.86806411837238, "step": 5570}, {"loss": 0.2585, "grad_norm": 1.2153927087783813, "learning_rate": 0.0002, "epoch": 6.8803945745992605, "step": 5580}, {"loss": 0.2888, "grad_norm": 1.111377477645874, "learning_rate": 0.0002, "epoch": 6.89272503082614, "step": 5590}, {"loss": 0.2569, "grad_norm": 1.0041896104812622, "learning_rate": 0.0002, "epoch": 6.905055487053021, "step": 5600}, {"loss": 0.2654, "grad_norm": 1.0638413429260254, "learning_rate": 0.0002, "epoch": 6.917385943279902, "step": 5610}, {"loss": 0.2364, "grad_norm": 0.9756764769554138, "learning_rate": 0.0002, "epoch": 6.929716399506781, "step": 5620}, {"loss": 0.2756, "grad_norm": 1.153550624847412, "learning_rate": 0.0002, "epoch": 6.942046855733662, "step": 5630}, {"loss": 0.2732, "grad_norm": 1.3393985033035278, "learning_rate": 0.0002, "epoch": 6.954377311960543, "step": 5640}, {"loss": 0.2793, "grad_norm": 1.3233463764190674, "learning_rate": 0.0002, "epoch": 6.9667077681874225, "step": 5650}, {"loss": 0.2593, "grad_norm": 1.1693105697631836, "learning_rate": 0.0002, "epoch": 6.979038224414303, "step": 5660}, {"loss": 0.278, "grad_norm": 0.7186262607574463, "learning_rate": 0.0002, "epoch": 6.991368680641184, "step": 5670}]} +{"epoch": 8.0, "step": 6488, "epoch_duration": 2527.4715588092804, "total_accumulated_duration": 19626.680991649628, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19860.224609375}, "avg_memory_reserved": {"GPU_0": 28774.0}, "peak_memory_reserved": {"GPU_0": 28774.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-5885-sd-10000/checkpoint-1622", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.3601, "grad_norm": 0.8258164525032043, "learning_rate": 0.0002, "epoch": 0.012330456226880395, "step": 10}, {"loss": 1.7317, "grad_norm": 0.4577729105949402, "learning_rate": 0.0002, "epoch": 0.02466091245376079, "step": 20}, {"loss": 1.4729, "grad_norm": 0.639807939529419, "learning_rate": 0.0002, "epoch": 0.036991368680641186, "step": 30}, {"loss": 1.2088, "grad_norm": 0.5311757922172546, "learning_rate": 0.0002, "epoch": 0.04932182490752158, "step": 40}, {"loss": 1.3149, "grad_norm": 0.386595219373703, "learning_rate": 0.0002, "epoch": 0.06165228113440197, "step": 50}, {"loss": 1.1657, "grad_norm": 0.4401357173919678, "learning_rate": 0.0002, "epoch": 0.07398273736128237, "step": 60}, {"loss": 1.1022, "grad_norm": 0.3234352171421051, "learning_rate": 0.0002, "epoch": 0.08631319358816276, "step": 70}, {"loss": 1.3738, "grad_norm": 0.29643672704696655, "learning_rate": 0.0002, "epoch": 0.09864364981504316, "step": 80}, {"loss": 1.1929, "grad_norm": 0.2941012382507324, "learning_rate": 0.0002, "epoch": 0.11097410604192355, "step": 90}, {"loss": 1.2067, "grad_norm": 0.5498173832893372, "learning_rate": 0.0002, "epoch": 0.12330456226880394, "step": 100}, {"loss": 1.142, "grad_norm": 0.2545783519744873, "learning_rate": 0.0002, "epoch": 0.13563501849568435, "step": 110}, {"loss": 1.0297, "grad_norm": 0.2984241247177124, "learning_rate": 0.0002, "epoch": 0.14796547472256474, "step": 120}, {"loss": 1.1591, "grad_norm": 0.2710968852043152, "learning_rate": 0.0002, "epoch": 0.16029593094944514, "step": 130}, {"loss": 1.0427, "grad_norm": 0.2817152142524719, "learning_rate": 0.0002, "epoch": 0.17262638717632553, "step": 140}, {"loss": 1.0638, "grad_norm": 0.41083765029907227, "learning_rate": 0.0002, "epoch": 0.18495684340320592, "step": 150}, {"loss": 1.3537, "grad_norm": 0.36536213755607605, "learning_rate": 0.0002, "epoch": 0.19728729963008632, "step": 160}, {"loss": 1.0132, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "epoch": 0.2096177558569667, "step": 170}, {"loss": 0.9555, "grad_norm": 0.27403146028518677, "learning_rate": 0.0002, "epoch": 0.2219482120838471, "step": 180}, {"loss": 1.0769, "grad_norm": 0.4446810483932495, "learning_rate": 0.0002, "epoch": 0.2342786683107275, "step": 190}, {"loss": 1.0588, "grad_norm": 0.5295385718345642, "learning_rate": 0.0002, "epoch": 0.2466091245376079, "step": 200}, {"loss": 1.0827, "grad_norm": 0.311404824256897, "learning_rate": 0.0002, "epoch": 0.2589395807644883, "step": 210}, {"loss": 1.1455, "grad_norm": 0.2448509782552719, "learning_rate": 0.0002, "epoch": 0.2712700369913687, "step": 220}, {"loss": 1.0929, "grad_norm": 0.6507014036178589, "learning_rate": 0.0002, "epoch": 0.2836004932182491, "step": 230}, {"loss": 0.9875, "grad_norm": 0.2339320331811905, "learning_rate": 0.0002, "epoch": 0.2959309494451295, "step": 240}, {"loss": 0.9211, "grad_norm": 0.8210226893424988, "learning_rate": 0.0002, "epoch": 0.3082614056720099, "step": 250}, {"loss": 1.161, "grad_norm": 0.27473965287208557, "learning_rate": 0.0002, "epoch": 0.3205918618988903, "step": 260}, {"loss": 1.0218, "grad_norm": 0.3051395118236542, "learning_rate": 0.0002, "epoch": 0.33292231812577067, "step": 270}, {"loss": 1.0286, "grad_norm": 0.3037777245044708, "learning_rate": 0.0002, "epoch": 0.34525277435265106, "step": 280}, {"loss": 1.144, "grad_norm": 0.2748974859714508, "learning_rate": 0.0002, "epoch": 0.35758323057953145, "step": 290}, {"loss": 1.0858, "grad_norm": 0.23656068742275238, "learning_rate": 0.0002, "epoch": 0.36991368680641185, "step": 300}, {"loss": 1.0564, "grad_norm": 0.2523384094238281, "learning_rate": 0.0002, "epoch": 0.38224414303329224, "step": 310}, {"loss": 1.3091, "grad_norm": 0.27848055958747864, "learning_rate": 0.0002, "epoch": 0.39457459926017263, "step": 320}, {"loss": 1.0252, "grad_norm": 0.3204525411128998, "learning_rate": 0.0002, "epoch": 0.406905055487053, "step": 330}, {"loss": 0.9855, "grad_norm": 0.3459707498550415, "learning_rate": 0.0002, "epoch": 0.4192355117139334, "step": 340}, {"loss": 1.1111, "grad_norm": 0.2458430379629135, "learning_rate": 0.0002, "epoch": 0.4315659679408138, "step": 350}, {"loss": 1.1493, "grad_norm": 0.5022910237312317, "learning_rate": 0.0002, "epoch": 0.4438964241676942, "step": 360}, {"loss": 1.1272, "grad_norm": 0.27076372504234314, "learning_rate": 0.0002, "epoch": 0.4562268803945746, "step": 370}, {"loss": 1.1927, "grad_norm": 0.6489047408103943, "learning_rate": 0.0002, "epoch": 0.468557336621455, "step": 380}, {"loss": 0.9501, "grad_norm": 0.3324144184589386, "learning_rate": 0.0002, "epoch": 0.4808877928483354, "step": 390}, {"loss": 1.2012, "grad_norm": 0.32813116908073425, "learning_rate": 0.0002, "epoch": 0.4932182490752158, "step": 400}, {"loss": 1.1135, "grad_norm": 0.25295355916023254, "learning_rate": 0.0002, "epoch": 0.5055487053020962, "step": 410}, {"loss": 0.9477, "grad_norm": 0.2912578880786896, "learning_rate": 0.0002, "epoch": 0.5178791615289766, "step": 420}, {"loss": 1.0121, "grad_norm": 0.34780189394950867, "learning_rate": 0.0002, "epoch": 0.530209617755857, "step": 430}, {"loss": 0.9296, "grad_norm": 0.24604526162147522, "learning_rate": 0.0002, "epoch": 0.5425400739827374, "step": 440}, {"loss": 1.253, "grad_norm": 0.32759982347488403, "learning_rate": 0.0002, "epoch": 0.5548705302096177, "step": 450}, {"loss": 1.1925, "grad_norm": 0.40810221433639526, "learning_rate": 0.0002, "epoch": 0.5672009864364982, "step": 460}, {"loss": 1.174, "grad_norm": 0.3590679466724396, "learning_rate": 0.0002, "epoch": 0.5795314426633785, "step": 470}, {"loss": 1.2223, "grad_norm": 0.5656213760375977, "learning_rate": 0.0002, "epoch": 0.591861898890259, "step": 480}, {"loss": 1.1936, "grad_norm": 0.30830657482147217, "learning_rate": 0.0002, "epoch": 0.6041923551171393, "step": 490}, {"loss": 1.1873, "grad_norm": 0.317905455827713, "learning_rate": 0.0002, "epoch": 0.6165228113440198, "step": 500}, {"loss": 0.9805, "grad_norm": 0.3254566490650177, "learning_rate": 0.0002, "epoch": 0.6288532675709001, "step": 510}, {"loss": 1.0384, "grad_norm": 0.29187721014022827, "learning_rate": 0.0002, "epoch": 0.6411837237977805, "step": 520}, {"loss": 1.2526, "grad_norm": 0.3439238965511322, "learning_rate": 0.0002, "epoch": 0.6535141800246609, "step": 530}, {"loss": 1.0698, "grad_norm": 0.20970556139945984, "learning_rate": 0.0002, "epoch": 0.6658446362515413, "step": 540}, {"loss": 1.1047, "grad_norm": 0.4022853374481201, "learning_rate": 0.0002, "epoch": 0.6781750924784217, "step": 550}, {"loss": 1.0684, "grad_norm": 0.2235759049654007, "learning_rate": 0.0002, "epoch": 0.6905055487053021, "step": 560}, {"loss": 1.2339, "grad_norm": 0.33849895000457764, "learning_rate": 0.0002, "epoch": 0.7028360049321825, "step": 570}, {"loss": 1.1929, "grad_norm": 0.34745967388153076, "learning_rate": 0.0002, "epoch": 0.7151664611590629, "step": 580}, {"loss": 1.1158, "grad_norm": 0.26041269302368164, "learning_rate": 0.0002, "epoch": 0.7274969173859432, "step": 590}, {"loss": 1.2134, "grad_norm": 0.3804777264595032, "learning_rate": 0.0002, "epoch": 0.7398273736128237, "step": 600}, {"loss": 1.0606, "grad_norm": 0.2456253319978714, "learning_rate": 0.0002, "epoch": 0.752157829839704, "step": 610}, {"loss": 1.0638, "grad_norm": 0.37838423252105713, "learning_rate": 0.0002, "epoch": 0.7644882860665845, "step": 620}, {"loss": 1.0556, "grad_norm": 0.28105494379997253, "learning_rate": 0.0002, "epoch": 0.7768187422934648, "step": 630}, {"loss": 1.0672, "grad_norm": 0.2774018943309784, "learning_rate": 0.0002, "epoch": 0.7891491985203453, "step": 640}, {"loss": 0.9978, "grad_norm": 1.8184229135513306, "learning_rate": 0.0002, "epoch": 0.8014796547472256, "step": 650}, {"loss": 1.1038, "grad_norm": 0.3325096070766449, "learning_rate": 0.0002, "epoch": 0.813810110974106, "step": 660}, {"loss": 1.083, "grad_norm": 0.2686693072319031, "learning_rate": 0.0002, "epoch": 0.8261405672009864, "step": 670}, {"loss": 1.1308, "grad_norm": 0.3271431624889374, "learning_rate": 0.0002, "epoch": 0.8384710234278668, "step": 680}, {"loss": 1.1116, "grad_norm": 2.359999656677246, "learning_rate": 0.0002, "epoch": 0.8508014796547472, "step": 690}, {"loss": 1.0782, "grad_norm": 0.46242964267730713, "learning_rate": 0.0002, "epoch": 0.8631319358816276, "step": 700}, {"loss": 0.95, "grad_norm": 0.34731170535087585, "learning_rate": 0.0002, "epoch": 0.8754623921085081, "step": 710}, {"loss": 1.2236, "grad_norm": 0.39381715655326843, "learning_rate": 0.0002, "epoch": 0.8877928483353884, "step": 720}, {"loss": 1.1319, "grad_norm": 0.43496373295783997, "learning_rate": 0.0002, "epoch": 0.9001233045622689, "step": 730}, {"loss": 1.0979, "grad_norm": 0.32243210077285767, "learning_rate": 0.0002, "epoch": 0.9124537607891492, "step": 740}, {"loss": 0.9913, "grad_norm": 0.30396756529808044, "learning_rate": 0.0002, "epoch": 0.9247842170160296, "step": 750}, {"loss": 1.1141, "grad_norm": 0.4461122751235962, "learning_rate": 0.0002, "epoch": 0.93711467324291, "step": 760}, {"loss": 1.0049, "grad_norm": 0.24081681668758392, "learning_rate": 0.0002, "epoch": 0.9494451294697904, "step": 770}, {"loss": 1.0966, "grad_norm": 0.27461910247802734, "learning_rate": 0.0002, "epoch": 0.9617755856966708, "step": 780}, {"loss": 0.9942, "grad_norm": 0.3325668275356293, "learning_rate": 0.0002, "epoch": 0.9741060419235512, "step": 790}, {"loss": 1.0506, "grad_norm": 0.24046339094638824, "learning_rate": 0.0002, "epoch": 0.9864364981504316, "step": 800}, {"loss": 0.9989, "grad_norm": 0.42950066924095154, "learning_rate": 0.0002, "epoch": 0.998766954377312, "step": 810}, {"eval_loss": 1.246457576751709, "eval_runtime": 98.7974, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.557, "epoch": 1.0, "step": 811}, {"loss": 1.0536, "grad_norm": 0.26760655641555786, "learning_rate": 0.0002, "epoch": 1.0110974106041923, "step": 820}, {"loss": 0.9722, "grad_norm": 0.4640820026397705, "learning_rate": 0.0002, "epoch": 1.0234278668310728, "step": 830}, {"loss": 0.9392, "grad_norm": 0.2699166238307953, "learning_rate": 0.0002, "epoch": 1.0357583230579532, "step": 840}, {"loss": 0.9335, "grad_norm": 0.3441709578037262, "learning_rate": 0.0002, "epoch": 1.0480887792848335, "step": 850}, {"loss": 0.9083, "grad_norm": 0.299934983253479, "learning_rate": 0.0002, "epoch": 1.060419235511714, "step": 860}, {"loss": 0.9416, "grad_norm": 0.2980666160583496, "learning_rate": 0.0002, "epoch": 1.0727496917385944, "step": 870}, {"loss": 0.94, "grad_norm": 0.3131714463233948, "learning_rate": 0.0002, "epoch": 1.0850801479654748, "step": 880}, {"loss": 0.9288, "grad_norm": 0.29881617426872253, "learning_rate": 0.0002, "epoch": 1.097410604192355, "step": 890}, {"loss": 0.998, "grad_norm": 0.29870888590812683, "learning_rate": 0.0002, "epoch": 1.1097410604192355, "step": 900}, {"loss": 0.9924, "grad_norm": 0.5735140442848206, "learning_rate": 0.0002, "epoch": 1.122071516646116, "step": 910}, {"loss": 1.0694, "grad_norm": 0.33159002661705017, "learning_rate": 0.0002, "epoch": 1.1344019728729964, "step": 920}, {"loss": 1.0069, "grad_norm": 1.235399842262268, "learning_rate": 0.0002, "epoch": 1.1467324290998766, "step": 930}, {"loss": 1.0315, "grad_norm": 0.27469736337661743, "learning_rate": 0.0002, "epoch": 1.159062885326757, "step": 940}, {"loss": 0.9386, "grad_norm": 0.29130664467811584, "learning_rate": 0.0002, "epoch": 1.1713933415536375, "step": 950}, {"loss": 0.8919, "grad_norm": 0.3730354607105255, "learning_rate": 0.0002, "epoch": 1.183723797780518, "step": 960}, {"loss": 0.9988, "grad_norm": 0.5973590612411499, "learning_rate": 0.0002, "epoch": 1.1960542540073984, "step": 970}, {"loss": 0.9525, "grad_norm": 0.39631304144859314, "learning_rate": 0.0002, "epoch": 1.2083847102342786, "step": 980}, {"loss": 0.9217, "grad_norm": 0.849051296710968, "learning_rate": 0.0002, "epoch": 1.220715166461159, "step": 990}, {"loss": 1.0903, "grad_norm": 0.4390525817871094, "learning_rate": 0.0002, "epoch": 1.2330456226880395, "step": 1000}, {"loss": 0.9018, "grad_norm": 0.30423852801322937, "learning_rate": 0.0002, "epoch": 1.2453760789149197, "step": 1010}, {"loss": 1.0128, "grad_norm": 0.34736061096191406, "learning_rate": 0.0002, "epoch": 1.2577065351418002, "step": 1020}, {"loss": 0.9026, "grad_norm": 0.3421604037284851, "learning_rate": 0.0002, "epoch": 1.2700369913686806, "step": 1030}, {"loss": 0.8485, "grad_norm": 0.544170081615448, "learning_rate": 0.0002, "epoch": 1.282367447595561, "step": 1040}, {"loss": 0.9591, "grad_norm": 0.5128790736198425, "learning_rate": 0.0002, "epoch": 1.2946979038224415, "step": 1050}, {"loss": 0.9214, "grad_norm": 0.443344384431839, "learning_rate": 0.0002, "epoch": 1.3070283600493218, "step": 1060}, {"loss": 0.9367, "grad_norm": 0.6380868554115295, "learning_rate": 0.0002, "epoch": 1.3193588162762022, "step": 1070}, {"loss": 0.9849, "grad_norm": 0.4638073146343231, "learning_rate": 0.0002, "epoch": 1.3316892725030827, "step": 1080}, {"loss": 0.8645, "grad_norm": 0.32406893372535706, "learning_rate": 0.0002, "epoch": 1.344019728729963, "step": 1090}, {"loss": 0.8278, "grad_norm": 0.3955065608024597, "learning_rate": 0.0002, "epoch": 1.3563501849568433, "step": 1100}, {"loss": 0.9306, "grad_norm": 0.3489246666431427, "learning_rate": 0.0002, "epoch": 1.3686806411837238, "step": 1110}, {"loss": 1.0138, "grad_norm": 0.48451653122901917, "learning_rate": 0.0002, "epoch": 1.3810110974106042, "step": 1120}, {"loss": 0.9165, "grad_norm": 0.3652360439300537, "learning_rate": 0.0002, "epoch": 1.3933415536374847, "step": 1130}, {"loss": 0.9576, "grad_norm": 1.3097436428070068, "learning_rate": 0.0002, "epoch": 1.405672009864365, "step": 1140}, {"loss": 0.8115, "grad_norm": 0.3647715449333191, "learning_rate": 0.0002, "epoch": 1.4180024660912454, "step": 1150}, {"loss": 0.8573, "grad_norm": 0.37248560786247253, "learning_rate": 0.0002, "epoch": 1.4303329223181258, "step": 1160}, {"loss": 0.936, "grad_norm": 0.4639643430709839, "learning_rate": 0.0002, "epoch": 1.442663378545006, "step": 1170}, {"loss": 0.9511, "grad_norm": 0.5455219745635986, "learning_rate": 0.0002, "epoch": 1.4549938347718865, "step": 1180}, {"loss": 0.8611, "grad_norm": 0.38862571120262146, "learning_rate": 0.0002, "epoch": 1.467324290998767, "step": 1190}, {"loss": 0.8681, "grad_norm": 0.37586215138435364, "learning_rate": 0.0002, "epoch": 1.4796547472256474, "step": 1200}, {"loss": 0.8673, "grad_norm": 0.46244436502456665, "learning_rate": 0.0002, "epoch": 1.4919852034525278, "step": 1210}, {"loss": 0.9388, "grad_norm": 0.3570359945297241, "learning_rate": 0.0002, "epoch": 1.504315659679408, "step": 1220}, {"loss": 0.971, "grad_norm": 0.28393083810806274, "learning_rate": 0.0002, "epoch": 1.5166461159062885, "step": 1230}, {"loss": 0.9296, "grad_norm": 0.5672869682312012, "learning_rate": 0.0002, "epoch": 1.528976572133169, "step": 1240}, {"loss": 0.8787, "grad_norm": 0.41605108976364136, "learning_rate": 0.0002, "epoch": 1.5413070283600492, "step": 1250}, {"loss": 0.8744, "grad_norm": 0.40657493472099304, "learning_rate": 0.0002, "epoch": 1.5536374845869299, "step": 1260}, {"loss": 0.9046, "grad_norm": 0.43672341108322144, "learning_rate": 0.0002, "epoch": 1.56596794081381, "step": 1270}, {"loss": 0.8586, "grad_norm": 0.3065410554409027, "learning_rate": 0.0002, "epoch": 1.5782983970406905, "step": 1280}, {"loss": 0.9499, "grad_norm": 0.37826645374298096, "learning_rate": 0.0002, "epoch": 1.590628853267571, "step": 1290}, {"loss": 0.901, "grad_norm": 0.42307335138320923, "learning_rate": 0.0002, "epoch": 1.6029593094944512, "step": 1300}, {"loss": 0.8673, "grad_norm": 0.3648843467235565, "learning_rate": 0.0002, "epoch": 1.6152897657213316, "step": 1310}, {"loss": 0.9302, "grad_norm": 0.8921076059341431, "learning_rate": 0.0002, "epoch": 1.627620221948212, "step": 1320}, {"loss": 0.9378, "grad_norm": 0.37522226572036743, "learning_rate": 0.0002, "epoch": 1.6399506781750923, "step": 1330}, {"loss": 0.8921, "grad_norm": 0.7489957809448242, "learning_rate": 0.0002, "epoch": 1.652281134401973, "step": 1340}, {"loss": 0.9297, "grad_norm": 0.31733131408691406, "learning_rate": 0.0002, "epoch": 1.6646115906288532, "step": 1350}, {"loss": 0.907, "grad_norm": 0.3249478340148926, "learning_rate": 0.0002, "epoch": 1.6769420468557337, "step": 1360}, {"loss": 1.0197, "grad_norm": 0.3178001344203949, "learning_rate": 0.0002, "epoch": 1.6892725030826141, "step": 1370}, {"loss": 1.0781, "grad_norm": 0.5674093961715698, "learning_rate": 0.0002, "epoch": 1.7016029593094943, "step": 1380}, {"loss": 0.8972, "grad_norm": 0.35272449254989624, "learning_rate": 0.0002, "epoch": 1.7139334155363748, "step": 1390}, {"loss": 0.9346, "grad_norm": 0.5778217911720276, "learning_rate": 0.0002, "epoch": 1.7262638717632552, "step": 1400}, {"loss": 0.9099, "grad_norm": 0.33561450242996216, "learning_rate": 0.0002, "epoch": 1.7385943279901355, "step": 1410}, {"loss": 0.8636, "grad_norm": 0.31735464930534363, "learning_rate": 0.0002, "epoch": 1.7509247842170161, "step": 1420}, {"loss": 0.982, "grad_norm": 1.0612670183181763, "learning_rate": 0.0002, "epoch": 1.7632552404438964, "step": 1430}, {"loss": 0.8224, "grad_norm": 0.5442509651184082, "learning_rate": 0.0002, "epoch": 1.7755856966707768, "step": 1440}, {"loss": 0.9275, "grad_norm": 0.7471332550048828, "learning_rate": 0.0002, "epoch": 1.7879161528976573, "step": 1450}, {"loss": 0.9389, "grad_norm": 0.4323609173297882, "learning_rate": 0.0002, "epoch": 1.8002466091245375, "step": 1460}, {"loss": 0.8247, "grad_norm": 0.47796759009361267, "learning_rate": 0.0002, "epoch": 1.8125770653514182, "step": 1470}, {"loss": 0.9395, "grad_norm": 0.3348400592803955, "learning_rate": 0.0002, "epoch": 1.8249075215782984, "step": 1480}, {"loss": 0.9793, "grad_norm": 0.3354550898075104, "learning_rate": 0.0002, "epoch": 1.8372379778051788, "step": 1490}, {"loss": 0.8581, "grad_norm": 0.5988477468490601, "learning_rate": 0.0002, "epoch": 1.8495684340320593, "step": 1500}, {"loss": 0.9268, "grad_norm": 0.5222318172454834, "learning_rate": 0.0002, "epoch": 1.8618988902589395, "step": 1510}, {"loss": 0.8846, "grad_norm": 0.5246642827987671, "learning_rate": 0.0002, "epoch": 1.87422934648582, "step": 1520}, {"loss": 0.9317, "grad_norm": 0.3164594769477844, "learning_rate": 0.0002, "epoch": 1.8865598027127004, "step": 1530}, {"loss": 0.9961, "grad_norm": 0.3496174216270447, "learning_rate": 0.0002, "epoch": 1.8988902589395806, "step": 1540}, {"loss": 0.9057, "grad_norm": 0.8863359689712524, "learning_rate": 0.0002, "epoch": 1.9112207151664613, "step": 1550}, {"loss": 0.9405, "grad_norm": 0.3587026298046112, "learning_rate": 0.0002, "epoch": 1.9235511713933415, "step": 1560}, {"loss": 0.8335, "grad_norm": 0.6052881479263306, "learning_rate": 0.0002, "epoch": 1.935881627620222, "step": 1570}, {"loss": 0.8805, "grad_norm": 0.567269504070282, "learning_rate": 0.0002, "epoch": 1.9482120838471024, "step": 1580}, {"loss": 0.9581, "grad_norm": 0.45184487104415894, "learning_rate": 0.0002, "epoch": 1.9605425400739827, "step": 1590}, {"loss": 0.9147, "grad_norm": 0.5028569102287292, "learning_rate": 0.0002, "epoch": 1.972872996300863, "step": 1600}, {"loss": 0.75, "grad_norm": 0.4677547216415405, "learning_rate": 0.0002, "epoch": 1.9852034525277436, "step": 1610}, {"loss": 0.8469, "grad_norm": 0.35106056928634644, "learning_rate": 0.0002, "epoch": 1.9975339087546238, "step": 1620}, {"eval_loss": 1.238026738166809, "eval_runtime": 95.4287, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.576, "epoch": 2.0, "step": 1622}, {"loss": 0.7629, "grad_norm": 0.444060355424881, "learning_rate": 0.0002, "epoch": 2.0098643649815044, "step": 1630}, {"loss": 0.772, "grad_norm": 0.627570390701294, "learning_rate": 0.0002, "epoch": 2.0221948212083847, "step": 1640}, {"loss": 0.6186, "grad_norm": 0.38737839460372925, "learning_rate": 0.0002, "epoch": 2.034525277435265, "step": 1650}, {"loss": 0.7734, "grad_norm": 0.4300459623336792, "learning_rate": 0.0002, "epoch": 2.0468557336621456, "step": 1660}, {"loss": 0.6943, "grad_norm": 0.43037715554237366, "learning_rate": 0.0002, "epoch": 2.059186189889026, "step": 1670}, {"loss": 0.6588, "grad_norm": 0.40772515535354614, "learning_rate": 0.0002, "epoch": 2.0715166461159065, "step": 1680}, {"loss": 0.8105, "grad_norm": 0.5295451879501343, "learning_rate": 0.0002, "epoch": 2.0838471023427867, "step": 1690}, {"loss": 0.7717, "grad_norm": 0.7452750205993652, "learning_rate": 0.0002, "epoch": 2.096177558569667, "step": 1700}, {"loss": 0.7458, "grad_norm": 0.809183657169342, "learning_rate": 0.0002, "epoch": 2.1085080147965476, "step": 1710}, {"loss": 0.7398, "grad_norm": 0.4597688913345337, "learning_rate": 0.0002, "epoch": 2.120838471023428, "step": 1720}, {"loss": 0.6856, "grad_norm": 0.806919276714325, "learning_rate": 0.0002, "epoch": 2.133168927250308, "step": 1730}, {"loss": 0.679, "grad_norm": 0.3755643665790558, "learning_rate": 0.0002, "epoch": 2.1454993834771887, "step": 1740}, {"loss": 0.7938, "grad_norm": 0.5882734060287476, "learning_rate": 0.0002, "epoch": 2.157829839704069, "step": 1750}, {"loss": 0.6782, "grad_norm": 0.692960798740387, "learning_rate": 0.0002, "epoch": 2.1701602959309496, "step": 1760}, {"loss": 0.7195, "grad_norm": 0.4737096428871155, "learning_rate": 0.0002, "epoch": 2.18249075215783, "step": 1770}, {"loss": 0.758, "grad_norm": 0.6637021899223328, "learning_rate": 0.0002, "epoch": 2.19482120838471, "step": 1780}, {"loss": 0.7139, "grad_norm": 0.9109764099121094, "learning_rate": 0.0002, "epoch": 2.2071516646115907, "step": 1790}, {"loss": 0.7373, "grad_norm": 0.4137539267539978, "learning_rate": 0.0002, "epoch": 2.219482120838471, "step": 1800}, {"loss": 0.7266, "grad_norm": 0.44995415210723877, "learning_rate": 0.0002, "epoch": 2.2318125770653516, "step": 1810}, {"loss": 0.7663, "grad_norm": 0.5985036492347717, "learning_rate": 0.0002, "epoch": 2.244143033292232, "step": 1820}, {"loss": 0.7502, "grad_norm": 0.7549490332603455, "learning_rate": 0.0002, "epoch": 2.256473489519112, "step": 1830}, {"loss": 0.7452, "grad_norm": 0.4490937888622284, "learning_rate": 0.0002, "epoch": 2.2688039457459928, "step": 1840}, {"loss": 0.7531, "grad_norm": 0.38859808444976807, "learning_rate": 0.0002, "epoch": 2.281134401972873, "step": 1850}, {"loss": 0.7278, "grad_norm": 1.0704916715621948, "learning_rate": 0.0002, "epoch": 2.293464858199753, "step": 1860}, {"loss": 0.7143, "grad_norm": 0.4647100865840912, "learning_rate": 0.0002, "epoch": 2.305795314426634, "step": 1870}, {"loss": 0.7146, "grad_norm": 0.6181163787841797, "learning_rate": 0.0002, "epoch": 2.318125770653514, "step": 1880}, {"loss": 0.7689, "grad_norm": 0.9241904020309448, "learning_rate": 0.0002, "epoch": 2.3304562268803943, "step": 1890}, {"loss": 0.7294, "grad_norm": 0.39101317524909973, "learning_rate": 0.0002, "epoch": 2.342786683107275, "step": 1900}, {"loss": 0.7079, "grad_norm": 0.49442458152770996, "learning_rate": 0.0002, "epoch": 2.3551171393341552, "step": 1910}, {"loss": 0.7586, "grad_norm": 0.4864824414253235, "learning_rate": 0.0002, "epoch": 2.367447595561036, "step": 1920}, {"loss": 0.7434, "grad_norm": 0.5427613854408264, "learning_rate": 0.0002, "epoch": 2.379778051787916, "step": 1930}, {"loss": 0.8423, "grad_norm": 0.7164974808692932, "learning_rate": 0.0002, "epoch": 2.392108508014797, "step": 1940}, {"loss": 0.6888, "grad_norm": 0.562979519367218, "learning_rate": 0.0002, "epoch": 2.404438964241677, "step": 1950}, {"loss": 0.7692, "grad_norm": 0.5631861090660095, "learning_rate": 0.0002, "epoch": 2.4167694204685573, "step": 1960}, {"loss": 0.67, "grad_norm": 0.4895121157169342, "learning_rate": 0.0002, "epoch": 2.429099876695438, "step": 1970}, {"loss": 0.7735, "grad_norm": 0.45674824714660645, "learning_rate": 0.0002, "epoch": 2.441430332922318, "step": 1980}, {"loss": 0.685, "grad_norm": 1.1424206495285034, "learning_rate": 0.0002, "epoch": 2.4537607891491984, "step": 1990}, {"loss": 0.7627, "grad_norm": 0.6314579844474792, "learning_rate": 0.0002, "epoch": 2.466091245376079, "step": 2000}, {"loss": 0.7118, "grad_norm": 0.5481605529785156, "learning_rate": 0.0002, "epoch": 2.4784217016029593, "step": 2010}, {"loss": 0.6947, "grad_norm": 0.4671579599380493, "learning_rate": 0.0002, "epoch": 2.4907521578298395, "step": 2020}, {"loss": 0.7377, "grad_norm": 0.7621194124221802, "learning_rate": 0.0002, "epoch": 2.50308261405672, "step": 2030}, {"loss": 0.69, "grad_norm": 0.38983288407325745, "learning_rate": 0.0002, "epoch": 2.5154130702836004, "step": 2040}, {"loss": 0.8381, "grad_norm": 0.6341150999069214, "learning_rate": 0.0002, "epoch": 2.5277435265104806, "step": 2050}, {"loss": 0.773, "grad_norm": 0.7151971459388733, "learning_rate": 0.0002, "epoch": 2.5400739827373613, "step": 2060}, {"loss": 0.6733, "grad_norm": 0.9665895104408264, "learning_rate": 0.0002, "epoch": 2.5524044389642415, "step": 2070}, {"loss": 0.7791, "grad_norm": 0.9572727680206299, "learning_rate": 0.0002, "epoch": 2.564734895191122, "step": 2080}, {"loss": 0.7205, "grad_norm": 1.1970765590667725, "learning_rate": 0.0002, "epoch": 2.5770653514180024, "step": 2090}, {"loss": 0.6736, "grad_norm": 0.5505942702293396, "learning_rate": 0.0002, "epoch": 2.589395807644883, "step": 2100}, {"loss": 0.673, "grad_norm": 0.5903949737548828, "learning_rate": 0.0002, "epoch": 2.6017262638717633, "step": 2110}, {"loss": 0.678, "grad_norm": 0.45640307664871216, "learning_rate": 0.0002, "epoch": 2.6140567200986435, "step": 2120}, {"loss": 0.6562, "grad_norm": 0.8763944506645203, "learning_rate": 0.0002, "epoch": 2.626387176325524, "step": 2130}, {"loss": 0.6484, "grad_norm": 0.4472963213920593, "learning_rate": 0.0002, "epoch": 2.6387176325524044, "step": 2140}, {"loss": 0.7702, "grad_norm": 0.5335086584091187, "learning_rate": 0.0002, "epoch": 2.6510480887792847, "step": 2150}, {"loss": 0.6851, "grad_norm": 0.805263340473175, "learning_rate": 0.0002, "epoch": 2.6633785450061653, "step": 2160}, {"loss": 0.7026, "grad_norm": 0.6332727670669556, "learning_rate": 0.0002, "epoch": 2.6757090012330456, "step": 2170}, {"loss": 0.7925, "grad_norm": 0.8667435646057129, "learning_rate": 0.0002, "epoch": 2.688039457459926, "step": 2180}, {"loss": 0.8044, "grad_norm": 0.5638955235481262, "learning_rate": 0.0002, "epoch": 2.7003699136868065, "step": 2190}, {"loss": 0.7117, "grad_norm": 0.4176250696182251, "learning_rate": 0.0002, "epoch": 2.7127003699136867, "step": 2200}, {"loss": 0.6932, "grad_norm": 0.6013461351394653, "learning_rate": 0.0002, "epoch": 2.7250308261405674, "step": 2210}, {"loss": 0.7843, "grad_norm": 0.553961992263794, "learning_rate": 0.0002, "epoch": 2.7373612823674476, "step": 2220}, {"loss": 0.8633, "grad_norm": 0.4710180461406708, "learning_rate": 0.0002, "epoch": 2.7496917385943282, "step": 2230}, {"loss": 0.7469, "grad_norm": 0.8141706585884094, "learning_rate": 0.0002, "epoch": 2.7620221948212085, "step": 2240}, {"loss": 0.7086, "grad_norm": 0.7449556589126587, "learning_rate": 0.0002, "epoch": 2.7743526510480887, "step": 2250}, {"loss": 0.6933, "grad_norm": 0.5366780757904053, "learning_rate": 0.0002, "epoch": 2.7866831072749694, "step": 2260}, {"loss": 0.7192, "grad_norm": 0.5316720604896545, "learning_rate": 0.0002, "epoch": 2.7990135635018496, "step": 2270}, {"loss": 0.6212, "grad_norm": 0.4598459005355835, "learning_rate": 0.0002, "epoch": 2.81134401972873, "step": 2280}, {"loss": 0.7024, "grad_norm": 0.6852091550827026, "learning_rate": 0.0002, "epoch": 2.8236744759556105, "step": 2290}, {"loss": 0.7357, "grad_norm": 0.8040902018547058, "learning_rate": 0.0002, "epoch": 2.8360049321824907, "step": 2300}, {"loss": 0.7563, "grad_norm": 0.46976321935653687, "learning_rate": 0.0002, "epoch": 2.848335388409371, "step": 2310}, {"loss": 0.731, "grad_norm": 0.5214090347290039, "learning_rate": 0.0002, "epoch": 2.8606658446362516, "step": 2320}, {"loss": 0.6687, "grad_norm": 0.5323054790496826, "learning_rate": 0.0002, "epoch": 2.872996300863132, "step": 2330}, {"loss": 0.7895, "grad_norm": 0.6842264533042908, "learning_rate": 0.0002, "epoch": 2.885326757090012, "step": 2340}, {"loss": 0.7737, "grad_norm": 0.9157055616378784, "learning_rate": 0.0002, "epoch": 2.8976572133168927, "step": 2350}, {"loss": 0.7217, "grad_norm": 0.5253258347511292, "learning_rate": 0.0002, "epoch": 2.909987669543773, "step": 2360}, {"loss": 0.7162, "grad_norm": 0.4937705099582672, "learning_rate": 0.0002, "epoch": 2.9223181257706536, "step": 2370}, {"loss": 0.7008, "grad_norm": 0.48762989044189453, "learning_rate": 0.0002, "epoch": 2.934648581997534, "step": 2380}, {"loss": 0.8086, "grad_norm": 0.544335126876831, "learning_rate": 0.0002, "epoch": 2.9469790382244145, "step": 2390}, {"loss": 0.643, "grad_norm": 0.4847845435142517, "learning_rate": 0.0002, "epoch": 2.9593094944512948, "step": 2400}, {"loss": 0.7757, "grad_norm": 0.4787445366382599, "learning_rate": 0.0002, "epoch": 2.971639950678175, "step": 2410}, {"loss": 0.7678, "grad_norm": 1.022318959236145, "learning_rate": 0.0002, "epoch": 2.9839704069050557, "step": 2420}, {"loss": 0.6548, "grad_norm": 0.4987848103046417, "learning_rate": 0.0002, "epoch": 2.996300863131936, "step": 2430}, {"eval_loss": 1.2936296463012695, "eval_runtime": 94.7897, "eval_samples_per_second": 4.6, "eval_steps_per_second": 0.58, "epoch": 3.0, "step": 2433}, {"loss": 0.6073, "grad_norm": 0.5562372803688049, "learning_rate": 0.0002, "epoch": 3.008631319358816, "step": 2440}, {"loss": 0.5181, "grad_norm": 1.133402705192566, "learning_rate": 0.0002, "epoch": 3.020961775585697, "step": 2450}, {"loss": 0.5333, "grad_norm": 0.6480470299720764, "learning_rate": 0.0002, "epoch": 3.033292231812577, "step": 2460}, {"loss": 0.4828, "grad_norm": 0.8989138007164001, "learning_rate": 0.0002, "epoch": 3.0456226880394572, "step": 2470}, {"loss": 0.5097, "grad_norm": 0.8257461786270142, "learning_rate": 0.0002, "epoch": 3.057953144266338, "step": 2480}, {"loss": 0.6229, "grad_norm": 0.6813381910324097, "learning_rate": 0.0002, "epoch": 3.070283600493218, "step": 2490}, {"loss": 0.531, "grad_norm": 0.6989586353302002, "learning_rate": 0.0002, "epoch": 3.082614056720099, "step": 2500}, {"loss": 0.54, "grad_norm": 0.7992092967033386, "learning_rate": 0.0002, "epoch": 3.094944512946979, "step": 2510}, {"loss": 0.5054, "grad_norm": 0.698077917098999, "learning_rate": 0.0002, "epoch": 3.1072749691738593, "step": 2520}, {"loss": 0.5064, "grad_norm": 0.5699033141136169, "learning_rate": 0.0002, "epoch": 3.11960542540074, "step": 2530}, {"loss": 0.6088, "grad_norm": 0.6142355799674988, "learning_rate": 0.0002, "epoch": 3.13193588162762, "step": 2540}, {"loss": 0.585, "grad_norm": 0.7089933753013611, "learning_rate": 0.0002, "epoch": 3.144266337854501, "step": 2550}, {"loss": 0.5373, "grad_norm": 1.0107015371322632, "learning_rate": 0.0002, "epoch": 3.156596794081381, "step": 2560}, {"loss": 0.5429, "grad_norm": 0.568138837814331, "learning_rate": 0.0002, "epoch": 3.1689272503082613, "step": 2570}, {"loss": 0.5897, "grad_norm": 0.9960416555404663, "learning_rate": 0.0002, "epoch": 3.181257706535142, "step": 2580}, {"loss": 0.5211, "grad_norm": 0.6277595162391663, "learning_rate": 0.0002, "epoch": 3.193588162762022, "step": 2590}, {"loss": 0.5787, "grad_norm": 0.681083619594574, "learning_rate": 0.0002, "epoch": 3.2059186189889024, "step": 2600}, {"loss": 0.5166, "grad_norm": 0.5816057324409485, "learning_rate": 0.0002, "epoch": 3.218249075215783, "step": 2610}, {"loss": 0.545, "grad_norm": 0.7197734117507935, "learning_rate": 0.0002, "epoch": 3.2305795314426633, "step": 2620}, {"loss": 0.614, "grad_norm": 0.6524068117141724, "learning_rate": 0.0002, "epoch": 3.242909987669544, "step": 2630}, {"loss": 0.5456, "grad_norm": 1.273668646812439, "learning_rate": 0.0002, "epoch": 3.255240443896424, "step": 2640}, {"loss": 0.5266, "grad_norm": 0.6950451731681824, "learning_rate": 0.0002, "epoch": 3.2675709001233044, "step": 2650}, {"loss": 0.5194, "grad_norm": 0.8029071688652039, "learning_rate": 0.0002, "epoch": 3.279901356350185, "step": 2660}, {"loss": 0.5729, "grad_norm": 0.7464073896408081, "learning_rate": 0.0002, "epoch": 3.2922318125770653, "step": 2670}, {"loss": 0.5366, "grad_norm": 0.8342001438140869, "learning_rate": 0.0002, "epoch": 3.304562268803946, "step": 2680}, {"loss": 0.5413, "grad_norm": 0.5629868507385254, "learning_rate": 0.0002, "epoch": 3.316892725030826, "step": 2690}, {"loss": 0.633, "grad_norm": 0.753999650478363, "learning_rate": 0.0002, "epoch": 3.3292231812577064, "step": 2700}, {"loss": 0.5048, "grad_norm": 1.0271371603012085, "learning_rate": 0.0002, "epoch": 3.341553637484587, "step": 2710}, {"loss": 0.5233, "grad_norm": 0.9608535170555115, "learning_rate": 0.0002, "epoch": 3.3538840937114673, "step": 2720}, {"loss": 0.5102, "grad_norm": 0.7796488404273987, "learning_rate": 0.0002, "epoch": 3.3662145499383476, "step": 2730}, {"loss": 0.5172, "grad_norm": 0.5666437149047852, "learning_rate": 0.0002, "epoch": 3.3785450061652282, "step": 2740}, {"loss": 0.491, "grad_norm": 0.5462956428527832, "learning_rate": 0.0002, "epoch": 3.3908754623921085, "step": 2750}, {"loss": 0.5855, "grad_norm": 1.289099097251892, "learning_rate": 0.0002, "epoch": 3.4032059186189887, "step": 2760}, {"loss": 0.635, "grad_norm": 0.825566828250885, "learning_rate": 0.0002, "epoch": 3.4155363748458694, "step": 2770}, {"loss": 0.4998, "grad_norm": 0.8366670608520508, "learning_rate": 0.0002, "epoch": 3.4278668310727496, "step": 2780}, {"loss": 0.5732, "grad_norm": 1.0931549072265625, "learning_rate": 0.0002, "epoch": 3.4401972872996303, "step": 2790}, {"loss": 0.6093, "grad_norm": 0.9228858351707458, "learning_rate": 0.0002, "epoch": 3.4525277435265105, "step": 2800}, {"loss": 0.6089, "grad_norm": 1.3182806968688965, "learning_rate": 0.0002, "epoch": 3.4648581997533907, "step": 2810}, {"loss": 0.5665, "grad_norm": 0.8366976380348206, "learning_rate": 0.0002, "epoch": 3.4771886559802714, "step": 2820}, {"loss": 0.5666, "grad_norm": 0.8067695498466492, "learning_rate": 0.0002, "epoch": 3.4895191122071516, "step": 2830}, {"loss": 0.579, "grad_norm": 1.1163437366485596, "learning_rate": 0.0002, "epoch": 3.5018495684340323, "step": 2840}, {"loss": 0.5785, "grad_norm": 1.7196556329727173, "learning_rate": 0.0002, "epoch": 3.5141800246609125, "step": 2850}, {"loss": 0.5346, "grad_norm": 1.1267012357711792, "learning_rate": 0.0002, "epoch": 3.5265104808877927, "step": 2860}, {"loss": 0.447, "grad_norm": 0.7220137119293213, "learning_rate": 0.0002, "epoch": 3.5388409371146734, "step": 2870}, {"loss": 0.6099, "grad_norm": 0.914114773273468, "learning_rate": 0.0002, "epoch": 3.5511713933415536, "step": 2880}, {"loss": 0.6143, "grad_norm": 0.6193503141403198, "learning_rate": 0.0002, "epoch": 3.563501849568434, "step": 2890}, {"loss": 0.5171, "grad_norm": 0.6060135960578918, "learning_rate": 0.0002, "epoch": 3.5758323057953145, "step": 2900}, {"loss": 0.5659, "grad_norm": 1.0177327394485474, "learning_rate": 0.0002, "epoch": 3.5881627620221948, "step": 2910}, {"loss": 0.5711, "grad_norm": 0.5994468331336975, "learning_rate": 0.0002, "epoch": 3.600493218249075, "step": 2920}, {"loss": 0.6373, "grad_norm": 0.7450457215309143, "learning_rate": 0.0002, "epoch": 3.6128236744759556, "step": 2930}, {"loss": 0.4933, "grad_norm": 0.5825870037078857, "learning_rate": 0.0002, "epoch": 3.625154130702836, "step": 2940}, {"loss": 0.6016, "grad_norm": 0.6289743781089783, "learning_rate": 0.0002, "epoch": 3.6374845869297165, "step": 2950}, {"loss": 0.5507, "grad_norm": 0.7801929116249084, "learning_rate": 0.0002, "epoch": 3.6498150431565968, "step": 2960}, {"loss": 0.5695, "grad_norm": 1.1206634044647217, "learning_rate": 0.0002, "epoch": 3.6621454993834774, "step": 2970}, {"loss": 0.4985, "grad_norm": 0.6738817691802979, "learning_rate": 0.0002, "epoch": 3.6744759556103577, "step": 2980}, {"loss": 0.6209, "grad_norm": 1.1917344331741333, "learning_rate": 0.0002, "epoch": 3.686806411837238, "step": 2990}, {"loss": 0.5373, "grad_norm": 1.3738657236099243, "learning_rate": 0.0002, "epoch": 3.6991368680641186, "step": 3000}, {"loss": 0.5467, "grad_norm": 0.6642793416976929, "learning_rate": 0.0002, "epoch": 3.711467324290999, "step": 3010}, {"loss": 0.6243, "grad_norm": 0.9030995965003967, "learning_rate": 0.0002, "epoch": 3.723797780517879, "step": 3020}, {"loss": 0.592, "grad_norm": 1.0203914642333984, "learning_rate": 0.0002, "epoch": 3.7361282367447597, "step": 3030}, {"loss": 0.5453, "grad_norm": 0.648394763469696, "learning_rate": 0.0002, "epoch": 3.74845869297164, "step": 3040}, {"loss": 0.498, "grad_norm": 0.6304570436477661, "learning_rate": 0.0002, "epoch": 3.76078914919852, "step": 3050}, {"loss": 0.683, "grad_norm": 0.8286601901054382, "learning_rate": 0.0002, "epoch": 3.773119605425401, "step": 3060}, {"loss": 0.5302, "grad_norm": 0.906444251537323, "learning_rate": 0.0002, "epoch": 3.785450061652281, "step": 3070}, {"loss": 0.5345, "grad_norm": 1.4212149381637573, "learning_rate": 0.0002, "epoch": 3.7977805178791613, "step": 3080}, {"loss": 0.6403, "grad_norm": 0.7574319839477539, "learning_rate": 0.0002, "epoch": 3.810110974106042, "step": 3090}, {"loss": 0.5756, "grad_norm": 0.6534451246261597, "learning_rate": 0.0002, "epoch": 3.822441430332922, "step": 3100}, {"loss": 0.5306, "grad_norm": 0.7525447010993958, "learning_rate": 0.0002, "epoch": 3.834771886559803, "step": 3110}, {"loss": 0.5368, "grad_norm": 0.6513990759849548, "learning_rate": 0.0002, "epoch": 3.847102342786683, "step": 3120}, {"loss": 0.5492, "grad_norm": 0.7782694697380066, "learning_rate": 0.0002, "epoch": 3.8594327990135637, "step": 3130}, {"loss": 0.5727, "grad_norm": 0.7998530268669128, "learning_rate": 0.0002, "epoch": 3.871763255240444, "step": 3140}, {"loss": 0.5156, "grad_norm": 0.8045353293418884, "learning_rate": 0.0002, "epoch": 3.884093711467324, "step": 3150}, {"loss": 0.5341, "grad_norm": 0.8242645263671875, "learning_rate": 0.0002, "epoch": 3.896424167694205, "step": 3160}, {"loss": 0.5563, "grad_norm": 0.8302360773086548, "learning_rate": 0.0002, "epoch": 3.908754623921085, "step": 3170}, {"loss": 0.5793, "grad_norm": 0.8653109073638916, "learning_rate": 0.0002, "epoch": 3.9210850801479653, "step": 3180}, {"loss": 0.5219, "grad_norm": 0.6461338996887207, "learning_rate": 0.0002, "epoch": 3.933415536374846, "step": 3190}, {"loss": 0.6009, "grad_norm": 0.8267415165901184, "learning_rate": 0.0002, "epoch": 3.945745992601726, "step": 3200}, {"loss": 0.5956, "grad_norm": 1.1963194608688354, "learning_rate": 0.0002, "epoch": 3.9580764488286064, "step": 3210}, {"loss": 0.5692, "grad_norm": 0.7101966142654419, "learning_rate": 0.0002, "epoch": 3.970406905055487, "step": 3220}, {"loss": 0.5471, "grad_norm": 0.5931660532951355, "learning_rate": 0.0002, "epoch": 3.9827373612823673, "step": 3230}, {"loss": 0.5619, "grad_norm": 0.7465988993644714, "learning_rate": 0.0002, "epoch": 3.995067817509248, "step": 3240}, {"eval_loss": 1.4066498279571533, "eval_runtime": 95.7145, "eval_samples_per_second": 4.555, "eval_steps_per_second": 0.575, "epoch": 4.0, "step": 3244}, {"loss": 0.4948, "grad_norm": 0.9478800296783447, "learning_rate": 0.0002, "epoch": 4.007398273736128, "step": 3250}, {"loss": 0.4129, "grad_norm": 1.207059621810913, "learning_rate": 0.0002, "epoch": 4.019728729963009, "step": 3260}, {"loss": 0.3577, "grad_norm": 0.8984074592590332, "learning_rate": 0.0002, "epoch": 4.032059186189889, "step": 3270}, {"loss": 0.3798, "grad_norm": 0.8104140758514404, "learning_rate": 0.0002, "epoch": 4.044389642416769, "step": 3280}, {"loss": 0.3657, "grad_norm": 1.0875468254089355, "learning_rate": 0.0002, "epoch": 4.05672009864365, "step": 3290}, {"loss": 0.3703, "grad_norm": 0.8520309329032898, "learning_rate": 0.0002, "epoch": 4.06905055487053, "step": 3300}, {"loss": 0.3933, "grad_norm": 1.076735496520996, "learning_rate": 0.0002, "epoch": 4.0813810110974105, "step": 3310}, {"loss": 0.4422, "grad_norm": 0.7789369821548462, "learning_rate": 0.0002, "epoch": 4.093711467324291, "step": 3320}, {"loss": 0.4009, "grad_norm": 0.916862964630127, "learning_rate": 0.0002, "epoch": 4.106041923551172, "step": 3330}, {"loss": 0.3934, "grad_norm": 1.1251654624938965, "learning_rate": 0.0002, "epoch": 4.118372379778052, "step": 3340}, {"loss": 0.3651, "grad_norm": 0.9373420476913452, "learning_rate": 0.0002, "epoch": 4.130702836004932, "step": 3350}, {"loss": 0.384, "grad_norm": 1.03253972530365, "learning_rate": 0.0002, "epoch": 4.143033292231813, "step": 3360}, {"loss": 0.372, "grad_norm": 0.947023332118988, "learning_rate": 0.0002, "epoch": 4.155363748458693, "step": 3370}, {"loss": 0.4018, "grad_norm": 0.8709157109260559, "learning_rate": 0.0002, "epoch": 4.167694204685573, "step": 3380}, {"loss": 0.3754, "grad_norm": 0.930983304977417, "learning_rate": 0.0002, "epoch": 4.180024660912454, "step": 3390}, {"loss": 0.4248, "grad_norm": 1.092809796333313, "learning_rate": 0.0002, "epoch": 4.192355117139334, "step": 3400}, {"loss": 0.4453, "grad_norm": 0.8454303741455078, "learning_rate": 0.0002, "epoch": 4.2046855733662145, "step": 3410}, {"loss": 0.4198, "grad_norm": 0.957210123538971, "learning_rate": 0.0002, "epoch": 4.217016029593095, "step": 3420}, {"loss": 0.3743, "grad_norm": 0.854333758354187, "learning_rate": 0.0002, "epoch": 4.229346485819975, "step": 3430}, {"loss": 0.4041, "grad_norm": 1.0457639694213867, "learning_rate": 0.0002, "epoch": 4.241676942046856, "step": 3440}, {"loss": 0.3817, "grad_norm": 0.8972977995872498, "learning_rate": 0.0002, "epoch": 4.254007398273736, "step": 3450}, {"loss": 0.4445, "grad_norm": 1.0438238382339478, "learning_rate": 0.0002, "epoch": 4.266337854500616, "step": 3460}, {"loss": 0.4078, "grad_norm": 0.7000405192375183, "learning_rate": 0.0002, "epoch": 4.278668310727497, "step": 3470}, {"loss": 0.3718, "grad_norm": 1.0451240539550781, "learning_rate": 0.0002, "epoch": 4.290998766954377, "step": 3480}, {"loss": 0.4506, "grad_norm": 1.3339767456054688, "learning_rate": 0.0002, "epoch": 4.303329223181258, "step": 3490}, {"loss": 0.3999, "grad_norm": 0.7503946423530579, "learning_rate": 0.0002, "epoch": 4.315659679408138, "step": 3500}, {"loss": 0.4503, "grad_norm": 0.8443584442138672, "learning_rate": 0.0002, "epoch": 4.3279901356350186, "step": 3510}, {"loss": 0.3793, "grad_norm": 1.1681201457977295, "learning_rate": 0.0002, "epoch": 4.340320591861899, "step": 3520}, {"loss": 0.4462, "grad_norm": 1.078883171081543, "learning_rate": 0.0002, "epoch": 4.352651048088779, "step": 3530}, {"loss": 0.4216, "grad_norm": 0.6894834041595459, "learning_rate": 0.0002, "epoch": 4.36498150431566, "step": 3540}, {"loss": 0.4315, "grad_norm": 0.7059480547904968, "learning_rate": 0.0002, "epoch": 4.37731196054254, "step": 3550}, {"loss": 0.3821, "grad_norm": 1.1807256937026978, "learning_rate": 0.0002, "epoch": 4.38964241676942, "step": 3560}, {"loss": 0.4192, "grad_norm": 0.8341359496116638, "learning_rate": 0.0002, "epoch": 4.401972872996301, "step": 3570}, {"loss": 0.4123, "grad_norm": 1.0273033380508423, "learning_rate": 0.0002, "epoch": 4.4143033292231815, "step": 3580}, {"loss": 0.5018, "grad_norm": 0.6916454434394836, "learning_rate": 0.0002, "epoch": 4.426633785450061, "step": 3590}, {"loss": 0.3909, "grad_norm": 0.8210113644599915, "learning_rate": 0.0002, "epoch": 4.438964241676942, "step": 3600}, {"loss": 0.3893, "grad_norm": 1.0309500694274902, "learning_rate": 0.0002, "epoch": 4.451294697903823, "step": 3610}, {"loss": 0.3902, "grad_norm": 0.8847399353981018, "learning_rate": 0.0002, "epoch": 4.463625154130703, "step": 3620}, {"loss": 0.4198, "grad_norm": 1.668636679649353, "learning_rate": 0.0002, "epoch": 4.475955610357583, "step": 3630}, {"loss": 0.4075, "grad_norm": 1.3087958097457886, "learning_rate": 0.0002, "epoch": 4.488286066584464, "step": 3640}, {"loss": 0.4294, "grad_norm": 0.837852418422699, "learning_rate": 0.0002, "epoch": 4.500616522811344, "step": 3650}, {"loss": 0.4053, "grad_norm": 9.7662353515625, "learning_rate": 0.0002, "epoch": 4.512946979038224, "step": 3660}, {"loss": 0.4033, "grad_norm": 1.125719428062439, "learning_rate": 0.0002, "epoch": 4.525277435265105, "step": 3670}, {"loss": 0.4566, "grad_norm": 0.7755377292633057, "learning_rate": 0.0002, "epoch": 4.5376078914919855, "step": 3680}, {"loss": 0.4415, "grad_norm": 0.7185089588165283, "learning_rate": 0.0002, "epoch": 4.549938347718865, "step": 3690}, {"loss": 0.4616, "grad_norm": 1.182063102722168, "learning_rate": 0.0002, "epoch": 4.562268803945746, "step": 3700}, {"loss": 0.4572, "grad_norm": 1.001197338104248, "learning_rate": 0.0002, "epoch": 4.574599260172627, "step": 3710}, {"loss": 0.4493, "grad_norm": 0.9705429077148438, "learning_rate": 0.0002, "epoch": 4.586929716399506, "step": 3720}, {"loss": 0.42, "grad_norm": 0.7136746048927307, "learning_rate": 0.0002, "epoch": 4.599260172626387, "step": 3730}, {"loss": 0.3757, "grad_norm": 1.0004864931106567, "learning_rate": 0.0002, "epoch": 4.611590628853268, "step": 3740}, {"loss": 0.4418, "grad_norm": 1.3193715810775757, "learning_rate": 0.0002, "epoch": 4.623921085080148, "step": 3750}, {"loss": 0.4572, "grad_norm": 0.6945042014122009, "learning_rate": 0.0002, "epoch": 4.636251541307028, "step": 3760}, {"loss": 0.4255, "grad_norm": 0.8903936743736267, "learning_rate": 0.0002, "epoch": 4.648581997533909, "step": 3770}, {"loss": 0.3582, "grad_norm": 0.7960889339447021, "learning_rate": 0.0002, "epoch": 4.660912453760789, "step": 3780}, {"loss": 0.3864, "grad_norm": 1.0439172983169556, "learning_rate": 0.0002, "epoch": 4.673242909987669, "step": 3790}, {"loss": 0.4378, "grad_norm": 1.4546219110488892, "learning_rate": 0.0002, "epoch": 4.68557336621455, "step": 3800}, {"loss": 0.4191, "grad_norm": 0.8194343447685242, "learning_rate": 0.0002, "epoch": 4.697903822441431, "step": 3810}, {"loss": 0.4473, "grad_norm": 1.0727602243423462, "learning_rate": 0.0002, "epoch": 4.7102342786683105, "step": 3820}, {"loss": 0.4021, "grad_norm": 0.7785195708274841, "learning_rate": 0.0002, "epoch": 4.722564734895191, "step": 3830}, {"loss": 0.4252, "grad_norm": 0.846783459186554, "learning_rate": 0.0002, "epoch": 4.734895191122072, "step": 3840}, {"loss": 0.4647, "grad_norm": 1.0481648445129395, "learning_rate": 0.0002, "epoch": 4.747225647348952, "step": 3850}, {"loss": 0.4944, "grad_norm": 0.7324008941650391, "learning_rate": 0.0002, "epoch": 4.759556103575832, "step": 3860}, {"loss": 0.3831, "grad_norm": 1.06382417678833, "learning_rate": 0.0002, "epoch": 4.771886559802713, "step": 3870}, {"loss": 0.3934, "grad_norm": 0.9851241111755371, "learning_rate": 0.0002, "epoch": 4.784217016029594, "step": 3880}, {"loss": 0.5172, "grad_norm": 0.8215277791023254, "learning_rate": 0.0002, "epoch": 4.796547472256473, "step": 3890}, {"loss": 0.4437, "grad_norm": 0.9901723861694336, "learning_rate": 0.0002, "epoch": 4.808877928483354, "step": 3900}, {"loss": 0.4673, "grad_norm": 0.9149112701416016, "learning_rate": 0.0002, "epoch": 4.821208384710234, "step": 3910}, {"loss": 0.4295, "grad_norm": 0.9772973656654358, "learning_rate": 0.0002, "epoch": 4.8335388409371145, "step": 3920}, {"loss": 0.4346, "grad_norm": 0.8889636397361755, "learning_rate": 0.0002, "epoch": 4.845869297163995, "step": 3930}, {"loss": 0.421, "grad_norm": 1.3032807111740112, "learning_rate": 0.0002, "epoch": 4.858199753390876, "step": 3940}, {"loss": 0.434, "grad_norm": 0.8575899600982666, "learning_rate": 0.0002, "epoch": 4.870530209617756, "step": 3950}, {"loss": 0.4295, "grad_norm": 1.04326331615448, "learning_rate": 0.0002, "epoch": 4.882860665844636, "step": 3960}, {"loss": 0.3633, "grad_norm": 1.041210651397705, "learning_rate": 0.0002, "epoch": 4.895191122071517, "step": 3970}, {"loss": 0.4104, "grad_norm": 0.9113056063652039, "learning_rate": 0.0002, "epoch": 4.907521578298397, "step": 3980}, {"loss": 0.4496, "grad_norm": 1.019347906112671, "learning_rate": 0.0002, "epoch": 4.919852034525277, "step": 3990}, {"loss": 0.457, "grad_norm": 0.7709218859672546, "learning_rate": 0.0002, "epoch": 4.932182490752158, "step": 4000}, {"loss": 0.4697, "grad_norm": 0.8891775608062744, "learning_rate": 0.0002, "epoch": 4.944512946979038, "step": 4010}, {"loss": 0.4436, "grad_norm": 1.0396920442581177, "learning_rate": 0.0002, "epoch": 4.9568434032059185, "step": 4020}, {"loss": 0.4251, "grad_norm": 0.9239833354949951, "learning_rate": 0.0002, "epoch": 4.969173859432799, "step": 4030}, {"loss": 0.5049, "grad_norm": 1.801400065422058, "learning_rate": 0.0002, "epoch": 4.981504315659679, "step": 4040}, {"loss": 0.4481, "grad_norm": 0.6194164752960205, "learning_rate": 0.0002, "epoch": 4.99383477188656, "step": 4050}, {"eval_loss": 1.544758915901184, "eval_runtime": 96.2573, "eval_samples_per_second": 4.53, "eval_steps_per_second": 0.571, "epoch": 5.0, "step": 4055}, {"loss": 0.3774, "grad_norm": 0.9918256998062134, "learning_rate": 0.0002, "epoch": 5.00616522811344, "step": 4060}, {"loss": 0.2887, "grad_norm": 1.4851351976394653, "learning_rate": 0.0002, "epoch": 5.018495684340321, "step": 4070}, {"loss": 0.2454, "grad_norm": 0.9237686395645142, "learning_rate": 0.0002, "epoch": 5.030826140567201, "step": 4080}, {"loss": 0.3072, "grad_norm": 1.2180852890014648, "learning_rate": 0.0002, "epoch": 5.0431565967940815, "step": 4090}, {"loss": 0.282, "grad_norm": 1.1247979402542114, "learning_rate": 0.0002, "epoch": 5.055487053020962, "step": 4100}, {"loss": 0.3108, "grad_norm": 1.2969884872436523, "learning_rate": 0.0002, "epoch": 5.067817509247842, "step": 4110}, {"loss": 0.2858, "grad_norm": 1.0183063745498657, "learning_rate": 0.0002, "epoch": 5.080147965474723, "step": 4120}, {"loss": 0.295, "grad_norm": 1.121330738067627, "learning_rate": 0.0002, "epoch": 5.092478421701603, "step": 4130}, {"loss": 0.2697, "grad_norm": 1.0748186111450195, "learning_rate": 0.0002, "epoch": 5.104808877928483, "step": 4140}, {"loss": 0.3414, "grad_norm": 1.103474736213684, "learning_rate": 0.0002, "epoch": 5.117139334155364, "step": 4150}, {"loss": 0.305, "grad_norm": 1.2251166105270386, "learning_rate": 0.0002, "epoch": 5.129469790382244, "step": 4160}, {"loss": 0.3131, "grad_norm": 0.920898973941803, "learning_rate": 0.0002, "epoch": 5.141800246609124, "step": 4170}, {"loss": 0.281, "grad_norm": 1.327542781829834, "learning_rate": 0.0002, "epoch": 5.154130702836005, "step": 4180}, {"loss": 0.3214, "grad_norm": 1.0677192211151123, "learning_rate": 0.0002, "epoch": 5.1664611590628855, "step": 4190}, {"loss": 0.2863, "grad_norm": 0.897241473197937, "learning_rate": 0.0002, "epoch": 5.178791615289766, "step": 4200}, {"loss": 0.2967, "grad_norm": 0.977457582950592, "learning_rate": 0.0002, "epoch": 5.191122071516646, "step": 4210}, {"loss": 0.3032, "grad_norm": 1.4115267992019653, "learning_rate": 0.0002, "epoch": 5.203452527743527, "step": 4220}, {"loss": 0.3279, "grad_norm": 1.097743034362793, "learning_rate": 0.0002, "epoch": 5.215782983970407, "step": 4230}, {"loss": 0.293, "grad_norm": 1.1095269918441772, "learning_rate": 0.0002, "epoch": 5.228113440197287, "step": 4240}, {"loss": 0.3544, "grad_norm": 1.3785479068756104, "learning_rate": 0.0002, "epoch": 5.240443896424168, "step": 4250}, {"loss": 0.3118, "grad_norm": 1.0298776626586914, "learning_rate": 0.0002, "epoch": 5.252774352651048, "step": 4260}, {"loss": 0.296, "grad_norm": 1.1592111587524414, "learning_rate": 0.0002, "epoch": 5.265104808877928, "step": 4270}, {"loss": 0.2878, "grad_norm": 1.2355743646621704, "learning_rate": 0.0002, "epoch": 5.277435265104809, "step": 4280}, {"loss": 0.3085, "grad_norm": 0.8543112874031067, "learning_rate": 0.0002, "epoch": 5.2897657213316895, "step": 4290}, {"loss": 0.3108, "grad_norm": 1.2953215837478638, "learning_rate": 0.0002, "epoch": 5.302096177558569, "step": 4300}, {"loss": 0.2912, "grad_norm": 1.1001787185668945, "learning_rate": 0.0002, "epoch": 5.31442663378545, "step": 4310}, {"loss": 0.3003, "grad_norm": 0.7476816773414612, "learning_rate": 0.0002, "epoch": 5.326757090012331, "step": 4320}, {"loss": 0.3247, "grad_norm": 0.8195574283599854, "learning_rate": 0.0002, "epoch": 5.3390875462392104, "step": 4330}, {"loss": 0.3035, "grad_norm": 0.9490262866020203, "learning_rate": 0.0002, "epoch": 5.351418002466091, "step": 4340}, {"loss": 0.2846, "grad_norm": 1.2201412916183472, "learning_rate": 0.0002, "epoch": 5.363748458692972, "step": 4350}, {"loss": 0.2644, "grad_norm": 1.0311479568481445, "learning_rate": 0.0002, "epoch": 5.376078914919852, "step": 4360}, {"loss": 0.3104, "grad_norm": 1.2097488641738892, "learning_rate": 0.0002, "epoch": 5.388409371146732, "step": 4370}, {"loss": 0.2977, "grad_norm": 1.140942096710205, "learning_rate": 0.0002, "epoch": 5.400739827373613, "step": 4380}, {"loss": 0.2975, "grad_norm": 0.8091890811920166, "learning_rate": 0.0002, "epoch": 5.413070283600494, "step": 4390}, {"loss": 0.3727, "grad_norm": 1.4467964172363281, "learning_rate": 0.0002, "epoch": 5.425400739827373, "step": 4400}, {"loss": 0.2979, "grad_norm": 1.0836058855056763, "learning_rate": 0.0002, "epoch": 5.437731196054254, "step": 4410}, {"loss": 0.2601, "grad_norm": 1.0515433549880981, "learning_rate": 0.0002, "epoch": 5.450061652281135, "step": 4420}, {"loss": 0.315, "grad_norm": 0.9603073000907898, "learning_rate": 0.0002, "epoch": 5.4623921085080145, "step": 4430}, {"loss": 0.3166, "grad_norm": 1.234609842300415, "learning_rate": 0.0002, "epoch": 5.474722564734895, "step": 4440}, {"loss": 0.3142, "grad_norm": 0.8881428837776184, "learning_rate": 0.0002, "epoch": 5.487053020961776, "step": 4450}, {"loss": 0.3725, "grad_norm": 1.1817275285720825, "learning_rate": 0.0002, "epoch": 5.499383477188656, "step": 4460}, {"loss": 0.2944, "grad_norm": 1.213993787765503, "learning_rate": 0.0002, "epoch": 5.511713933415536, "step": 4470}, {"loss": 0.3136, "grad_norm": 1.0501725673675537, "learning_rate": 0.0002, "epoch": 5.524044389642417, "step": 4480}, {"loss": 0.306, "grad_norm": 1.5061579942703247, "learning_rate": 0.0002, "epoch": 5.536374845869297, "step": 4490}, {"loss": 0.3226, "grad_norm": 1.1171475648880005, "learning_rate": 0.0002, "epoch": 5.548705302096177, "step": 4500}, {"loss": 0.3624, "grad_norm": 1.1147594451904297, "learning_rate": 0.0002, "epoch": 5.561035758323058, "step": 4510}, {"loss": 0.3435, "grad_norm": 1.0600544214248657, "learning_rate": 0.0002, "epoch": 5.573366214549939, "step": 4520}, {"loss": 0.3268, "grad_norm": 1.247870922088623, "learning_rate": 0.0002, "epoch": 5.5856966707768185, "step": 4530}, {"loss": 0.3168, "grad_norm": 0.9425561428070068, "learning_rate": 0.0002, "epoch": 5.598027127003699, "step": 4540}, {"loss": 0.3119, "grad_norm": 1.1111550331115723, "learning_rate": 0.0002, "epoch": 5.61035758323058, "step": 4550}, {"loss": 0.3389, "grad_norm": 1.743268609046936, "learning_rate": 0.0002, "epoch": 5.62268803945746, "step": 4560}, {"loss": 0.31, "grad_norm": 1.3522645235061646, "learning_rate": 0.0002, "epoch": 5.63501849568434, "step": 4570}, {"loss": 0.3121, "grad_norm": 0.7354221343994141, "learning_rate": 0.0002, "epoch": 5.647348951911221, "step": 4580}, {"loss": 0.3693, "grad_norm": 1.050743818283081, "learning_rate": 0.0002, "epoch": 5.659679408138101, "step": 4590}, {"loss": 0.3449, "grad_norm": 1.1302396059036255, "learning_rate": 0.0002, "epoch": 5.6720098643649814, "step": 4600}, {"loss": 0.3211, "grad_norm": 0.8774183392524719, "learning_rate": 0.0002, "epoch": 5.684340320591862, "step": 4610}, {"loss": 0.3501, "grad_norm": 1.090781569480896, "learning_rate": 0.0002, "epoch": 5.696670776818742, "step": 4620}, {"loss": 0.3211, "grad_norm": 0.9177733063697815, "learning_rate": 0.0002, "epoch": 5.709001233045623, "step": 4630}, {"loss": 0.3439, "grad_norm": 0.9985341429710388, "learning_rate": 0.0002, "epoch": 5.721331689272503, "step": 4640}, {"loss": 0.3323, "grad_norm": 1.0230613946914673, "learning_rate": 0.0002, "epoch": 5.733662145499384, "step": 4650}, {"loss": 0.3525, "grad_norm": 0.944656252861023, "learning_rate": 0.0002, "epoch": 5.745992601726264, "step": 4660}, {"loss": 0.3191, "grad_norm": 0.8162471652030945, "learning_rate": 0.0002, "epoch": 5.758323057953144, "step": 4670}, {"loss": 0.4011, "grad_norm": 1.0500398874282837, "learning_rate": 0.0002, "epoch": 5.770653514180025, "step": 4680}, {"loss": 0.3452, "grad_norm": 0.9487981796264648, "learning_rate": 0.0002, "epoch": 5.782983970406905, "step": 4690}, {"loss": 0.2942, "grad_norm": 1.1856540441513062, "learning_rate": 0.0002, "epoch": 5.7953144266337855, "step": 4700}, {"loss": 0.3107, "grad_norm": 1.2583396434783936, "learning_rate": 0.0002, "epoch": 5.807644882860666, "step": 4710}, {"loss": 0.3223, "grad_norm": 1.2532602548599243, "learning_rate": 0.0002, "epoch": 5.819975339087546, "step": 4720}, {"loss": 0.3253, "grad_norm": 1.115236520767212, "learning_rate": 0.0002, "epoch": 5.832305795314427, "step": 4730}, {"loss": 0.3539, "grad_norm": 1.2245537042617798, "learning_rate": 0.0002, "epoch": 5.844636251541307, "step": 4740}, {"loss": 0.3171, "grad_norm": 1.1964094638824463, "learning_rate": 0.0002, "epoch": 5.856966707768187, "step": 4750}, {"loss": 0.3623, "grad_norm": 1.0833805799484253, "learning_rate": 0.0002, "epoch": 5.869297163995068, "step": 4760}, {"loss": 0.3511, "grad_norm": 1.0694046020507812, "learning_rate": 0.0002, "epoch": 5.881627620221948, "step": 4770}, {"loss": 0.3266, "grad_norm": 0.9947936534881592, "learning_rate": 0.0002, "epoch": 5.893958076448829, "step": 4780}, {"loss": 0.316, "grad_norm": 1.175716519355774, "learning_rate": 0.0002, "epoch": 5.906288532675709, "step": 4790}, {"loss": 0.3609, "grad_norm": 0.7717352509498596, "learning_rate": 0.0002, "epoch": 5.9186189889025895, "step": 4800}, {"loss": 0.3058, "grad_norm": 1.2906442880630493, "learning_rate": 0.0002, "epoch": 5.930949445129469, "step": 4810}, {"loss": 0.3187, "grad_norm": 1.2416284084320068, "learning_rate": 0.0002, "epoch": 5.94327990135635, "step": 4820}, {"loss": 0.337, "grad_norm": 1.3066956996917725, "learning_rate": 0.0002, "epoch": 5.955610357583231, "step": 4830}, {"loss": 0.3167, "grad_norm": 1.0872026681900024, "learning_rate": 0.0002, "epoch": 5.967940813810111, "step": 4840}, {"loss": 0.3262, "grad_norm": 1.1941101551055908, "learning_rate": 0.0002, "epoch": 5.980271270036991, "step": 4850}, {"loss": 0.3234, "grad_norm": 1.1126095056533813, "learning_rate": 0.0002, "epoch": 5.992601726263872, "step": 4860}, {"eval_loss": 1.748323917388916, "eval_runtime": 97.7488, "eval_samples_per_second": 4.46, "eval_steps_per_second": 0.563, "epoch": 6.0, "step": 4866}, {"loss": 0.2774, "grad_norm": 1.3631165027618408, "learning_rate": 0.0002, "epoch": 6.0049321824907524, "step": 4870}, {"loss": 0.2399, "grad_norm": 1.2631664276123047, "learning_rate": 0.0002, "epoch": 6.017262638717632, "step": 4880}, {"loss": 0.2177, "grad_norm": 0.7073080539703369, "learning_rate": 0.0002, "epoch": 6.029593094944513, "step": 4890}, {"loss": 0.215, "grad_norm": 0.7856091856956482, "learning_rate": 0.0002, "epoch": 6.041923551171394, "step": 4900}, {"loss": 0.1999, "grad_norm": 1.145540475845337, "learning_rate": 0.0002, "epoch": 6.054254007398273, "step": 4910}, {"loss": 0.2084, "grad_norm": 1.1742334365844727, "learning_rate": 0.0002, "epoch": 6.066584463625154, "step": 4920}, {"loss": 0.2342, "grad_norm": 0.8043994903564453, "learning_rate": 0.0002, "epoch": 6.078914919852035, "step": 4930}, {"loss": 0.2454, "grad_norm": 1.1877652406692505, "learning_rate": 0.0002, "epoch": 6.0912453760789145, "step": 4940}, {"loss": 0.1908, "grad_norm": 0.7624953985214233, "learning_rate": 0.0002, "epoch": 6.103575832305795, "step": 4950}, {"loss": 0.2254, "grad_norm": 1.0403119325637817, "learning_rate": 0.0002, "epoch": 6.115906288532676, "step": 4960}, {"loss": 0.2274, "grad_norm": 1.2040252685546875, "learning_rate": 0.0002, "epoch": 6.1282367447595565, "step": 4970}, {"loss": 0.2199, "grad_norm": 0.6242546439170837, "learning_rate": 0.0002, "epoch": 6.140567200986436, "step": 4980}, {"loss": 0.27, "grad_norm": 1.1394767761230469, "learning_rate": 0.0002, "epoch": 6.152897657213317, "step": 4990}, {"loss": 0.2377, "grad_norm": 1.3760257959365845, "learning_rate": 0.0002, "epoch": 6.165228113440198, "step": 5000}, {"loss": 0.2331, "grad_norm": 1.0707697868347168, "learning_rate": 0.0002, "epoch": 6.177558569667077, "step": 5010}, {"loss": 0.2311, "grad_norm": 1.288072109222412, "learning_rate": 0.0002, "epoch": 6.189889025893958, "step": 5020}, {"loss": 0.2276, "grad_norm": 1.1479463577270508, "learning_rate": 0.0002, "epoch": 6.202219482120839, "step": 5030}, {"loss": 0.2294, "grad_norm": 0.905891478061676, "learning_rate": 0.0002, "epoch": 6.2145499383477185, "step": 5040}, {"loss": 0.2575, "grad_norm": 1.0354516506195068, "learning_rate": 0.0002, "epoch": 6.226880394574599, "step": 5050}, {"loss": 0.2259, "grad_norm": 1.312671184539795, "learning_rate": 0.0002, "epoch": 6.23921085080148, "step": 5060}, {"loss": 0.2281, "grad_norm": 1.614709734916687, "learning_rate": 0.0002, "epoch": 6.25154130702836, "step": 5070}, {"loss": 0.2388, "grad_norm": 1.0864229202270508, "learning_rate": 0.0002, "epoch": 6.26387176325524, "step": 5080}, {"loss": 0.2014, "grad_norm": 1.0401391983032227, "learning_rate": 0.0002, "epoch": 6.276202219482121, "step": 5090}, {"loss": 0.2419, "grad_norm": 1.2187728881835938, "learning_rate": 0.0002, "epoch": 6.288532675709002, "step": 5100}, {"loss": 0.2144, "grad_norm": 0.9474364519119263, "learning_rate": 0.0002, "epoch": 6.300863131935881, "step": 5110}, {"loss": 0.238, "grad_norm": 1.1228716373443604, "learning_rate": 0.0002, "epoch": 6.313193588162762, "step": 5120}, {"loss": 0.2556, "grad_norm": 0.9294499754905701, "learning_rate": 0.0002, "epoch": 6.325524044389643, "step": 5130}, {"loss": 0.2384, "grad_norm": 1.0521048307418823, "learning_rate": 0.0002, "epoch": 6.337854500616523, "step": 5140}, {"loss": 0.2444, "grad_norm": 1.2406890392303467, "learning_rate": 0.0002, "epoch": 6.350184956843403, "step": 5150}, {"loss": 0.2301, "grad_norm": 1.2972853183746338, "learning_rate": 0.0002, "epoch": 6.362515413070284, "step": 5160}, {"loss": 0.2574, "grad_norm": 0.8772842288017273, "learning_rate": 0.0002, "epoch": 6.374845869297164, "step": 5170}, {"loss": 0.2337, "grad_norm": 1.050349473953247, "learning_rate": 0.0002, "epoch": 6.387176325524044, "step": 5180}, {"loss": 0.2593, "grad_norm": 0.9432134032249451, "learning_rate": 0.0002, "epoch": 6.399506781750925, "step": 5190}, {"loss": 0.2546, "grad_norm": 1.11045241355896, "learning_rate": 0.0002, "epoch": 6.411837237977805, "step": 5200}, {"loss": 0.268, "grad_norm": 1.117530345916748, "learning_rate": 0.0002, "epoch": 6.4241676942046855, "step": 5210}, {"loss": 0.25, "grad_norm": 1.4194035530090332, "learning_rate": 0.0002, "epoch": 6.436498150431566, "step": 5220}, {"loss": 0.2335, "grad_norm": 1.063950777053833, "learning_rate": 0.0002, "epoch": 6.448828606658447, "step": 5230}, {"loss": 0.2299, "grad_norm": 1.2946349382400513, "learning_rate": 0.0002, "epoch": 6.461159062885327, "step": 5240}, {"loss": 0.242, "grad_norm": 1.5237880945205688, "learning_rate": 0.0002, "epoch": 6.473489519112207, "step": 5250}, {"loss": 0.255, "grad_norm": 1.1915720701217651, "learning_rate": 0.0002, "epoch": 6.485819975339088, "step": 5260}, {"loss": 0.2357, "grad_norm": 1.0779626369476318, "learning_rate": 0.0002, "epoch": 6.498150431565968, "step": 5270}, {"loss": 0.2476, "grad_norm": 0.8255738019943237, "learning_rate": 0.0002, "epoch": 6.510480887792848, "step": 5280}, {"loss": 0.267, "grad_norm": 1.275174856185913, "learning_rate": 0.0002, "epoch": 6.522811344019729, "step": 5290}, {"loss": 0.2217, "grad_norm": 1.0878815650939941, "learning_rate": 0.0002, "epoch": 6.535141800246609, "step": 5300}, {"loss": 0.2462, "grad_norm": 1.2594236135482788, "learning_rate": 0.0002, "epoch": 6.5474722564734895, "step": 5310}, {"loss": 0.2457, "grad_norm": 0.9919610619544983, "learning_rate": 0.0002, "epoch": 6.55980271270037, "step": 5320}, {"loss": 0.2933, "grad_norm": 1.3703680038452148, "learning_rate": 0.0002, "epoch": 6.57213316892725, "step": 5330}, {"loss": 0.2453, "grad_norm": 1.403140902519226, "learning_rate": 0.0002, "epoch": 6.584463625154131, "step": 5340}, {"loss": 0.2584, "grad_norm": 1.3477165699005127, "learning_rate": 0.0002, "epoch": 6.596794081381011, "step": 5350}, {"loss": 0.2853, "grad_norm": 1.3145594596862793, "learning_rate": 0.0002, "epoch": 6.609124537607892, "step": 5360}, {"loss": 0.246, "grad_norm": 0.9048973321914673, "learning_rate": 0.0002, "epoch": 6.621454993834772, "step": 5370}, {"loss": 0.2646, "grad_norm": 1.4123972654342651, "learning_rate": 0.0002, "epoch": 6.633785450061652, "step": 5380}, {"loss": 0.272, "grad_norm": 1.3584848642349243, "learning_rate": 0.0002, "epoch": 6.646115906288532, "step": 5390}, {"loss": 0.2663, "grad_norm": 1.2085801362991333, "learning_rate": 0.0002, "epoch": 6.658446362515413, "step": 5400}, {"loss": 0.2796, "grad_norm": 1.9293283224105835, "learning_rate": 0.0002, "epoch": 6.670776818742294, "step": 5410}, {"loss": 0.2412, "grad_norm": 1.3658782243728638, "learning_rate": 0.0002, "epoch": 6.683107274969174, "step": 5420}, {"loss": 0.2442, "grad_norm": 1.2004997730255127, "learning_rate": 0.0002, "epoch": 6.695437731196054, "step": 5430}, {"loss": 0.2693, "grad_norm": 1.0671268701553345, "learning_rate": 0.0002, "epoch": 6.707768187422935, "step": 5440}, {"loss": 0.2216, "grad_norm": 0.8877466320991516, "learning_rate": 0.0002, "epoch": 6.720098643649815, "step": 5450}, {"loss": 0.2678, "grad_norm": 1.2843106985092163, "learning_rate": 0.0002, "epoch": 6.732429099876695, "step": 5460}, {"loss": 0.2418, "grad_norm": 1.0663448572158813, "learning_rate": 0.0002, "epoch": 6.744759556103576, "step": 5470}, {"loss": 0.2402, "grad_norm": 1.3155773878097534, "learning_rate": 0.0002, "epoch": 6.7570900123304565, "step": 5480}, {"loss": 0.2559, "grad_norm": 1.8862448930740356, "learning_rate": 0.0002, "epoch": 6.769420468557336, "step": 5490}, {"loss": 0.2651, "grad_norm": 1.165061116218567, "learning_rate": 0.0002, "epoch": 6.781750924784217, "step": 5500}, {"loss": 0.2342, "grad_norm": 1.0968598127365112, "learning_rate": 0.0002, "epoch": 6.794081381011098, "step": 5510}, {"loss": 0.2453, "grad_norm": 0.9448091983795166, "learning_rate": 0.0002, "epoch": 6.806411837237977, "step": 5520}, {"loss": 0.2609, "grad_norm": 1.400767207145691, "learning_rate": 0.0002, "epoch": 6.818742293464858, "step": 5530}, {"loss": 0.2642, "grad_norm": 1.1031112670898438, "learning_rate": 0.0002, "epoch": 6.831072749691739, "step": 5540}, {"loss": 0.2534, "grad_norm": 1.2436904907226562, "learning_rate": 0.0002, "epoch": 6.843403205918619, "step": 5550}, {"loss": 0.2601, "grad_norm": 1.0987974405288696, "learning_rate": 0.0002, "epoch": 6.855733662145499, "step": 5560}, {"loss": 0.2622, "grad_norm": 0.8656415939331055, "learning_rate": 0.0002, "epoch": 6.86806411837238, "step": 5570}, {"loss": 0.2585, "grad_norm": 1.2153927087783813, "learning_rate": 0.0002, "epoch": 6.8803945745992605, "step": 5580}, {"loss": 0.2888, "grad_norm": 1.111377477645874, "learning_rate": 0.0002, "epoch": 6.89272503082614, "step": 5590}, {"loss": 0.2569, "grad_norm": 1.0041896104812622, "learning_rate": 0.0002, "epoch": 6.905055487053021, "step": 5600}, {"loss": 0.2654, "grad_norm": 1.0638413429260254, "learning_rate": 0.0002, "epoch": 6.917385943279902, "step": 5610}, {"loss": 0.2364, "grad_norm": 0.9756764769554138, "learning_rate": 0.0002, "epoch": 6.929716399506781, "step": 5620}, {"loss": 0.2756, "grad_norm": 1.153550624847412, "learning_rate": 0.0002, "epoch": 6.942046855733662, "step": 5630}, {"loss": 0.2732, "grad_norm": 1.3393985033035278, "learning_rate": 0.0002, "epoch": 6.954377311960543, "step": 5640}, {"loss": 0.2793, "grad_norm": 1.3233463764190674, "learning_rate": 0.0002, "epoch": 6.9667077681874225, "step": 5650}, {"loss": 0.2593, "grad_norm": 1.1693105697631836, "learning_rate": 0.0002, "epoch": 6.979038224414303, "step": 5660}, {"loss": 0.278, "grad_norm": 0.7186262607574463, "learning_rate": 0.0002, "epoch": 6.991368680641184, "step": 5670}, {"eval_loss": 1.936746597290039, "eval_runtime": 99.4259, "eval_samples_per_second": 4.385, "eval_steps_per_second": 0.553, "epoch": 7.0, "step": 5677}, {"loss": 0.2573, "grad_norm": 0.9832284450531006, "learning_rate": 0.0002, "epoch": 7.003699136868065, "step": 5680}, {"loss": 0.1879, "grad_norm": 1.6794530153274536, "learning_rate": 0.0002, "epoch": 7.016029593094944, "step": 5690}, {"loss": 0.1612, "grad_norm": 1.0405313968658447, "learning_rate": 0.0002, "epoch": 7.028360049321825, "step": 5700}, {"loss": 0.1623, "grad_norm": 0.8833287954330444, "learning_rate": 0.0002, "epoch": 7.040690505548706, "step": 5710}, {"loss": 0.1666, "grad_norm": 1.081743597984314, "learning_rate": 0.0002, "epoch": 7.0530209617755855, "step": 5720}, {"loss": 0.1952, "grad_norm": 1.1786993741989136, "learning_rate": 0.0002, "epoch": 7.065351418002466, "step": 5730}, {"loss": 0.1618, "grad_norm": 1.219215989112854, "learning_rate": 0.0002, "epoch": 7.077681874229347, "step": 5740}, {"loss": 0.165, "grad_norm": 0.882033109664917, "learning_rate": 0.0002, "epoch": 7.090012330456227, "step": 5750}, {"loss": 0.1801, "grad_norm": 1.0919346809387207, "learning_rate": 0.0002, "epoch": 7.102342786683107, "step": 5760}, {"loss": 0.1914, "grad_norm": 1.2448198795318604, "learning_rate": 0.0002, "epoch": 7.114673242909988, "step": 5770}, {"loss": 0.1826, "grad_norm": 0.8977628350257874, "learning_rate": 0.0002, "epoch": 7.127003699136868, "step": 5780}, {"loss": 0.2119, "grad_norm": 1.1030590534210205, "learning_rate": 0.0002, "epoch": 7.139334155363748, "step": 5790}, {"loss": 0.1841, "grad_norm": 0.9050454497337341, "learning_rate": 0.0002, "epoch": 7.151664611590629, "step": 5800}, {"loss": 0.1765, "grad_norm": 1.2709665298461914, "learning_rate": 0.0002, "epoch": 7.163995067817509, "step": 5810}, {"loss": 0.2069, "grad_norm": 1.7741143703460693, "learning_rate": 0.0002, "epoch": 7.1763255240443895, "step": 5820}, {"loss": 0.2189, "grad_norm": 1.040995478630066, "learning_rate": 0.0002, "epoch": 7.18865598027127, "step": 5830}, {"loss": 0.1869, "grad_norm": 0.995246171951294, "learning_rate": 0.0002, "epoch": 7.200986436498151, "step": 5840}, {"loss": 0.1727, "grad_norm": 0.962523341178894, "learning_rate": 0.0002, "epoch": 7.213316892725031, "step": 5850}, {"loss": 0.2051, "grad_norm": 1.2010393142700195, "learning_rate": 0.0002, "epoch": 7.225647348951911, "step": 5860}, {"loss": 0.19, "grad_norm": 1.1749597787857056, "learning_rate": 0.0002, "epoch": 7.237977805178792, "step": 5870}, {"loss": 0.1799, "grad_norm": 1.0654889345169067, "learning_rate": 0.0002, "epoch": 7.250308261405672, "step": 5880}, {"loss": 0.2184, "grad_norm": 0.761138379573822, "learning_rate": 0.0002, "epoch": 7.262638717632552, "step": 5890}, {"loss": 0.1828, "grad_norm": 0.9512502551078796, "learning_rate": 0.0002, "epoch": 7.274969173859433, "step": 5900}, {"loss": 0.1655, "grad_norm": 0.7542949318885803, "learning_rate": 0.0002, "epoch": 7.287299630086313, "step": 5910}, {"loss": 0.1985, "grad_norm": 0.7638646364212036, "learning_rate": 0.0002, "epoch": 7.2996300863131935, "step": 5920}, {"loss": 0.1953, "grad_norm": 1.162330985069275, "learning_rate": 0.0002, "epoch": 7.311960542540074, "step": 5930}, {"loss": 0.204, "grad_norm": 1.5835925340652466, "learning_rate": 0.0002, "epoch": 7.324290998766954, "step": 5940}, {"loss": 0.178, "grad_norm": 1.0043281316757202, "learning_rate": 0.0002, "epoch": 7.336621454993835, "step": 5950}, {"loss": 0.1819, "grad_norm": 1.2750244140625, "learning_rate": 0.0002, "epoch": 7.348951911220715, "step": 5960}, {"loss": 0.1917, "grad_norm": 0.8582083582878113, "learning_rate": 0.0002, "epoch": 7.361282367447595, "step": 5970}, {"loss": 0.2022, "grad_norm": 1.0025495290756226, "learning_rate": 0.0002, "epoch": 7.373612823674476, "step": 5980}, {"loss": 0.1971, "grad_norm": 1.030452847480774, "learning_rate": 0.0002, "epoch": 7.3859432799013565, "step": 5990}, {"loss": 0.2026, "grad_norm": 0.9436936378479004, "learning_rate": 0.0002, "epoch": 7.398273736128237, "step": 6000}, {"loss": 0.1847, "grad_norm": 1.3259925842285156, "learning_rate": 0.0002, "epoch": 7.410604192355117, "step": 6010}, {"loss": 0.1794, "grad_norm": 0.884767472743988, "learning_rate": 0.0002, "epoch": 7.422934648581998, "step": 6020}, {"loss": 0.1712, "grad_norm": 0.8467209339141846, "learning_rate": 0.0002, "epoch": 7.435265104808878, "step": 6030}, {"loss": 0.2025, "grad_norm": 0.9294904470443726, "learning_rate": 0.0002, "epoch": 7.447595561035758, "step": 6040}, {"loss": 0.1808, "grad_norm": 1.2054014205932617, "learning_rate": 0.0002, "epoch": 7.459926017262639, "step": 6050}, {"loss": 0.193, "grad_norm": 0.9458960294723511, "learning_rate": 0.0002, "epoch": 7.472256473489519, "step": 6060}, {"loss": 0.1762, "grad_norm": 1.0876508951187134, "learning_rate": 0.0002, "epoch": 7.484586929716399, "step": 6070}, {"loss": 0.1999, "grad_norm": 1.110326886177063, "learning_rate": 0.0002, "epoch": 7.49691738594328, "step": 6080}, {"loss": 0.1854, "grad_norm": 1.1584968566894531, "learning_rate": 0.0002, "epoch": 7.5092478421701605, "step": 6090}, {"loss": 0.2059, "grad_norm": 1.0806410312652588, "learning_rate": 0.0002, "epoch": 7.52157829839704, "step": 6100}, {"loss": 0.2042, "grad_norm": 0.9162251353263855, "learning_rate": 0.0002, "epoch": 7.533908754623921, "step": 6110}, {"loss": 0.1981, "grad_norm": 1.044049620628357, "learning_rate": 0.0002, "epoch": 7.546239210850802, "step": 6120}, {"loss": 0.1857, "grad_norm": 0.9524619579315186, "learning_rate": 0.0002, "epoch": 7.558569667077682, "step": 6130}, {"loss": 0.2113, "grad_norm": 1.0031976699829102, "learning_rate": 0.0002, "epoch": 7.570900123304562, "step": 6140}, {"loss": 0.2008, "grad_norm": 1.342751383781433, "learning_rate": 0.0002, "epoch": 7.583230579531443, "step": 6150}, {"loss": 0.1895, "grad_norm": 1.4278815984725952, "learning_rate": 0.0002, "epoch": 7.595561035758323, "step": 6160}, {"loss": 0.2473, "grad_norm": 1.6231565475463867, "learning_rate": 0.0002, "epoch": 7.607891491985203, "step": 6170}, {"loss": 0.1861, "grad_norm": 1.0082448720932007, "learning_rate": 0.0002, "epoch": 7.620221948212084, "step": 6180}, {"loss": 0.1945, "grad_norm": 1.1605384349822998, "learning_rate": 0.0002, "epoch": 7.6325524044389645, "step": 6190}, {"loss": 0.196, "grad_norm": 1.3302881717681885, "learning_rate": 0.0002, "epoch": 7.644882860665844, "step": 6200}, {"loss": 0.2169, "grad_norm": 1.3318504095077515, "learning_rate": 0.0002, "epoch": 7.657213316892725, "step": 6210}, {"loss": 0.1872, "grad_norm": 1.265977144241333, "learning_rate": 0.0002, "epoch": 7.669543773119606, "step": 6220}, {"loss": 0.1917, "grad_norm": 1.3964512348175049, "learning_rate": 0.0002, "epoch": 7.6818742293464854, "step": 6230}, {"loss": 0.2244, "grad_norm": 1.148972988128662, "learning_rate": 0.0002, "epoch": 7.694204685573366, "step": 6240}, {"loss": 0.2038, "grad_norm": 0.8778917193412781, "learning_rate": 0.0002, "epoch": 7.706535141800247, "step": 6250}, {"loss": 0.2399, "grad_norm": 1.3537850379943848, "learning_rate": 0.0002, "epoch": 7.7188655980271275, "step": 6260}, {"loss": 0.194, "grad_norm": 0.8741335868835449, "learning_rate": 0.0002, "epoch": 7.731196054254007, "step": 6270}, {"loss": 0.2178, "grad_norm": 0.9642979502677917, "learning_rate": 0.0002, "epoch": 7.743526510480888, "step": 6280}, {"loss": 0.2328, "grad_norm": 1.4556978940963745, "learning_rate": 0.0002, "epoch": 7.755856966707768, "step": 6290}, {"loss": 0.2309, "grad_norm": 1.1485596895217896, "learning_rate": 0.0002, "epoch": 7.768187422934648, "step": 6300}, {"loss": 0.2178, "grad_norm": 1.2361459732055664, "learning_rate": 0.0002, "epoch": 7.780517879161529, "step": 6310}, {"loss": 0.2291, "grad_norm": 1.0271167755126953, "learning_rate": 0.0002, "epoch": 7.79284833538841, "step": 6320}, {"loss": 0.2112, "grad_norm": 1.2584497928619385, "learning_rate": 0.0002, "epoch": 7.8051787916152895, "step": 6330}, {"loss": 0.2098, "grad_norm": 0.9013339877128601, "learning_rate": 0.0002, "epoch": 7.81750924784217, "step": 6340}, {"loss": 0.1927, "grad_norm": 1.1033759117126465, "learning_rate": 0.0002, "epoch": 7.829839704069051, "step": 6350}, {"loss": 0.2294, "grad_norm": 1.4669054746627808, "learning_rate": 0.0002, "epoch": 7.842170160295931, "step": 6360}, {"loss": 0.2176, "grad_norm": 1.3915599584579468, "learning_rate": 0.0002, "epoch": 7.854500616522811, "step": 6370}, {"loss": 0.2294, "grad_norm": 1.6034538745880127, "learning_rate": 0.0002, "epoch": 7.866831072749692, "step": 6380}, {"loss": 0.2244, "grad_norm": 1.3022582530975342, "learning_rate": 0.0002, "epoch": 7.879161528976573, "step": 6390}, {"loss": 0.2316, "grad_norm": 1.0695449113845825, "learning_rate": 0.0002, "epoch": 7.891491985203452, "step": 6400}, {"loss": 0.2203, "grad_norm": 1.1082428693771362, "learning_rate": 0.0002, "epoch": 7.903822441430333, "step": 6410}, {"loss": 0.2102, "grad_norm": 0.9848728775978088, "learning_rate": 0.0002, "epoch": 7.916152897657213, "step": 6420}, {"loss": 0.2049, "grad_norm": 0.8668254017829895, "learning_rate": 0.0002, "epoch": 7.9284833538840935, "step": 6430}, {"loss": 0.2054, "grad_norm": 0.9431440234184265, "learning_rate": 0.0002, "epoch": 7.940813810110974, "step": 6440}, {"loss": 0.2364, "grad_norm": 1.3903534412384033, "learning_rate": 0.0002, "epoch": 7.953144266337855, "step": 6450}, {"loss": 0.2193, "grad_norm": 1.111591100692749, "learning_rate": 0.0002, "epoch": 7.965474722564735, "step": 6460}, {"loss": 0.2178, "grad_norm": 0.9858004450798035, "learning_rate": 0.0002, "epoch": 7.977805178791615, "step": 6470}, {"loss": 0.208, "grad_norm": 0.9721771478652954, "learning_rate": 0.0002, "epoch": 7.990135635018496, "step": 6480}]}