diff --git a/README.md b/README.md index bc5f30d6632ac0efdc7be2e9095e9e9579af2e33..92fd7bc40ba324c8a9c1897eeba2c94b46e9f526 100644 --- a/README.md +++ b/README.md @@ -1,199 +1,12 @@ ---- -library_name: transformers -tags: [] ---- +# dissimilar_FullFT -# Model Card for Model ID +Fine-tuned LLaMA model on QA_CODE_SUMMARIZATION dataset. - +- **LoRA**: Full Fine-Tuning +- **LoRA Rank**: N/A +- **Tasks**: QA_CODE_SUMMARIZATION +- **Base Model**: LLaMA 1B +- **Optimizer**: AdamW +- **Batch Size**: 4 - - -## Model Details - -### Model Description - - - -This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. - -- **Developed by:** [More Information Needed] -- **Funded by [optional]:** [More Information Needed] -- **Shared by [optional]:** [More Information Needed] -- **Model type:** [More Information Needed] -- **Language(s) (NLP):** [More Information Needed] -- **License:** [More Information Needed] -- **Finetuned from model [optional]:** [More Information Needed] - -### Model Sources [optional] - - - -- **Repository:** [More Information Needed] -- **Paper [optional]:** [More Information Needed] -- **Demo [optional]:** [More Information Needed] - -## Uses - - - -### Direct Use - - - -[More Information Needed] - -### Downstream Use [optional] - - - -[More Information Needed] - -### Out-of-Scope Use - - - -[More Information Needed] - -## Bias, Risks, and Limitations - - - -[More Information Needed] - -### Recommendations - - - -Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. - -## How to Get Started with the Model - -Use the code below to get started with the model. - -[More Information Needed] - -## Training Details - -### Training Data - - - -[More Information Needed] - -### Training Procedure - - - -#### Preprocessing [optional] - -[More Information Needed] - - -#### Training Hyperparameters - -- **Training regime:** [More Information Needed] - -#### Speeds, Sizes, Times [optional] - - - -[More Information Needed] - -## Evaluation - - - -### Testing Data, Factors & Metrics - -#### Testing Data - - - -[More Information Needed] - -#### Factors - - - -[More Information Needed] - -#### Metrics - - - -[More Information Needed] - -### Results - -[More Information Needed] - -#### Summary - - - -## Model Examination [optional] - - - -[More Information Needed] - -## Environmental Impact - - - -Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). - -- **Hardware Type:** [More Information Needed] -- **Hours used:** [More Information Needed] -- **Cloud Provider:** [More Information Needed] -- **Compute Region:** [More Information Needed] -- **Carbon Emitted:** [More Information Needed] - -## Technical Specifications [optional] - -### Model Architecture and Objective - -[More Information Needed] - -### Compute Infrastructure - -[More Information Needed] - -#### Hardware - -[More Information Needed] - -#### Software - -[More Information Needed] - -## Citation [optional] - - - -**BibTeX:** - -[More Information Needed] - -**APA:** - -[More Information Needed] - -## Glossary [optional] - - - -[More Information Needed] - -## More Information [optional] - -[More Information Needed] - -## Model Card Authors [optional] - -[More Information Needed] - -## Model Card Contact - -[More Information Needed] \ No newline at end of file +Trained using the 🤗 Transformers `Trainer` API. diff --git a/checkpoint-11200/config.json b/checkpoint-11200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-11200/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-11200/generation_config.json b/checkpoint-11200/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-11200/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-11200/model.safetensors b/checkpoint-11200/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..580671808c4249e1990f5e89ce49b22db982bc15 --- /dev/null +++ b/checkpoint-11200/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d90debf519ff58fbced5650010c2f0af86ac8df928b02ec227c4bcdc210ed88 +size 2471645608 diff --git a/checkpoint-11200/optimizer.pt b/checkpoint-11200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0340daae653a0cd24dce4ac448e063cffe0485a --- /dev/null +++ b/checkpoint-11200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb01368b519db3fdb91bdd1ec7763020533514514df96d0cc7e012c51efd118 +size 4943382114 diff --git a/checkpoint-11200/rng_state.pth b/checkpoint-11200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b90a80289e8b4a1364c141ec2bd718026fcf6df6 --- /dev/null +++ b/checkpoint-11200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b274076e93b274478bebe3dfb954b3391ae67cd9a9156c93fe95545bd14ca5c +size 14244 diff --git a/checkpoint-11200/scheduler.pt b/checkpoint-11200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d158af99238a27cea6416ed01bcfeda1468a4852 --- /dev/null +++ b/checkpoint-11200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43fbf940fdad622d543be6e79ee80a93eace452ff6fc2bd4f2a917b2a611852 +size 1064 diff --git a/checkpoint-11200/special_tokens_map.json b/checkpoint-11200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-11200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-11200/tokenizer.json b/checkpoint-11200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-11200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-11200/tokenizer_config.json b/checkpoint-11200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-11200/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-11200/trainer_state.json b/checkpoint-11200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f5af3ed8d944464b1d2feaf242a030a9b1a880ad --- /dev/null +++ b/checkpoint-11200/trainer_state.json @@ -0,0 +1,817 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5973333333333334, + "eval_steps": 500, + "global_step": 11200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.357192702066688e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-11200/training_args.bin b/checkpoint-11200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-11200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-16800/config.json b/checkpoint-16800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-16800/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-16800/generation_config.json b/checkpoint-16800/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-16800/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-16800/model.safetensors b/checkpoint-16800/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..efa96cb2af78f85f81b6c51f072ecab2585c1577 --- /dev/null +++ b/checkpoint-16800/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb489c55e347b0636fb8b40554f8c0328f8dfd269b47d76c7292c30028ba3c1 +size 2471645608 diff --git a/checkpoint-16800/optimizer.pt b/checkpoint-16800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9aaee1a762ac37ee0b638f498f9da71fa39f1b5 --- /dev/null +++ b/checkpoint-16800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58d85d02c8eaa817fb42e7b7c16e63486d208a6d2a67f00a7898d6e457d8ab76 +size 4943382114 diff --git a/checkpoint-16800/rng_state.pth b/checkpoint-16800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b90a80289e8b4a1364c141ec2bd718026fcf6df6 --- /dev/null +++ b/checkpoint-16800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b274076e93b274478bebe3dfb954b3391ae67cd9a9156c93fe95545bd14ca5c +size 14244 diff --git a/checkpoint-16800/scheduler.pt b/checkpoint-16800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..09a82def0608f4f1ac5867c56e4a8f4c24cd9123 --- /dev/null +++ b/checkpoint-16800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:563a15aed2aab0d13458a1d1ed48e8abddbe4c9cb9763b2e1022121331f75d56 +size 1064 diff --git a/checkpoint-16800/special_tokens_map.json b/checkpoint-16800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-16800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-16800/tokenizer.json b/checkpoint-16800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-16800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-16800/tokenizer_config.json b/checkpoint-16800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-16800/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-16800/trainer_state.json b/checkpoint-16800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e76e5d5aa73244483185619049c919e4f52d4683 --- /dev/null +++ b/checkpoint-16800/trainer_state.json @@ -0,0 +1,1209 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.896, + "eval_steps": 500, + "global_step": 16800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.035789053100032e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-16800/training_args.bin b/checkpoint-16800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-16800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-22400/config.json b/checkpoint-22400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-22400/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-22400/generation_config.json b/checkpoint-22400/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-22400/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-22400/model.safetensors b/checkpoint-22400/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..245078c57fd3379dc0c0eb975b373c2d6251e88c --- /dev/null +++ b/checkpoint-22400/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85afa4f30a0cfc7af736e1fcfb9ead1260d52aac8c8ea3c4022aa7952858afe0 +size 2471645608 diff --git a/checkpoint-22400/optimizer.pt b/checkpoint-22400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0c655bf7da48c294bc1a33873156b9a4fa5adcb --- /dev/null +++ b/checkpoint-22400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c14801432237fc4ed2c224553b8fa93805b5bf95050edc0c2abbedfb42d018 +size 4943382114 diff --git a/checkpoint-22400/rng_state.pth b/checkpoint-22400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d92002db8004af01b90f2f4177e1273a6a0e7cc --- /dev/null +++ b/checkpoint-22400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b0db50774a6c9a505c003900f29da4a6b6e4931ce68e70206b79caf6492446 +size 14244 diff --git a/checkpoint-22400/scheduler.pt b/checkpoint-22400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..decbcc49b3806b5c9a2921a34552ecff71c3a0f1 --- /dev/null +++ b/checkpoint-22400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b127796410352623d0dd7e566131a49ed6113c718c0e7d94a39f512ecdd67a70 +size 1064 diff --git a/checkpoint-22400/special_tokens_map.json b/checkpoint-22400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-22400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-22400/tokenizer.json b/checkpoint-22400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-22400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-22400/tokenizer_config.json b/checkpoint-22400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-22400/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-22400/trainer_state.json b/checkpoint-22400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a5de95872e0290abf4c1c02846b8cca27a7cb461 --- /dev/null +++ b/checkpoint-22400/trainer_state.json @@ -0,0 +1,1601 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.1946666666666665, + "eval_steps": 500, + "global_step": 22400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0714385404133376e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-22400/training_args.bin b/checkpoint-22400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-22400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-28000/config.json b/checkpoint-28000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-28000/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-28000/generation_config.json b/checkpoint-28000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-28000/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-28000/model.safetensors b/checkpoint-28000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10c51b0ec1f770f9f618d4f08c1009bbc4f2af73 --- /dev/null +++ b/checkpoint-28000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccffbecc635f89e1e5d5b32f451a26e3e055c995147b55a954ca7700af91d53d +size 2471645608 diff --git a/checkpoint-28000/optimizer.pt b/checkpoint-28000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2001d88d9c029ac22d415ee782aefa15bc5eba2 --- /dev/null +++ b/checkpoint-28000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97025d2b17a21bd0abbe5564ac32e3dec41fa0b6aa2aded186cad7a92ed40970 +size 4943382114 diff --git a/checkpoint-28000/rng_state.pth b/checkpoint-28000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d92002db8004af01b90f2f4177e1273a6a0e7cc --- /dev/null +++ b/checkpoint-28000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b0db50774a6c9a505c003900f29da4a6b6e4931ce68e70206b79caf6492446 +size 14244 diff --git a/checkpoint-28000/scheduler.pt b/checkpoint-28000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4527d0f5fea4d692dadd15f9196b3861cabe2a97 --- /dev/null +++ b/checkpoint-28000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f28b382bfc8eddc8ec5360742ee3b7c90574a41681bed25f48cd4b9a094fd41 +size 1064 diff --git a/checkpoint-28000/special_tokens_map.json b/checkpoint-28000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-28000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-28000/tokenizer.json b/checkpoint-28000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-28000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-28000/tokenizer_config.json b/checkpoint-28000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-28000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-28000/trainer_state.json b/checkpoint-28000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f78b1d701e1e872f9ea8b52c0df4bf01cab5519e --- /dev/null +++ b/checkpoint-28000/trainer_state.json @@ -0,0 +1,1993 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4933333333333334, + "eval_steps": 500, + "global_step": 28000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + }, + { + "epoch": 1.2, + "grad_norm": 5.71875, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6603, + "step": 22500 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 5.90625, + "learning_rate": 6.646913580246914e-05, + "loss": 1.7291, + "step": 22600 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 7.40625, + "learning_rate": 6.62716049382716e-05, + "loss": 1.7231, + "step": 22700 + }, + { + "epoch": 1.216, + "grad_norm": 4.8125, + "learning_rate": 6.607407407407408e-05, + "loss": 1.6072, + "step": 22800 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 10.5, + "learning_rate": 6.587654320987655e-05, + "loss": 1.7127, + "step": 22900 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 5.71875, + "learning_rate": 6.567901234567901e-05, + "loss": 1.7209, + "step": 23000 + }, + { + "epoch": 1.232, + "grad_norm": 6.0, + "learning_rate": 6.54814814814815e-05, + "loss": 1.7039, + "step": 23100 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 10.3125, + "learning_rate": 6.528395061728395e-05, + "loss": 1.7275, + "step": 23200 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 5.5625, + "learning_rate": 6.508641975308642e-05, + "loss": 1.7337, + "step": 23300 + }, + { + "epoch": 1.248, + "grad_norm": 5.90625, + "learning_rate": 6.488888888888889e-05, + "loss": 1.6821, + "step": 23400 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 5.875, + "learning_rate": 6.469135802469136e-05, + "loss": 1.7188, + "step": 23500 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 5.84375, + "learning_rate": 6.449382716049382e-05, + "loss": 1.7119, + "step": 23600 + }, + { + "epoch": 1.264, + "grad_norm": 8.125, + "learning_rate": 6.42962962962963e-05, + "loss": 1.6742, + "step": 23700 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 4.96875, + "learning_rate": 6.409876543209878e-05, + "loss": 1.6378, + "step": 23800 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 5.40625, + "learning_rate": 6.390123456790123e-05, + "loss": 1.6826, + "step": 23900 + }, + { + "epoch": 1.28, + "grad_norm": 5.96875, + "learning_rate": 6.37037037037037e-05, + "loss": 1.712, + "step": 24000 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 6.3125, + "learning_rate": 6.350617283950617e-05, + "loss": 1.7673, + "step": 24100 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 5.375, + "learning_rate": 6.330864197530864e-05, + "loss": 1.5944, + "step": 24200 + }, + { + "epoch": 1.296, + "grad_norm": 8.0, + "learning_rate": 6.311111111111112e-05, + "loss": 1.7515, + "step": 24300 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 5.53125, + "learning_rate": 6.291358024691359e-05, + "loss": 1.739, + "step": 24400 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 4.6875, + "learning_rate": 6.271604938271606e-05, + "loss": 1.744, + "step": 24500 + }, + { + "epoch": 1.312, + "grad_norm": 11.9375, + "learning_rate": 6.251851851851853e-05, + "loss": 1.6566, + "step": 24600 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 11.4375, + "learning_rate": 6.2320987654321e-05, + "loss": 1.6289, + "step": 24700 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 11.1875, + "learning_rate": 6.212345679012346e-05, + "loss": 1.686, + "step": 24800 + }, + { + "epoch": 1.328, + "grad_norm": 6.21875, + "learning_rate": 6.192592592592593e-05, + "loss": 1.66, + "step": 24900 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.5, + "learning_rate": 6.17283950617284e-05, + "loss": 1.6724, + "step": 25000 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 6.46875, + "learning_rate": 6.153086419753087e-05, + "loss": 1.7236, + "step": 25100 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.6875, + "learning_rate": 6.133333333333334e-05, + "loss": 1.6676, + "step": 25200 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 6.84375, + "learning_rate": 6.113580246913581e-05, + "loss": 1.6966, + "step": 25300 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 6.09375, + "learning_rate": 6.093827160493828e-05, + "loss": 1.6573, + "step": 25400 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.53125, + "learning_rate": 6.074074074074074e-05, + "loss": 1.7067, + "step": 25500 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 5.0, + "learning_rate": 6.0543209876543214e-05, + "loss": 1.6531, + "step": 25600 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 4.3125, + "learning_rate": 6.034567901234568e-05, + "loss": 1.6951, + "step": 25700 + }, + { + "epoch": 1.376, + "grad_norm": 6.84375, + "learning_rate": 6.0148148148148155e-05, + "loss": 1.6101, + "step": 25800 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 5.8125, + "learning_rate": 5.995061728395062e-05, + "loss": 1.7114, + "step": 25900 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 6.375, + "learning_rate": 5.975308641975309e-05, + "loss": 1.6413, + "step": 26000 + }, + { + "epoch": 1.392, + "grad_norm": 5.5, + "learning_rate": 5.9555555555555554e-05, + "loss": 1.6189, + "step": 26100 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 6.28125, + "learning_rate": 5.9358024691358024e-05, + "loss": 1.6949, + "step": 26200 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 5.25, + "learning_rate": 5.91604938271605e-05, + "loss": 1.6616, + "step": 26300 + }, + { + "epoch": 1.408, + "grad_norm": 8.625, + "learning_rate": 5.8962962962962966e-05, + "loss": 1.6484, + "step": 26400 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 4.96875, + "learning_rate": 5.8765432098765437e-05, + "loss": 1.599, + "step": 26500 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 4.40625, + "learning_rate": 5.85679012345679e-05, + "loss": 1.6366, + "step": 26600 + }, + { + "epoch": 1.424, + "grad_norm": 9.8125, + "learning_rate": 5.837037037037038e-05, + "loss": 1.7065, + "step": 26700 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 5.46875, + "learning_rate": 5.8172839506172835e-05, + "loss": 1.6841, + "step": 26800 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 4.9375, + "learning_rate": 5.797530864197531e-05, + "loss": 1.66, + "step": 26900 + }, + { + "epoch": 1.44, + "grad_norm": 5.375, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6645, + "step": 27000 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 5.875, + "learning_rate": 5.758024691358025e-05, + "loss": 1.6354, + "step": 27100 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 6.90625, + "learning_rate": 5.7382716049382725e-05, + "loss": 1.626, + "step": 27200 + }, + { + "epoch": 1.456, + "grad_norm": 6.5, + "learning_rate": 5.718518518518519e-05, + "loss": 1.6265, + "step": 27300 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 9.25, + "learning_rate": 5.698765432098766e-05, + "loss": 1.6879, + "step": 27400 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 6.1875, + "learning_rate": 5.679012345679012e-05, + "loss": 1.6756, + "step": 27500 + }, + { + "epoch": 1.472, + "grad_norm": 6.0625, + "learning_rate": 5.6592592592592594e-05, + "loss": 1.748, + "step": 27600 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 7.1875, + "learning_rate": 5.639506172839506e-05, + "loss": 1.668, + "step": 27700 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 11.375, + "learning_rate": 5.6197530864197535e-05, + "loss": 1.6842, + "step": 27800 + }, + { + "epoch": 1.488, + "grad_norm": 5.125, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7157, + "step": 27900 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 5.5, + "learning_rate": 5.580246913580247e-05, + "loss": 1.6674, + "step": 28000 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.339298175516672e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-28000/training_args.bin b/checkpoint-28000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-28000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-33600/config.json b/checkpoint-33600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-33600/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-33600/generation_config.json b/checkpoint-33600/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-33600/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-33600/model.safetensors b/checkpoint-33600/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c9fbe1238305401dfe2027b6b07d8d92024214cc --- /dev/null +++ b/checkpoint-33600/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b86fc20dcfa3faaa17e9c5f8a92305591b0ac52170cc706cf65b3870ad4351 +size 2471645608 diff --git a/checkpoint-33600/optimizer.pt b/checkpoint-33600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..52fd9ee87536f5854332932d892f00e3ea94bbc8 --- /dev/null +++ b/checkpoint-33600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8acc588e71f25cbd7b465ef6b586779eedc075417672866d278857fcdc71538 +size 4943382114 diff --git a/checkpoint-33600/rng_state.pth b/checkpoint-33600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8d92002db8004af01b90f2f4177e1273a6a0e7cc --- /dev/null +++ b/checkpoint-33600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b0db50774a6c9a505c003900f29da4a6b6e4931ce68e70206b79caf6492446 +size 14244 diff --git a/checkpoint-33600/scheduler.pt b/checkpoint-33600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..12b6a79a6f2faf29b627389748cb1a43b88eba02 --- /dev/null +++ b/checkpoint-33600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6093beba822844de95d58fc05a03de386ec7f8d8fb3284f404948d6fe0df1f +size 1064 diff --git a/checkpoint-33600/special_tokens_map.json b/checkpoint-33600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-33600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-33600/tokenizer.json b/checkpoint-33600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-33600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-33600/tokenizer_config.json b/checkpoint-33600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-33600/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-33600/trainer_state.json b/checkpoint-33600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..99b6eb700986ec89f2744bc7c10e90e5b1240c23 --- /dev/null +++ b/checkpoint-33600/trainer_state.json @@ -0,0 +1,2385 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.792, + "eval_steps": 500, + "global_step": 33600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + }, + { + "epoch": 1.2, + "grad_norm": 5.71875, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6603, + "step": 22500 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 5.90625, + "learning_rate": 6.646913580246914e-05, + "loss": 1.7291, + "step": 22600 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 7.40625, + "learning_rate": 6.62716049382716e-05, + "loss": 1.7231, + "step": 22700 + }, + { + "epoch": 1.216, + "grad_norm": 4.8125, + "learning_rate": 6.607407407407408e-05, + "loss": 1.6072, + "step": 22800 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 10.5, + "learning_rate": 6.587654320987655e-05, + "loss": 1.7127, + "step": 22900 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 5.71875, + "learning_rate": 6.567901234567901e-05, + "loss": 1.7209, + "step": 23000 + }, + { + "epoch": 1.232, + "grad_norm": 6.0, + "learning_rate": 6.54814814814815e-05, + "loss": 1.7039, + "step": 23100 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 10.3125, + "learning_rate": 6.528395061728395e-05, + "loss": 1.7275, + "step": 23200 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 5.5625, + "learning_rate": 6.508641975308642e-05, + "loss": 1.7337, + "step": 23300 + }, + { + "epoch": 1.248, + "grad_norm": 5.90625, + "learning_rate": 6.488888888888889e-05, + "loss": 1.6821, + "step": 23400 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 5.875, + "learning_rate": 6.469135802469136e-05, + "loss": 1.7188, + "step": 23500 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 5.84375, + "learning_rate": 6.449382716049382e-05, + "loss": 1.7119, + "step": 23600 + }, + { + "epoch": 1.264, + "grad_norm": 8.125, + "learning_rate": 6.42962962962963e-05, + "loss": 1.6742, + "step": 23700 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 4.96875, + "learning_rate": 6.409876543209878e-05, + "loss": 1.6378, + "step": 23800 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 5.40625, + "learning_rate": 6.390123456790123e-05, + "loss": 1.6826, + "step": 23900 + }, + { + "epoch": 1.28, + "grad_norm": 5.96875, + "learning_rate": 6.37037037037037e-05, + "loss": 1.712, + "step": 24000 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 6.3125, + "learning_rate": 6.350617283950617e-05, + "loss": 1.7673, + "step": 24100 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 5.375, + "learning_rate": 6.330864197530864e-05, + "loss": 1.5944, + "step": 24200 + }, + { + "epoch": 1.296, + "grad_norm": 8.0, + "learning_rate": 6.311111111111112e-05, + "loss": 1.7515, + "step": 24300 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 5.53125, + "learning_rate": 6.291358024691359e-05, + "loss": 1.739, + "step": 24400 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 4.6875, + "learning_rate": 6.271604938271606e-05, + "loss": 1.744, + "step": 24500 + }, + { + "epoch": 1.312, + "grad_norm": 11.9375, + "learning_rate": 6.251851851851853e-05, + "loss": 1.6566, + "step": 24600 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 11.4375, + "learning_rate": 6.2320987654321e-05, + "loss": 1.6289, + "step": 24700 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 11.1875, + "learning_rate": 6.212345679012346e-05, + "loss": 1.686, + "step": 24800 + }, + { + "epoch": 1.328, + "grad_norm": 6.21875, + "learning_rate": 6.192592592592593e-05, + "loss": 1.66, + "step": 24900 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.5, + "learning_rate": 6.17283950617284e-05, + "loss": 1.6724, + "step": 25000 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 6.46875, + "learning_rate": 6.153086419753087e-05, + "loss": 1.7236, + "step": 25100 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.6875, + "learning_rate": 6.133333333333334e-05, + "loss": 1.6676, + "step": 25200 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 6.84375, + "learning_rate": 6.113580246913581e-05, + "loss": 1.6966, + "step": 25300 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 6.09375, + "learning_rate": 6.093827160493828e-05, + "loss": 1.6573, + "step": 25400 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.53125, + "learning_rate": 6.074074074074074e-05, + "loss": 1.7067, + "step": 25500 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 5.0, + "learning_rate": 6.0543209876543214e-05, + "loss": 1.6531, + "step": 25600 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 4.3125, + "learning_rate": 6.034567901234568e-05, + "loss": 1.6951, + "step": 25700 + }, + { + "epoch": 1.376, + "grad_norm": 6.84375, + "learning_rate": 6.0148148148148155e-05, + "loss": 1.6101, + "step": 25800 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 5.8125, + "learning_rate": 5.995061728395062e-05, + "loss": 1.7114, + "step": 25900 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 6.375, + "learning_rate": 5.975308641975309e-05, + "loss": 1.6413, + "step": 26000 + }, + { + "epoch": 1.392, + "grad_norm": 5.5, + "learning_rate": 5.9555555555555554e-05, + "loss": 1.6189, + "step": 26100 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 6.28125, + "learning_rate": 5.9358024691358024e-05, + "loss": 1.6949, + "step": 26200 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 5.25, + "learning_rate": 5.91604938271605e-05, + "loss": 1.6616, + "step": 26300 + }, + { + "epoch": 1.408, + "grad_norm": 8.625, + "learning_rate": 5.8962962962962966e-05, + "loss": 1.6484, + "step": 26400 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 4.96875, + "learning_rate": 5.8765432098765437e-05, + "loss": 1.599, + "step": 26500 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 4.40625, + "learning_rate": 5.85679012345679e-05, + "loss": 1.6366, + "step": 26600 + }, + { + "epoch": 1.424, + "grad_norm": 9.8125, + "learning_rate": 5.837037037037038e-05, + "loss": 1.7065, + "step": 26700 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 5.46875, + "learning_rate": 5.8172839506172835e-05, + "loss": 1.6841, + "step": 26800 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 4.9375, + "learning_rate": 5.797530864197531e-05, + "loss": 1.66, + "step": 26900 + }, + { + "epoch": 1.44, + "grad_norm": 5.375, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6645, + "step": 27000 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 5.875, + "learning_rate": 5.758024691358025e-05, + "loss": 1.6354, + "step": 27100 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 6.90625, + "learning_rate": 5.7382716049382725e-05, + "loss": 1.626, + "step": 27200 + }, + { + "epoch": 1.456, + "grad_norm": 6.5, + "learning_rate": 5.718518518518519e-05, + "loss": 1.6265, + "step": 27300 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 9.25, + "learning_rate": 5.698765432098766e-05, + "loss": 1.6879, + "step": 27400 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 6.1875, + "learning_rate": 5.679012345679012e-05, + "loss": 1.6756, + "step": 27500 + }, + { + "epoch": 1.472, + "grad_norm": 6.0625, + "learning_rate": 5.6592592592592594e-05, + "loss": 1.748, + "step": 27600 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 7.1875, + "learning_rate": 5.639506172839506e-05, + "loss": 1.668, + "step": 27700 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 11.375, + "learning_rate": 5.6197530864197535e-05, + "loss": 1.6842, + "step": 27800 + }, + { + "epoch": 1.488, + "grad_norm": 5.125, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7157, + "step": 27900 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 5.5, + "learning_rate": 5.580246913580247e-05, + "loss": 1.6674, + "step": 28000 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 5.6875, + "learning_rate": 5.560493827160495e-05, + "loss": 1.6131, + "step": 28100 + }, + { + "epoch": 1.504, + "grad_norm": 4.5, + "learning_rate": 5.540740740740741e-05, + "loss": 1.7084, + "step": 28200 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 5.15625, + "learning_rate": 5.520987654320988e-05, + "loss": 1.5791, + "step": 28300 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 6.96875, + "learning_rate": 5.5012345679012346e-05, + "loss": 1.5846, + "step": 28400 + }, + { + "epoch": 1.52, + "grad_norm": 11.875, + "learning_rate": 5.4814814814814817e-05, + "loss": 1.6353, + "step": 28500 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 8.3125, + "learning_rate": 5.461728395061728e-05, + "loss": 1.6686, + "step": 28600 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 13.6875, + "learning_rate": 5.441975308641976e-05, + "loss": 1.6609, + "step": 28700 + }, + { + "epoch": 1.536, + "grad_norm": 7.6875, + "learning_rate": 5.422222222222223e-05, + "loss": 1.6264, + "step": 28800 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 8.125, + "learning_rate": 5.402469135802469e-05, + "loss": 1.6539, + "step": 28900 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 7.4375, + "learning_rate": 5.382716049382717e-05, + "loss": 1.6946, + "step": 29000 + }, + { + "epoch": 1.552, + "grad_norm": 7.09375, + "learning_rate": 5.362962962962963e-05, + "loss": 1.6258, + "step": 29100 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 4.53125, + "learning_rate": 5.3432098765432105e-05, + "loss": 1.6388, + "step": 29200 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 5.4375, + "learning_rate": 5.323456790123457e-05, + "loss": 1.6131, + "step": 29300 + }, + { + "epoch": 1.568, + "grad_norm": 7.15625, + "learning_rate": 5.303703703703704e-05, + "loss": 1.5935, + "step": 29400 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 9.8125, + "learning_rate": 5.28395061728395e-05, + "loss": 1.6357, + "step": 29500 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 6.625, + "learning_rate": 5.264197530864198e-05, + "loss": 1.6733, + "step": 29600 + }, + { + "epoch": 1.584, + "grad_norm": 5.0, + "learning_rate": 5.244444444444445e-05, + "loss": 1.7063, + "step": 29700 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 6.625, + "learning_rate": 5.2246913580246915e-05, + "loss": 1.6056, + "step": 29800 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 6.90625, + "learning_rate": 5.2049382716049386e-05, + "loss": 1.6357, + "step": 29900 + }, + { + "epoch": 1.6, + "grad_norm": 7.5, + "learning_rate": 5.185185185185185e-05, + "loss": 1.6332, + "step": 30000 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 7.84375, + "learning_rate": 5.165432098765433e-05, + "loss": 1.6458, + "step": 30100 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 15.375, + "learning_rate": 5.145679012345679e-05, + "loss": 1.5787, + "step": 30200 + }, + { + "epoch": 1.616, + "grad_norm": 8.5625, + "learning_rate": 5.125925925925926e-05, + "loss": 1.6441, + "step": 30300 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 5.9375, + "learning_rate": 5.1061728395061726e-05, + "loss": 1.6211, + "step": 30400 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 6.09375, + "learning_rate": 5.0864197530864197e-05, + "loss": 1.6304, + "step": 30500 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.40625, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.6111, + "step": 30600 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 7.625, + "learning_rate": 5.046913580246914e-05, + "loss": 1.6387, + "step": 30700 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 4.875, + "learning_rate": 5.027160493827161e-05, + "loss": 1.6418, + "step": 30800 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.25, + "learning_rate": 5.007407407407407e-05, + "loss": 1.6082, + "step": 30900 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 5.125, + "learning_rate": 4.987654320987655e-05, + "loss": 1.5755, + "step": 31000 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 9.0625, + "learning_rate": 4.9679012345679014e-05, + "loss": 1.6432, + "step": 31100 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 6.0, + "learning_rate": 4.9481481481481485e-05, + "loss": 1.6333, + "step": 31200 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 6.65625, + "learning_rate": 4.9283950617283955e-05, + "loss": 1.6183, + "step": 31300 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 7.28125, + "learning_rate": 4.908641975308642e-05, + "loss": 1.5636, + "step": 31400 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.125, + "learning_rate": 4.888888888888889e-05, + "loss": 1.621, + "step": 31500 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 6.46875, + "learning_rate": 4.869135802469136e-05, + "loss": 1.7226, + "step": 31600 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 5.875, + "learning_rate": 4.849382716049383e-05, + "loss": 1.6311, + "step": 31700 + }, + { + "epoch": 1.696, + "grad_norm": 5.875, + "learning_rate": 4.82962962962963e-05, + "loss": 1.6132, + "step": 31800 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 5.375, + "learning_rate": 4.8098765432098766e-05, + "loss": 1.5931, + "step": 31900 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 13.0625, + "learning_rate": 4.7901234567901237e-05, + "loss": 1.6958, + "step": 32000 + }, + { + "epoch": 1.712, + "grad_norm": 6.40625, + "learning_rate": 4.770370370370371e-05, + "loss": 1.6209, + "step": 32100 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 13.5625, + "learning_rate": 4.750617283950617e-05, + "loss": 1.6031, + "step": 32200 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 8.5, + "learning_rate": 4.730864197530864e-05, + "loss": 1.6279, + "step": 32300 + }, + { + "epoch": 1.728, + "grad_norm": 7.5, + "learning_rate": 4.711111111111111e-05, + "loss": 1.5793, + "step": 32400 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 6.75, + "learning_rate": 4.691358024691358e-05, + "loss": 1.6628, + "step": 32500 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 6.21875, + "learning_rate": 4.6716049382716054e-05, + "loss": 1.671, + "step": 32600 + }, + { + "epoch": 1.744, + "grad_norm": 5.875, + "learning_rate": 4.6518518518518525e-05, + "loss": 1.6886, + "step": 32700 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 5.15625, + "learning_rate": 4.632098765432099e-05, + "loss": 1.6598, + "step": 32800 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 9.6875, + "learning_rate": 4.612345679012346e-05, + "loss": 1.5773, + "step": 32900 + }, + { + "epoch": 1.76, + "grad_norm": 6.125, + "learning_rate": 4.592592592592593e-05, + "loss": 1.6603, + "step": 33000 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 5.8125, + "learning_rate": 4.5728395061728394e-05, + "loss": 1.6405, + "step": 33100 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 5.34375, + "learning_rate": 4.5530864197530865e-05, + "loss": 1.6776, + "step": 33200 + }, + { + "epoch": 1.776, + "grad_norm": 5.5625, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.5413, + "step": 33300 + }, + { + "epoch": 1.7813333333333334, + "grad_norm": 8.875, + "learning_rate": 4.5135802469135806e-05, + "loss": 1.6298, + "step": 33400 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 6.28125, + "learning_rate": 4.493827160493828e-05, + "loss": 1.5795, + "step": 33500 + }, + { + "epoch": 1.792, + "grad_norm": 6.65625, + "learning_rate": 4.474074074074075e-05, + "loss": 1.7145, + "step": 33600 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6071578106200064e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-33600/training_args.bin b/checkpoint-33600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-33600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-39200/config.json b/checkpoint-39200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-39200/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-39200/generation_config.json b/checkpoint-39200/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-39200/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-39200/model.safetensors b/checkpoint-39200/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..68ef89f57705f7ab5be53bc1ab17c11108ba1ead --- /dev/null +++ b/checkpoint-39200/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5afe43809090098decd242fc7cbcadcaaff8f9741c591e83883d6653e84a0ed5 +size 2471645608 diff --git a/checkpoint-39200/optimizer.pt b/checkpoint-39200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4e61bbf430ecc4536eddbb30b9e358df7a4704c6 --- /dev/null +++ b/checkpoint-39200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43b17e0aec11b8d42bbe19f71db52e6bc4e4a3e1cd3c25e5a5018fc891550c77 +size 4943382114 diff --git a/checkpoint-39200/rng_state.pth b/checkpoint-39200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..95cad945c935ebaeeca5e461007867b6e155022e --- /dev/null +++ b/checkpoint-39200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3adbebcf39742024134d36312cc62baede0eb396a36041797f643dcac19c1b2 +size 14244 diff --git a/checkpoint-39200/scheduler.pt b/checkpoint-39200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..db8af28902be52ff910ca3a6422399ef29fd2a55 --- /dev/null +++ b/checkpoint-39200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad525b3cd9d0ad54a570483e15a19a7847d543fba5304a42835e80a4aad5f227 +size 1064 diff --git a/checkpoint-39200/special_tokens_map.json b/checkpoint-39200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-39200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-39200/tokenizer.json b/checkpoint-39200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-39200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-39200/tokenizer_config.json b/checkpoint-39200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-39200/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-39200/trainer_state.json b/checkpoint-39200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aaf7014c266aadc68f5d8cbc926da1027f99d5b8 --- /dev/null +++ b/checkpoint-39200/trainer_state.json @@ -0,0 +1,2777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0906666666666665, + "eval_steps": 500, + "global_step": 39200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + }, + { + "epoch": 1.2, + "grad_norm": 5.71875, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6603, + "step": 22500 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 5.90625, + "learning_rate": 6.646913580246914e-05, + "loss": 1.7291, + "step": 22600 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 7.40625, + "learning_rate": 6.62716049382716e-05, + "loss": 1.7231, + "step": 22700 + }, + { + "epoch": 1.216, + "grad_norm": 4.8125, + "learning_rate": 6.607407407407408e-05, + "loss": 1.6072, + "step": 22800 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 10.5, + "learning_rate": 6.587654320987655e-05, + "loss": 1.7127, + "step": 22900 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 5.71875, + "learning_rate": 6.567901234567901e-05, + "loss": 1.7209, + "step": 23000 + }, + { + "epoch": 1.232, + "grad_norm": 6.0, + "learning_rate": 6.54814814814815e-05, + "loss": 1.7039, + "step": 23100 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 10.3125, + "learning_rate": 6.528395061728395e-05, + "loss": 1.7275, + "step": 23200 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 5.5625, + "learning_rate": 6.508641975308642e-05, + "loss": 1.7337, + "step": 23300 + }, + { + "epoch": 1.248, + "grad_norm": 5.90625, + "learning_rate": 6.488888888888889e-05, + "loss": 1.6821, + "step": 23400 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 5.875, + "learning_rate": 6.469135802469136e-05, + "loss": 1.7188, + "step": 23500 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 5.84375, + "learning_rate": 6.449382716049382e-05, + "loss": 1.7119, + "step": 23600 + }, + { + "epoch": 1.264, + "grad_norm": 8.125, + "learning_rate": 6.42962962962963e-05, + "loss": 1.6742, + "step": 23700 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 4.96875, + "learning_rate": 6.409876543209878e-05, + "loss": 1.6378, + "step": 23800 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 5.40625, + "learning_rate": 6.390123456790123e-05, + "loss": 1.6826, + "step": 23900 + }, + { + "epoch": 1.28, + "grad_norm": 5.96875, + "learning_rate": 6.37037037037037e-05, + "loss": 1.712, + "step": 24000 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 6.3125, + "learning_rate": 6.350617283950617e-05, + "loss": 1.7673, + "step": 24100 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 5.375, + "learning_rate": 6.330864197530864e-05, + "loss": 1.5944, + "step": 24200 + }, + { + "epoch": 1.296, + "grad_norm": 8.0, + "learning_rate": 6.311111111111112e-05, + "loss": 1.7515, + "step": 24300 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 5.53125, + "learning_rate": 6.291358024691359e-05, + "loss": 1.739, + "step": 24400 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 4.6875, + "learning_rate": 6.271604938271606e-05, + "loss": 1.744, + "step": 24500 + }, + { + "epoch": 1.312, + "grad_norm": 11.9375, + "learning_rate": 6.251851851851853e-05, + "loss": 1.6566, + "step": 24600 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 11.4375, + "learning_rate": 6.2320987654321e-05, + "loss": 1.6289, + "step": 24700 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 11.1875, + "learning_rate": 6.212345679012346e-05, + "loss": 1.686, + "step": 24800 + }, + { + "epoch": 1.328, + "grad_norm": 6.21875, + "learning_rate": 6.192592592592593e-05, + "loss": 1.66, + "step": 24900 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.5, + "learning_rate": 6.17283950617284e-05, + "loss": 1.6724, + "step": 25000 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 6.46875, + "learning_rate": 6.153086419753087e-05, + "loss": 1.7236, + "step": 25100 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.6875, + "learning_rate": 6.133333333333334e-05, + "loss": 1.6676, + "step": 25200 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 6.84375, + "learning_rate": 6.113580246913581e-05, + "loss": 1.6966, + "step": 25300 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 6.09375, + "learning_rate": 6.093827160493828e-05, + "loss": 1.6573, + "step": 25400 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.53125, + "learning_rate": 6.074074074074074e-05, + "loss": 1.7067, + "step": 25500 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 5.0, + "learning_rate": 6.0543209876543214e-05, + "loss": 1.6531, + "step": 25600 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 4.3125, + "learning_rate": 6.034567901234568e-05, + "loss": 1.6951, + "step": 25700 + }, + { + "epoch": 1.376, + "grad_norm": 6.84375, + "learning_rate": 6.0148148148148155e-05, + "loss": 1.6101, + "step": 25800 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 5.8125, + "learning_rate": 5.995061728395062e-05, + "loss": 1.7114, + "step": 25900 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 6.375, + "learning_rate": 5.975308641975309e-05, + "loss": 1.6413, + "step": 26000 + }, + { + "epoch": 1.392, + "grad_norm": 5.5, + "learning_rate": 5.9555555555555554e-05, + "loss": 1.6189, + "step": 26100 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 6.28125, + "learning_rate": 5.9358024691358024e-05, + "loss": 1.6949, + "step": 26200 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 5.25, + "learning_rate": 5.91604938271605e-05, + "loss": 1.6616, + "step": 26300 + }, + { + "epoch": 1.408, + "grad_norm": 8.625, + "learning_rate": 5.8962962962962966e-05, + "loss": 1.6484, + "step": 26400 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 4.96875, + "learning_rate": 5.8765432098765437e-05, + "loss": 1.599, + "step": 26500 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 4.40625, + "learning_rate": 5.85679012345679e-05, + "loss": 1.6366, + "step": 26600 + }, + { + "epoch": 1.424, + "grad_norm": 9.8125, + "learning_rate": 5.837037037037038e-05, + "loss": 1.7065, + "step": 26700 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 5.46875, + "learning_rate": 5.8172839506172835e-05, + "loss": 1.6841, + "step": 26800 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 4.9375, + "learning_rate": 5.797530864197531e-05, + "loss": 1.66, + "step": 26900 + }, + { + "epoch": 1.44, + "grad_norm": 5.375, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6645, + "step": 27000 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 5.875, + "learning_rate": 5.758024691358025e-05, + "loss": 1.6354, + "step": 27100 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 6.90625, + "learning_rate": 5.7382716049382725e-05, + "loss": 1.626, + "step": 27200 + }, + { + "epoch": 1.456, + "grad_norm": 6.5, + "learning_rate": 5.718518518518519e-05, + "loss": 1.6265, + "step": 27300 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 9.25, + "learning_rate": 5.698765432098766e-05, + "loss": 1.6879, + "step": 27400 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 6.1875, + "learning_rate": 5.679012345679012e-05, + "loss": 1.6756, + "step": 27500 + }, + { + "epoch": 1.472, + "grad_norm": 6.0625, + "learning_rate": 5.6592592592592594e-05, + "loss": 1.748, + "step": 27600 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 7.1875, + "learning_rate": 5.639506172839506e-05, + "loss": 1.668, + "step": 27700 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 11.375, + "learning_rate": 5.6197530864197535e-05, + "loss": 1.6842, + "step": 27800 + }, + { + "epoch": 1.488, + "grad_norm": 5.125, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7157, + "step": 27900 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 5.5, + "learning_rate": 5.580246913580247e-05, + "loss": 1.6674, + "step": 28000 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 5.6875, + "learning_rate": 5.560493827160495e-05, + "loss": 1.6131, + "step": 28100 + }, + { + "epoch": 1.504, + "grad_norm": 4.5, + "learning_rate": 5.540740740740741e-05, + "loss": 1.7084, + "step": 28200 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 5.15625, + "learning_rate": 5.520987654320988e-05, + "loss": 1.5791, + "step": 28300 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 6.96875, + "learning_rate": 5.5012345679012346e-05, + "loss": 1.5846, + "step": 28400 + }, + { + "epoch": 1.52, + "grad_norm": 11.875, + "learning_rate": 5.4814814814814817e-05, + "loss": 1.6353, + "step": 28500 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 8.3125, + "learning_rate": 5.461728395061728e-05, + "loss": 1.6686, + "step": 28600 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 13.6875, + "learning_rate": 5.441975308641976e-05, + "loss": 1.6609, + "step": 28700 + }, + { + "epoch": 1.536, + "grad_norm": 7.6875, + "learning_rate": 5.422222222222223e-05, + "loss": 1.6264, + "step": 28800 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 8.125, + "learning_rate": 5.402469135802469e-05, + "loss": 1.6539, + "step": 28900 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 7.4375, + "learning_rate": 5.382716049382717e-05, + "loss": 1.6946, + "step": 29000 + }, + { + "epoch": 1.552, + "grad_norm": 7.09375, + "learning_rate": 5.362962962962963e-05, + "loss": 1.6258, + "step": 29100 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 4.53125, + "learning_rate": 5.3432098765432105e-05, + "loss": 1.6388, + "step": 29200 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 5.4375, + "learning_rate": 5.323456790123457e-05, + "loss": 1.6131, + "step": 29300 + }, + { + "epoch": 1.568, + "grad_norm": 7.15625, + "learning_rate": 5.303703703703704e-05, + "loss": 1.5935, + "step": 29400 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 9.8125, + "learning_rate": 5.28395061728395e-05, + "loss": 1.6357, + "step": 29500 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 6.625, + "learning_rate": 5.264197530864198e-05, + "loss": 1.6733, + "step": 29600 + }, + { + "epoch": 1.584, + "grad_norm": 5.0, + "learning_rate": 5.244444444444445e-05, + "loss": 1.7063, + "step": 29700 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 6.625, + "learning_rate": 5.2246913580246915e-05, + "loss": 1.6056, + "step": 29800 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 6.90625, + "learning_rate": 5.2049382716049386e-05, + "loss": 1.6357, + "step": 29900 + }, + { + "epoch": 1.6, + "grad_norm": 7.5, + "learning_rate": 5.185185185185185e-05, + "loss": 1.6332, + "step": 30000 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 7.84375, + "learning_rate": 5.165432098765433e-05, + "loss": 1.6458, + "step": 30100 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 15.375, + "learning_rate": 5.145679012345679e-05, + "loss": 1.5787, + "step": 30200 + }, + { + "epoch": 1.616, + "grad_norm": 8.5625, + "learning_rate": 5.125925925925926e-05, + "loss": 1.6441, + "step": 30300 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 5.9375, + "learning_rate": 5.1061728395061726e-05, + "loss": 1.6211, + "step": 30400 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 6.09375, + "learning_rate": 5.0864197530864197e-05, + "loss": 1.6304, + "step": 30500 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.40625, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.6111, + "step": 30600 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 7.625, + "learning_rate": 5.046913580246914e-05, + "loss": 1.6387, + "step": 30700 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 4.875, + "learning_rate": 5.027160493827161e-05, + "loss": 1.6418, + "step": 30800 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.25, + "learning_rate": 5.007407407407407e-05, + "loss": 1.6082, + "step": 30900 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 5.125, + "learning_rate": 4.987654320987655e-05, + "loss": 1.5755, + "step": 31000 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 9.0625, + "learning_rate": 4.9679012345679014e-05, + "loss": 1.6432, + "step": 31100 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 6.0, + "learning_rate": 4.9481481481481485e-05, + "loss": 1.6333, + "step": 31200 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 6.65625, + "learning_rate": 4.9283950617283955e-05, + "loss": 1.6183, + "step": 31300 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 7.28125, + "learning_rate": 4.908641975308642e-05, + "loss": 1.5636, + "step": 31400 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.125, + "learning_rate": 4.888888888888889e-05, + "loss": 1.621, + "step": 31500 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 6.46875, + "learning_rate": 4.869135802469136e-05, + "loss": 1.7226, + "step": 31600 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 5.875, + "learning_rate": 4.849382716049383e-05, + "loss": 1.6311, + "step": 31700 + }, + { + "epoch": 1.696, + "grad_norm": 5.875, + "learning_rate": 4.82962962962963e-05, + "loss": 1.6132, + "step": 31800 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 5.375, + "learning_rate": 4.8098765432098766e-05, + "loss": 1.5931, + "step": 31900 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 13.0625, + "learning_rate": 4.7901234567901237e-05, + "loss": 1.6958, + "step": 32000 + }, + { + "epoch": 1.712, + "grad_norm": 6.40625, + "learning_rate": 4.770370370370371e-05, + "loss": 1.6209, + "step": 32100 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 13.5625, + "learning_rate": 4.750617283950617e-05, + "loss": 1.6031, + "step": 32200 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 8.5, + "learning_rate": 4.730864197530864e-05, + "loss": 1.6279, + "step": 32300 + }, + { + "epoch": 1.728, + "grad_norm": 7.5, + "learning_rate": 4.711111111111111e-05, + "loss": 1.5793, + "step": 32400 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 6.75, + "learning_rate": 4.691358024691358e-05, + "loss": 1.6628, + "step": 32500 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 6.21875, + "learning_rate": 4.6716049382716054e-05, + "loss": 1.671, + "step": 32600 + }, + { + "epoch": 1.744, + "grad_norm": 5.875, + "learning_rate": 4.6518518518518525e-05, + "loss": 1.6886, + "step": 32700 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 5.15625, + "learning_rate": 4.632098765432099e-05, + "loss": 1.6598, + "step": 32800 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 9.6875, + "learning_rate": 4.612345679012346e-05, + "loss": 1.5773, + "step": 32900 + }, + { + "epoch": 1.76, + "grad_norm": 6.125, + "learning_rate": 4.592592592592593e-05, + "loss": 1.6603, + "step": 33000 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 5.8125, + "learning_rate": 4.5728395061728394e-05, + "loss": 1.6405, + "step": 33100 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 5.34375, + "learning_rate": 4.5530864197530865e-05, + "loss": 1.6776, + "step": 33200 + }, + { + "epoch": 1.776, + "grad_norm": 5.5625, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.5413, + "step": 33300 + }, + { + "epoch": 1.7813333333333334, + "grad_norm": 8.875, + "learning_rate": 4.5135802469135806e-05, + "loss": 1.6298, + "step": 33400 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 6.28125, + "learning_rate": 4.493827160493828e-05, + "loss": 1.5795, + "step": 33500 + }, + { + "epoch": 1.792, + "grad_norm": 6.65625, + "learning_rate": 4.474074074074075e-05, + "loss": 1.7145, + "step": 33600 + }, + { + "epoch": 1.7973333333333334, + "grad_norm": 7.96875, + "learning_rate": 4.454320987654321e-05, + "loss": 1.6492, + "step": 33700 + }, + { + "epoch": 1.8026666666666666, + "grad_norm": 10.625, + "learning_rate": 4.434567901234568e-05, + "loss": 1.5981, + "step": 33800 + }, + { + "epoch": 1.808, + "grad_norm": 5.65625, + "learning_rate": 4.414814814814815e-05, + "loss": 1.5606, + "step": 33900 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 4.90625, + "learning_rate": 4.3950617283950617e-05, + "loss": 1.5981, + "step": 34000 + }, + { + "epoch": 1.8186666666666667, + "grad_norm": 6.0, + "learning_rate": 4.375308641975309e-05, + "loss": 1.5976, + "step": 34100 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 5.5625, + "learning_rate": 4.355555555555556e-05, + "loss": 1.6783, + "step": 34200 + }, + { + "epoch": 1.8293333333333335, + "grad_norm": 6.96875, + "learning_rate": 4.335802469135803e-05, + "loss": 1.6716, + "step": 34300 + }, + { + "epoch": 1.8346666666666667, + "grad_norm": 4.6875, + "learning_rate": 4.31604938271605e-05, + "loss": 1.5989, + "step": 34400 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 7.71875, + "learning_rate": 4.296296296296296e-05, + "loss": 1.6317, + "step": 34500 + }, + { + "epoch": 1.8453333333333335, + "grad_norm": 5.78125, + "learning_rate": 4.2765432098765434e-05, + "loss": 1.6327, + "step": 34600 + }, + { + "epoch": 1.8506666666666667, + "grad_norm": 5.59375, + "learning_rate": 4.2567901234567905e-05, + "loss": 1.5324, + "step": 34700 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 7.65625, + "learning_rate": 4.237037037037037e-05, + "loss": 1.6141, + "step": 34800 + }, + { + "epoch": 1.8613333333333333, + "grad_norm": 9.4375, + "learning_rate": 4.217283950617284e-05, + "loss": 1.6398, + "step": 34900 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 8.875, + "learning_rate": 4.197530864197531e-05, + "loss": 1.5835, + "step": 35000 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 12.0625, + "learning_rate": 4.177777777777778e-05, + "loss": 1.633, + "step": 35100 + }, + { + "epoch": 1.8773333333333333, + "grad_norm": 8.375, + "learning_rate": 4.158024691358025e-05, + "loss": 1.6851, + "step": 35200 + }, + { + "epoch": 1.8826666666666667, + "grad_norm": 11.5625, + "learning_rate": 4.138271604938272e-05, + "loss": 1.6436, + "step": 35300 + }, + { + "epoch": 1.888, + "grad_norm": 7.78125, + "learning_rate": 4.1185185185185186e-05, + "loss": 1.6268, + "step": 35400 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 11.1875, + "learning_rate": 4.0987654320987657e-05, + "loss": 1.5537, + "step": 35500 + }, + { + "epoch": 1.8986666666666667, + "grad_norm": 10.8125, + "learning_rate": 4.079012345679013e-05, + "loss": 1.6954, + "step": 35600 + }, + { + "epoch": 1.904, + "grad_norm": 10.625, + "learning_rate": 4.059259259259259e-05, + "loss": 1.6122, + "step": 35700 + }, + { + "epoch": 1.9093333333333333, + "grad_norm": 4.4375, + "learning_rate": 4.039506172839506e-05, + "loss": 1.6308, + "step": 35800 + }, + { + "epoch": 1.9146666666666667, + "grad_norm": 5.4375, + "learning_rate": 4.019753086419753e-05, + "loss": 1.6331, + "step": 35900 + }, + { + "epoch": 1.92, + "grad_norm": 5.125, + "learning_rate": 4e-05, + "loss": 1.5898, + "step": 36000 + }, + { + "epoch": 1.9253333333333333, + "grad_norm": 13.5625, + "learning_rate": 3.9802469135802474e-05, + "loss": 1.6748, + "step": 36100 + }, + { + "epoch": 1.9306666666666668, + "grad_norm": 5.40625, + "learning_rate": 3.960493827160494e-05, + "loss": 1.6326, + "step": 36200 + }, + { + "epoch": 1.936, + "grad_norm": 8.0, + "learning_rate": 3.940740740740741e-05, + "loss": 1.6027, + "step": 36300 + }, + { + "epoch": 1.9413333333333334, + "grad_norm": 12.625, + "learning_rate": 3.920987654320988e-05, + "loss": 1.5298, + "step": 36400 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 5.875, + "learning_rate": 3.901234567901234e-05, + "loss": 1.6354, + "step": 36500 + }, + { + "epoch": 1.952, + "grad_norm": 5.40625, + "learning_rate": 3.8814814814814814e-05, + "loss": 1.6155, + "step": 36600 + }, + { + "epoch": 1.9573333333333334, + "grad_norm": 5.15625, + "learning_rate": 3.8617283950617285e-05, + "loss": 1.6524, + "step": 36700 + }, + { + "epoch": 1.9626666666666668, + "grad_norm": 8.0625, + "learning_rate": 3.8419753086419755e-05, + "loss": 1.6594, + "step": 36800 + }, + { + "epoch": 1.968, + "grad_norm": 11.0, + "learning_rate": 3.8222222222222226e-05, + "loss": 1.6397, + "step": 36900 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 6.96875, + "learning_rate": 3.80246913580247e-05, + "loss": 1.6208, + "step": 37000 + }, + { + "epoch": 1.9786666666666668, + "grad_norm": 9.125, + "learning_rate": 3.782716049382716e-05, + "loss": 1.5995, + "step": 37100 + }, + { + "epoch": 1.984, + "grad_norm": 8.8125, + "learning_rate": 3.762962962962963e-05, + "loss": 1.59, + "step": 37200 + }, + { + "epoch": 1.9893333333333332, + "grad_norm": 8.1875, + "learning_rate": 3.74320987654321e-05, + "loss": 1.6343, + "step": 37300 + }, + { + "epoch": 1.9946666666666668, + "grad_norm": 7.65625, + "learning_rate": 3.7234567901234566e-05, + "loss": 1.6007, + "step": 37400 + }, + { + "epoch": 2.0, + "grad_norm": 6.125, + "learning_rate": 3.7037037037037037e-05, + "loss": 1.6295, + "step": 37500 + }, + { + "epoch": 2.005333333333333, + "grad_norm": 6.65625, + "learning_rate": 3.683950617283951e-05, + "loss": 1.2317, + "step": 37600 + }, + { + "epoch": 2.010666666666667, + "grad_norm": 6.9375, + "learning_rate": 3.664197530864198e-05, + "loss": 1.3769, + "step": 37700 + }, + { + "epoch": 2.016, + "grad_norm": 5.46875, + "learning_rate": 3.644444444444445e-05, + "loss": 1.3206, + "step": 37800 + }, + { + "epoch": 2.021333333333333, + "grad_norm": 7.3125, + "learning_rate": 3.624691358024692e-05, + "loss": 1.2903, + "step": 37900 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.604938271604938e-05, + "loss": 1.3443, + "step": 38000 + }, + { + "epoch": 2.032, + "grad_norm": 6.375, + "learning_rate": 3.5851851851851854e-05, + "loss": 1.291, + "step": 38100 + }, + { + "epoch": 2.037333333333333, + "grad_norm": 7.84375, + "learning_rate": 3.5654320987654325e-05, + "loss": 1.2552, + "step": 38200 + }, + { + "epoch": 2.042666666666667, + "grad_norm": 8.9375, + "learning_rate": 3.545679012345679e-05, + "loss": 1.2883, + "step": 38300 + }, + { + "epoch": 2.048, + "grad_norm": 6.09375, + "learning_rate": 3.525925925925926e-05, + "loss": 1.2755, + "step": 38400 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 6.0625, + "learning_rate": 3.506172839506173e-05, + "loss": 1.3612, + "step": 38500 + }, + { + "epoch": 2.058666666666667, + "grad_norm": 8.625, + "learning_rate": 3.48641975308642e-05, + "loss": 1.2394, + "step": 38600 + }, + { + "epoch": 2.064, + "grad_norm": 8.25, + "learning_rate": 3.466666666666667e-05, + "loss": 1.3005, + "step": 38700 + }, + { + "epoch": 2.0693333333333332, + "grad_norm": 7.125, + "learning_rate": 3.4469135802469135e-05, + "loss": 1.3219, + "step": 38800 + }, + { + "epoch": 2.074666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.4271604938271606e-05, + "loss": 1.3388, + "step": 38900 + }, + { + "epoch": 2.08, + "grad_norm": 7.4375, + "learning_rate": 3.4074074074074077e-05, + "loss": 1.3317, + "step": 39000 + }, + { + "epoch": 2.0853333333333333, + "grad_norm": 5.15625, + "learning_rate": 3.387654320987654e-05, + "loss": 1.2546, + "step": 39100 + }, + { + "epoch": 2.0906666666666665, + "grad_norm": 6.71875, + "learning_rate": 3.367901234567901e-05, + "loss": 1.3502, + "step": 39200 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8750174457233408e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-39200/training_args.bin b/checkpoint-39200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-39200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-44800/config.json b/checkpoint-44800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-44800/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-44800/generation_config.json b/checkpoint-44800/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-44800/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-44800/model.safetensors b/checkpoint-44800/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c4efc72f8a93107e97331b06ce4bd554c064a134 --- /dev/null +++ b/checkpoint-44800/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e36b2c9258b8cb32ae6b64618e81465db17135dc7d134248dc538b8b86ab354 +size 2471645608 diff --git a/checkpoint-44800/optimizer.pt b/checkpoint-44800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4e68d2684783e941e4eeff9aacfc30ce22c9ede --- /dev/null +++ b/checkpoint-44800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27eb6eba60f3a4c71172ba156e0f42bf150aded7b7b0fbc8a1230928b764b40f +size 4943382114 diff --git a/checkpoint-44800/rng_state.pth b/checkpoint-44800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..95cad945c935ebaeeca5e461007867b6e155022e --- /dev/null +++ b/checkpoint-44800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3adbebcf39742024134d36312cc62baede0eb396a36041797f643dcac19c1b2 +size 14244 diff --git a/checkpoint-44800/scheduler.pt b/checkpoint-44800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5fe08527f8b11f5c23caf36ba3dc01bcab88344d --- /dev/null +++ b/checkpoint-44800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990386d9cc92efce7711a882040daa46259c0138048a02f85f6aebd660864c1c +size 1064 diff --git a/checkpoint-44800/special_tokens_map.json b/checkpoint-44800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-44800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-44800/tokenizer.json b/checkpoint-44800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-44800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-44800/tokenizer_config.json b/checkpoint-44800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-44800/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-44800/trainer_state.json b/checkpoint-44800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1498cc9092870a6cf5087be12b34b2721ec165de --- /dev/null +++ b/checkpoint-44800/trainer_state.json @@ -0,0 +1,3169 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.389333333333333, + "eval_steps": 500, + "global_step": 44800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + }, + { + "epoch": 1.2, + "grad_norm": 5.71875, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6603, + "step": 22500 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 5.90625, + "learning_rate": 6.646913580246914e-05, + "loss": 1.7291, + "step": 22600 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 7.40625, + "learning_rate": 6.62716049382716e-05, + "loss": 1.7231, + "step": 22700 + }, + { + "epoch": 1.216, + "grad_norm": 4.8125, + "learning_rate": 6.607407407407408e-05, + "loss": 1.6072, + "step": 22800 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 10.5, + "learning_rate": 6.587654320987655e-05, + "loss": 1.7127, + "step": 22900 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 5.71875, + "learning_rate": 6.567901234567901e-05, + "loss": 1.7209, + "step": 23000 + }, + { + "epoch": 1.232, + "grad_norm": 6.0, + "learning_rate": 6.54814814814815e-05, + "loss": 1.7039, + "step": 23100 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 10.3125, + "learning_rate": 6.528395061728395e-05, + "loss": 1.7275, + "step": 23200 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 5.5625, + "learning_rate": 6.508641975308642e-05, + "loss": 1.7337, + "step": 23300 + }, + { + "epoch": 1.248, + "grad_norm": 5.90625, + "learning_rate": 6.488888888888889e-05, + "loss": 1.6821, + "step": 23400 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 5.875, + "learning_rate": 6.469135802469136e-05, + "loss": 1.7188, + "step": 23500 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 5.84375, + "learning_rate": 6.449382716049382e-05, + "loss": 1.7119, + "step": 23600 + }, + { + "epoch": 1.264, + "grad_norm": 8.125, + "learning_rate": 6.42962962962963e-05, + "loss": 1.6742, + "step": 23700 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 4.96875, + "learning_rate": 6.409876543209878e-05, + "loss": 1.6378, + "step": 23800 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 5.40625, + "learning_rate": 6.390123456790123e-05, + "loss": 1.6826, + "step": 23900 + }, + { + "epoch": 1.28, + "grad_norm": 5.96875, + "learning_rate": 6.37037037037037e-05, + "loss": 1.712, + "step": 24000 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 6.3125, + "learning_rate": 6.350617283950617e-05, + "loss": 1.7673, + "step": 24100 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 5.375, + "learning_rate": 6.330864197530864e-05, + "loss": 1.5944, + "step": 24200 + }, + { + "epoch": 1.296, + "grad_norm": 8.0, + "learning_rate": 6.311111111111112e-05, + "loss": 1.7515, + "step": 24300 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 5.53125, + "learning_rate": 6.291358024691359e-05, + "loss": 1.739, + "step": 24400 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 4.6875, + "learning_rate": 6.271604938271606e-05, + "loss": 1.744, + "step": 24500 + }, + { + "epoch": 1.312, + "grad_norm": 11.9375, + "learning_rate": 6.251851851851853e-05, + "loss": 1.6566, + "step": 24600 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 11.4375, + "learning_rate": 6.2320987654321e-05, + "loss": 1.6289, + "step": 24700 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 11.1875, + "learning_rate": 6.212345679012346e-05, + "loss": 1.686, + "step": 24800 + }, + { + "epoch": 1.328, + "grad_norm": 6.21875, + "learning_rate": 6.192592592592593e-05, + "loss": 1.66, + "step": 24900 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.5, + "learning_rate": 6.17283950617284e-05, + "loss": 1.6724, + "step": 25000 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 6.46875, + "learning_rate": 6.153086419753087e-05, + "loss": 1.7236, + "step": 25100 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.6875, + "learning_rate": 6.133333333333334e-05, + "loss": 1.6676, + "step": 25200 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 6.84375, + "learning_rate": 6.113580246913581e-05, + "loss": 1.6966, + "step": 25300 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 6.09375, + "learning_rate": 6.093827160493828e-05, + "loss": 1.6573, + "step": 25400 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.53125, + "learning_rate": 6.074074074074074e-05, + "loss": 1.7067, + "step": 25500 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 5.0, + "learning_rate": 6.0543209876543214e-05, + "loss": 1.6531, + "step": 25600 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 4.3125, + "learning_rate": 6.034567901234568e-05, + "loss": 1.6951, + "step": 25700 + }, + { + "epoch": 1.376, + "grad_norm": 6.84375, + "learning_rate": 6.0148148148148155e-05, + "loss": 1.6101, + "step": 25800 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 5.8125, + "learning_rate": 5.995061728395062e-05, + "loss": 1.7114, + "step": 25900 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 6.375, + "learning_rate": 5.975308641975309e-05, + "loss": 1.6413, + "step": 26000 + }, + { + "epoch": 1.392, + "grad_norm": 5.5, + "learning_rate": 5.9555555555555554e-05, + "loss": 1.6189, + "step": 26100 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 6.28125, + "learning_rate": 5.9358024691358024e-05, + "loss": 1.6949, + "step": 26200 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 5.25, + "learning_rate": 5.91604938271605e-05, + "loss": 1.6616, + "step": 26300 + }, + { + "epoch": 1.408, + "grad_norm": 8.625, + "learning_rate": 5.8962962962962966e-05, + "loss": 1.6484, + "step": 26400 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 4.96875, + "learning_rate": 5.8765432098765437e-05, + "loss": 1.599, + "step": 26500 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 4.40625, + "learning_rate": 5.85679012345679e-05, + "loss": 1.6366, + "step": 26600 + }, + { + "epoch": 1.424, + "grad_norm": 9.8125, + "learning_rate": 5.837037037037038e-05, + "loss": 1.7065, + "step": 26700 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 5.46875, + "learning_rate": 5.8172839506172835e-05, + "loss": 1.6841, + "step": 26800 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 4.9375, + "learning_rate": 5.797530864197531e-05, + "loss": 1.66, + "step": 26900 + }, + { + "epoch": 1.44, + "grad_norm": 5.375, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6645, + "step": 27000 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 5.875, + "learning_rate": 5.758024691358025e-05, + "loss": 1.6354, + "step": 27100 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 6.90625, + "learning_rate": 5.7382716049382725e-05, + "loss": 1.626, + "step": 27200 + }, + { + "epoch": 1.456, + "grad_norm": 6.5, + "learning_rate": 5.718518518518519e-05, + "loss": 1.6265, + "step": 27300 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 9.25, + "learning_rate": 5.698765432098766e-05, + "loss": 1.6879, + "step": 27400 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 6.1875, + "learning_rate": 5.679012345679012e-05, + "loss": 1.6756, + "step": 27500 + }, + { + "epoch": 1.472, + "grad_norm": 6.0625, + "learning_rate": 5.6592592592592594e-05, + "loss": 1.748, + "step": 27600 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 7.1875, + "learning_rate": 5.639506172839506e-05, + "loss": 1.668, + "step": 27700 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 11.375, + "learning_rate": 5.6197530864197535e-05, + "loss": 1.6842, + "step": 27800 + }, + { + "epoch": 1.488, + "grad_norm": 5.125, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7157, + "step": 27900 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 5.5, + "learning_rate": 5.580246913580247e-05, + "loss": 1.6674, + "step": 28000 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 5.6875, + "learning_rate": 5.560493827160495e-05, + "loss": 1.6131, + "step": 28100 + }, + { + "epoch": 1.504, + "grad_norm": 4.5, + "learning_rate": 5.540740740740741e-05, + "loss": 1.7084, + "step": 28200 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 5.15625, + "learning_rate": 5.520987654320988e-05, + "loss": 1.5791, + "step": 28300 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 6.96875, + "learning_rate": 5.5012345679012346e-05, + "loss": 1.5846, + "step": 28400 + }, + { + "epoch": 1.52, + "grad_norm": 11.875, + "learning_rate": 5.4814814814814817e-05, + "loss": 1.6353, + "step": 28500 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 8.3125, + "learning_rate": 5.461728395061728e-05, + "loss": 1.6686, + "step": 28600 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 13.6875, + "learning_rate": 5.441975308641976e-05, + "loss": 1.6609, + "step": 28700 + }, + { + "epoch": 1.536, + "grad_norm": 7.6875, + "learning_rate": 5.422222222222223e-05, + "loss": 1.6264, + "step": 28800 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 8.125, + "learning_rate": 5.402469135802469e-05, + "loss": 1.6539, + "step": 28900 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 7.4375, + "learning_rate": 5.382716049382717e-05, + "loss": 1.6946, + "step": 29000 + }, + { + "epoch": 1.552, + "grad_norm": 7.09375, + "learning_rate": 5.362962962962963e-05, + "loss": 1.6258, + "step": 29100 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 4.53125, + "learning_rate": 5.3432098765432105e-05, + "loss": 1.6388, + "step": 29200 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 5.4375, + "learning_rate": 5.323456790123457e-05, + "loss": 1.6131, + "step": 29300 + }, + { + "epoch": 1.568, + "grad_norm": 7.15625, + "learning_rate": 5.303703703703704e-05, + "loss": 1.5935, + "step": 29400 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 9.8125, + "learning_rate": 5.28395061728395e-05, + "loss": 1.6357, + "step": 29500 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 6.625, + "learning_rate": 5.264197530864198e-05, + "loss": 1.6733, + "step": 29600 + }, + { + "epoch": 1.584, + "grad_norm": 5.0, + "learning_rate": 5.244444444444445e-05, + "loss": 1.7063, + "step": 29700 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 6.625, + "learning_rate": 5.2246913580246915e-05, + "loss": 1.6056, + "step": 29800 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 6.90625, + "learning_rate": 5.2049382716049386e-05, + "loss": 1.6357, + "step": 29900 + }, + { + "epoch": 1.6, + "grad_norm": 7.5, + "learning_rate": 5.185185185185185e-05, + "loss": 1.6332, + "step": 30000 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 7.84375, + "learning_rate": 5.165432098765433e-05, + "loss": 1.6458, + "step": 30100 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 15.375, + "learning_rate": 5.145679012345679e-05, + "loss": 1.5787, + "step": 30200 + }, + { + "epoch": 1.616, + "grad_norm": 8.5625, + "learning_rate": 5.125925925925926e-05, + "loss": 1.6441, + "step": 30300 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 5.9375, + "learning_rate": 5.1061728395061726e-05, + "loss": 1.6211, + "step": 30400 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 6.09375, + "learning_rate": 5.0864197530864197e-05, + "loss": 1.6304, + "step": 30500 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.40625, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.6111, + "step": 30600 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 7.625, + "learning_rate": 5.046913580246914e-05, + "loss": 1.6387, + "step": 30700 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 4.875, + "learning_rate": 5.027160493827161e-05, + "loss": 1.6418, + "step": 30800 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.25, + "learning_rate": 5.007407407407407e-05, + "loss": 1.6082, + "step": 30900 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 5.125, + "learning_rate": 4.987654320987655e-05, + "loss": 1.5755, + "step": 31000 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 9.0625, + "learning_rate": 4.9679012345679014e-05, + "loss": 1.6432, + "step": 31100 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 6.0, + "learning_rate": 4.9481481481481485e-05, + "loss": 1.6333, + "step": 31200 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 6.65625, + "learning_rate": 4.9283950617283955e-05, + "loss": 1.6183, + "step": 31300 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 7.28125, + "learning_rate": 4.908641975308642e-05, + "loss": 1.5636, + "step": 31400 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.125, + "learning_rate": 4.888888888888889e-05, + "loss": 1.621, + "step": 31500 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 6.46875, + "learning_rate": 4.869135802469136e-05, + "loss": 1.7226, + "step": 31600 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 5.875, + "learning_rate": 4.849382716049383e-05, + "loss": 1.6311, + "step": 31700 + }, + { + "epoch": 1.696, + "grad_norm": 5.875, + "learning_rate": 4.82962962962963e-05, + "loss": 1.6132, + "step": 31800 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 5.375, + "learning_rate": 4.8098765432098766e-05, + "loss": 1.5931, + "step": 31900 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 13.0625, + "learning_rate": 4.7901234567901237e-05, + "loss": 1.6958, + "step": 32000 + }, + { + "epoch": 1.712, + "grad_norm": 6.40625, + "learning_rate": 4.770370370370371e-05, + "loss": 1.6209, + "step": 32100 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 13.5625, + "learning_rate": 4.750617283950617e-05, + "loss": 1.6031, + "step": 32200 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 8.5, + "learning_rate": 4.730864197530864e-05, + "loss": 1.6279, + "step": 32300 + }, + { + "epoch": 1.728, + "grad_norm": 7.5, + "learning_rate": 4.711111111111111e-05, + "loss": 1.5793, + "step": 32400 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 6.75, + "learning_rate": 4.691358024691358e-05, + "loss": 1.6628, + "step": 32500 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 6.21875, + "learning_rate": 4.6716049382716054e-05, + "loss": 1.671, + "step": 32600 + }, + { + "epoch": 1.744, + "grad_norm": 5.875, + "learning_rate": 4.6518518518518525e-05, + "loss": 1.6886, + "step": 32700 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 5.15625, + "learning_rate": 4.632098765432099e-05, + "loss": 1.6598, + "step": 32800 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 9.6875, + "learning_rate": 4.612345679012346e-05, + "loss": 1.5773, + "step": 32900 + }, + { + "epoch": 1.76, + "grad_norm": 6.125, + "learning_rate": 4.592592592592593e-05, + "loss": 1.6603, + "step": 33000 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 5.8125, + "learning_rate": 4.5728395061728394e-05, + "loss": 1.6405, + "step": 33100 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 5.34375, + "learning_rate": 4.5530864197530865e-05, + "loss": 1.6776, + "step": 33200 + }, + { + "epoch": 1.776, + "grad_norm": 5.5625, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.5413, + "step": 33300 + }, + { + "epoch": 1.7813333333333334, + "grad_norm": 8.875, + "learning_rate": 4.5135802469135806e-05, + "loss": 1.6298, + "step": 33400 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 6.28125, + "learning_rate": 4.493827160493828e-05, + "loss": 1.5795, + "step": 33500 + }, + { + "epoch": 1.792, + "grad_norm": 6.65625, + "learning_rate": 4.474074074074075e-05, + "loss": 1.7145, + "step": 33600 + }, + { + "epoch": 1.7973333333333334, + "grad_norm": 7.96875, + "learning_rate": 4.454320987654321e-05, + "loss": 1.6492, + "step": 33700 + }, + { + "epoch": 1.8026666666666666, + "grad_norm": 10.625, + "learning_rate": 4.434567901234568e-05, + "loss": 1.5981, + "step": 33800 + }, + { + "epoch": 1.808, + "grad_norm": 5.65625, + "learning_rate": 4.414814814814815e-05, + "loss": 1.5606, + "step": 33900 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 4.90625, + "learning_rate": 4.3950617283950617e-05, + "loss": 1.5981, + "step": 34000 + }, + { + "epoch": 1.8186666666666667, + "grad_norm": 6.0, + "learning_rate": 4.375308641975309e-05, + "loss": 1.5976, + "step": 34100 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 5.5625, + "learning_rate": 4.355555555555556e-05, + "loss": 1.6783, + "step": 34200 + }, + { + "epoch": 1.8293333333333335, + "grad_norm": 6.96875, + "learning_rate": 4.335802469135803e-05, + "loss": 1.6716, + "step": 34300 + }, + { + "epoch": 1.8346666666666667, + "grad_norm": 4.6875, + "learning_rate": 4.31604938271605e-05, + "loss": 1.5989, + "step": 34400 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 7.71875, + "learning_rate": 4.296296296296296e-05, + "loss": 1.6317, + "step": 34500 + }, + { + "epoch": 1.8453333333333335, + "grad_norm": 5.78125, + "learning_rate": 4.2765432098765434e-05, + "loss": 1.6327, + "step": 34600 + }, + { + "epoch": 1.8506666666666667, + "grad_norm": 5.59375, + "learning_rate": 4.2567901234567905e-05, + "loss": 1.5324, + "step": 34700 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 7.65625, + "learning_rate": 4.237037037037037e-05, + "loss": 1.6141, + "step": 34800 + }, + { + "epoch": 1.8613333333333333, + "grad_norm": 9.4375, + "learning_rate": 4.217283950617284e-05, + "loss": 1.6398, + "step": 34900 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 8.875, + "learning_rate": 4.197530864197531e-05, + "loss": 1.5835, + "step": 35000 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 12.0625, + "learning_rate": 4.177777777777778e-05, + "loss": 1.633, + "step": 35100 + }, + { + "epoch": 1.8773333333333333, + "grad_norm": 8.375, + "learning_rate": 4.158024691358025e-05, + "loss": 1.6851, + "step": 35200 + }, + { + "epoch": 1.8826666666666667, + "grad_norm": 11.5625, + "learning_rate": 4.138271604938272e-05, + "loss": 1.6436, + "step": 35300 + }, + { + "epoch": 1.888, + "grad_norm": 7.78125, + "learning_rate": 4.1185185185185186e-05, + "loss": 1.6268, + "step": 35400 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 11.1875, + "learning_rate": 4.0987654320987657e-05, + "loss": 1.5537, + "step": 35500 + }, + { + "epoch": 1.8986666666666667, + "grad_norm": 10.8125, + "learning_rate": 4.079012345679013e-05, + "loss": 1.6954, + "step": 35600 + }, + { + "epoch": 1.904, + "grad_norm": 10.625, + "learning_rate": 4.059259259259259e-05, + "loss": 1.6122, + "step": 35700 + }, + { + "epoch": 1.9093333333333333, + "grad_norm": 4.4375, + "learning_rate": 4.039506172839506e-05, + "loss": 1.6308, + "step": 35800 + }, + { + "epoch": 1.9146666666666667, + "grad_norm": 5.4375, + "learning_rate": 4.019753086419753e-05, + "loss": 1.6331, + "step": 35900 + }, + { + "epoch": 1.92, + "grad_norm": 5.125, + "learning_rate": 4e-05, + "loss": 1.5898, + "step": 36000 + }, + { + "epoch": 1.9253333333333333, + "grad_norm": 13.5625, + "learning_rate": 3.9802469135802474e-05, + "loss": 1.6748, + "step": 36100 + }, + { + "epoch": 1.9306666666666668, + "grad_norm": 5.40625, + "learning_rate": 3.960493827160494e-05, + "loss": 1.6326, + "step": 36200 + }, + { + "epoch": 1.936, + "grad_norm": 8.0, + "learning_rate": 3.940740740740741e-05, + "loss": 1.6027, + "step": 36300 + }, + { + "epoch": 1.9413333333333334, + "grad_norm": 12.625, + "learning_rate": 3.920987654320988e-05, + "loss": 1.5298, + "step": 36400 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 5.875, + "learning_rate": 3.901234567901234e-05, + "loss": 1.6354, + "step": 36500 + }, + { + "epoch": 1.952, + "grad_norm": 5.40625, + "learning_rate": 3.8814814814814814e-05, + "loss": 1.6155, + "step": 36600 + }, + { + "epoch": 1.9573333333333334, + "grad_norm": 5.15625, + "learning_rate": 3.8617283950617285e-05, + "loss": 1.6524, + "step": 36700 + }, + { + "epoch": 1.9626666666666668, + "grad_norm": 8.0625, + "learning_rate": 3.8419753086419755e-05, + "loss": 1.6594, + "step": 36800 + }, + { + "epoch": 1.968, + "grad_norm": 11.0, + "learning_rate": 3.8222222222222226e-05, + "loss": 1.6397, + "step": 36900 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 6.96875, + "learning_rate": 3.80246913580247e-05, + "loss": 1.6208, + "step": 37000 + }, + { + "epoch": 1.9786666666666668, + "grad_norm": 9.125, + "learning_rate": 3.782716049382716e-05, + "loss": 1.5995, + "step": 37100 + }, + { + "epoch": 1.984, + "grad_norm": 8.8125, + "learning_rate": 3.762962962962963e-05, + "loss": 1.59, + "step": 37200 + }, + { + "epoch": 1.9893333333333332, + "grad_norm": 8.1875, + "learning_rate": 3.74320987654321e-05, + "loss": 1.6343, + "step": 37300 + }, + { + "epoch": 1.9946666666666668, + "grad_norm": 7.65625, + "learning_rate": 3.7234567901234566e-05, + "loss": 1.6007, + "step": 37400 + }, + { + "epoch": 2.0, + "grad_norm": 6.125, + "learning_rate": 3.7037037037037037e-05, + "loss": 1.6295, + "step": 37500 + }, + { + "epoch": 2.005333333333333, + "grad_norm": 6.65625, + "learning_rate": 3.683950617283951e-05, + "loss": 1.2317, + "step": 37600 + }, + { + "epoch": 2.010666666666667, + "grad_norm": 6.9375, + "learning_rate": 3.664197530864198e-05, + "loss": 1.3769, + "step": 37700 + }, + { + "epoch": 2.016, + "grad_norm": 5.46875, + "learning_rate": 3.644444444444445e-05, + "loss": 1.3206, + "step": 37800 + }, + { + "epoch": 2.021333333333333, + "grad_norm": 7.3125, + "learning_rate": 3.624691358024692e-05, + "loss": 1.2903, + "step": 37900 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.604938271604938e-05, + "loss": 1.3443, + "step": 38000 + }, + { + "epoch": 2.032, + "grad_norm": 6.375, + "learning_rate": 3.5851851851851854e-05, + "loss": 1.291, + "step": 38100 + }, + { + "epoch": 2.037333333333333, + "grad_norm": 7.84375, + "learning_rate": 3.5654320987654325e-05, + "loss": 1.2552, + "step": 38200 + }, + { + "epoch": 2.042666666666667, + "grad_norm": 8.9375, + "learning_rate": 3.545679012345679e-05, + "loss": 1.2883, + "step": 38300 + }, + { + "epoch": 2.048, + "grad_norm": 6.09375, + "learning_rate": 3.525925925925926e-05, + "loss": 1.2755, + "step": 38400 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 6.0625, + "learning_rate": 3.506172839506173e-05, + "loss": 1.3612, + "step": 38500 + }, + { + "epoch": 2.058666666666667, + "grad_norm": 8.625, + "learning_rate": 3.48641975308642e-05, + "loss": 1.2394, + "step": 38600 + }, + { + "epoch": 2.064, + "grad_norm": 8.25, + "learning_rate": 3.466666666666667e-05, + "loss": 1.3005, + "step": 38700 + }, + { + "epoch": 2.0693333333333332, + "grad_norm": 7.125, + "learning_rate": 3.4469135802469135e-05, + "loss": 1.3219, + "step": 38800 + }, + { + "epoch": 2.074666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.4271604938271606e-05, + "loss": 1.3388, + "step": 38900 + }, + { + "epoch": 2.08, + "grad_norm": 7.4375, + "learning_rate": 3.4074074074074077e-05, + "loss": 1.3317, + "step": 39000 + }, + { + "epoch": 2.0853333333333333, + "grad_norm": 5.15625, + "learning_rate": 3.387654320987654e-05, + "loss": 1.2546, + "step": 39100 + }, + { + "epoch": 2.0906666666666665, + "grad_norm": 6.71875, + "learning_rate": 3.367901234567901e-05, + "loss": 1.3502, + "step": 39200 + }, + { + "epoch": 2.096, + "grad_norm": 7.28125, + "learning_rate": 3.348148148148148e-05, + "loss": 1.2733, + "step": 39300 + }, + { + "epoch": 2.1013333333333333, + "grad_norm": 8.125, + "learning_rate": 3.328395061728395e-05, + "loss": 1.2879, + "step": 39400 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 6.5625, + "learning_rate": 3.308641975308642e-05, + "loss": 1.2175, + "step": 39500 + }, + { + "epoch": 2.112, + "grad_norm": 7.375, + "learning_rate": 3.2888888888888894e-05, + "loss": 1.3628, + "step": 39600 + }, + { + "epoch": 2.1173333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.269135802469136e-05, + "loss": 1.2937, + "step": 39700 + }, + { + "epoch": 2.1226666666666665, + "grad_norm": 5.9375, + "learning_rate": 3.249382716049383e-05, + "loss": 1.2451, + "step": 39800 + }, + { + "epoch": 2.128, + "grad_norm": 9.6875, + "learning_rate": 3.22962962962963e-05, + "loss": 1.3379, + "step": 39900 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 7.5, + "learning_rate": 3.209876543209876e-05, + "loss": 1.2934, + "step": 40000 + }, + { + "epoch": 2.1386666666666665, + "grad_norm": 5.84375, + "learning_rate": 3.1901234567901234e-05, + "loss": 1.2618, + "step": 40100 + }, + { + "epoch": 2.144, + "grad_norm": 6.875, + "learning_rate": 3.1703703703703705e-05, + "loss": 1.384, + "step": 40200 + }, + { + "epoch": 2.1493333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.1506172839506175e-05, + "loss": 1.2611, + "step": 40300 + }, + { + "epoch": 2.1546666666666665, + "grad_norm": 8.0, + "learning_rate": 3.1308641975308646e-05, + "loss": 1.2923, + "step": 40400 + }, + { + "epoch": 2.16, + "grad_norm": 7.53125, + "learning_rate": 3.111111111111111e-05, + "loss": 1.2947, + "step": 40500 + }, + { + "epoch": 2.1653333333333333, + "grad_norm": 8.125, + "learning_rate": 3.091358024691358e-05, + "loss": 1.283, + "step": 40600 + }, + { + "epoch": 2.1706666666666665, + "grad_norm": 7.625, + "learning_rate": 3.071604938271605e-05, + "loss": 1.3939, + "step": 40700 + }, + { + "epoch": 2.176, + "grad_norm": 8.0625, + "learning_rate": 3.0518518518518515e-05, + "loss": 1.3395, + "step": 40800 + }, + { + "epoch": 2.1813333333333333, + "grad_norm": 10.3125, + "learning_rate": 3.0320987654320986e-05, + "loss": 1.2382, + "step": 40900 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 5.8125, + "learning_rate": 3.012345679012346e-05, + "loss": 1.346, + "step": 41000 + }, + { + "epoch": 2.192, + "grad_norm": 12.0, + "learning_rate": 2.992592592592593e-05, + "loss": 1.348, + "step": 41100 + }, + { + "epoch": 2.1973333333333334, + "grad_norm": 7.0, + "learning_rate": 2.9728395061728398e-05, + "loss": 1.2885, + "step": 41200 + }, + { + "epoch": 2.2026666666666666, + "grad_norm": 10.9375, + "learning_rate": 2.9530864197530865e-05, + "loss": 1.2577, + "step": 41300 + }, + { + "epoch": 2.208, + "grad_norm": 7.0625, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.3698, + "step": 41400 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 5.6875, + "learning_rate": 2.9135802469135803e-05, + "loss": 1.2787, + "step": 41500 + }, + { + "epoch": 2.2186666666666666, + "grad_norm": 11.0625, + "learning_rate": 2.893827160493827e-05, + "loss": 1.299, + "step": 41600 + }, + { + "epoch": 2.224, + "grad_norm": 16.5, + "learning_rate": 2.874074074074074e-05, + "loss": 1.3493, + "step": 41700 + }, + { + "epoch": 2.2293333333333334, + "grad_norm": 7.71875, + "learning_rate": 2.854320987654321e-05, + "loss": 1.232, + "step": 41800 + }, + { + "epoch": 2.2346666666666666, + "grad_norm": 7.3125, + "learning_rate": 2.8345679012345683e-05, + "loss": 1.2965, + "step": 41900 + }, + { + "epoch": 2.24, + "grad_norm": 4.875, + "learning_rate": 2.814814814814815e-05, + "loss": 1.2932, + "step": 42000 + }, + { + "epoch": 2.2453333333333334, + "grad_norm": 8.5625, + "learning_rate": 2.795061728395062e-05, + "loss": 1.2689, + "step": 42100 + }, + { + "epoch": 2.2506666666666666, + "grad_norm": 8.5625, + "learning_rate": 2.7753086419753088e-05, + "loss": 1.3437, + "step": 42200 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 11.375, + "learning_rate": 2.7555555555555555e-05, + "loss": 1.3957, + "step": 42300 + }, + { + "epoch": 2.2613333333333334, + "grad_norm": 7.125, + "learning_rate": 2.7358024691358026e-05, + "loss": 1.2948, + "step": 42400 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 6.90625, + "learning_rate": 2.7160493827160493e-05, + "loss": 1.2896, + "step": 42500 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 8.3125, + "learning_rate": 2.696296296296296e-05, + "loss": 1.2483, + "step": 42600 + }, + { + "epoch": 2.2773333333333334, + "grad_norm": 6.40625, + "learning_rate": 2.6765432098765435e-05, + "loss": 1.3159, + "step": 42700 + }, + { + "epoch": 2.2826666666666666, + "grad_norm": 6.59375, + "learning_rate": 2.6567901234567905e-05, + "loss": 1.2742, + "step": 42800 + }, + { + "epoch": 2.288, + "grad_norm": 7.21875, + "learning_rate": 2.6370370370370373e-05, + "loss": 1.3353, + "step": 42900 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 5.46875, + "learning_rate": 2.617283950617284e-05, + "loss": 1.3093, + "step": 43000 + }, + { + "epoch": 2.2986666666666666, + "grad_norm": 8.3125, + "learning_rate": 2.597530864197531e-05, + "loss": 1.2342, + "step": 43100 + }, + { + "epoch": 2.304, + "grad_norm": 7.09375, + "learning_rate": 2.5777777777777778e-05, + "loss": 1.3586, + "step": 43200 + }, + { + "epoch": 2.3093333333333335, + "grad_norm": 11.625, + "learning_rate": 2.558024691358025e-05, + "loss": 1.2999, + "step": 43300 + }, + { + "epoch": 2.3146666666666667, + "grad_norm": 7.75, + "learning_rate": 2.5382716049382716e-05, + "loss": 1.2873, + "step": 43400 + }, + { + "epoch": 2.32, + "grad_norm": 8.0, + "learning_rate": 2.5185185185185183e-05, + "loss": 1.3057, + "step": 43500 + }, + { + "epoch": 2.3253333333333335, + "grad_norm": 9.1875, + "learning_rate": 2.4987654320987654e-05, + "loss": 1.3544, + "step": 43600 + }, + { + "epoch": 2.3306666666666667, + "grad_norm": 7.71875, + "learning_rate": 2.4790123456790125e-05, + "loss": 1.333, + "step": 43700 + }, + { + "epoch": 2.336, + "grad_norm": 6.21875, + "learning_rate": 2.4592592592592595e-05, + "loss": 1.2135, + "step": 43800 + }, + { + "epoch": 2.3413333333333335, + "grad_norm": 6.59375, + "learning_rate": 2.4395061728395063e-05, + "loss": 1.3494, + "step": 43900 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 8.1875, + "learning_rate": 2.4197530864197533e-05, + "loss": 1.3179, + "step": 44000 + }, + { + "epoch": 2.352, + "grad_norm": 5.9375, + "learning_rate": 2.4e-05, + "loss": 1.401, + "step": 44100 + }, + { + "epoch": 2.3573333333333335, + "grad_norm": 8.125, + "learning_rate": 2.380246913580247e-05, + "loss": 1.2905, + "step": 44200 + }, + { + "epoch": 2.3626666666666667, + "grad_norm": 6.5625, + "learning_rate": 2.360493827160494e-05, + "loss": 1.3236, + "step": 44300 + }, + { + "epoch": 2.368, + "grad_norm": 7.71875, + "learning_rate": 2.340740740740741e-05, + "loss": 1.2924, + "step": 44400 + }, + { + "epoch": 2.3733333333333335, + "grad_norm": 10.1875, + "learning_rate": 2.3209876543209877e-05, + "loss": 1.3823, + "step": 44500 + }, + { + "epoch": 2.3786666666666667, + "grad_norm": 9.0625, + "learning_rate": 2.3012345679012347e-05, + "loss": 1.2555, + "step": 44600 + }, + { + "epoch": 2.384, + "grad_norm": 6.53125, + "learning_rate": 2.2814814814814818e-05, + "loss": 1.319, + "step": 44700 + }, + { + "epoch": 2.389333333333333, + "grad_norm": 6.28125, + "learning_rate": 2.2617283950617285e-05, + "loss": 1.3722, + "step": 44800 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1428770808266752e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-44800/training_args.bin b/checkpoint-44800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-44800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-50400/config.json b/checkpoint-50400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-50400/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-50400/generation_config.json b/checkpoint-50400/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-50400/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-50400/model.safetensors b/checkpoint-50400/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b2fc27cb36bfa075caaa66cdd281e8dd120fd9c4 --- /dev/null +++ b/checkpoint-50400/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26fa1e92739d1bcdbf6aa5a2a9437eaf233f8f0d7fbb0fde30c3665ab5e6f20d +size 2471645608 diff --git a/checkpoint-50400/optimizer.pt b/checkpoint-50400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..68c939705efd14b75f6fc37218170379ba00e74c --- /dev/null +++ b/checkpoint-50400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd9e316ea25e611ed00ba956ff3b70698f50b32e1428c9562cacbb4524784afd +size 4943382114 diff --git a/checkpoint-50400/rng_state.pth b/checkpoint-50400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..95cad945c935ebaeeca5e461007867b6e155022e --- /dev/null +++ b/checkpoint-50400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3adbebcf39742024134d36312cc62baede0eb396a36041797f643dcac19c1b2 +size 14244 diff --git a/checkpoint-50400/scheduler.pt b/checkpoint-50400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..611b6a764923588ae53c8f51ff0e9360c65a34c9 --- /dev/null +++ b/checkpoint-50400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71072ce046c61c3c042d77ac3986584284b48e57041415f0038fbc34c8eed747 +size 1064 diff --git a/checkpoint-50400/special_tokens_map.json b/checkpoint-50400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-50400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-50400/tokenizer.json b/checkpoint-50400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-50400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-50400/tokenizer_config.json b/checkpoint-50400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-50400/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-50400/trainer_state.json b/checkpoint-50400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..327290d1b1faf7781dc8374bd1c85468d8cb36b5 --- /dev/null +++ b/checkpoint-50400/trainer_state.json @@ -0,0 +1,3561 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.6879999999999997, + "eval_steps": 500, + "global_step": 50400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + }, + { + "epoch": 1.2, + "grad_norm": 5.71875, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6603, + "step": 22500 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 5.90625, + "learning_rate": 6.646913580246914e-05, + "loss": 1.7291, + "step": 22600 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 7.40625, + "learning_rate": 6.62716049382716e-05, + "loss": 1.7231, + "step": 22700 + }, + { + "epoch": 1.216, + "grad_norm": 4.8125, + "learning_rate": 6.607407407407408e-05, + "loss": 1.6072, + "step": 22800 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 10.5, + "learning_rate": 6.587654320987655e-05, + "loss": 1.7127, + "step": 22900 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 5.71875, + "learning_rate": 6.567901234567901e-05, + "loss": 1.7209, + "step": 23000 + }, + { + "epoch": 1.232, + "grad_norm": 6.0, + "learning_rate": 6.54814814814815e-05, + "loss": 1.7039, + "step": 23100 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 10.3125, + "learning_rate": 6.528395061728395e-05, + "loss": 1.7275, + "step": 23200 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 5.5625, + "learning_rate": 6.508641975308642e-05, + "loss": 1.7337, + "step": 23300 + }, + { + "epoch": 1.248, + "grad_norm": 5.90625, + "learning_rate": 6.488888888888889e-05, + "loss": 1.6821, + "step": 23400 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 5.875, + "learning_rate": 6.469135802469136e-05, + "loss": 1.7188, + "step": 23500 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 5.84375, + "learning_rate": 6.449382716049382e-05, + "loss": 1.7119, + "step": 23600 + }, + { + "epoch": 1.264, + "grad_norm": 8.125, + "learning_rate": 6.42962962962963e-05, + "loss": 1.6742, + "step": 23700 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 4.96875, + "learning_rate": 6.409876543209878e-05, + "loss": 1.6378, + "step": 23800 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 5.40625, + "learning_rate": 6.390123456790123e-05, + "loss": 1.6826, + "step": 23900 + }, + { + "epoch": 1.28, + "grad_norm": 5.96875, + "learning_rate": 6.37037037037037e-05, + "loss": 1.712, + "step": 24000 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 6.3125, + "learning_rate": 6.350617283950617e-05, + "loss": 1.7673, + "step": 24100 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 5.375, + "learning_rate": 6.330864197530864e-05, + "loss": 1.5944, + "step": 24200 + }, + { + "epoch": 1.296, + "grad_norm": 8.0, + "learning_rate": 6.311111111111112e-05, + "loss": 1.7515, + "step": 24300 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 5.53125, + "learning_rate": 6.291358024691359e-05, + "loss": 1.739, + "step": 24400 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 4.6875, + "learning_rate": 6.271604938271606e-05, + "loss": 1.744, + "step": 24500 + }, + { + "epoch": 1.312, + "grad_norm": 11.9375, + "learning_rate": 6.251851851851853e-05, + "loss": 1.6566, + "step": 24600 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 11.4375, + "learning_rate": 6.2320987654321e-05, + "loss": 1.6289, + "step": 24700 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 11.1875, + "learning_rate": 6.212345679012346e-05, + "loss": 1.686, + "step": 24800 + }, + { + "epoch": 1.328, + "grad_norm": 6.21875, + "learning_rate": 6.192592592592593e-05, + "loss": 1.66, + "step": 24900 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.5, + "learning_rate": 6.17283950617284e-05, + "loss": 1.6724, + "step": 25000 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 6.46875, + "learning_rate": 6.153086419753087e-05, + "loss": 1.7236, + "step": 25100 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.6875, + "learning_rate": 6.133333333333334e-05, + "loss": 1.6676, + "step": 25200 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 6.84375, + "learning_rate": 6.113580246913581e-05, + "loss": 1.6966, + "step": 25300 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 6.09375, + "learning_rate": 6.093827160493828e-05, + "loss": 1.6573, + "step": 25400 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.53125, + "learning_rate": 6.074074074074074e-05, + "loss": 1.7067, + "step": 25500 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 5.0, + "learning_rate": 6.0543209876543214e-05, + "loss": 1.6531, + "step": 25600 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 4.3125, + "learning_rate": 6.034567901234568e-05, + "loss": 1.6951, + "step": 25700 + }, + { + "epoch": 1.376, + "grad_norm": 6.84375, + "learning_rate": 6.0148148148148155e-05, + "loss": 1.6101, + "step": 25800 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 5.8125, + "learning_rate": 5.995061728395062e-05, + "loss": 1.7114, + "step": 25900 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 6.375, + "learning_rate": 5.975308641975309e-05, + "loss": 1.6413, + "step": 26000 + }, + { + "epoch": 1.392, + "grad_norm": 5.5, + "learning_rate": 5.9555555555555554e-05, + "loss": 1.6189, + "step": 26100 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 6.28125, + "learning_rate": 5.9358024691358024e-05, + "loss": 1.6949, + "step": 26200 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 5.25, + "learning_rate": 5.91604938271605e-05, + "loss": 1.6616, + "step": 26300 + }, + { + "epoch": 1.408, + "grad_norm": 8.625, + "learning_rate": 5.8962962962962966e-05, + "loss": 1.6484, + "step": 26400 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 4.96875, + "learning_rate": 5.8765432098765437e-05, + "loss": 1.599, + "step": 26500 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 4.40625, + "learning_rate": 5.85679012345679e-05, + "loss": 1.6366, + "step": 26600 + }, + { + "epoch": 1.424, + "grad_norm": 9.8125, + "learning_rate": 5.837037037037038e-05, + "loss": 1.7065, + "step": 26700 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 5.46875, + "learning_rate": 5.8172839506172835e-05, + "loss": 1.6841, + "step": 26800 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 4.9375, + "learning_rate": 5.797530864197531e-05, + "loss": 1.66, + "step": 26900 + }, + { + "epoch": 1.44, + "grad_norm": 5.375, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6645, + "step": 27000 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 5.875, + "learning_rate": 5.758024691358025e-05, + "loss": 1.6354, + "step": 27100 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 6.90625, + "learning_rate": 5.7382716049382725e-05, + "loss": 1.626, + "step": 27200 + }, + { + "epoch": 1.456, + "grad_norm": 6.5, + "learning_rate": 5.718518518518519e-05, + "loss": 1.6265, + "step": 27300 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 9.25, + "learning_rate": 5.698765432098766e-05, + "loss": 1.6879, + "step": 27400 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 6.1875, + "learning_rate": 5.679012345679012e-05, + "loss": 1.6756, + "step": 27500 + }, + { + "epoch": 1.472, + "grad_norm": 6.0625, + "learning_rate": 5.6592592592592594e-05, + "loss": 1.748, + "step": 27600 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 7.1875, + "learning_rate": 5.639506172839506e-05, + "loss": 1.668, + "step": 27700 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 11.375, + "learning_rate": 5.6197530864197535e-05, + "loss": 1.6842, + "step": 27800 + }, + { + "epoch": 1.488, + "grad_norm": 5.125, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7157, + "step": 27900 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 5.5, + "learning_rate": 5.580246913580247e-05, + "loss": 1.6674, + "step": 28000 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 5.6875, + "learning_rate": 5.560493827160495e-05, + "loss": 1.6131, + "step": 28100 + }, + { + "epoch": 1.504, + "grad_norm": 4.5, + "learning_rate": 5.540740740740741e-05, + "loss": 1.7084, + "step": 28200 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 5.15625, + "learning_rate": 5.520987654320988e-05, + "loss": 1.5791, + "step": 28300 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 6.96875, + "learning_rate": 5.5012345679012346e-05, + "loss": 1.5846, + "step": 28400 + }, + { + "epoch": 1.52, + "grad_norm": 11.875, + "learning_rate": 5.4814814814814817e-05, + "loss": 1.6353, + "step": 28500 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 8.3125, + "learning_rate": 5.461728395061728e-05, + "loss": 1.6686, + "step": 28600 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 13.6875, + "learning_rate": 5.441975308641976e-05, + "loss": 1.6609, + "step": 28700 + }, + { + "epoch": 1.536, + "grad_norm": 7.6875, + "learning_rate": 5.422222222222223e-05, + "loss": 1.6264, + "step": 28800 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 8.125, + "learning_rate": 5.402469135802469e-05, + "loss": 1.6539, + "step": 28900 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 7.4375, + "learning_rate": 5.382716049382717e-05, + "loss": 1.6946, + "step": 29000 + }, + { + "epoch": 1.552, + "grad_norm": 7.09375, + "learning_rate": 5.362962962962963e-05, + "loss": 1.6258, + "step": 29100 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 4.53125, + "learning_rate": 5.3432098765432105e-05, + "loss": 1.6388, + "step": 29200 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 5.4375, + "learning_rate": 5.323456790123457e-05, + "loss": 1.6131, + "step": 29300 + }, + { + "epoch": 1.568, + "grad_norm": 7.15625, + "learning_rate": 5.303703703703704e-05, + "loss": 1.5935, + "step": 29400 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 9.8125, + "learning_rate": 5.28395061728395e-05, + "loss": 1.6357, + "step": 29500 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 6.625, + "learning_rate": 5.264197530864198e-05, + "loss": 1.6733, + "step": 29600 + }, + { + "epoch": 1.584, + "grad_norm": 5.0, + "learning_rate": 5.244444444444445e-05, + "loss": 1.7063, + "step": 29700 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 6.625, + "learning_rate": 5.2246913580246915e-05, + "loss": 1.6056, + "step": 29800 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 6.90625, + "learning_rate": 5.2049382716049386e-05, + "loss": 1.6357, + "step": 29900 + }, + { + "epoch": 1.6, + "grad_norm": 7.5, + "learning_rate": 5.185185185185185e-05, + "loss": 1.6332, + "step": 30000 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 7.84375, + "learning_rate": 5.165432098765433e-05, + "loss": 1.6458, + "step": 30100 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 15.375, + "learning_rate": 5.145679012345679e-05, + "loss": 1.5787, + "step": 30200 + }, + { + "epoch": 1.616, + "grad_norm": 8.5625, + "learning_rate": 5.125925925925926e-05, + "loss": 1.6441, + "step": 30300 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 5.9375, + "learning_rate": 5.1061728395061726e-05, + "loss": 1.6211, + "step": 30400 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 6.09375, + "learning_rate": 5.0864197530864197e-05, + "loss": 1.6304, + "step": 30500 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.40625, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.6111, + "step": 30600 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 7.625, + "learning_rate": 5.046913580246914e-05, + "loss": 1.6387, + "step": 30700 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 4.875, + "learning_rate": 5.027160493827161e-05, + "loss": 1.6418, + "step": 30800 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.25, + "learning_rate": 5.007407407407407e-05, + "loss": 1.6082, + "step": 30900 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 5.125, + "learning_rate": 4.987654320987655e-05, + "loss": 1.5755, + "step": 31000 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 9.0625, + "learning_rate": 4.9679012345679014e-05, + "loss": 1.6432, + "step": 31100 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 6.0, + "learning_rate": 4.9481481481481485e-05, + "loss": 1.6333, + "step": 31200 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 6.65625, + "learning_rate": 4.9283950617283955e-05, + "loss": 1.6183, + "step": 31300 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 7.28125, + "learning_rate": 4.908641975308642e-05, + "loss": 1.5636, + "step": 31400 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.125, + "learning_rate": 4.888888888888889e-05, + "loss": 1.621, + "step": 31500 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 6.46875, + "learning_rate": 4.869135802469136e-05, + "loss": 1.7226, + "step": 31600 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 5.875, + "learning_rate": 4.849382716049383e-05, + "loss": 1.6311, + "step": 31700 + }, + { + "epoch": 1.696, + "grad_norm": 5.875, + "learning_rate": 4.82962962962963e-05, + "loss": 1.6132, + "step": 31800 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 5.375, + "learning_rate": 4.8098765432098766e-05, + "loss": 1.5931, + "step": 31900 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 13.0625, + "learning_rate": 4.7901234567901237e-05, + "loss": 1.6958, + "step": 32000 + }, + { + "epoch": 1.712, + "grad_norm": 6.40625, + "learning_rate": 4.770370370370371e-05, + "loss": 1.6209, + "step": 32100 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 13.5625, + "learning_rate": 4.750617283950617e-05, + "loss": 1.6031, + "step": 32200 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 8.5, + "learning_rate": 4.730864197530864e-05, + "loss": 1.6279, + "step": 32300 + }, + { + "epoch": 1.728, + "grad_norm": 7.5, + "learning_rate": 4.711111111111111e-05, + "loss": 1.5793, + "step": 32400 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 6.75, + "learning_rate": 4.691358024691358e-05, + "loss": 1.6628, + "step": 32500 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 6.21875, + "learning_rate": 4.6716049382716054e-05, + "loss": 1.671, + "step": 32600 + }, + { + "epoch": 1.744, + "grad_norm": 5.875, + "learning_rate": 4.6518518518518525e-05, + "loss": 1.6886, + "step": 32700 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 5.15625, + "learning_rate": 4.632098765432099e-05, + "loss": 1.6598, + "step": 32800 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 9.6875, + "learning_rate": 4.612345679012346e-05, + "loss": 1.5773, + "step": 32900 + }, + { + "epoch": 1.76, + "grad_norm": 6.125, + "learning_rate": 4.592592592592593e-05, + "loss": 1.6603, + "step": 33000 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 5.8125, + "learning_rate": 4.5728395061728394e-05, + "loss": 1.6405, + "step": 33100 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 5.34375, + "learning_rate": 4.5530864197530865e-05, + "loss": 1.6776, + "step": 33200 + }, + { + "epoch": 1.776, + "grad_norm": 5.5625, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.5413, + "step": 33300 + }, + { + "epoch": 1.7813333333333334, + "grad_norm": 8.875, + "learning_rate": 4.5135802469135806e-05, + "loss": 1.6298, + "step": 33400 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 6.28125, + "learning_rate": 4.493827160493828e-05, + "loss": 1.5795, + "step": 33500 + }, + { + "epoch": 1.792, + "grad_norm": 6.65625, + "learning_rate": 4.474074074074075e-05, + "loss": 1.7145, + "step": 33600 + }, + { + "epoch": 1.7973333333333334, + "grad_norm": 7.96875, + "learning_rate": 4.454320987654321e-05, + "loss": 1.6492, + "step": 33700 + }, + { + "epoch": 1.8026666666666666, + "grad_norm": 10.625, + "learning_rate": 4.434567901234568e-05, + "loss": 1.5981, + "step": 33800 + }, + { + "epoch": 1.808, + "grad_norm": 5.65625, + "learning_rate": 4.414814814814815e-05, + "loss": 1.5606, + "step": 33900 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 4.90625, + "learning_rate": 4.3950617283950617e-05, + "loss": 1.5981, + "step": 34000 + }, + { + "epoch": 1.8186666666666667, + "grad_norm": 6.0, + "learning_rate": 4.375308641975309e-05, + "loss": 1.5976, + "step": 34100 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 5.5625, + "learning_rate": 4.355555555555556e-05, + "loss": 1.6783, + "step": 34200 + }, + { + "epoch": 1.8293333333333335, + "grad_norm": 6.96875, + "learning_rate": 4.335802469135803e-05, + "loss": 1.6716, + "step": 34300 + }, + { + "epoch": 1.8346666666666667, + "grad_norm": 4.6875, + "learning_rate": 4.31604938271605e-05, + "loss": 1.5989, + "step": 34400 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 7.71875, + "learning_rate": 4.296296296296296e-05, + "loss": 1.6317, + "step": 34500 + }, + { + "epoch": 1.8453333333333335, + "grad_norm": 5.78125, + "learning_rate": 4.2765432098765434e-05, + "loss": 1.6327, + "step": 34600 + }, + { + "epoch": 1.8506666666666667, + "grad_norm": 5.59375, + "learning_rate": 4.2567901234567905e-05, + "loss": 1.5324, + "step": 34700 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 7.65625, + "learning_rate": 4.237037037037037e-05, + "loss": 1.6141, + "step": 34800 + }, + { + "epoch": 1.8613333333333333, + "grad_norm": 9.4375, + "learning_rate": 4.217283950617284e-05, + "loss": 1.6398, + "step": 34900 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 8.875, + "learning_rate": 4.197530864197531e-05, + "loss": 1.5835, + "step": 35000 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 12.0625, + "learning_rate": 4.177777777777778e-05, + "loss": 1.633, + "step": 35100 + }, + { + "epoch": 1.8773333333333333, + "grad_norm": 8.375, + "learning_rate": 4.158024691358025e-05, + "loss": 1.6851, + "step": 35200 + }, + { + "epoch": 1.8826666666666667, + "grad_norm": 11.5625, + "learning_rate": 4.138271604938272e-05, + "loss": 1.6436, + "step": 35300 + }, + { + "epoch": 1.888, + "grad_norm": 7.78125, + "learning_rate": 4.1185185185185186e-05, + "loss": 1.6268, + "step": 35400 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 11.1875, + "learning_rate": 4.0987654320987657e-05, + "loss": 1.5537, + "step": 35500 + }, + { + "epoch": 1.8986666666666667, + "grad_norm": 10.8125, + "learning_rate": 4.079012345679013e-05, + "loss": 1.6954, + "step": 35600 + }, + { + "epoch": 1.904, + "grad_norm": 10.625, + "learning_rate": 4.059259259259259e-05, + "loss": 1.6122, + "step": 35700 + }, + { + "epoch": 1.9093333333333333, + "grad_norm": 4.4375, + "learning_rate": 4.039506172839506e-05, + "loss": 1.6308, + "step": 35800 + }, + { + "epoch": 1.9146666666666667, + "grad_norm": 5.4375, + "learning_rate": 4.019753086419753e-05, + "loss": 1.6331, + "step": 35900 + }, + { + "epoch": 1.92, + "grad_norm": 5.125, + "learning_rate": 4e-05, + "loss": 1.5898, + "step": 36000 + }, + { + "epoch": 1.9253333333333333, + "grad_norm": 13.5625, + "learning_rate": 3.9802469135802474e-05, + "loss": 1.6748, + "step": 36100 + }, + { + "epoch": 1.9306666666666668, + "grad_norm": 5.40625, + "learning_rate": 3.960493827160494e-05, + "loss": 1.6326, + "step": 36200 + }, + { + "epoch": 1.936, + "grad_norm": 8.0, + "learning_rate": 3.940740740740741e-05, + "loss": 1.6027, + "step": 36300 + }, + { + "epoch": 1.9413333333333334, + "grad_norm": 12.625, + "learning_rate": 3.920987654320988e-05, + "loss": 1.5298, + "step": 36400 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 5.875, + "learning_rate": 3.901234567901234e-05, + "loss": 1.6354, + "step": 36500 + }, + { + "epoch": 1.952, + "grad_norm": 5.40625, + "learning_rate": 3.8814814814814814e-05, + "loss": 1.6155, + "step": 36600 + }, + { + "epoch": 1.9573333333333334, + "grad_norm": 5.15625, + "learning_rate": 3.8617283950617285e-05, + "loss": 1.6524, + "step": 36700 + }, + { + "epoch": 1.9626666666666668, + "grad_norm": 8.0625, + "learning_rate": 3.8419753086419755e-05, + "loss": 1.6594, + "step": 36800 + }, + { + "epoch": 1.968, + "grad_norm": 11.0, + "learning_rate": 3.8222222222222226e-05, + "loss": 1.6397, + "step": 36900 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 6.96875, + "learning_rate": 3.80246913580247e-05, + "loss": 1.6208, + "step": 37000 + }, + { + "epoch": 1.9786666666666668, + "grad_norm": 9.125, + "learning_rate": 3.782716049382716e-05, + "loss": 1.5995, + "step": 37100 + }, + { + "epoch": 1.984, + "grad_norm": 8.8125, + "learning_rate": 3.762962962962963e-05, + "loss": 1.59, + "step": 37200 + }, + { + "epoch": 1.9893333333333332, + "grad_norm": 8.1875, + "learning_rate": 3.74320987654321e-05, + "loss": 1.6343, + "step": 37300 + }, + { + "epoch": 1.9946666666666668, + "grad_norm": 7.65625, + "learning_rate": 3.7234567901234566e-05, + "loss": 1.6007, + "step": 37400 + }, + { + "epoch": 2.0, + "grad_norm": 6.125, + "learning_rate": 3.7037037037037037e-05, + "loss": 1.6295, + "step": 37500 + }, + { + "epoch": 2.005333333333333, + "grad_norm": 6.65625, + "learning_rate": 3.683950617283951e-05, + "loss": 1.2317, + "step": 37600 + }, + { + "epoch": 2.010666666666667, + "grad_norm": 6.9375, + "learning_rate": 3.664197530864198e-05, + "loss": 1.3769, + "step": 37700 + }, + { + "epoch": 2.016, + "grad_norm": 5.46875, + "learning_rate": 3.644444444444445e-05, + "loss": 1.3206, + "step": 37800 + }, + { + "epoch": 2.021333333333333, + "grad_norm": 7.3125, + "learning_rate": 3.624691358024692e-05, + "loss": 1.2903, + "step": 37900 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.604938271604938e-05, + "loss": 1.3443, + "step": 38000 + }, + { + "epoch": 2.032, + "grad_norm": 6.375, + "learning_rate": 3.5851851851851854e-05, + "loss": 1.291, + "step": 38100 + }, + { + "epoch": 2.037333333333333, + "grad_norm": 7.84375, + "learning_rate": 3.5654320987654325e-05, + "loss": 1.2552, + "step": 38200 + }, + { + "epoch": 2.042666666666667, + "grad_norm": 8.9375, + "learning_rate": 3.545679012345679e-05, + "loss": 1.2883, + "step": 38300 + }, + { + "epoch": 2.048, + "grad_norm": 6.09375, + "learning_rate": 3.525925925925926e-05, + "loss": 1.2755, + "step": 38400 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 6.0625, + "learning_rate": 3.506172839506173e-05, + "loss": 1.3612, + "step": 38500 + }, + { + "epoch": 2.058666666666667, + "grad_norm": 8.625, + "learning_rate": 3.48641975308642e-05, + "loss": 1.2394, + "step": 38600 + }, + { + "epoch": 2.064, + "grad_norm": 8.25, + "learning_rate": 3.466666666666667e-05, + "loss": 1.3005, + "step": 38700 + }, + { + "epoch": 2.0693333333333332, + "grad_norm": 7.125, + "learning_rate": 3.4469135802469135e-05, + "loss": 1.3219, + "step": 38800 + }, + { + "epoch": 2.074666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.4271604938271606e-05, + "loss": 1.3388, + "step": 38900 + }, + { + "epoch": 2.08, + "grad_norm": 7.4375, + "learning_rate": 3.4074074074074077e-05, + "loss": 1.3317, + "step": 39000 + }, + { + "epoch": 2.0853333333333333, + "grad_norm": 5.15625, + "learning_rate": 3.387654320987654e-05, + "loss": 1.2546, + "step": 39100 + }, + { + "epoch": 2.0906666666666665, + "grad_norm": 6.71875, + "learning_rate": 3.367901234567901e-05, + "loss": 1.3502, + "step": 39200 + }, + { + "epoch": 2.096, + "grad_norm": 7.28125, + "learning_rate": 3.348148148148148e-05, + "loss": 1.2733, + "step": 39300 + }, + { + "epoch": 2.1013333333333333, + "grad_norm": 8.125, + "learning_rate": 3.328395061728395e-05, + "loss": 1.2879, + "step": 39400 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 6.5625, + "learning_rate": 3.308641975308642e-05, + "loss": 1.2175, + "step": 39500 + }, + { + "epoch": 2.112, + "grad_norm": 7.375, + "learning_rate": 3.2888888888888894e-05, + "loss": 1.3628, + "step": 39600 + }, + { + "epoch": 2.1173333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.269135802469136e-05, + "loss": 1.2937, + "step": 39700 + }, + { + "epoch": 2.1226666666666665, + "grad_norm": 5.9375, + "learning_rate": 3.249382716049383e-05, + "loss": 1.2451, + "step": 39800 + }, + { + "epoch": 2.128, + "grad_norm": 9.6875, + "learning_rate": 3.22962962962963e-05, + "loss": 1.3379, + "step": 39900 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 7.5, + "learning_rate": 3.209876543209876e-05, + "loss": 1.2934, + "step": 40000 + }, + { + "epoch": 2.1386666666666665, + "grad_norm": 5.84375, + "learning_rate": 3.1901234567901234e-05, + "loss": 1.2618, + "step": 40100 + }, + { + "epoch": 2.144, + "grad_norm": 6.875, + "learning_rate": 3.1703703703703705e-05, + "loss": 1.384, + "step": 40200 + }, + { + "epoch": 2.1493333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.1506172839506175e-05, + "loss": 1.2611, + "step": 40300 + }, + { + "epoch": 2.1546666666666665, + "grad_norm": 8.0, + "learning_rate": 3.1308641975308646e-05, + "loss": 1.2923, + "step": 40400 + }, + { + "epoch": 2.16, + "grad_norm": 7.53125, + "learning_rate": 3.111111111111111e-05, + "loss": 1.2947, + "step": 40500 + }, + { + "epoch": 2.1653333333333333, + "grad_norm": 8.125, + "learning_rate": 3.091358024691358e-05, + "loss": 1.283, + "step": 40600 + }, + { + "epoch": 2.1706666666666665, + "grad_norm": 7.625, + "learning_rate": 3.071604938271605e-05, + "loss": 1.3939, + "step": 40700 + }, + { + "epoch": 2.176, + "grad_norm": 8.0625, + "learning_rate": 3.0518518518518515e-05, + "loss": 1.3395, + "step": 40800 + }, + { + "epoch": 2.1813333333333333, + "grad_norm": 10.3125, + "learning_rate": 3.0320987654320986e-05, + "loss": 1.2382, + "step": 40900 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 5.8125, + "learning_rate": 3.012345679012346e-05, + "loss": 1.346, + "step": 41000 + }, + { + "epoch": 2.192, + "grad_norm": 12.0, + "learning_rate": 2.992592592592593e-05, + "loss": 1.348, + "step": 41100 + }, + { + "epoch": 2.1973333333333334, + "grad_norm": 7.0, + "learning_rate": 2.9728395061728398e-05, + "loss": 1.2885, + "step": 41200 + }, + { + "epoch": 2.2026666666666666, + "grad_norm": 10.9375, + "learning_rate": 2.9530864197530865e-05, + "loss": 1.2577, + "step": 41300 + }, + { + "epoch": 2.208, + "grad_norm": 7.0625, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.3698, + "step": 41400 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 5.6875, + "learning_rate": 2.9135802469135803e-05, + "loss": 1.2787, + "step": 41500 + }, + { + "epoch": 2.2186666666666666, + "grad_norm": 11.0625, + "learning_rate": 2.893827160493827e-05, + "loss": 1.299, + "step": 41600 + }, + { + "epoch": 2.224, + "grad_norm": 16.5, + "learning_rate": 2.874074074074074e-05, + "loss": 1.3493, + "step": 41700 + }, + { + "epoch": 2.2293333333333334, + "grad_norm": 7.71875, + "learning_rate": 2.854320987654321e-05, + "loss": 1.232, + "step": 41800 + }, + { + "epoch": 2.2346666666666666, + "grad_norm": 7.3125, + "learning_rate": 2.8345679012345683e-05, + "loss": 1.2965, + "step": 41900 + }, + { + "epoch": 2.24, + "grad_norm": 4.875, + "learning_rate": 2.814814814814815e-05, + "loss": 1.2932, + "step": 42000 + }, + { + "epoch": 2.2453333333333334, + "grad_norm": 8.5625, + "learning_rate": 2.795061728395062e-05, + "loss": 1.2689, + "step": 42100 + }, + { + "epoch": 2.2506666666666666, + "grad_norm": 8.5625, + "learning_rate": 2.7753086419753088e-05, + "loss": 1.3437, + "step": 42200 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 11.375, + "learning_rate": 2.7555555555555555e-05, + "loss": 1.3957, + "step": 42300 + }, + { + "epoch": 2.2613333333333334, + "grad_norm": 7.125, + "learning_rate": 2.7358024691358026e-05, + "loss": 1.2948, + "step": 42400 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 6.90625, + "learning_rate": 2.7160493827160493e-05, + "loss": 1.2896, + "step": 42500 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 8.3125, + "learning_rate": 2.696296296296296e-05, + "loss": 1.2483, + "step": 42600 + }, + { + "epoch": 2.2773333333333334, + "grad_norm": 6.40625, + "learning_rate": 2.6765432098765435e-05, + "loss": 1.3159, + "step": 42700 + }, + { + "epoch": 2.2826666666666666, + "grad_norm": 6.59375, + "learning_rate": 2.6567901234567905e-05, + "loss": 1.2742, + "step": 42800 + }, + { + "epoch": 2.288, + "grad_norm": 7.21875, + "learning_rate": 2.6370370370370373e-05, + "loss": 1.3353, + "step": 42900 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 5.46875, + "learning_rate": 2.617283950617284e-05, + "loss": 1.3093, + "step": 43000 + }, + { + "epoch": 2.2986666666666666, + "grad_norm": 8.3125, + "learning_rate": 2.597530864197531e-05, + "loss": 1.2342, + "step": 43100 + }, + { + "epoch": 2.304, + "grad_norm": 7.09375, + "learning_rate": 2.5777777777777778e-05, + "loss": 1.3586, + "step": 43200 + }, + { + "epoch": 2.3093333333333335, + "grad_norm": 11.625, + "learning_rate": 2.558024691358025e-05, + "loss": 1.2999, + "step": 43300 + }, + { + "epoch": 2.3146666666666667, + "grad_norm": 7.75, + "learning_rate": 2.5382716049382716e-05, + "loss": 1.2873, + "step": 43400 + }, + { + "epoch": 2.32, + "grad_norm": 8.0, + "learning_rate": 2.5185185185185183e-05, + "loss": 1.3057, + "step": 43500 + }, + { + "epoch": 2.3253333333333335, + "grad_norm": 9.1875, + "learning_rate": 2.4987654320987654e-05, + "loss": 1.3544, + "step": 43600 + }, + { + "epoch": 2.3306666666666667, + "grad_norm": 7.71875, + "learning_rate": 2.4790123456790125e-05, + "loss": 1.333, + "step": 43700 + }, + { + "epoch": 2.336, + "grad_norm": 6.21875, + "learning_rate": 2.4592592592592595e-05, + "loss": 1.2135, + "step": 43800 + }, + { + "epoch": 2.3413333333333335, + "grad_norm": 6.59375, + "learning_rate": 2.4395061728395063e-05, + "loss": 1.3494, + "step": 43900 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 8.1875, + "learning_rate": 2.4197530864197533e-05, + "loss": 1.3179, + "step": 44000 + }, + { + "epoch": 2.352, + "grad_norm": 5.9375, + "learning_rate": 2.4e-05, + "loss": 1.401, + "step": 44100 + }, + { + "epoch": 2.3573333333333335, + "grad_norm": 8.125, + "learning_rate": 2.380246913580247e-05, + "loss": 1.2905, + "step": 44200 + }, + { + "epoch": 2.3626666666666667, + "grad_norm": 6.5625, + "learning_rate": 2.360493827160494e-05, + "loss": 1.3236, + "step": 44300 + }, + { + "epoch": 2.368, + "grad_norm": 7.71875, + "learning_rate": 2.340740740740741e-05, + "loss": 1.2924, + "step": 44400 + }, + { + "epoch": 2.3733333333333335, + "grad_norm": 10.1875, + "learning_rate": 2.3209876543209877e-05, + "loss": 1.3823, + "step": 44500 + }, + { + "epoch": 2.3786666666666667, + "grad_norm": 9.0625, + "learning_rate": 2.3012345679012347e-05, + "loss": 1.2555, + "step": 44600 + }, + { + "epoch": 2.384, + "grad_norm": 6.53125, + "learning_rate": 2.2814814814814818e-05, + "loss": 1.319, + "step": 44700 + }, + { + "epoch": 2.389333333333333, + "grad_norm": 6.28125, + "learning_rate": 2.2617283950617285e-05, + "loss": 1.3722, + "step": 44800 + }, + { + "epoch": 2.3946666666666667, + "grad_norm": 8.75, + "learning_rate": 2.2419753086419753e-05, + "loss": 1.2535, + "step": 44900 + }, + { + "epoch": 2.4, + "grad_norm": 5.75, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.3091, + "step": 45000 + }, + { + "epoch": 2.405333333333333, + "grad_norm": 7.09375, + "learning_rate": 2.2024691358024694e-05, + "loss": 1.3417, + "step": 45100 + }, + { + "epoch": 2.4106666666666667, + "grad_norm": 6.21875, + "learning_rate": 2.182716049382716e-05, + "loss": 1.318, + "step": 45200 + }, + { + "epoch": 2.416, + "grad_norm": 6.09375, + "learning_rate": 2.162962962962963e-05, + "loss": 1.2971, + "step": 45300 + }, + { + "epoch": 2.421333333333333, + "grad_norm": 6.875, + "learning_rate": 2.14320987654321e-05, + "loss": 1.3866, + "step": 45400 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 5.9375, + "learning_rate": 2.123456790123457e-05, + "loss": 1.2945, + "step": 45500 + }, + { + "epoch": 2.432, + "grad_norm": 7.4375, + "learning_rate": 2.1037037037037037e-05, + "loss": 1.2541, + "step": 45600 + }, + { + "epoch": 2.437333333333333, + "grad_norm": 5.71875, + "learning_rate": 2.0839506172839508e-05, + "loss": 1.3282, + "step": 45700 + }, + { + "epoch": 2.4426666666666668, + "grad_norm": 12.1875, + "learning_rate": 2.0641975308641975e-05, + "loss": 1.3743, + "step": 45800 + }, + { + "epoch": 2.448, + "grad_norm": 5.0, + "learning_rate": 2.0444444444444446e-05, + "loss": 1.2689, + "step": 45900 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 8.5625, + "learning_rate": 2.0246913580246917e-05, + "loss": 1.347, + "step": 46000 + }, + { + "epoch": 2.458666666666667, + "grad_norm": 7.25, + "learning_rate": 2.0049382716049384e-05, + "loss": 1.3629, + "step": 46100 + }, + { + "epoch": 2.464, + "grad_norm": 12.5, + "learning_rate": 1.985185185185185e-05, + "loss": 1.2604, + "step": 46200 + }, + { + "epoch": 2.469333333333333, + "grad_norm": 7.03125, + "learning_rate": 1.9654320987654322e-05, + "loss": 1.3428, + "step": 46300 + }, + { + "epoch": 2.474666666666667, + "grad_norm": 7.8125, + "learning_rate": 1.9456790123456793e-05, + "loss": 1.2956, + "step": 46400 + }, + { + "epoch": 2.48, + "grad_norm": 7.21875, + "learning_rate": 1.925925925925926e-05, + "loss": 1.2986, + "step": 46500 + }, + { + "epoch": 2.485333333333333, + "grad_norm": 8.3125, + "learning_rate": 1.9061728395061727e-05, + "loss": 1.2794, + "step": 46600 + }, + { + "epoch": 2.490666666666667, + "grad_norm": 8.125, + "learning_rate": 1.8864197530864198e-05, + "loss": 1.3091, + "step": 46700 + }, + { + "epoch": 2.496, + "grad_norm": 6.59375, + "learning_rate": 1.866666666666667e-05, + "loss": 1.2405, + "step": 46800 + }, + { + "epoch": 2.501333333333333, + "grad_norm": 9.25, + "learning_rate": 1.8469135802469136e-05, + "loss": 1.2841, + "step": 46900 + }, + { + "epoch": 2.506666666666667, + "grad_norm": 5.78125, + "learning_rate": 1.8271604938271607e-05, + "loss": 1.3305, + "step": 47000 + }, + { + "epoch": 2.512, + "grad_norm": 8.375, + "learning_rate": 1.8074074074074074e-05, + "loss": 1.3659, + "step": 47100 + }, + { + "epoch": 2.517333333333333, + "grad_norm": 7.0625, + "learning_rate": 1.7876543209876545e-05, + "loss": 1.2434, + "step": 47200 + }, + { + "epoch": 2.522666666666667, + "grad_norm": 9.8125, + "learning_rate": 1.7679012345679012e-05, + "loss": 1.2765, + "step": 47300 + }, + { + "epoch": 2.528, + "grad_norm": 7.71875, + "learning_rate": 1.7481481481481483e-05, + "loss": 1.3136, + "step": 47400 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 7.71875, + "learning_rate": 1.728395061728395e-05, + "loss": 1.3622, + "step": 47500 + }, + { + "epoch": 2.538666666666667, + "grad_norm": 11.0, + "learning_rate": 1.708641975308642e-05, + "loss": 1.2877, + "step": 47600 + }, + { + "epoch": 2.544, + "grad_norm": 6.25, + "learning_rate": 1.688888888888889e-05, + "loss": 1.3239, + "step": 47700 + }, + { + "epoch": 2.5493333333333332, + "grad_norm": 6.375, + "learning_rate": 1.669135802469136e-05, + "loss": 1.3512, + "step": 47800 + }, + { + "epoch": 2.554666666666667, + "grad_norm": 6.875, + "learning_rate": 1.6493827160493826e-05, + "loss": 1.3079, + "step": 47900 + }, + { + "epoch": 2.56, + "grad_norm": 7.90625, + "learning_rate": 1.62962962962963e-05, + "loss": 1.3031, + "step": 48000 + }, + { + "epoch": 2.5653333333333332, + "grad_norm": 8.0, + "learning_rate": 1.6098765432098767e-05, + "loss": 1.3062, + "step": 48100 + }, + { + "epoch": 2.570666666666667, + "grad_norm": 7.625, + "learning_rate": 1.5901234567901235e-05, + "loss": 1.3348, + "step": 48200 + }, + { + "epoch": 2.576, + "grad_norm": 9.6875, + "learning_rate": 1.5703703703703705e-05, + "loss": 1.3392, + "step": 48300 + }, + { + "epoch": 2.5813333333333333, + "grad_norm": 8.75, + "learning_rate": 1.5506172839506173e-05, + "loss": 1.3153, + "step": 48400 + }, + { + "epoch": 2.586666666666667, + "grad_norm": 6.75, + "learning_rate": 1.5308641975308643e-05, + "loss": 1.3348, + "step": 48500 + }, + { + "epoch": 2.592, + "grad_norm": 9.0, + "learning_rate": 1.5111111111111112e-05, + "loss": 1.3008, + "step": 48600 + }, + { + "epoch": 2.5973333333333333, + "grad_norm": 8.1875, + "learning_rate": 1.4913580246913581e-05, + "loss": 1.3492, + "step": 48700 + }, + { + "epoch": 2.602666666666667, + "grad_norm": 6.5, + "learning_rate": 1.4716049382716049e-05, + "loss": 1.2897, + "step": 48800 + }, + { + "epoch": 2.608, + "grad_norm": 6.03125, + "learning_rate": 1.4518518518518521e-05, + "loss": 1.2443, + "step": 48900 + }, + { + "epoch": 2.6133333333333333, + "grad_norm": 8.125, + "learning_rate": 1.4320987654320988e-05, + "loss": 1.3395, + "step": 49000 + }, + { + "epoch": 2.618666666666667, + "grad_norm": 7.90625, + "learning_rate": 1.4123456790123457e-05, + "loss": 1.3716, + "step": 49100 + }, + { + "epoch": 2.624, + "grad_norm": 6.28125, + "learning_rate": 1.3925925925925926e-05, + "loss": 1.3066, + "step": 49200 + }, + { + "epoch": 2.6293333333333333, + "grad_norm": 5.34375, + "learning_rate": 1.3728395061728397e-05, + "loss": 1.2932, + "step": 49300 + }, + { + "epoch": 2.634666666666667, + "grad_norm": 7.21875, + "learning_rate": 1.3530864197530866e-05, + "loss": 1.2657, + "step": 49400 + }, + { + "epoch": 2.64, + "grad_norm": 12.6875, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.2547, + "step": 49500 + }, + { + "epoch": 2.6453333333333333, + "grad_norm": 7.53125, + "learning_rate": 1.3135802469135802e-05, + "loss": 1.2846, + "step": 49600 + }, + { + "epoch": 2.6506666666666665, + "grad_norm": 10.3125, + "learning_rate": 1.2938271604938273e-05, + "loss": 1.3046, + "step": 49700 + }, + { + "epoch": 2.656, + "grad_norm": 8.5625, + "learning_rate": 1.2740740740740742e-05, + "loss": 1.3353, + "step": 49800 + }, + { + "epoch": 2.6613333333333333, + "grad_norm": 10.6875, + "learning_rate": 1.2543209876543211e-05, + "loss": 1.3146, + "step": 49900 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 10.125, + "learning_rate": 1.2345679012345678e-05, + "loss": 1.279, + "step": 50000 + }, + { + "epoch": 2.672, + "grad_norm": 11.125, + "learning_rate": 1.2148148148148149e-05, + "loss": 1.2854, + "step": 50100 + }, + { + "epoch": 2.6773333333333333, + "grad_norm": 6.375, + "learning_rate": 1.1950617283950618e-05, + "loss": 1.3665, + "step": 50200 + }, + { + "epoch": 2.6826666666666665, + "grad_norm": 7.90625, + "learning_rate": 1.1753086419753087e-05, + "loss": 1.2908, + "step": 50300 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 6.03125, + "learning_rate": 1.1555555555555556e-05, + "loss": 1.33, + "step": 50400 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4107367159300096e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-50400/training_args.bin b/checkpoint-50400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-50400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-5600/config.json b/checkpoint-5600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-5600/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-5600/generation_config.json b/checkpoint-5600/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-5600/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-5600/model.safetensors b/checkpoint-5600/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cf86d797ff1e916e631572a1e8ca995c50b013f0 --- /dev/null +++ b/checkpoint-5600/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50aef4e321dfc7bdfb09a25c3c74f54a5b5509c2cae858c424ca5c4ca85f9933 +size 2471645608 diff --git a/checkpoint-5600/optimizer.pt b/checkpoint-5600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6973fb441f2acebdf3eb6a24380b6857ac2c09a1 --- /dev/null +++ b/checkpoint-5600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e8a66ab8396087001388dea10253a36119e789683b2fc48db64632749a36494 +size 4943382114 diff --git a/checkpoint-5600/rng_state.pth b/checkpoint-5600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b90a80289e8b4a1364c141ec2bd718026fcf6df6 --- /dev/null +++ b/checkpoint-5600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b274076e93b274478bebe3dfb954b3391ae67cd9a9156c93fe95545bd14ca5c +size 14244 diff --git a/checkpoint-5600/scheduler.pt b/checkpoint-5600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e0f262021375af6cf4ff1a76c3befe243c73dba --- /dev/null +++ b/checkpoint-5600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:839df4df23c197e217a94a4b732250f48483d4dd1bf5b5466bbdfd12796cdd6c +size 1064 diff --git a/checkpoint-5600/special_tokens_map.json b/checkpoint-5600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-5600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-5600/tokenizer.json b/checkpoint-5600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-5600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-5600/tokenizer_config.json b/checkpoint-5600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-5600/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-5600/trainer_state.json b/checkpoint-5600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..61078633b4ae25f900443f51cd8ddc1f9f464360 --- /dev/null +++ b/checkpoint-5600/trainer_state.json @@ -0,0 +1,425 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2986666666666667, + "eval_steps": 500, + "global_step": 5600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.678596351033344e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5600/training_args.bin b/checkpoint-5600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-5600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-56000/config.json b/checkpoint-56000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-56000/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-56000/generation_config.json b/checkpoint-56000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-56000/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-56000/model.safetensors b/checkpoint-56000/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2b81e873f8f71acfda3247ac23b3b5b8dc777837 --- /dev/null +++ b/checkpoint-56000/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:067d0d7e89d0cd509ac143609cc7c1f360381e7eda3e900bb4ff410e29e5b318 +size 2471645608 diff --git a/checkpoint-56000/optimizer.pt b/checkpoint-56000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..35e55e660311b5b17a89a9e7d968a63f37d4ce1c --- /dev/null +++ b/checkpoint-56000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9358dc9789dffed7e8a202ed5fcde83ab6072ed91b8561ecb357a6a9778a57f7 +size 4943382114 diff --git a/checkpoint-56000/rng_state.pth b/checkpoint-56000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..95cad945c935ebaeeca5e461007867b6e155022e --- /dev/null +++ b/checkpoint-56000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3adbebcf39742024134d36312cc62baede0eb396a36041797f643dcac19c1b2 +size 14244 diff --git a/checkpoint-56000/scheduler.pt b/checkpoint-56000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d6cc0449e6fdb16a604fc3c67b85a5a8445ec7b --- /dev/null +++ b/checkpoint-56000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cacefd2240c5556a9422f9373c916bc8311b77f1bedfb7933e42c39a5620caf3 +size 1064 diff --git a/checkpoint-56000/special_tokens_map.json b/checkpoint-56000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-56000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-56000/tokenizer.json b/checkpoint-56000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-56000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-56000/tokenizer_config.json b/checkpoint-56000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-56000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-56000/trainer_state.json b/checkpoint-56000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dfc35c12bd0c644bb59a6b3364151e358875d94e --- /dev/null +++ b/checkpoint-56000/trainer_state.json @@ -0,0 +1,3953 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.986666666666667, + "eval_steps": 500, + "global_step": 56000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + }, + { + "epoch": 1.2, + "grad_norm": 5.71875, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6603, + "step": 22500 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 5.90625, + "learning_rate": 6.646913580246914e-05, + "loss": 1.7291, + "step": 22600 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 7.40625, + "learning_rate": 6.62716049382716e-05, + "loss": 1.7231, + "step": 22700 + }, + { + "epoch": 1.216, + "grad_norm": 4.8125, + "learning_rate": 6.607407407407408e-05, + "loss": 1.6072, + "step": 22800 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 10.5, + "learning_rate": 6.587654320987655e-05, + "loss": 1.7127, + "step": 22900 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 5.71875, + "learning_rate": 6.567901234567901e-05, + "loss": 1.7209, + "step": 23000 + }, + { + "epoch": 1.232, + "grad_norm": 6.0, + "learning_rate": 6.54814814814815e-05, + "loss": 1.7039, + "step": 23100 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 10.3125, + "learning_rate": 6.528395061728395e-05, + "loss": 1.7275, + "step": 23200 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 5.5625, + "learning_rate": 6.508641975308642e-05, + "loss": 1.7337, + "step": 23300 + }, + { + "epoch": 1.248, + "grad_norm": 5.90625, + "learning_rate": 6.488888888888889e-05, + "loss": 1.6821, + "step": 23400 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 5.875, + "learning_rate": 6.469135802469136e-05, + "loss": 1.7188, + "step": 23500 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 5.84375, + "learning_rate": 6.449382716049382e-05, + "loss": 1.7119, + "step": 23600 + }, + { + "epoch": 1.264, + "grad_norm": 8.125, + "learning_rate": 6.42962962962963e-05, + "loss": 1.6742, + "step": 23700 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 4.96875, + "learning_rate": 6.409876543209878e-05, + "loss": 1.6378, + "step": 23800 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 5.40625, + "learning_rate": 6.390123456790123e-05, + "loss": 1.6826, + "step": 23900 + }, + { + "epoch": 1.28, + "grad_norm": 5.96875, + "learning_rate": 6.37037037037037e-05, + "loss": 1.712, + "step": 24000 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 6.3125, + "learning_rate": 6.350617283950617e-05, + "loss": 1.7673, + "step": 24100 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 5.375, + "learning_rate": 6.330864197530864e-05, + "loss": 1.5944, + "step": 24200 + }, + { + "epoch": 1.296, + "grad_norm": 8.0, + "learning_rate": 6.311111111111112e-05, + "loss": 1.7515, + "step": 24300 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 5.53125, + "learning_rate": 6.291358024691359e-05, + "loss": 1.739, + "step": 24400 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 4.6875, + "learning_rate": 6.271604938271606e-05, + "loss": 1.744, + "step": 24500 + }, + { + "epoch": 1.312, + "grad_norm": 11.9375, + "learning_rate": 6.251851851851853e-05, + "loss": 1.6566, + "step": 24600 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 11.4375, + "learning_rate": 6.2320987654321e-05, + "loss": 1.6289, + "step": 24700 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 11.1875, + "learning_rate": 6.212345679012346e-05, + "loss": 1.686, + "step": 24800 + }, + { + "epoch": 1.328, + "grad_norm": 6.21875, + "learning_rate": 6.192592592592593e-05, + "loss": 1.66, + "step": 24900 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.5, + "learning_rate": 6.17283950617284e-05, + "loss": 1.6724, + "step": 25000 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 6.46875, + "learning_rate": 6.153086419753087e-05, + "loss": 1.7236, + "step": 25100 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.6875, + "learning_rate": 6.133333333333334e-05, + "loss": 1.6676, + "step": 25200 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 6.84375, + "learning_rate": 6.113580246913581e-05, + "loss": 1.6966, + "step": 25300 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 6.09375, + "learning_rate": 6.093827160493828e-05, + "loss": 1.6573, + "step": 25400 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.53125, + "learning_rate": 6.074074074074074e-05, + "loss": 1.7067, + "step": 25500 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 5.0, + "learning_rate": 6.0543209876543214e-05, + "loss": 1.6531, + "step": 25600 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 4.3125, + "learning_rate": 6.034567901234568e-05, + "loss": 1.6951, + "step": 25700 + }, + { + "epoch": 1.376, + "grad_norm": 6.84375, + "learning_rate": 6.0148148148148155e-05, + "loss": 1.6101, + "step": 25800 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 5.8125, + "learning_rate": 5.995061728395062e-05, + "loss": 1.7114, + "step": 25900 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 6.375, + "learning_rate": 5.975308641975309e-05, + "loss": 1.6413, + "step": 26000 + }, + { + "epoch": 1.392, + "grad_norm": 5.5, + "learning_rate": 5.9555555555555554e-05, + "loss": 1.6189, + "step": 26100 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 6.28125, + "learning_rate": 5.9358024691358024e-05, + "loss": 1.6949, + "step": 26200 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 5.25, + "learning_rate": 5.91604938271605e-05, + "loss": 1.6616, + "step": 26300 + }, + { + "epoch": 1.408, + "grad_norm": 8.625, + "learning_rate": 5.8962962962962966e-05, + "loss": 1.6484, + "step": 26400 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 4.96875, + "learning_rate": 5.8765432098765437e-05, + "loss": 1.599, + "step": 26500 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 4.40625, + "learning_rate": 5.85679012345679e-05, + "loss": 1.6366, + "step": 26600 + }, + { + "epoch": 1.424, + "grad_norm": 9.8125, + "learning_rate": 5.837037037037038e-05, + "loss": 1.7065, + "step": 26700 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 5.46875, + "learning_rate": 5.8172839506172835e-05, + "loss": 1.6841, + "step": 26800 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 4.9375, + "learning_rate": 5.797530864197531e-05, + "loss": 1.66, + "step": 26900 + }, + { + "epoch": 1.44, + "grad_norm": 5.375, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6645, + "step": 27000 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 5.875, + "learning_rate": 5.758024691358025e-05, + "loss": 1.6354, + "step": 27100 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 6.90625, + "learning_rate": 5.7382716049382725e-05, + "loss": 1.626, + "step": 27200 + }, + { + "epoch": 1.456, + "grad_norm": 6.5, + "learning_rate": 5.718518518518519e-05, + "loss": 1.6265, + "step": 27300 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 9.25, + "learning_rate": 5.698765432098766e-05, + "loss": 1.6879, + "step": 27400 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 6.1875, + "learning_rate": 5.679012345679012e-05, + "loss": 1.6756, + "step": 27500 + }, + { + "epoch": 1.472, + "grad_norm": 6.0625, + "learning_rate": 5.6592592592592594e-05, + "loss": 1.748, + "step": 27600 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 7.1875, + "learning_rate": 5.639506172839506e-05, + "loss": 1.668, + "step": 27700 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 11.375, + "learning_rate": 5.6197530864197535e-05, + "loss": 1.6842, + "step": 27800 + }, + { + "epoch": 1.488, + "grad_norm": 5.125, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7157, + "step": 27900 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 5.5, + "learning_rate": 5.580246913580247e-05, + "loss": 1.6674, + "step": 28000 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 5.6875, + "learning_rate": 5.560493827160495e-05, + "loss": 1.6131, + "step": 28100 + }, + { + "epoch": 1.504, + "grad_norm": 4.5, + "learning_rate": 5.540740740740741e-05, + "loss": 1.7084, + "step": 28200 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 5.15625, + "learning_rate": 5.520987654320988e-05, + "loss": 1.5791, + "step": 28300 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 6.96875, + "learning_rate": 5.5012345679012346e-05, + "loss": 1.5846, + "step": 28400 + }, + { + "epoch": 1.52, + "grad_norm": 11.875, + "learning_rate": 5.4814814814814817e-05, + "loss": 1.6353, + "step": 28500 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 8.3125, + "learning_rate": 5.461728395061728e-05, + "loss": 1.6686, + "step": 28600 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 13.6875, + "learning_rate": 5.441975308641976e-05, + "loss": 1.6609, + "step": 28700 + }, + { + "epoch": 1.536, + "grad_norm": 7.6875, + "learning_rate": 5.422222222222223e-05, + "loss": 1.6264, + "step": 28800 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 8.125, + "learning_rate": 5.402469135802469e-05, + "loss": 1.6539, + "step": 28900 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 7.4375, + "learning_rate": 5.382716049382717e-05, + "loss": 1.6946, + "step": 29000 + }, + { + "epoch": 1.552, + "grad_norm": 7.09375, + "learning_rate": 5.362962962962963e-05, + "loss": 1.6258, + "step": 29100 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 4.53125, + "learning_rate": 5.3432098765432105e-05, + "loss": 1.6388, + "step": 29200 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 5.4375, + "learning_rate": 5.323456790123457e-05, + "loss": 1.6131, + "step": 29300 + }, + { + "epoch": 1.568, + "grad_norm": 7.15625, + "learning_rate": 5.303703703703704e-05, + "loss": 1.5935, + "step": 29400 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 9.8125, + "learning_rate": 5.28395061728395e-05, + "loss": 1.6357, + "step": 29500 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 6.625, + "learning_rate": 5.264197530864198e-05, + "loss": 1.6733, + "step": 29600 + }, + { + "epoch": 1.584, + "grad_norm": 5.0, + "learning_rate": 5.244444444444445e-05, + "loss": 1.7063, + "step": 29700 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 6.625, + "learning_rate": 5.2246913580246915e-05, + "loss": 1.6056, + "step": 29800 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 6.90625, + "learning_rate": 5.2049382716049386e-05, + "loss": 1.6357, + "step": 29900 + }, + { + "epoch": 1.6, + "grad_norm": 7.5, + "learning_rate": 5.185185185185185e-05, + "loss": 1.6332, + "step": 30000 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 7.84375, + "learning_rate": 5.165432098765433e-05, + "loss": 1.6458, + "step": 30100 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 15.375, + "learning_rate": 5.145679012345679e-05, + "loss": 1.5787, + "step": 30200 + }, + { + "epoch": 1.616, + "grad_norm": 8.5625, + "learning_rate": 5.125925925925926e-05, + "loss": 1.6441, + "step": 30300 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 5.9375, + "learning_rate": 5.1061728395061726e-05, + "loss": 1.6211, + "step": 30400 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 6.09375, + "learning_rate": 5.0864197530864197e-05, + "loss": 1.6304, + "step": 30500 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.40625, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.6111, + "step": 30600 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 7.625, + "learning_rate": 5.046913580246914e-05, + "loss": 1.6387, + "step": 30700 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 4.875, + "learning_rate": 5.027160493827161e-05, + "loss": 1.6418, + "step": 30800 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.25, + "learning_rate": 5.007407407407407e-05, + "loss": 1.6082, + "step": 30900 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 5.125, + "learning_rate": 4.987654320987655e-05, + "loss": 1.5755, + "step": 31000 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 9.0625, + "learning_rate": 4.9679012345679014e-05, + "loss": 1.6432, + "step": 31100 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 6.0, + "learning_rate": 4.9481481481481485e-05, + "loss": 1.6333, + "step": 31200 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 6.65625, + "learning_rate": 4.9283950617283955e-05, + "loss": 1.6183, + "step": 31300 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 7.28125, + "learning_rate": 4.908641975308642e-05, + "loss": 1.5636, + "step": 31400 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.125, + "learning_rate": 4.888888888888889e-05, + "loss": 1.621, + "step": 31500 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 6.46875, + "learning_rate": 4.869135802469136e-05, + "loss": 1.7226, + "step": 31600 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 5.875, + "learning_rate": 4.849382716049383e-05, + "loss": 1.6311, + "step": 31700 + }, + { + "epoch": 1.696, + "grad_norm": 5.875, + "learning_rate": 4.82962962962963e-05, + "loss": 1.6132, + "step": 31800 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 5.375, + "learning_rate": 4.8098765432098766e-05, + "loss": 1.5931, + "step": 31900 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 13.0625, + "learning_rate": 4.7901234567901237e-05, + "loss": 1.6958, + "step": 32000 + }, + { + "epoch": 1.712, + "grad_norm": 6.40625, + "learning_rate": 4.770370370370371e-05, + "loss": 1.6209, + "step": 32100 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 13.5625, + "learning_rate": 4.750617283950617e-05, + "loss": 1.6031, + "step": 32200 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 8.5, + "learning_rate": 4.730864197530864e-05, + "loss": 1.6279, + "step": 32300 + }, + { + "epoch": 1.728, + "grad_norm": 7.5, + "learning_rate": 4.711111111111111e-05, + "loss": 1.5793, + "step": 32400 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 6.75, + "learning_rate": 4.691358024691358e-05, + "loss": 1.6628, + "step": 32500 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 6.21875, + "learning_rate": 4.6716049382716054e-05, + "loss": 1.671, + "step": 32600 + }, + { + "epoch": 1.744, + "grad_norm": 5.875, + "learning_rate": 4.6518518518518525e-05, + "loss": 1.6886, + "step": 32700 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 5.15625, + "learning_rate": 4.632098765432099e-05, + "loss": 1.6598, + "step": 32800 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 9.6875, + "learning_rate": 4.612345679012346e-05, + "loss": 1.5773, + "step": 32900 + }, + { + "epoch": 1.76, + "grad_norm": 6.125, + "learning_rate": 4.592592592592593e-05, + "loss": 1.6603, + "step": 33000 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 5.8125, + "learning_rate": 4.5728395061728394e-05, + "loss": 1.6405, + "step": 33100 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 5.34375, + "learning_rate": 4.5530864197530865e-05, + "loss": 1.6776, + "step": 33200 + }, + { + "epoch": 1.776, + "grad_norm": 5.5625, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.5413, + "step": 33300 + }, + { + "epoch": 1.7813333333333334, + "grad_norm": 8.875, + "learning_rate": 4.5135802469135806e-05, + "loss": 1.6298, + "step": 33400 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 6.28125, + "learning_rate": 4.493827160493828e-05, + "loss": 1.5795, + "step": 33500 + }, + { + "epoch": 1.792, + "grad_norm": 6.65625, + "learning_rate": 4.474074074074075e-05, + "loss": 1.7145, + "step": 33600 + }, + { + "epoch": 1.7973333333333334, + "grad_norm": 7.96875, + "learning_rate": 4.454320987654321e-05, + "loss": 1.6492, + "step": 33700 + }, + { + "epoch": 1.8026666666666666, + "grad_norm": 10.625, + "learning_rate": 4.434567901234568e-05, + "loss": 1.5981, + "step": 33800 + }, + { + "epoch": 1.808, + "grad_norm": 5.65625, + "learning_rate": 4.414814814814815e-05, + "loss": 1.5606, + "step": 33900 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 4.90625, + "learning_rate": 4.3950617283950617e-05, + "loss": 1.5981, + "step": 34000 + }, + { + "epoch": 1.8186666666666667, + "grad_norm": 6.0, + "learning_rate": 4.375308641975309e-05, + "loss": 1.5976, + "step": 34100 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 5.5625, + "learning_rate": 4.355555555555556e-05, + "loss": 1.6783, + "step": 34200 + }, + { + "epoch": 1.8293333333333335, + "grad_norm": 6.96875, + "learning_rate": 4.335802469135803e-05, + "loss": 1.6716, + "step": 34300 + }, + { + "epoch": 1.8346666666666667, + "grad_norm": 4.6875, + "learning_rate": 4.31604938271605e-05, + "loss": 1.5989, + "step": 34400 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 7.71875, + "learning_rate": 4.296296296296296e-05, + "loss": 1.6317, + "step": 34500 + }, + { + "epoch": 1.8453333333333335, + "grad_norm": 5.78125, + "learning_rate": 4.2765432098765434e-05, + "loss": 1.6327, + "step": 34600 + }, + { + "epoch": 1.8506666666666667, + "grad_norm": 5.59375, + "learning_rate": 4.2567901234567905e-05, + "loss": 1.5324, + "step": 34700 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 7.65625, + "learning_rate": 4.237037037037037e-05, + "loss": 1.6141, + "step": 34800 + }, + { + "epoch": 1.8613333333333333, + "grad_norm": 9.4375, + "learning_rate": 4.217283950617284e-05, + "loss": 1.6398, + "step": 34900 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 8.875, + "learning_rate": 4.197530864197531e-05, + "loss": 1.5835, + "step": 35000 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 12.0625, + "learning_rate": 4.177777777777778e-05, + "loss": 1.633, + "step": 35100 + }, + { + "epoch": 1.8773333333333333, + "grad_norm": 8.375, + "learning_rate": 4.158024691358025e-05, + "loss": 1.6851, + "step": 35200 + }, + { + "epoch": 1.8826666666666667, + "grad_norm": 11.5625, + "learning_rate": 4.138271604938272e-05, + "loss": 1.6436, + "step": 35300 + }, + { + "epoch": 1.888, + "grad_norm": 7.78125, + "learning_rate": 4.1185185185185186e-05, + "loss": 1.6268, + "step": 35400 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 11.1875, + "learning_rate": 4.0987654320987657e-05, + "loss": 1.5537, + "step": 35500 + }, + { + "epoch": 1.8986666666666667, + "grad_norm": 10.8125, + "learning_rate": 4.079012345679013e-05, + "loss": 1.6954, + "step": 35600 + }, + { + "epoch": 1.904, + "grad_norm": 10.625, + "learning_rate": 4.059259259259259e-05, + "loss": 1.6122, + "step": 35700 + }, + { + "epoch": 1.9093333333333333, + "grad_norm": 4.4375, + "learning_rate": 4.039506172839506e-05, + "loss": 1.6308, + "step": 35800 + }, + { + "epoch": 1.9146666666666667, + "grad_norm": 5.4375, + "learning_rate": 4.019753086419753e-05, + "loss": 1.6331, + "step": 35900 + }, + { + "epoch": 1.92, + "grad_norm": 5.125, + "learning_rate": 4e-05, + "loss": 1.5898, + "step": 36000 + }, + { + "epoch": 1.9253333333333333, + "grad_norm": 13.5625, + "learning_rate": 3.9802469135802474e-05, + "loss": 1.6748, + "step": 36100 + }, + { + "epoch": 1.9306666666666668, + "grad_norm": 5.40625, + "learning_rate": 3.960493827160494e-05, + "loss": 1.6326, + "step": 36200 + }, + { + "epoch": 1.936, + "grad_norm": 8.0, + "learning_rate": 3.940740740740741e-05, + "loss": 1.6027, + "step": 36300 + }, + { + "epoch": 1.9413333333333334, + "grad_norm": 12.625, + "learning_rate": 3.920987654320988e-05, + "loss": 1.5298, + "step": 36400 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 5.875, + "learning_rate": 3.901234567901234e-05, + "loss": 1.6354, + "step": 36500 + }, + { + "epoch": 1.952, + "grad_norm": 5.40625, + "learning_rate": 3.8814814814814814e-05, + "loss": 1.6155, + "step": 36600 + }, + { + "epoch": 1.9573333333333334, + "grad_norm": 5.15625, + "learning_rate": 3.8617283950617285e-05, + "loss": 1.6524, + "step": 36700 + }, + { + "epoch": 1.9626666666666668, + "grad_norm": 8.0625, + "learning_rate": 3.8419753086419755e-05, + "loss": 1.6594, + "step": 36800 + }, + { + "epoch": 1.968, + "grad_norm": 11.0, + "learning_rate": 3.8222222222222226e-05, + "loss": 1.6397, + "step": 36900 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 6.96875, + "learning_rate": 3.80246913580247e-05, + "loss": 1.6208, + "step": 37000 + }, + { + "epoch": 1.9786666666666668, + "grad_norm": 9.125, + "learning_rate": 3.782716049382716e-05, + "loss": 1.5995, + "step": 37100 + }, + { + "epoch": 1.984, + "grad_norm": 8.8125, + "learning_rate": 3.762962962962963e-05, + "loss": 1.59, + "step": 37200 + }, + { + "epoch": 1.9893333333333332, + "grad_norm": 8.1875, + "learning_rate": 3.74320987654321e-05, + "loss": 1.6343, + "step": 37300 + }, + { + "epoch": 1.9946666666666668, + "grad_norm": 7.65625, + "learning_rate": 3.7234567901234566e-05, + "loss": 1.6007, + "step": 37400 + }, + { + "epoch": 2.0, + "grad_norm": 6.125, + "learning_rate": 3.7037037037037037e-05, + "loss": 1.6295, + "step": 37500 + }, + { + "epoch": 2.005333333333333, + "grad_norm": 6.65625, + "learning_rate": 3.683950617283951e-05, + "loss": 1.2317, + "step": 37600 + }, + { + "epoch": 2.010666666666667, + "grad_norm": 6.9375, + "learning_rate": 3.664197530864198e-05, + "loss": 1.3769, + "step": 37700 + }, + { + "epoch": 2.016, + "grad_norm": 5.46875, + "learning_rate": 3.644444444444445e-05, + "loss": 1.3206, + "step": 37800 + }, + { + "epoch": 2.021333333333333, + "grad_norm": 7.3125, + "learning_rate": 3.624691358024692e-05, + "loss": 1.2903, + "step": 37900 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.604938271604938e-05, + "loss": 1.3443, + "step": 38000 + }, + { + "epoch": 2.032, + "grad_norm": 6.375, + "learning_rate": 3.5851851851851854e-05, + "loss": 1.291, + "step": 38100 + }, + { + "epoch": 2.037333333333333, + "grad_norm": 7.84375, + "learning_rate": 3.5654320987654325e-05, + "loss": 1.2552, + "step": 38200 + }, + { + "epoch": 2.042666666666667, + "grad_norm": 8.9375, + "learning_rate": 3.545679012345679e-05, + "loss": 1.2883, + "step": 38300 + }, + { + "epoch": 2.048, + "grad_norm": 6.09375, + "learning_rate": 3.525925925925926e-05, + "loss": 1.2755, + "step": 38400 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 6.0625, + "learning_rate": 3.506172839506173e-05, + "loss": 1.3612, + "step": 38500 + }, + { + "epoch": 2.058666666666667, + "grad_norm": 8.625, + "learning_rate": 3.48641975308642e-05, + "loss": 1.2394, + "step": 38600 + }, + { + "epoch": 2.064, + "grad_norm": 8.25, + "learning_rate": 3.466666666666667e-05, + "loss": 1.3005, + "step": 38700 + }, + { + "epoch": 2.0693333333333332, + "grad_norm": 7.125, + "learning_rate": 3.4469135802469135e-05, + "loss": 1.3219, + "step": 38800 + }, + { + "epoch": 2.074666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.4271604938271606e-05, + "loss": 1.3388, + "step": 38900 + }, + { + "epoch": 2.08, + "grad_norm": 7.4375, + "learning_rate": 3.4074074074074077e-05, + "loss": 1.3317, + "step": 39000 + }, + { + "epoch": 2.0853333333333333, + "grad_norm": 5.15625, + "learning_rate": 3.387654320987654e-05, + "loss": 1.2546, + "step": 39100 + }, + { + "epoch": 2.0906666666666665, + "grad_norm": 6.71875, + "learning_rate": 3.367901234567901e-05, + "loss": 1.3502, + "step": 39200 + }, + { + "epoch": 2.096, + "grad_norm": 7.28125, + "learning_rate": 3.348148148148148e-05, + "loss": 1.2733, + "step": 39300 + }, + { + "epoch": 2.1013333333333333, + "grad_norm": 8.125, + "learning_rate": 3.328395061728395e-05, + "loss": 1.2879, + "step": 39400 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 6.5625, + "learning_rate": 3.308641975308642e-05, + "loss": 1.2175, + "step": 39500 + }, + { + "epoch": 2.112, + "grad_norm": 7.375, + "learning_rate": 3.2888888888888894e-05, + "loss": 1.3628, + "step": 39600 + }, + { + "epoch": 2.1173333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.269135802469136e-05, + "loss": 1.2937, + "step": 39700 + }, + { + "epoch": 2.1226666666666665, + "grad_norm": 5.9375, + "learning_rate": 3.249382716049383e-05, + "loss": 1.2451, + "step": 39800 + }, + { + "epoch": 2.128, + "grad_norm": 9.6875, + "learning_rate": 3.22962962962963e-05, + "loss": 1.3379, + "step": 39900 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 7.5, + "learning_rate": 3.209876543209876e-05, + "loss": 1.2934, + "step": 40000 + }, + { + "epoch": 2.1386666666666665, + "grad_norm": 5.84375, + "learning_rate": 3.1901234567901234e-05, + "loss": 1.2618, + "step": 40100 + }, + { + "epoch": 2.144, + "grad_norm": 6.875, + "learning_rate": 3.1703703703703705e-05, + "loss": 1.384, + "step": 40200 + }, + { + "epoch": 2.1493333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.1506172839506175e-05, + "loss": 1.2611, + "step": 40300 + }, + { + "epoch": 2.1546666666666665, + "grad_norm": 8.0, + "learning_rate": 3.1308641975308646e-05, + "loss": 1.2923, + "step": 40400 + }, + { + "epoch": 2.16, + "grad_norm": 7.53125, + "learning_rate": 3.111111111111111e-05, + "loss": 1.2947, + "step": 40500 + }, + { + "epoch": 2.1653333333333333, + "grad_norm": 8.125, + "learning_rate": 3.091358024691358e-05, + "loss": 1.283, + "step": 40600 + }, + { + "epoch": 2.1706666666666665, + "grad_norm": 7.625, + "learning_rate": 3.071604938271605e-05, + "loss": 1.3939, + "step": 40700 + }, + { + "epoch": 2.176, + "grad_norm": 8.0625, + "learning_rate": 3.0518518518518515e-05, + "loss": 1.3395, + "step": 40800 + }, + { + "epoch": 2.1813333333333333, + "grad_norm": 10.3125, + "learning_rate": 3.0320987654320986e-05, + "loss": 1.2382, + "step": 40900 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 5.8125, + "learning_rate": 3.012345679012346e-05, + "loss": 1.346, + "step": 41000 + }, + { + "epoch": 2.192, + "grad_norm": 12.0, + "learning_rate": 2.992592592592593e-05, + "loss": 1.348, + "step": 41100 + }, + { + "epoch": 2.1973333333333334, + "grad_norm": 7.0, + "learning_rate": 2.9728395061728398e-05, + "loss": 1.2885, + "step": 41200 + }, + { + "epoch": 2.2026666666666666, + "grad_norm": 10.9375, + "learning_rate": 2.9530864197530865e-05, + "loss": 1.2577, + "step": 41300 + }, + { + "epoch": 2.208, + "grad_norm": 7.0625, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.3698, + "step": 41400 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 5.6875, + "learning_rate": 2.9135802469135803e-05, + "loss": 1.2787, + "step": 41500 + }, + { + "epoch": 2.2186666666666666, + "grad_norm": 11.0625, + "learning_rate": 2.893827160493827e-05, + "loss": 1.299, + "step": 41600 + }, + { + "epoch": 2.224, + "grad_norm": 16.5, + "learning_rate": 2.874074074074074e-05, + "loss": 1.3493, + "step": 41700 + }, + { + "epoch": 2.2293333333333334, + "grad_norm": 7.71875, + "learning_rate": 2.854320987654321e-05, + "loss": 1.232, + "step": 41800 + }, + { + "epoch": 2.2346666666666666, + "grad_norm": 7.3125, + "learning_rate": 2.8345679012345683e-05, + "loss": 1.2965, + "step": 41900 + }, + { + "epoch": 2.24, + "grad_norm": 4.875, + "learning_rate": 2.814814814814815e-05, + "loss": 1.2932, + "step": 42000 + }, + { + "epoch": 2.2453333333333334, + "grad_norm": 8.5625, + "learning_rate": 2.795061728395062e-05, + "loss": 1.2689, + "step": 42100 + }, + { + "epoch": 2.2506666666666666, + "grad_norm": 8.5625, + "learning_rate": 2.7753086419753088e-05, + "loss": 1.3437, + "step": 42200 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 11.375, + "learning_rate": 2.7555555555555555e-05, + "loss": 1.3957, + "step": 42300 + }, + { + "epoch": 2.2613333333333334, + "grad_norm": 7.125, + "learning_rate": 2.7358024691358026e-05, + "loss": 1.2948, + "step": 42400 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 6.90625, + "learning_rate": 2.7160493827160493e-05, + "loss": 1.2896, + "step": 42500 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 8.3125, + "learning_rate": 2.696296296296296e-05, + "loss": 1.2483, + "step": 42600 + }, + { + "epoch": 2.2773333333333334, + "grad_norm": 6.40625, + "learning_rate": 2.6765432098765435e-05, + "loss": 1.3159, + "step": 42700 + }, + { + "epoch": 2.2826666666666666, + "grad_norm": 6.59375, + "learning_rate": 2.6567901234567905e-05, + "loss": 1.2742, + "step": 42800 + }, + { + "epoch": 2.288, + "grad_norm": 7.21875, + "learning_rate": 2.6370370370370373e-05, + "loss": 1.3353, + "step": 42900 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 5.46875, + "learning_rate": 2.617283950617284e-05, + "loss": 1.3093, + "step": 43000 + }, + { + "epoch": 2.2986666666666666, + "grad_norm": 8.3125, + "learning_rate": 2.597530864197531e-05, + "loss": 1.2342, + "step": 43100 + }, + { + "epoch": 2.304, + "grad_norm": 7.09375, + "learning_rate": 2.5777777777777778e-05, + "loss": 1.3586, + "step": 43200 + }, + { + "epoch": 2.3093333333333335, + "grad_norm": 11.625, + "learning_rate": 2.558024691358025e-05, + "loss": 1.2999, + "step": 43300 + }, + { + "epoch": 2.3146666666666667, + "grad_norm": 7.75, + "learning_rate": 2.5382716049382716e-05, + "loss": 1.2873, + "step": 43400 + }, + { + "epoch": 2.32, + "grad_norm": 8.0, + "learning_rate": 2.5185185185185183e-05, + "loss": 1.3057, + "step": 43500 + }, + { + "epoch": 2.3253333333333335, + "grad_norm": 9.1875, + "learning_rate": 2.4987654320987654e-05, + "loss": 1.3544, + "step": 43600 + }, + { + "epoch": 2.3306666666666667, + "grad_norm": 7.71875, + "learning_rate": 2.4790123456790125e-05, + "loss": 1.333, + "step": 43700 + }, + { + "epoch": 2.336, + "grad_norm": 6.21875, + "learning_rate": 2.4592592592592595e-05, + "loss": 1.2135, + "step": 43800 + }, + { + "epoch": 2.3413333333333335, + "grad_norm": 6.59375, + "learning_rate": 2.4395061728395063e-05, + "loss": 1.3494, + "step": 43900 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 8.1875, + "learning_rate": 2.4197530864197533e-05, + "loss": 1.3179, + "step": 44000 + }, + { + "epoch": 2.352, + "grad_norm": 5.9375, + "learning_rate": 2.4e-05, + "loss": 1.401, + "step": 44100 + }, + { + "epoch": 2.3573333333333335, + "grad_norm": 8.125, + "learning_rate": 2.380246913580247e-05, + "loss": 1.2905, + "step": 44200 + }, + { + "epoch": 2.3626666666666667, + "grad_norm": 6.5625, + "learning_rate": 2.360493827160494e-05, + "loss": 1.3236, + "step": 44300 + }, + { + "epoch": 2.368, + "grad_norm": 7.71875, + "learning_rate": 2.340740740740741e-05, + "loss": 1.2924, + "step": 44400 + }, + { + "epoch": 2.3733333333333335, + "grad_norm": 10.1875, + "learning_rate": 2.3209876543209877e-05, + "loss": 1.3823, + "step": 44500 + }, + { + "epoch": 2.3786666666666667, + "grad_norm": 9.0625, + "learning_rate": 2.3012345679012347e-05, + "loss": 1.2555, + "step": 44600 + }, + { + "epoch": 2.384, + "grad_norm": 6.53125, + "learning_rate": 2.2814814814814818e-05, + "loss": 1.319, + "step": 44700 + }, + { + "epoch": 2.389333333333333, + "grad_norm": 6.28125, + "learning_rate": 2.2617283950617285e-05, + "loss": 1.3722, + "step": 44800 + }, + { + "epoch": 2.3946666666666667, + "grad_norm": 8.75, + "learning_rate": 2.2419753086419753e-05, + "loss": 1.2535, + "step": 44900 + }, + { + "epoch": 2.4, + "grad_norm": 5.75, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.3091, + "step": 45000 + }, + { + "epoch": 2.405333333333333, + "grad_norm": 7.09375, + "learning_rate": 2.2024691358024694e-05, + "loss": 1.3417, + "step": 45100 + }, + { + "epoch": 2.4106666666666667, + "grad_norm": 6.21875, + "learning_rate": 2.182716049382716e-05, + "loss": 1.318, + "step": 45200 + }, + { + "epoch": 2.416, + "grad_norm": 6.09375, + "learning_rate": 2.162962962962963e-05, + "loss": 1.2971, + "step": 45300 + }, + { + "epoch": 2.421333333333333, + "grad_norm": 6.875, + "learning_rate": 2.14320987654321e-05, + "loss": 1.3866, + "step": 45400 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 5.9375, + "learning_rate": 2.123456790123457e-05, + "loss": 1.2945, + "step": 45500 + }, + { + "epoch": 2.432, + "grad_norm": 7.4375, + "learning_rate": 2.1037037037037037e-05, + "loss": 1.2541, + "step": 45600 + }, + { + "epoch": 2.437333333333333, + "grad_norm": 5.71875, + "learning_rate": 2.0839506172839508e-05, + "loss": 1.3282, + "step": 45700 + }, + { + "epoch": 2.4426666666666668, + "grad_norm": 12.1875, + "learning_rate": 2.0641975308641975e-05, + "loss": 1.3743, + "step": 45800 + }, + { + "epoch": 2.448, + "grad_norm": 5.0, + "learning_rate": 2.0444444444444446e-05, + "loss": 1.2689, + "step": 45900 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 8.5625, + "learning_rate": 2.0246913580246917e-05, + "loss": 1.347, + "step": 46000 + }, + { + "epoch": 2.458666666666667, + "grad_norm": 7.25, + "learning_rate": 2.0049382716049384e-05, + "loss": 1.3629, + "step": 46100 + }, + { + "epoch": 2.464, + "grad_norm": 12.5, + "learning_rate": 1.985185185185185e-05, + "loss": 1.2604, + "step": 46200 + }, + { + "epoch": 2.469333333333333, + "grad_norm": 7.03125, + "learning_rate": 1.9654320987654322e-05, + "loss": 1.3428, + "step": 46300 + }, + { + "epoch": 2.474666666666667, + "grad_norm": 7.8125, + "learning_rate": 1.9456790123456793e-05, + "loss": 1.2956, + "step": 46400 + }, + { + "epoch": 2.48, + "grad_norm": 7.21875, + "learning_rate": 1.925925925925926e-05, + "loss": 1.2986, + "step": 46500 + }, + { + "epoch": 2.485333333333333, + "grad_norm": 8.3125, + "learning_rate": 1.9061728395061727e-05, + "loss": 1.2794, + "step": 46600 + }, + { + "epoch": 2.490666666666667, + "grad_norm": 8.125, + "learning_rate": 1.8864197530864198e-05, + "loss": 1.3091, + "step": 46700 + }, + { + "epoch": 2.496, + "grad_norm": 6.59375, + "learning_rate": 1.866666666666667e-05, + "loss": 1.2405, + "step": 46800 + }, + { + "epoch": 2.501333333333333, + "grad_norm": 9.25, + "learning_rate": 1.8469135802469136e-05, + "loss": 1.2841, + "step": 46900 + }, + { + "epoch": 2.506666666666667, + "grad_norm": 5.78125, + "learning_rate": 1.8271604938271607e-05, + "loss": 1.3305, + "step": 47000 + }, + { + "epoch": 2.512, + "grad_norm": 8.375, + "learning_rate": 1.8074074074074074e-05, + "loss": 1.3659, + "step": 47100 + }, + { + "epoch": 2.517333333333333, + "grad_norm": 7.0625, + "learning_rate": 1.7876543209876545e-05, + "loss": 1.2434, + "step": 47200 + }, + { + "epoch": 2.522666666666667, + "grad_norm": 9.8125, + "learning_rate": 1.7679012345679012e-05, + "loss": 1.2765, + "step": 47300 + }, + { + "epoch": 2.528, + "grad_norm": 7.71875, + "learning_rate": 1.7481481481481483e-05, + "loss": 1.3136, + "step": 47400 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 7.71875, + "learning_rate": 1.728395061728395e-05, + "loss": 1.3622, + "step": 47500 + }, + { + "epoch": 2.538666666666667, + "grad_norm": 11.0, + "learning_rate": 1.708641975308642e-05, + "loss": 1.2877, + "step": 47600 + }, + { + "epoch": 2.544, + "grad_norm": 6.25, + "learning_rate": 1.688888888888889e-05, + "loss": 1.3239, + "step": 47700 + }, + { + "epoch": 2.5493333333333332, + "grad_norm": 6.375, + "learning_rate": 1.669135802469136e-05, + "loss": 1.3512, + "step": 47800 + }, + { + "epoch": 2.554666666666667, + "grad_norm": 6.875, + "learning_rate": 1.6493827160493826e-05, + "loss": 1.3079, + "step": 47900 + }, + { + "epoch": 2.56, + "grad_norm": 7.90625, + "learning_rate": 1.62962962962963e-05, + "loss": 1.3031, + "step": 48000 + }, + { + "epoch": 2.5653333333333332, + "grad_norm": 8.0, + "learning_rate": 1.6098765432098767e-05, + "loss": 1.3062, + "step": 48100 + }, + { + "epoch": 2.570666666666667, + "grad_norm": 7.625, + "learning_rate": 1.5901234567901235e-05, + "loss": 1.3348, + "step": 48200 + }, + { + "epoch": 2.576, + "grad_norm": 9.6875, + "learning_rate": 1.5703703703703705e-05, + "loss": 1.3392, + "step": 48300 + }, + { + "epoch": 2.5813333333333333, + "grad_norm": 8.75, + "learning_rate": 1.5506172839506173e-05, + "loss": 1.3153, + "step": 48400 + }, + { + "epoch": 2.586666666666667, + "grad_norm": 6.75, + "learning_rate": 1.5308641975308643e-05, + "loss": 1.3348, + "step": 48500 + }, + { + "epoch": 2.592, + "grad_norm": 9.0, + "learning_rate": 1.5111111111111112e-05, + "loss": 1.3008, + "step": 48600 + }, + { + "epoch": 2.5973333333333333, + "grad_norm": 8.1875, + "learning_rate": 1.4913580246913581e-05, + "loss": 1.3492, + "step": 48700 + }, + { + "epoch": 2.602666666666667, + "grad_norm": 6.5, + "learning_rate": 1.4716049382716049e-05, + "loss": 1.2897, + "step": 48800 + }, + { + "epoch": 2.608, + "grad_norm": 6.03125, + "learning_rate": 1.4518518518518521e-05, + "loss": 1.2443, + "step": 48900 + }, + { + "epoch": 2.6133333333333333, + "grad_norm": 8.125, + "learning_rate": 1.4320987654320988e-05, + "loss": 1.3395, + "step": 49000 + }, + { + "epoch": 2.618666666666667, + "grad_norm": 7.90625, + "learning_rate": 1.4123456790123457e-05, + "loss": 1.3716, + "step": 49100 + }, + { + "epoch": 2.624, + "grad_norm": 6.28125, + "learning_rate": 1.3925925925925926e-05, + "loss": 1.3066, + "step": 49200 + }, + { + "epoch": 2.6293333333333333, + "grad_norm": 5.34375, + "learning_rate": 1.3728395061728397e-05, + "loss": 1.2932, + "step": 49300 + }, + { + "epoch": 2.634666666666667, + "grad_norm": 7.21875, + "learning_rate": 1.3530864197530866e-05, + "loss": 1.2657, + "step": 49400 + }, + { + "epoch": 2.64, + "grad_norm": 12.6875, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.2547, + "step": 49500 + }, + { + "epoch": 2.6453333333333333, + "grad_norm": 7.53125, + "learning_rate": 1.3135802469135802e-05, + "loss": 1.2846, + "step": 49600 + }, + { + "epoch": 2.6506666666666665, + "grad_norm": 10.3125, + "learning_rate": 1.2938271604938273e-05, + "loss": 1.3046, + "step": 49700 + }, + { + "epoch": 2.656, + "grad_norm": 8.5625, + "learning_rate": 1.2740740740740742e-05, + "loss": 1.3353, + "step": 49800 + }, + { + "epoch": 2.6613333333333333, + "grad_norm": 10.6875, + "learning_rate": 1.2543209876543211e-05, + "loss": 1.3146, + "step": 49900 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 10.125, + "learning_rate": 1.2345679012345678e-05, + "loss": 1.279, + "step": 50000 + }, + { + "epoch": 2.672, + "grad_norm": 11.125, + "learning_rate": 1.2148148148148149e-05, + "loss": 1.2854, + "step": 50100 + }, + { + "epoch": 2.6773333333333333, + "grad_norm": 6.375, + "learning_rate": 1.1950617283950618e-05, + "loss": 1.3665, + "step": 50200 + }, + { + "epoch": 2.6826666666666665, + "grad_norm": 7.90625, + "learning_rate": 1.1753086419753087e-05, + "loss": 1.2908, + "step": 50300 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 6.03125, + "learning_rate": 1.1555555555555556e-05, + "loss": 1.33, + "step": 50400 + }, + { + "epoch": 2.6933333333333334, + "grad_norm": 9.1875, + "learning_rate": 1.1358024691358025e-05, + "loss": 1.3176, + "step": 50500 + }, + { + "epoch": 2.6986666666666665, + "grad_norm": 8.4375, + "learning_rate": 1.1160493827160494e-05, + "loss": 1.3215, + "step": 50600 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 6.90625, + "learning_rate": 1.0962962962962963e-05, + "loss": 1.3513, + "step": 50700 + }, + { + "epoch": 2.7093333333333334, + "grad_norm": 9.4375, + "learning_rate": 1.0765432098765432e-05, + "loss": 1.2539, + "step": 50800 + }, + { + "epoch": 2.7146666666666666, + "grad_norm": 7.125, + "learning_rate": 1.0567901234567903e-05, + "loss": 1.3037, + "step": 50900 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 7.75, + "learning_rate": 1.037037037037037e-05, + "loss": 1.3418, + "step": 51000 + }, + { + "epoch": 2.7253333333333334, + "grad_norm": 5.90625, + "learning_rate": 1.017283950617284e-05, + "loss": 1.3898, + "step": 51100 + }, + { + "epoch": 2.7306666666666666, + "grad_norm": 5.21875, + "learning_rate": 9.97530864197531e-06, + "loss": 1.2758, + "step": 51200 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 6.75, + "learning_rate": 9.777777777777779e-06, + "loss": 1.3215, + "step": 51300 + }, + { + "epoch": 2.7413333333333334, + "grad_norm": 10.9375, + "learning_rate": 9.580246913580248e-06, + "loss": 1.374, + "step": 51400 + }, + { + "epoch": 2.7466666666666666, + "grad_norm": 6.625, + "learning_rate": 9.382716049382717e-06, + "loss": 1.3613, + "step": 51500 + }, + { + "epoch": 2.752, + "grad_norm": 7.0625, + "learning_rate": 9.185185185185186e-06, + "loss": 1.3206, + "step": 51600 + }, + { + "epoch": 2.7573333333333334, + "grad_norm": 7.0, + "learning_rate": 8.987654320987655e-06, + "loss": 1.3731, + "step": 51700 + }, + { + "epoch": 2.7626666666666666, + "grad_norm": 6.40625, + "learning_rate": 8.790123456790124e-06, + "loss": 1.2751, + "step": 51800 + }, + { + "epoch": 2.768, + "grad_norm": 6.125, + "learning_rate": 8.592592592592593e-06, + "loss": 1.3447, + "step": 51900 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 6.40625, + "learning_rate": 8.395061728395062e-06, + "loss": 1.2362, + "step": 52000 + }, + { + "epoch": 2.7786666666666666, + "grad_norm": 7.125, + "learning_rate": 8.19753086419753e-06, + "loss": 1.3439, + "step": 52100 + }, + { + "epoch": 2.784, + "grad_norm": 5.53125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3392, + "step": 52200 + }, + { + "epoch": 2.7893333333333334, + "grad_norm": 7.4375, + "learning_rate": 7.802469135802469e-06, + "loss": 1.3598, + "step": 52300 + }, + { + "epoch": 2.7946666666666666, + "grad_norm": 7.53125, + "learning_rate": 7.6049382716049385e-06, + "loss": 1.2485, + "step": 52400 + }, + { + "epoch": 2.8, + "grad_norm": 6.4375, + "learning_rate": 7.4074074074074075e-06, + "loss": 1.2279, + "step": 52500 + }, + { + "epoch": 2.8053333333333335, + "grad_norm": 12.3125, + "learning_rate": 7.209876543209877e-06, + "loss": 1.272, + "step": 52600 + }, + { + "epoch": 2.8106666666666666, + "grad_norm": 7.5, + "learning_rate": 7.0123456790123455e-06, + "loss": 1.2805, + "step": 52700 + }, + { + "epoch": 2.816, + "grad_norm": 6.125, + "learning_rate": 6.814814814814815e-06, + "loss": 1.3183, + "step": 52800 + }, + { + "epoch": 2.8213333333333335, + "grad_norm": 5.78125, + "learning_rate": 6.617283950617284e-06, + "loss": 1.2925, + "step": 52900 + }, + { + "epoch": 2.8266666666666667, + "grad_norm": 6.0625, + "learning_rate": 6.419753086419754e-06, + "loss": 1.2946, + "step": 53000 + }, + { + "epoch": 2.832, + "grad_norm": 10.5, + "learning_rate": 6.222222222222222e-06, + "loss": 1.329, + "step": 53100 + }, + { + "epoch": 2.8373333333333335, + "grad_norm": 9.75, + "learning_rate": 6.024691358024691e-06, + "loss": 1.3318, + "step": 53200 + }, + { + "epoch": 2.8426666666666667, + "grad_norm": 7.0625, + "learning_rate": 5.82716049382716e-06, + "loss": 1.3648, + "step": 53300 + }, + { + "epoch": 2.848, + "grad_norm": 9.25, + "learning_rate": 5.62962962962963e-06, + "loss": 1.2775, + "step": 53400 + }, + { + "epoch": 2.8533333333333335, + "grad_norm": 9.1875, + "learning_rate": 5.432098765432099e-06, + "loss": 1.2806, + "step": 53500 + }, + { + "epoch": 2.8586666666666667, + "grad_norm": 6.4375, + "learning_rate": 5.234567901234568e-06, + "loss": 1.3544, + "step": 53600 + }, + { + "epoch": 2.864, + "grad_norm": 9.8125, + "learning_rate": 5.037037037037037e-06, + "loss": 1.2832, + "step": 53700 + }, + { + "epoch": 2.8693333333333335, + "grad_norm": 5.9375, + "learning_rate": 4.839506172839506e-06, + "loss": 1.3708, + "step": 53800 + }, + { + "epoch": 2.8746666666666667, + "grad_norm": 7.25, + "learning_rate": 4.641975308641976e-06, + "loss": 1.287, + "step": 53900 + }, + { + "epoch": 2.88, + "grad_norm": 10.375, + "learning_rate": 4.444444444444445e-06, + "loss": 1.2741, + "step": 54000 + }, + { + "epoch": 2.8853333333333335, + "grad_norm": 7.78125, + "learning_rate": 4.246913580246914e-06, + "loss": 1.3358, + "step": 54100 + }, + { + "epoch": 2.8906666666666667, + "grad_norm": 8.5, + "learning_rate": 4.049382716049383e-06, + "loss": 1.2651, + "step": 54200 + }, + { + "epoch": 2.896, + "grad_norm": 11.5, + "learning_rate": 3.851851851851852e-06, + "loss": 1.3113, + "step": 54300 + }, + { + "epoch": 2.9013333333333335, + "grad_norm": 6.4375, + "learning_rate": 3.6543209876543214e-06, + "loss": 1.2998, + "step": 54400 + }, + { + "epoch": 2.9066666666666667, + "grad_norm": 7.375, + "learning_rate": 3.45679012345679e-06, + "loss": 1.325, + "step": 54500 + }, + { + "epoch": 2.912, + "grad_norm": 7.75, + "learning_rate": 3.259259259259259e-06, + "loss": 1.2704, + "step": 54600 + }, + { + "epoch": 2.9173333333333336, + "grad_norm": 6.15625, + "learning_rate": 3.061728395061729e-06, + "loss": 1.3235, + "step": 54700 + }, + { + "epoch": 2.9226666666666667, + "grad_norm": 8.5625, + "learning_rate": 2.864197530864198e-06, + "loss": 1.3711, + "step": 54800 + }, + { + "epoch": 2.928, + "grad_norm": 13.6875, + "learning_rate": 2.666666666666667e-06, + "loss": 1.335, + "step": 54900 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 8.9375, + "learning_rate": 2.469135802469136e-06, + "loss": 1.3163, + "step": 55000 + }, + { + "epoch": 2.9386666666666668, + "grad_norm": 11.75, + "learning_rate": 2.271604938271605e-06, + "loss": 1.2763, + "step": 55100 + }, + { + "epoch": 2.944, + "grad_norm": 6.9375, + "learning_rate": 2.0740740740740742e-06, + "loss": 1.3573, + "step": 55200 + }, + { + "epoch": 2.9493333333333336, + "grad_norm": 9.375, + "learning_rate": 1.8765432098765432e-06, + "loss": 1.3565, + "step": 55300 + }, + { + "epoch": 2.9546666666666668, + "grad_norm": 6.28125, + "learning_rate": 1.6790123456790125e-06, + "loss": 1.3489, + "step": 55400 + }, + { + "epoch": 2.96, + "grad_norm": 6.6875, + "learning_rate": 1.4814814814814817e-06, + "loss": 1.2812, + "step": 55500 + }, + { + "epoch": 2.9653333333333336, + "grad_norm": 13.9375, + "learning_rate": 1.2839506172839507e-06, + "loss": 1.3521, + "step": 55600 + }, + { + "epoch": 2.970666666666667, + "grad_norm": 7.1875, + "learning_rate": 1.0864197530864197e-06, + "loss": 1.3804, + "step": 55700 + }, + { + "epoch": 2.976, + "grad_norm": 7.09375, + "learning_rate": 8.88888888888889e-07, + "loss": 1.2888, + "step": 55800 + }, + { + "epoch": 2.981333333333333, + "grad_norm": 8.125, + "learning_rate": 6.913580246913581e-07, + "loss": 1.2953, + "step": 55900 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 5.6875, + "learning_rate": 4.938271604938272e-07, + "loss": 1.2729, + "step": 56000 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.678596351033344e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-56000/training_args.bin b/checkpoint-56000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-56000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/checkpoint-56250/config.json b/checkpoint-56250/config.json new file mode 100644 index 0000000000000000000000000000000000000000..28aa1559a0893c7e50c6a67370092417fa5cc81f --- /dev/null +++ b/checkpoint-56250/config.json @@ -0,0 +1,36 @@ +{ + "_name_or_path": "meta-llama/Llama-3.2-1B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 2048, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/checkpoint-56250/generation_config.json b/checkpoint-56250/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/checkpoint-56250/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/checkpoint-56250/model.safetensors b/checkpoint-56250/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..202cbada8919e88292f18aeedab7f312e2d59b04 --- /dev/null +++ b/checkpoint-56250/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b0779d345b0852334d6f314eb73355223b87d6965d3d4a740f955fe6d89a7ce +size 2471645608 diff --git a/checkpoint-56250/optimizer.pt b/checkpoint-56250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b582a8d4523bc5b21ab9c4a93c708a877e561b60 --- /dev/null +++ b/checkpoint-56250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:964b70a140b02d44893260a6c632b9af0db9b6d6d8861c75589e5b4277e46473 +size 4943382114 diff --git a/checkpoint-56250/rng_state.pth b/checkpoint-56250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..95cad945c935ebaeeca5e461007867b6e155022e --- /dev/null +++ b/checkpoint-56250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3adbebcf39742024134d36312cc62baede0eb396a36041797f643dcac19c1b2 +size 14244 diff --git a/checkpoint-56250/scheduler.pt b/checkpoint-56250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f120f7681c793565249bb119bdfbe9e138989703 --- /dev/null +++ b/checkpoint-56250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6907d88d6b8d106afc71bc7ac229d2cb80057390ab0402a3d83cb16a6200ccc1 +size 1064 diff --git a/checkpoint-56250/special_tokens_map.json b/checkpoint-56250/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..04829afa78a2d2df203ac846968db37269b01f7f --- /dev/null +++ b/checkpoint-56250/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|end_of_text|>" +} diff --git a/checkpoint-56250/tokenizer.json b/checkpoint-56250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-56250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-56250/tokenizer_config.json b/checkpoint-56250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3026161bf4fc56964bad68d35d36b5b815a3f716 --- /dev/null +++ b/checkpoint-56250/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-56250/trainer_state.json b/checkpoint-56250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b9347e4a9ac0b0220ef2a5f335c8dc1287401ba8 --- /dev/null +++ b/checkpoint-56250/trainer_state.json @@ -0,0 +1,3967 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 56250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 12.125, + "learning_rate": 1.777777777777778e-06, + "loss": 2.4276, + "step": 100 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 23.25, + "learning_rate": 3.555555555555556e-06, + "loss": 2.3687, + "step": 200 + }, + { + "epoch": 0.016, + "grad_norm": 26.625, + "learning_rate": 5.333333333333334e-06, + "loss": 2.181, + "step": 300 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 8.1875, + "learning_rate": 7.111111111111112e-06, + "loss": 2.1432, + "step": 400 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 18.75, + "learning_rate": 8.88888888888889e-06, + "loss": 2.1741, + "step": 500 + }, + { + "epoch": 0.032, + "grad_norm": 11.6875, + "learning_rate": 1.0666666666666667e-05, + "loss": 2.1112, + "step": 600 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 9.75, + "learning_rate": 1.2444444444444445e-05, + "loss": 2.0589, + "step": 700 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 16.25, + "learning_rate": 1.4222222222222224e-05, + "loss": 2.0632, + "step": 800 + }, + { + "epoch": 0.048, + "grad_norm": 13.3125, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.0585, + "step": 900 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 6.40625, + "learning_rate": 1.777777777777778e-05, + "loss": 2.0095, + "step": 1000 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 7.625, + "learning_rate": 1.9555555555555557e-05, + "loss": 2.0589, + "step": 1100 + }, + { + "epoch": 0.064, + "grad_norm": 9.8125, + "learning_rate": 2.1333333333333335e-05, + "loss": 2.108, + "step": 1200 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 7.0, + "learning_rate": 2.3111111111111112e-05, + "loss": 2.101, + "step": 1300 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 11.9375, + "learning_rate": 2.488888888888889e-05, + "loss": 2.0621, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 6.5, + "learning_rate": 2.6666666666666667e-05, + "loss": 2.0563, + "step": 1500 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 6.625, + "learning_rate": 2.8444444444444447e-05, + "loss": 2.0864, + "step": 1600 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 8.3125, + "learning_rate": 3.0222222222222225e-05, + "loss": 2.1109, + "step": 1700 + }, + { + "epoch": 0.096, + "grad_norm": 8.6875, + "learning_rate": 3.2000000000000005e-05, + "loss": 2.1095, + "step": 1800 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 9.125, + "learning_rate": 3.377777777777778e-05, + "loss": 2.0741, + "step": 1900 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 7.3125, + "learning_rate": 3.555555555555556e-05, + "loss": 2.1343, + "step": 2000 + }, + { + "epoch": 0.112, + "grad_norm": 11.375, + "learning_rate": 3.733333333333334e-05, + "loss": 2.1033, + "step": 2100 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 8.125, + "learning_rate": 3.9111111111111115e-05, + "loss": 2.1148, + "step": 2200 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 8.375, + "learning_rate": 4.088888888888889e-05, + "loss": 2.1165, + "step": 2300 + }, + { + "epoch": 0.128, + "grad_norm": 11.5625, + "learning_rate": 4.266666666666667e-05, + "loss": 2.0901, + "step": 2400 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 8.625, + "learning_rate": 4.4444444444444447e-05, + "loss": 2.148, + "step": 2500 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 7.3125, + "learning_rate": 4.6222222222222224e-05, + "loss": 2.0988, + "step": 2600 + }, + { + "epoch": 0.144, + "grad_norm": 7.84375, + "learning_rate": 4.8e-05, + "loss": 2.1642, + "step": 2700 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 11.625, + "learning_rate": 4.977777777777778e-05, + "loss": 2.1439, + "step": 2800 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 6.9375, + "learning_rate": 5.1555555555555556e-05, + "loss": 2.1544, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 6.75, + "learning_rate": 5.333333333333333e-05, + "loss": 2.1989, + "step": 3000 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 14.5, + "learning_rate": 5.511111111111111e-05, + "loss": 2.1579, + "step": 3100 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 6.59375, + "learning_rate": 5.6888888888888895e-05, + "loss": 2.1803, + "step": 3200 + }, + { + "epoch": 0.176, + "grad_norm": 12.0625, + "learning_rate": 5.866666666666667e-05, + "loss": 2.1436, + "step": 3300 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 6.25, + "learning_rate": 6.044444444444445e-05, + "loss": 2.1903, + "step": 3400 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 7.09375, + "learning_rate": 6.222222222222222e-05, + "loss": 2.2214, + "step": 3500 + }, + { + "epoch": 0.192, + "grad_norm": 9.5, + "learning_rate": 6.400000000000001e-05, + "loss": 2.1987, + "step": 3600 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 12.1875, + "learning_rate": 6.577777777777779e-05, + "loss": 2.2713, + "step": 3700 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 8.125, + "learning_rate": 6.755555555555557e-05, + "loss": 2.2564, + "step": 3800 + }, + { + "epoch": 0.208, + "grad_norm": 11.1875, + "learning_rate": 6.933333333333334e-05, + "loss": 2.2224, + "step": 3900 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.375, + "learning_rate": 7.111111111111112e-05, + "loss": 2.2204, + "step": 4000 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 7.625, + "learning_rate": 7.28888888888889e-05, + "loss": 2.2343, + "step": 4100 + }, + { + "epoch": 0.224, + "grad_norm": 9.3125, + "learning_rate": 7.466666666666667e-05, + "loss": 2.2662, + "step": 4200 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 10.0, + "learning_rate": 7.644444444444445e-05, + "loss": 2.2438, + "step": 4300 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.822222222222223e-05, + "loss": 2.2702, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 7.5625, + "learning_rate": 8e-05, + "loss": 2.2973, + "step": 4500 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 8.6875, + "learning_rate": 8.177777777777778e-05, + "loss": 2.2938, + "step": 4600 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 8.4375, + "learning_rate": 8.355555555555556e-05, + "loss": 2.3142, + "step": 4700 + }, + { + "epoch": 0.256, + "grad_norm": 7.0, + "learning_rate": 8.533333333333334e-05, + "loss": 2.3118, + "step": 4800 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 5.90625, + "learning_rate": 8.711111111111112e-05, + "loss": 2.3032, + "step": 4900 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 11.25, + "learning_rate": 8.888888888888889e-05, + "loss": 2.3436, + "step": 5000 + }, + { + "epoch": 0.272, + "grad_norm": 6.59375, + "learning_rate": 9.066666666666667e-05, + "loss": 2.3547, + "step": 5100 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 5.0625, + "learning_rate": 9.244444444444445e-05, + "loss": 2.3153, + "step": 5200 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 7.53125, + "learning_rate": 9.422222222222223e-05, + "loss": 2.2985, + "step": 5300 + }, + { + "epoch": 0.288, + "grad_norm": 5.625, + "learning_rate": 9.6e-05, + "loss": 2.4309, + "step": 5400 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 5.625, + "learning_rate": 9.777777777777778e-05, + "loss": 2.3583, + "step": 5500 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 7.34375, + "learning_rate": 9.955555555555556e-05, + "loss": 2.3224, + "step": 5600 + }, + { + "epoch": 0.304, + "grad_norm": 4.875, + "learning_rate": 9.985185185185185e-05, + "loss": 2.4143, + "step": 5700 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 8.6875, + "learning_rate": 9.965432098765432e-05, + "loss": 2.2883, + "step": 5800 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 6.5, + "learning_rate": 9.94567901234568e-05, + "loss": 2.3951, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 9.925925925925926e-05, + "loss": 2.3833, + "step": 6000 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 6.78125, + "learning_rate": 9.906172839506173e-05, + "loss": 2.3717, + "step": 6100 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 8.75, + "learning_rate": 9.88641975308642e-05, + "loss": 2.3364, + "step": 6200 + }, + { + "epoch": 0.336, + "grad_norm": 10.0, + "learning_rate": 9.866666666666668e-05, + "loss": 2.3874, + "step": 6300 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 4.96875, + "learning_rate": 9.846913580246913e-05, + "loss": 2.3805, + "step": 6400 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 5.96875, + "learning_rate": 9.827160493827162e-05, + "loss": 2.418, + "step": 6500 + }, + { + "epoch": 0.352, + "grad_norm": 7.90625, + "learning_rate": 9.807407407407407e-05, + "loss": 2.3874, + "step": 6600 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 9.4375, + "learning_rate": 9.787654320987654e-05, + "loss": 2.3446, + "step": 6700 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 6.4375, + "learning_rate": 9.767901234567902e-05, + "loss": 2.3489, + "step": 6800 + }, + { + "epoch": 0.368, + "grad_norm": 9.3125, + "learning_rate": 9.748148148148149e-05, + "loss": 2.3538, + "step": 6900 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 7.0, + "learning_rate": 9.728395061728396e-05, + "loss": 2.3662, + "step": 7000 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 9.8125, + "learning_rate": 9.708641975308643e-05, + "loss": 2.3701, + "step": 7100 + }, + { + "epoch": 0.384, + "grad_norm": 6.46875, + "learning_rate": 9.68888888888889e-05, + "loss": 2.3644, + "step": 7200 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 5.9375, + "learning_rate": 9.669135802469136e-05, + "loss": 2.3989, + "step": 7300 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 6.0, + "learning_rate": 9.649382716049384e-05, + "loss": 2.353, + "step": 7400 + }, + { + "epoch": 0.4, + "grad_norm": 5.625, + "learning_rate": 9.62962962962963e-05, + "loss": 2.3273, + "step": 7500 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 7.21875, + "learning_rate": 9.609876543209877e-05, + "loss": 2.378, + "step": 7600 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 10.3125, + "learning_rate": 9.590123456790124e-05, + "loss": 2.3484, + "step": 7700 + }, + { + "epoch": 0.416, + "grad_norm": 7.90625, + "learning_rate": 9.570370370370371e-05, + "loss": 2.3315, + "step": 7800 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 5.65625, + "learning_rate": 9.550617283950618e-05, + "loss": 2.3279, + "step": 7900 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 7.28125, + "learning_rate": 9.530864197530865e-05, + "loss": 2.3943, + "step": 8000 + }, + { + "epoch": 0.432, + "grad_norm": 8.75, + "learning_rate": 9.511111111111112e-05, + "loss": 2.3285, + "step": 8100 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 8.0625, + "learning_rate": 9.491358024691358e-05, + "loss": 2.3089, + "step": 8200 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 8.9375, + "learning_rate": 9.471604938271605e-05, + "loss": 2.2575, + "step": 8300 + }, + { + "epoch": 0.448, + "grad_norm": 10.4375, + "learning_rate": 9.451851851851853e-05, + "loss": 2.2872, + "step": 8400 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 5.875, + "learning_rate": 9.432098765432099e-05, + "loss": 2.3486, + "step": 8500 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 10.5625, + "learning_rate": 9.412345679012346e-05, + "loss": 2.3712, + "step": 8600 + }, + { + "epoch": 0.464, + "grad_norm": 4.53125, + "learning_rate": 9.392592592592593e-05, + "loss": 2.3074, + "step": 8700 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 14.1875, + "learning_rate": 9.37283950617284e-05, + "loss": 2.2984, + "step": 8800 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 6.875, + "learning_rate": 9.353086419753086e-05, + "loss": 2.2932, + "step": 8900 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 9.333333333333334e-05, + "loss": 2.2894, + "step": 9000 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 4.5625, + "learning_rate": 9.31358024691358e-05, + "loss": 2.261, + "step": 9100 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 5.71875, + "learning_rate": 9.293827160493827e-05, + "loss": 2.2841, + "step": 9200 + }, + { + "epoch": 0.496, + "grad_norm": 7.21875, + "learning_rate": 9.274074074074076e-05, + "loss": 2.3142, + "step": 9300 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 9.5, + "learning_rate": 9.254320987654321e-05, + "loss": 2.2716, + "step": 9400 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 11.75, + "learning_rate": 9.234567901234568e-05, + "loss": 2.3298, + "step": 9500 + }, + { + "epoch": 0.512, + "grad_norm": 4.71875, + "learning_rate": 9.214814814814815e-05, + "loss": 2.3203, + "step": 9600 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 7.34375, + "learning_rate": 9.195061728395062e-05, + "loss": 2.2616, + "step": 9700 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 8.3125, + "learning_rate": 9.175308641975308e-05, + "loss": 2.3006, + "step": 9800 + }, + { + "epoch": 0.528, + "grad_norm": 8.5625, + "learning_rate": 9.155555555555557e-05, + "loss": 2.2778, + "step": 9900 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 7.625, + "learning_rate": 9.135802469135802e-05, + "loss": 2.2826, + "step": 10000 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 6.25, + "learning_rate": 9.11604938271605e-05, + "loss": 2.3184, + "step": 10100 + }, + { + "epoch": 0.544, + "grad_norm": 5.96875, + "learning_rate": 9.096296296296298e-05, + "loss": 2.266, + "step": 10200 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 7.78125, + "learning_rate": 9.076543209876544e-05, + "loss": 2.2399, + "step": 10300 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 7.3125, + "learning_rate": 9.05679012345679e-05, + "loss": 2.2603, + "step": 10400 + }, + { + "epoch": 0.56, + "grad_norm": 6.46875, + "learning_rate": 9.037037037037038e-05, + "loss": 2.3063, + "step": 10500 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 7.375, + "learning_rate": 9.017283950617285e-05, + "loss": 2.2636, + "step": 10600 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 9.375, + "learning_rate": 8.99753086419753e-05, + "loss": 2.2504, + "step": 10700 + }, + { + "epoch": 0.576, + "grad_norm": 6.21875, + "learning_rate": 8.977777777777779e-05, + "loss": 2.2907, + "step": 10800 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 5.6875, + "learning_rate": 8.958024691358025e-05, + "loss": 2.2517, + "step": 10900 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 4.875, + "learning_rate": 8.938271604938272e-05, + "loss": 2.2441, + "step": 11000 + }, + { + "epoch": 0.592, + "grad_norm": 7.0625, + "learning_rate": 8.918518518518519e-05, + "loss": 2.2398, + "step": 11100 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 7.34375, + "learning_rate": 8.898765432098766e-05, + "loss": 2.233, + "step": 11200 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 8.1875, + "learning_rate": 8.879012345679013e-05, + "loss": 2.2189, + "step": 11300 + }, + { + "epoch": 0.608, + "grad_norm": 3.765625, + "learning_rate": 8.85925925925926e-05, + "loss": 2.2437, + "step": 11400 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 7.5, + "learning_rate": 8.839506172839507e-05, + "loss": 2.2625, + "step": 11500 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 6.03125, + "learning_rate": 8.819753086419753e-05, + "loss": 2.2111, + "step": 11600 + }, + { + "epoch": 0.624, + "grad_norm": 6.84375, + "learning_rate": 8.800000000000001e-05, + "loss": 2.1595, + "step": 11700 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 5.53125, + "learning_rate": 8.780246913580248e-05, + "loss": 2.195, + "step": 11800 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 6.8125, + "learning_rate": 8.760493827160494e-05, + "loss": 2.2475, + "step": 11900 + }, + { + "epoch": 0.64, + "grad_norm": 5.8125, + "learning_rate": 8.740740740740741e-05, + "loss": 2.2127, + "step": 12000 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 6.53125, + "learning_rate": 8.720987654320988e-05, + "loss": 2.252, + "step": 12100 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 12.8125, + "learning_rate": 8.701234567901235e-05, + "loss": 2.2172, + "step": 12200 + }, + { + "epoch": 0.656, + "grad_norm": 7.40625, + "learning_rate": 8.681481481481482e-05, + "loss": 2.2443, + "step": 12300 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 4.65625, + "learning_rate": 8.661728395061729e-05, + "loss": 2.2779, + "step": 12400 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.34375, + "learning_rate": 8.641975308641975e-05, + "loss": 2.2281, + "step": 12500 + }, + { + "epoch": 0.672, + "grad_norm": 5.40625, + "learning_rate": 8.622222222222222e-05, + "loss": 2.2017, + "step": 12600 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 7.53125, + "learning_rate": 8.60246913580247e-05, + "loss": 2.2047, + "step": 12700 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 6.0625, + "learning_rate": 8.582716049382716e-05, + "loss": 2.1622, + "step": 12800 + }, + { + "epoch": 0.688, + "grad_norm": 6.3125, + "learning_rate": 8.562962962962963e-05, + "loss": 2.2128, + "step": 12900 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 7.71875, + "learning_rate": 8.54320987654321e-05, + "loss": 2.1793, + "step": 13000 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 5.96875, + "learning_rate": 8.523456790123457e-05, + "loss": 2.2025, + "step": 13100 + }, + { + "epoch": 0.704, + "grad_norm": 4.625, + "learning_rate": 8.503703703703703e-05, + "loss": 2.1922, + "step": 13200 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 7.0, + "learning_rate": 8.483950617283952e-05, + "loss": 2.1859, + "step": 13300 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 11.875, + "learning_rate": 8.464197530864197e-05, + "loss": 2.2153, + "step": 13400 + }, + { + "epoch": 0.72, + "grad_norm": 5.90625, + "learning_rate": 8.444444444444444e-05, + "loss": 2.245, + "step": 13500 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 4.78125, + "learning_rate": 8.424691358024693e-05, + "loss": 2.1703, + "step": 13600 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 5.84375, + "learning_rate": 8.404938271604938e-05, + "loss": 2.2208, + "step": 13700 + }, + { + "epoch": 0.736, + "grad_norm": 8.4375, + "learning_rate": 8.385185185185186e-05, + "loss": 2.0853, + "step": 13800 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 5.4375, + "learning_rate": 8.365432098765433e-05, + "loss": 2.2348, + "step": 13900 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 4.1875, + "learning_rate": 8.34567901234568e-05, + "loss": 2.1849, + "step": 14000 + }, + { + "epoch": 0.752, + "grad_norm": 6.65625, + "learning_rate": 8.325925925925925e-05, + "loss": 2.118, + "step": 14100 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 6.5625, + "learning_rate": 8.306172839506174e-05, + "loss": 2.1696, + "step": 14200 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 8.5625, + "learning_rate": 8.28641975308642e-05, + "loss": 2.1653, + "step": 14300 + }, + { + "epoch": 0.768, + "grad_norm": 7.53125, + "learning_rate": 8.266666666666667e-05, + "loss": 2.1604, + "step": 14400 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 9.375, + "learning_rate": 8.246913580246915e-05, + "loss": 2.2172, + "step": 14500 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 5.5625, + "learning_rate": 8.227160493827161e-05, + "loss": 2.1547, + "step": 14600 + }, + { + "epoch": 0.784, + "grad_norm": 9.5625, + "learning_rate": 8.207407407407408e-05, + "loss": 2.1884, + "step": 14700 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 9.5, + "learning_rate": 8.187654320987655e-05, + "loss": 2.1089, + "step": 14800 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 6.25, + "learning_rate": 8.167901234567902e-05, + "loss": 2.137, + "step": 14900 + }, + { + "epoch": 0.8, + "grad_norm": 9.0, + "learning_rate": 8.148148148148148e-05, + "loss": 2.107, + "step": 15000 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 10.4375, + "learning_rate": 8.128395061728396e-05, + "loss": 2.2031, + "step": 15100 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 9.5, + "learning_rate": 8.108641975308643e-05, + "loss": 2.1229, + "step": 15200 + }, + { + "epoch": 0.816, + "grad_norm": 8.0625, + "learning_rate": 8.088888888888889e-05, + "loss": 2.2447, + "step": 15300 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 5.25, + "learning_rate": 8.069135802469136e-05, + "loss": 2.1696, + "step": 15400 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 5.8125, + "learning_rate": 8.049382716049383e-05, + "loss": 2.1187, + "step": 15500 + }, + { + "epoch": 0.832, + "grad_norm": 6.59375, + "learning_rate": 8.02962962962963e-05, + "loss": 2.1284, + "step": 15600 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 8.875, + "learning_rate": 8.009876543209877e-05, + "loss": 2.0855, + "step": 15700 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.990123456790124e-05, + "loss": 2.1295, + "step": 15800 + }, + { + "epoch": 0.848, + "grad_norm": 6.5, + "learning_rate": 7.97037037037037e-05, + "loss": 2.1085, + "step": 15900 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 6.8125, + "learning_rate": 7.950617283950618e-05, + "loss": 2.1066, + "step": 16000 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 12.0, + "learning_rate": 7.930864197530865e-05, + "loss": 2.1632, + "step": 16100 + }, + { + "epoch": 0.864, + "grad_norm": 6.6875, + "learning_rate": 7.911111111111111e-05, + "loss": 2.1311, + "step": 16200 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 5.875, + "learning_rate": 7.891358024691358e-05, + "loss": 2.09, + "step": 16300 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 6.5625, + "learning_rate": 7.871604938271605e-05, + "loss": 2.1668, + "step": 16400 + }, + { + "epoch": 0.88, + "grad_norm": 7.90625, + "learning_rate": 7.851851851851852e-05, + "loss": 2.086, + "step": 16500 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 6.0625, + "learning_rate": 7.8320987654321e-05, + "loss": 2.1314, + "step": 16600 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 8.8125, + "learning_rate": 7.812345679012346e-05, + "loss": 2.1197, + "step": 16700 + }, + { + "epoch": 0.896, + "grad_norm": 8.0625, + "learning_rate": 7.792592592592592e-05, + "loss": 2.1947, + "step": 16800 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 5.25, + "learning_rate": 7.772839506172839e-05, + "loss": 2.1226, + "step": 16900 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 6.90625, + "learning_rate": 7.753086419753088e-05, + "loss": 2.1252, + "step": 17000 + }, + { + "epoch": 0.912, + "grad_norm": 5.46875, + "learning_rate": 7.733333333333333e-05, + "loss": 2.1168, + "step": 17100 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 6.65625, + "learning_rate": 7.71358024691358e-05, + "loss": 2.0991, + "step": 17200 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 5.0625, + "learning_rate": 7.693827160493828e-05, + "loss": 2.1109, + "step": 17300 + }, + { + "epoch": 0.928, + "grad_norm": 5.53125, + "learning_rate": 7.674074074074075e-05, + "loss": 2.1673, + "step": 17400 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 6.78125, + "learning_rate": 7.65432098765432e-05, + "loss": 2.1156, + "step": 17500 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 8.5, + "learning_rate": 7.634567901234569e-05, + "loss": 2.0908, + "step": 17600 + }, + { + "epoch": 0.944, + "grad_norm": 5.03125, + "learning_rate": 7.614814814814816e-05, + "loss": 2.11, + "step": 17700 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 7.90625, + "learning_rate": 7.595061728395062e-05, + "loss": 2.0758, + "step": 17800 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 6.3125, + "learning_rate": 7.57530864197531e-05, + "loss": 2.0879, + "step": 17900 + }, + { + "epoch": 0.96, + "grad_norm": 8.1875, + "learning_rate": 7.555555555555556e-05, + "loss": 2.1096, + "step": 18000 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 6.46875, + "learning_rate": 7.535802469135803e-05, + "loss": 2.0644, + "step": 18100 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 11.75, + "learning_rate": 7.51604938271605e-05, + "loss": 2.0952, + "step": 18200 + }, + { + "epoch": 0.976, + "grad_norm": 4.25, + "learning_rate": 7.496296296296297e-05, + "loss": 2.1121, + "step": 18300 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 5.8125, + "learning_rate": 7.476543209876543e-05, + "loss": 2.0889, + "step": 18400 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 5.53125, + "learning_rate": 7.456790123456791e-05, + "loss": 2.0975, + "step": 18500 + }, + { + "epoch": 0.992, + "grad_norm": 8.6875, + "learning_rate": 7.437037037037038e-05, + "loss": 2.1112, + "step": 18600 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 6.375, + "learning_rate": 7.417283950617284e-05, + "loss": 2.1031, + "step": 18700 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 5.78125, + "learning_rate": 7.397530864197532e-05, + "loss": 1.9096, + "step": 18800 + }, + { + "epoch": 1.008, + "grad_norm": 9.0625, + "learning_rate": 7.377777777777778e-05, + "loss": 1.6546, + "step": 18900 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 8.25, + "learning_rate": 7.358024691358025e-05, + "loss": 1.734, + "step": 19000 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 6.28125, + "learning_rate": 7.338271604938272e-05, + "loss": 1.6961, + "step": 19100 + }, + { + "epoch": 1.024, + "grad_norm": 6.5625, + "learning_rate": 7.318518518518519e-05, + "loss": 1.647, + "step": 19200 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 6.9375, + "learning_rate": 7.298765432098765e-05, + "loss": 1.678, + "step": 19300 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 6.09375, + "learning_rate": 7.279012345679013e-05, + "loss": 1.6691, + "step": 19400 + }, + { + "epoch": 1.04, + "grad_norm": 7.9375, + "learning_rate": 7.25925925925926e-05, + "loss": 1.7127, + "step": 19500 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 8.1875, + "learning_rate": 7.239506172839506e-05, + "loss": 1.6539, + "step": 19600 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 4.09375, + "learning_rate": 7.219753086419753e-05, + "loss": 1.6652, + "step": 19700 + }, + { + "epoch": 1.056, + "grad_norm": 4.84375, + "learning_rate": 7.2e-05, + "loss": 1.7378, + "step": 19800 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 7.53125, + "learning_rate": 7.180246913580247e-05, + "loss": 1.6836, + "step": 19900 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 7.21875, + "learning_rate": 7.160493827160494e-05, + "loss": 1.7519, + "step": 20000 + }, + { + "epoch": 1.072, + "grad_norm": 7.28125, + "learning_rate": 7.140740740740741e-05, + "loss": 1.6667, + "step": 20100 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 11.0625, + "learning_rate": 7.120987654320987e-05, + "loss": 1.6718, + "step": 20200 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 6.90625, + "learning_rate": 7.101234567901236e-05, + "loss": 1.7361, + "step": 20300 + }, + { + "epoch": 1.088, + "grad_norm": 7.34375, + "learning_rate": 7.081481481481483e-05, + "loss": 1.6885, + "step": 20400 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 9.5, + "learning_rate": 7.061728395061728e-05, + "loss": 1.7336, + "step": 20500 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 9.6875, + "learning_rate": 7.041975308641975e-05, + "loss": 1.6883, + "step": 20600 + }, + { + "epoch": 1.104, + "grad_norm": 8.8125, + "learning_rate": 7.022222222222222e-05, + "loss": 1.6396, + "step": 20700 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 6.21875, + "learning_rate": 7.00246913580247e-05, + "loss": 1.6886, + "step": 20800 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 13.625, + "learning_rate": 6.982716049382717e-05, + "loss": 1.6706, + "step": 20900 + }, + { + "epoch": 1.12, + "grad_norm": 4.53125, + "learning_rate": 6.962962962962964e-05, + "loss": 1.6766, + "step": 21000 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 7.46875, + "learning_rate": 6.943209876543211e-05, + "loss": 1.6789, + "step": 21100 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 6.1875, + "learning_rate": 6.923456790123456e-05, + "loss": 1.7217, + "step": 21200 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 9.75, + "learning_rate": 6.903703703703705e-05, + "loss": 1.6726, + "step": 21300 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 8.5625, + "learning_rate": 6.88395061728395e-05, + "loss": 1.7288, + "step": 21400 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 7.03125, + "learning_rate": 6.864197530864198e-05, + "loss": 1.6323, + "step": 21500 + }, + { + "epoch": 1.152, + "grad_norm": 11.8125, + "learning_rate": 6.844444444444445e-05, + "loss": 1.7222, + "step": 21600 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 5.28125, + "learning_rate": 6.824691358024692e-05, + "loss": 1.6429, + "step": 21700 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 6.5625, + "learning_rate": 6.804938271604938e-05, + "loss": 1.6679, + "step": 21800 + }, + { + "epoch": 1.168, + "grad_norm": 7.75, + "learning_rate": 6.785185185185186e-05, + "loss": 1.6387, + "step": 21900 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 7.0625, + "learning_rate": 6.765432098765433e-05, + "loss": 1.6457, + "step": 22000 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 6.59375, + "learning_rate": 6.745679012345679e-05, + "loss": 1.7333, + "step": 22100 + }, + { + "epoch": 1.184, + "grad_norm": 4.71875, + "learning_rate": 6.725925925925927e-05, + "loss": 1.7307, + "step": 22200 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 6.71875, + "learning_rate": 6.706172839506173e-05, + "loss": 1.7475, + "step": 22300 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 5.46875, + "learning_rate": 6.68641975308642e-05, + "loss": 1.6626, + "step": 22400 + }, + { + "epoch": 1.2, + "grad_norm": 5.71875, + "learning_rate": 6.666666666666667e-05, + "loss": 1.6603, + "step": 22500 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 5.90625, + "learning_rate": 6.646913580246914e-05, + "loss": 1.7291, + "step": 22600 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 7.40625, + "learning_rate": 6.62716049382716e-05, + "loss": 1.7231, + "step": 22700 + }, + { + "epoch": 1.216, + "grad_norm": 4.8125, + "learning_rate": 6.607407407407408e-05, + "loss": 1.6072, + "step": 22800 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 10.5, + "learning_rate": 6.587654320987655e-05, + "loss": 1.7127, + "step": 22900 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 5.71875, + "learning_rate": 6.567901234567901e-05, + "loss": 1.7209, + "step": 23000 + }, + { + "epoch": 1.232, + "grad_norm": 6.0, + "learning_rate": 6.54814814814815e-05, + "loss": 1.7039, + "step": 23100 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 10.3125, + "learning_rate": 6.528395061728395e-05, + "loss": 1.7275, + "step": 23200 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 5.5625, + "learning_rate": 6.508641975308642e-05, + "loss": 1.7337, + "step": 23300 + }, + { + "epoch": 1.248, + "grad_norm": 5.90625, + "learning_rate": 6.488888888888889e-05, + "loss": 1.6821, + "step": 23400 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 5.875, + "learning_rate": 6.469135802469136e-05, + "loss": 1.7188, + "step": 23500 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 5.84375, + "learning_rate": 6.449382716049382e-05, + "loss": 1.7119, + "step": 23600 + }, + { + "epoch": 1.264, + "grad_norm": 8.125, + "learning_rate": 6.42962962962963e-05, + "loss": 1.6742, + "step": 23700 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 4.96875, + "learning_rate": 6.409876543209878e-05, + "loss": 1.6378, + "step": 23800 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 5.40625, + "learning_rate": 6.390123456790123e-05, + "loss": 1.6826, + "step": 23900 + }, + { + "epoch": 1.28, + "grad_norm": 5.96875, + "learning_rate": 6.37037037037037e-05, + "loss": 1.712, + "step": 24000 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 6.3125, + "learning_rate": 6.350617283950617e-05, + "loss": 1.7673, + "step": 24100 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 5.375, + "learning_rate": 6.330864197530864e-05, + "loss": 1.5944, + "step": 24200 + }, + { + "epoch": 1.296, + "grad_norm": 8.0, + "learning_rate": 6.311111111111112e-05, + "loss": 1.7515, + "step": 24300 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 5.53125, + "learning_rate": 6.291358024691359e-05, + "loss": 1.739, + "step": 24400 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 4.6875, + "learning_rate": 6.271604938271606e-05, + "loss": 1.744, + "step": 24500 + }, + { + "epoch": 1.312, + "grad_norm": 11.9375, + "learning_rate": 6.251851851851853e-05, + "loss": 1.6566, + "step": 24600 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 11.4375, + "learning_rate": 6.2320987654321e-05, + "loss": 1.6289, + "step": 24700 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 11.1875, + "learning_rate": 6.212345679012346e-05, + "loss": 1.686, + "step": 24800 + }, + { + "epoch": 1.328, + "grad_norm": 6.21875, + "learning_rate": 6.192592592592593e-05, + "loss": 1.66, + "step": 24900 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 5.5, + "learning_rate": 6.17283950617284e-05, + "loss": 1.6724, + "step": 25000 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 6.46875, + "learning_rate": 6.153086419753087e-05, + "loss": 1.7236, + "step": 25100 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 6.6875, + "learning_rate": 6.133333333333334e-05, + "loss": 1.6676, + "step": 25200 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 6.84375, + "learning_rate": 6.113580246913581e-05, + "loss": 1.6966, + "step": 25300 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 6.09375, + "learning_rate": 6.093827160493828e-05, + "loss": 1.6573, + "step": 25400 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 6.53125, + "learning_rate": 6.074074074074074e-05, + "loss": 1.7067, + "step": 25500 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 5.0, + "learning_rate": 6.0543209876543214e-05, + "loss": 1.6531, + "step": 25600 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 4.3125, + "learning_rate": 6.034567901234568e-05, + "loss": 1.6951, + "step": 25700 + }, + { + "epoch": 1.376, + "grad_norm": 6.84375, + "learning_rate": 6.0148148148148155e-05, + "loss": 1.6101, + "step": 25800 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 5.8125, + "learning_rate": 5.995061728395062e-05, + "loss": 1.7114, + "step": 25900 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 6.375, + "learning_rate": 5.975308641975309e-05, + "loss": 1.6413, + "step": 26000 + }, + { + "epoch": 1.392, + "grad_norm": 5.5, + "learning_rate": 5.9555555555555554e-05, + "loss": 1.6189, + "step": 26100 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 6.28125, + "learning_rate": 5.9358024691358024e-05, + "loss": 1.6949, + "step": 26200 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 5.25, + "learning_rate": 5.91604938271605e-05, + "loss": 1.6616, + "step": 26300 + }, + { + "epoch": 1.408, + "grad_norm": 8.625, + "learning_rate": 5.8962962962962966e-05, + "loss": 1.6484, + "step": 26400 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 4.96875, + "learning_rate": 5.8765432098765437e-05, + "loss": 1.599, + "step": 26500 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 4.40625, + "learning_rate": 5.85679012345679e-05, + "loss": 1.6366, + "step": 26600 + }, + { + "epoch": 1.424, + "grad_norm": 9.8125, + "learning_rate": 5.837037037037038e-05, + "loss": 1.7065, + "step": 26700 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 5.46875, + "learning_rate": 5.8172839506172835e-05, + "loss": 1.6841, + "step": 26800 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 4.9375, + "learning_rate": 5.797530864197531e-05, + "loss": 1.66, + "step": 26900 + }, + { + "epoch": 1.44, + "grad_norm": 5.375, + "learning_rate": 5.7777777777777776e-05, + "loss": 1.6645, + "step": 27000 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 5.875, + "learning_rate": 5.758024691358025e-05, + "loss": 1.6354, + "step": 27100 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 6.90625, + "learning_rate": 5.7382716049382725e-05, + "loss": 1.626, + "step": 27200 + }, + { + "epoch": 1.456, + "grad_norm": 6.5, + "learning_rate": 5.718518518518519e-05, + "loss": 1.6265, + "step": 27300 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 9.25, + "learning_rate": 5.698765432098766e-05, + "loss": 1.6879, + "step": 27400 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 6.1875, + "learning_rate": 5.679012345679012e-05, + "loss": 1.6756, + "step": 27500 + }, + { + "epoch": 1.472, + "grad_norm": 6.0625, + "learning_rate": 5.6592592592592594e-05, + "loss": 1.748, + "step": 27600 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 7.1875, + "learning_rate": 5.639506172839506e-05, + "loss": 1.668, + "step": 27700 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 11.375, + "learning_rate": 5.6197530864197535e-05, + "loss": 1.6842, + "step": 27800 + }, + { + "epoch": 1.488, + "grad_norm": 5.125, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.7157, + "step": 27900 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 5.5, + "learning_rate": 5.580246913580247e-05, + "loss": 1.6674, + "step": 28000 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 5.6875, + "learning_rate": 5.560493827160495e-05, + "loss": 1.6131, + "step": 28100 + }, + { + "epoch": 1.504, + "grad_norm": 4.5, + "learning_rate": 5.540740740740741e-05, + "loss": 1.7084, + "step": 28200 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 5.15625, + "learning_rate": 5.520987654320988e-05, + "loss": 1.5791, + "step": 28300 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 6.96875, + "learning_rate": 5.5012345679012346e-05, + "loss": 1.5846, + "step": 28400 + }, + { + "epoch": 1.52, + "grad_norm": 11.875, + "learning_rate": 5.4814814814814817e-05, + "loss": 1.6353, + "step": 28500 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 8.3125, + "learning_rate": 5.461728395061728e-05, + "loss": 1.6686, + "step": 28600 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 13.6875, + "learning_rate": 5.441975308641976e-05, + "loss": 1.6609, + "step": 28700 + }, + { + "epoch": 1.536, + "grad_norm": 7.6875, + "learning_rate": 5.422222222222223e-05, + "loss": 1.6264, + "step": 28800 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 8.125, + "learning_rate": 5.402469135802469e-05, + "loss": 1.6539, + "step": 28900 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 7.4375, + "learning_rate": 5.382716049382717e-05, + "loss": 1.6946, + "step": 29000 + }, + { + "epoch": 1.552, + "grad_norm": 7.09375, + "learning_rate": 5.362962962962963e-05, + "loss": 1.6258, + "step": 29100 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 4.53125, + "learning_rate": 5.3432098765432105e-05, + "loss": 1.6388, + "step": 29200 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 5.4375, + "learning_rate": 5.323456790123457e-05, + "loss": 1.6131, + "step": 29300 + }, + { + "epoch": 1.568, + "grad_norm": 7.15625, + "learning_rate": 5.303703703703704e-05, + "loss": 1.5935, + "step": 29400 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 9.8125, + "learning_rate": 5.28395061728395e-05, + "loss": 1.6357, + "step": 29500 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 6.625, + "learning_rate": 5.264197530864198e-05, + "loss": 1.6733, + "step": 29600 + }, + { + "epoch": 1.584, + "grad_norm": 5.0, + "learning_rate": 5.244444444444445e-05, + "loss": 1.7063, + "step": 29700 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 6.625, + "learning_rate": 5.2246913580246915e-05, + "loss": 1.6056, + "step": 29800 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 6.90625, + "learning_rate": 5.2049382716049386e-05, + "loss": 1.6357, + "step": 29900 + }, + { + "epoch": 1.6, + "grad_norm": 7.5, + "learning_rate": 5.185185185185185e-05, + "loss": 1.6332, + "step": 30000 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 7.84375, + "learning_rate": 5.165432098765433e-05, + "loss": 1.6458, + "step": 30100 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 15.375, + "learning_rate": 5.145679012345679e-05, + "loss": 1.5787, + "step": 30200 + }, + { + "epoch": 1.616, + "grad_norm": 8.5625, + "learning_rate": 5.125925925925926e-05, + "loss": 1.6441, + "step": 30300 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 5.9375, + "learning_rate": 5.1061728395061726e-05, + "loss": 1.6211, + "step": 30400 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 6.09375, + "learning_rate": 5.0864197530864197e-05, + "loss": 1.6304, + "step": 30500 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.40625, + "learning_rate": 5.0666666666666674e-05, + "loss": 1.6111, + "step": 30600 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 7.625, + "learning_rate": 5.046913580246914e-05, + "loss": 1.6387, + "step": 30700 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 4.875, + "learning_rate": 5.027160493827161e-05, + "loss": 1.6418, + "step": 30800 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.25, + "learning_rate": 5.007407407407407e-05, + "loss": 1.6082, + "step": 30900 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 5.125, + "learning_rate": 4.987654320987655e-05, + "loss": 1.5755, + "step": 31000 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 9.0625, + "learning_rate": 4.9679012345679014e-05, + "loss": 1.6432, + "step": 31100 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 6.0, + "learning_rate": 4.9481481481481485e-05, + "loss": 1.6333, + "step": 31200 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 6.65625, + "learning_rate": 4.9283950617283955e-05, + "loss": 1.6183, + "step": 31300 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 7.28125, + "learning_rate": 4.908641975308642e-05, + "loss": 1.5636, + "step": 31400 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.125, + "learning_rate": 4.888888888888889e-05, + "loss": 1.621, + "step": 31500 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 6.46875, + "learning_rate": 4.869135802469136e-05, + "loss": 1.7226, + "step": 31600 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 5.875, + "learning_rate": 4.849382716049383e-05, + "loss": 1.6311, + "step": 31700 + }, + { + "epoch": 1.696, + "grad_norm": 5.875, + "learning_rate": 4.82962962962963e-05, + "loss": 1.6132, + "step": 31800 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 5.375, + "learning_rate": 4.8098765432098766e-05, + "loss": 1.5931, + "step": 31900 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 13.0625, + "learning_rate": 4.7901234567901237e-05, + "loss": 1.6958, + "step": 32000 + }, + { + "epoch": 1.712, + "grad_norm": 6.40625, + "learning_rate": 4.770370370370371e-05, + "loss": 1.6209, + "step": 32100 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 13.5625, + "learning_rate": 4.750617283950617e-05, + "loss": 1.6031, + "step": 32200 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 8.5, + "learning_rate": 4.730864197530864e-05, + "loss": 1.6279, + "step": 32300 + }, + { + "epoch": 1.728, + "grad_norm": 7.5, + "learning_rate": 4.711111111111111e-05, + "loss": 1.5793, + "step": 32400 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 6.75, + "learning_rate": 4.691358024691358e-05, + "loss": 1.6628, + "step": 32500 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 6.21875, + "learning_rate": 4.6716049382716054e-05, + "loss": 1.671, + "step": 32600 + }, + { + "epoch": 1.744, + "grad_norm": 5.875, + "learning_rate": 4.6518518518518525e-05, + "loss": 1.6886, + "step": 32700 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 5.15625, + "learning_rate": 4.632098765432099e-05, + "loss": 1.6598, + "step": 32800 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 9.6875, + "learning_rate": 4.612345679012346e-05, + "loss": 1.5773, + "step": 32900 + }, + { + "epoch": 1.76, + "grad_norm": 6.125, + "learning_rate": 4.592592592592593e-05, + "loss": 1.6603, + "step": 33000 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 5.8125, + "learning_rate": 4.5728395061728394e-05, + "loss": 1.6405, + "step": 33100 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 5.34375, + "learning_rate": 4.5530864197530865e-05, + "loss": 1.6776, + "step": 33200 + }, + { + "epoch": 1.776, + "grad_norm": 5.5625, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.5413, + "step": 33300 + }, + { + "epoch": 1.7813333333333334, + "grad_norm": 8.875, + "learning_rate": 4.5135802469135806e-05, + "loss": 1.6298, + "step": 33400 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 6.28125, + "learning_rate": 4.493827160493828e-05, + "loss": 1.5795, + "step": 33500 + }, + { + "epoch": 1.792, + "grad_norm": 6.65625, + "learning_rate": 4.474074074074075e-05, + "loss": 1.7145, + "step": 33600 + }, + { + "epoch": 1.7973333333333334, + "grad_norm": 7.96875, + "learning_rate": 4.454320987654321e-05, + "loss": 1.6492, + "step": 33700 + }, + { + "epoch": 1.8026666666666666, + "grad_norm": 10.625, + "learning_rate": 4.434567901234568e-05, + "loss": 1.5981, + "step": 33800 + }, + { + "epoch": 1.808, + "grad_norm": 5.65625, + "learning_rate": 4.414814814814815e-05, + "loss": 1.5606, + "step": 33900 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 4.90625, + "learning_rate": 4.3950617283950617e-05, + "loss": 1.5981, + "step": 34000 + }, + { + "epoch": 1.8186666666666667, + "grad_norm": 6.0, + "learning_rate": 4.375308641975309e-05, + "loss": 1.5976, + "step": 34100 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 5.5625, + "learning_rate": 4.355555555555556e-05, + "loss": 1.6783, + "step": 34200 + }, + { + "epoch": 1.8293333333333335, + "grad_norm": 6.96875, + "learning_rate": 4.335802469135803e-05, + "loss": 1.6716, + "step": 34300 + }, + { + "epoch": 1.8346666666666667, + "grad_norm": 4.6875, + "learning_rate": 4.31604938271605e-05, + "loss": 1.5989, + "step": 34400 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 7.71875, + "learning_rate": 4.296296296296296e-05, + "loss": 1.6317, + "step": 34500 + }, + { + "epoch": 1.8453333333333335, + "grad_norm": 5.78125, + "learning_rate": 4.2765432098765434e-05, + "loss": 1.6327, + "step": 34600 + }, + { + "epoch": 1.8506666666666667, + "grad_norm": 5.59375, + "learning_rate": 4.2567901234567905e-05, + "loss": 1.5324, + "step": 34700 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 7.65625, + "learning_rate": 4.237037037037037e-05, + "loss": 1.6141, + "step": 34800 + }, + { + "epoch": 1.8613333333333333, + "grad_norm": 9.4375, + "learning_rate": 4.217283950617284e-05, + "loss": 1.6398, + "step": 34900 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 8.875, + "learning_rate": 4.197530864197531e-05, + "loss": 1.5835, + "step": 35000 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 12.0625, + "learning_rate": 4.177777777777778e-05, + "loss": 1.633, + "step": 35100 + }, + { + "epoch": 1.8773333333333333, + "grad_norm": 8.375, + "learning_rate": 4.158024691358025e-05, + "loss": 1.6851, + "step": 35200 + }, + { + "epoch": 1.8826666666666667, + "grad_norm": 11.5625, + "learning_rate": 4.138271604938272e-05, + "loss": 1.6436, + "step": 35300 + }, + { + "epoch": 1.888, + "grad_norm": 7.78125, + "learning_rate": 4.1185185185185186e-05, + "loss": 1.6268, + "step": 35400 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 11.1875, + "learning_rate": 4.0987654320987657e-05, + "loss": 1.5537, + "step": 35500 + }, + { + "epoch": 1.8986666666666667, + "grad_norm": 10.8125, + "learning_rate": 4.079012345679013e-05, + "loss": 1.6954, + "step": 35600 + }, + { + "epoch": 1.904, + "grad_norm": 10.625, + "learning_rate": 4.059259259259259e-05, + "loss": 1.6122, + "step": 35700 + }, + { + "epoch": 1.9093333333333333, + "grad_norm": 4.4375, + "learning_rate": 4.039506172839506e-05, + "loss": 1.6308, + "step": 35800 + }, + { + "epoch": 1.9146666666666667, + "grad_norm": 5.4375, + "learning_rate": 4.019753086419753e-05, + "loss": 1.6331, + "step": 35900 + }, + { + "epoch": 1.92, + "grad_norm": 5.125, + "learning_rate": 4e-05, + "loss": 1.5898, + "step": 36000 + }, + { + "epoch": 1.9253333333333333, + "grad_norm": 13.5625, + "learning_rate": 3.9802469135802474e-05, + "loss": 1.6748, + "step": 36100 + }, + { + "epoch": 1.9306666666666668, + "grad_norm": 5.40625, + "learning_rate": 3.960493827160494e-05, + "loss": 1.6326, + "step": 36200 + }, + { + "epoch": 1.936, + "grad_norm": 8.0, + "learning_rate": 3.940740740740741e-05, + "loss": 1.6027, + "step": 36300 + }, + { + "epoch": 1.9413333333333334, + "grad_norm": 12.625, + "learning_rate": 3.920987654320988e-05, + "loss": 1.5298, + "step": 36400 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 5.875, + "learning_rate": 3.901234567901234e-05, + "loss": 1.6354, + "step": 36500 + }, + { + "epoch": 1.952, + "grad_norm": 5.40625, + "learning_rate": 3.8814814814814814e-05, + "loss": 1.6155, + "step": 36600 + }, + { + "epoch": 1.9573333333333334, + "grad_norm": 5.15625, + "learning_rate": 3.8617283950617285e-05, + "loss": 1.6524, + "step": 36700 + }, + { + "epoch": 1.9626666666666668, + "grad_norm": 8.0625, + "learning_rate": 3.8419753086419755e-05, + "loss": 1.6594, + "step": 36800 + }, + { + "epoch": 1.968, + "grad_norm": 11.0, + "learning_rate": 3.8222222222222226e-05, + "loss": 1.6397, + "step": 36900 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 6.96875, + "learning_rate": 3.80246913580247e-05, + "loss": 1.6208, + "step": 37000 + }, + { + "epoch": 1.9786666666666668, + "grad_norm": 9.125, + "learning_rate": 3.782716049382716e-05, + "loss": 1.5995, + "step": 37100 + }, + { + "epoch": 1.984, + "grad_norm": 8.8125, + "learning_rate": 3.762962962962963e-05, + "loss": 1.59, + "step": 37200 + }, + { + "epoch": 1.9893333333333332, + "grad_norm": 8.1875, + "learning_rate": 3.74320987654321e-05, + "loss": 1.6343, + "step": 37300 + }, + { + "epoch": 1.9946666666666668, + "grad_norm": 7.65625, + "learning_rate": 3.7234567901234566e-05, + "loss": 1.6007, + "step": 37400 + }, + { + "epoch": 2.0, + "grad_norm": 6.125, + "learning_rate": 3.7037037037037037e-05, + "loss": 1.6295, + "step": 37500 + }, + { + "epoch": 2.005333333333333, + "grad_norm": 6.65625, + "learning_rate": 3.683950617283951e-05, + "loss": 1.2317, + "step": 37600 + }, + { + "epoch": 2.010666666666667, + "grad_norm": 6.9375, + "learning_rate": 3.664197530864198e-05, + "loss": 1.3769, + "step": 37700 + }, + { + "epoch": 2.016, + "grad_norm": 5.46875, + "learning_rate": 3.644444444444445e-05, + "loss": 1.3206, + "step": 37800 + }, + { + "epoch": 2.021333333333333, + "grad_norm": 7.3125, + "learning_rate": 3.624691358024692e-05, + "loss": 1.2903, + "step": 37900 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.604938271604938e-05, + "loss": 1.3443, + "step": 38000 + }, + { + "epoch": 2.032, + "grad_norm": 6.375, + "learning_rate": 3.5851851851851854e-05, + "loss": 1.291, + "step": 38100 + }, + { + "epoch": 2.037333333333333, + "grad_norm": 7.84375, + "learning_rate": 3.5654320987654325e-05, + "loss": 1.2552, + "step": 38200 + }, + { + "epoch": 2.042666666666667, + "grad_norm": 8.9375, + "learning_rate": 3.545679012345679e-05, + "loss": 1.2883, + "step": 38300 + }, + { + "epoch": 2.048, + "grad_norm": 6.09375, + "learning_rate": 3.525925925925926e-05, + "loss": 1.2755, + "step": 38400 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 6.0625, + "learning_rate": 3.506172839506173e-05, + "loss": 1.3612, + "step": 38500 + }, + { + "epoch": 2.058666666666667, + "grad_norm": 8.625, + "learning_rate": 3.48641975308642e-05, + "loss": 1.2394, + "step": 38600 + }, + { + "epoch": 2.064, + "grad_norm": 8.25, + "learning_rate": 3.466666666666667e-05, + "loss": 1.3005, + "step": 38700 + }, + { + "epoch": 2.0693333333333332, + "grad_norm": 7.125, + "learning_rate": 3.4469135802469135e-05, + "loss": 1.3219, + "step": 38800 + }, + { + "epoch": 2.074666666666667, + "grad_norm": 6.6875, + "learning_rate": 3.4271604938271606e-05, + "loss": 1.3388, + "step": 38900 + }, + { + "epoch": 2.08, + "grad_norm": 7.4375, + "learning_rate": 3.4074074074074077e-05, + "loss": 1.3317, + "step": 39000 + }, + { + "epoch": 2.0853333333333333, + "grad_norm": 5.15625, + "learning_rate": 3.387654320987654e-05, + "loss": 1.2546, + "step": 39100 + }, + { + "epoch": 2.0906666666666665, + "grad_norm": 6.71875, + "learning_rate": 3.367901234567901e-05, + "loss": 1.3502, + "step": 39200 + }, + { + "epoch": 2.096, + "grad_norm": 7.28125, + "learning_rate": 3.348148148148148e-05, + "loss": 1.2733, + "step": 39300 + }, + { + "epoch": 2.1013333333333333, + "grad_norm": 8.125, + "learning_rate": 3.328395061728395e-05, + "loss": 1.2879, + "step": 39400 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 6.5625, + "learning_rate": 3.308641975308642e-05, + "loss": 1.2175, + "step": 39500 + }, + { + "epoch": 2.112, + "grad_norm": 7.375, + "learning_rate": 3.2888888888888894e-05, + "loss": 1.3628, + "step": 39600 + }, + { + "epoch": 2.1173333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.269135802469136e-05, + "loss": 1.2937, + "step": 39700 + }, + { + "epoch": 2.1226666666666665, + "grad_norm": 5.9375, + "learning_rate": 3.249382716049383e-05, + "loss": 1.2451, + "step": 39800 + }, + { + "epoch": 2.128, + "grad_norm": 9.6875, + "learning_rate": 3.22962962962963e-05, + "loss": 1.3379, + "step": 39900 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 7.5, + "learning_rate": 3.209876543209876e-05, + "loss": 1.2934, + "step": 40000 + }, + { + "epoch": 2.1386666666666665, + "grad_norm": 5.84375, + "learning_rate": 3.1901234567901234e-05, + "loss": 1.2618, + "step": 40100 + }, + { + "epoch": 2.144, + "grad_norm": 6.875, + "learning_rate": 3.1703703703703705e-05, + "loss": 1.384, + "step": 40200 + }, + { + "epoch": 2.1493333333333333, + "grad_norm": 7.34375, + "learning_rate": 3.1506172839506175e-05, + "loss": 1.2611, + "step": 40300 + }, + { + "epoch": 2.1546666666666665, + "grad_norm": 8.0, + "learning_rate": 3.1308641975308646e-05, + "loss": 1.2923, + "step": 40400 + }, + { + "epoch": 2.16, + "grad_norm": 7.53125, + "learning_rate": 3.111111111111111e-05, + "loss": 1.2947, + "step": 40500 + }, + { + "epoch": 2.1653333333333333, + "grad_norm": 8.125, + "learning_rate": 3.091358024691358e-05, + "loss": 1.283, + "step": 40600 + }, + { + "epoch": 2.1706666666666665, + "grad_norm": 7.625, + "learning_rate": 3.071604938271605e-05, + "loss": 1.3939, + "step": 40700 + }, + { + "epoch": 2.176, + "grad_norm": 8.0625, + "learning_rate": 3.0518518518518515e-05, + "loss": 1.3395, + "step": 40800 + }, + { + "epoch": 2.1813333333333333, + "grad_norm": 10.3125, + "learning_rate": 3.0320987654320986e-05, + "loss": 1.2382, + "step": 40900 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 5.8125, + "learning_rate": 3.012345679012346e-05, + "loss": 1.346, + "step": 41000 + }, + { + "epoch": 2.192, + "grad_norm": 12.0, + "learning_rate": 2.992592592592593e-05, + "loss": 1.348, + "step": 41100 + }, + { + "epoch": 2.1973333333333334, + "grad_norm": 7.0, + "learning_rate": 2.9728395061728398e-05, + "loss": 1.2885, + "step": 41200 + }, + { + "epoch": 2.2026666666666666, + "grad_norm": 10.9375, + "learning_rate": 2.9530864197530865e-05, + "loss": 1.2577, + "step": 41300 + }, + { + "epoch": 2.208, + "grad_norm": 7.0625, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.3698, + "step": 41400 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 5.6875, + "learning_rate": 2.9135802469135803e-05, + "loss": 1.2787, + "step": 41500 + }, + { + "epoch": 2.2186666666666666, + "grad_norm": 11.0625, + "learning_rate": 2.893827160493827e-05, + "loss": 1.299, + "step": 41600 + }, + { + "epoch": 2.224, + "grad_norm": 16.5, + "learning_rate": 2.874074074074074e-05, + "loss": 1.3493, + "step": 41700 + }, + { + "epoch": 2.2293333333333334, + "grad_norm": 7.71875, + "learning_rate": 2.854320987654321e-05, + "loss": 1.232, + "step": 41800 + }, + { + "epoch": 2.2346666666666666, + "grad_norm": 7.3125, + "learning_rate": 2.8345679012345683e-05, + "loss": 1.2965, + "step": 41900 + }, + { + "epoch": 2.24, + "grad_norm": 4.875, + "learning_rate": 2.814814814814815e-05, + "loss": 1.2932, + "step": 42000 + }, + { + "epoch": 2.2453333333333334, + "grad_norm": 8.5625, + "learning_rate": 2.795061728395062e-05, + "loss": 1.2689, + "step": 42100 + }, + { + "epoch": 2.2506666666666666, + "grad_norm": 8.5625, + "learning_rate": 2.7753086419753088e-05, + "loss": 1.3437, + "step": 42200 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 11.375, + "learning_rate": 2.7555555555555555e-05, + "loss": 1.3957, + "step": 42300 + }, + { + "epoch": 2.2613333333333334, + "grad_norm": 7.125, + "learning_rate": 2.7358024691358026e-05, + "loss": 1.2948, + "step": 42400 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 6.90625, + "learning_rate": 2.7160493827160493e-05, + "loss": 1.2896, + "step": 42500 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 8.3125, + "learning_rate": 2.696296296296296e-05, + "loss": 1.2483, + "step": 42600 + }, + { + "epoch": 2.2773333333333334, + "grad_norm": 6.40625, + "learning_rate": 2.6765432098765435e-05, + "loss": 1.3159, + "step": 42700 + }, + { + "epoch": 2.2826666666666666, + "grad_norm": 6.59375, + "learning_rate": 2.6567901234567905e-05, + "loss": 1.2742, + "step": 42800 + }, + { + "epoch": 2.288, + "grad_norm": 7.21875, + "learning_rate": 2.6370370370370373e-05, + "loss": 1.3353, + "step": 42900 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 5.46875, + "learning_rate": 2.617283950617284e-05, + "loss": 1.3093, + "step": 43000 + }, + { + "epoch": 2.2986666666666666, + "grad_norm": 8.3125, + "learning_rate": 2.597530864197531e-05, + "loss": 1.2342, + "step": 43100 + }, + { + "epoch": 2.304, + "grad_norm": 7.09375, + "learning_rate": 2.5777777777777778e-05, + "loss": 1.3586, + "step": 43200 + }, + { + "epoch": 2.3093333333333335, + "grad_norm": 11.625, + "learning_rate": 2.558024691358025e-05, + "loss": 1.2999, + "step": 43300 + }, + { + "epoch": 2.3146666666666667, + "grad_norm": 7.75, + "learning_rate": 2.5382716049382716e-05, + "loss": 1.2873, + "step": 43400 + }, + { + "epoch": 2.32, + "grad_norm": 8.0, + "learning_rate": 2.5185185185185183e-05, + "loss": 1.3057, + "step": 43500 + }, + { + "epoch": 2.3253333333333335, + "grad_norm": 9.1875, + "learning_rate": 2.4987654320987654e-05, + "loss": 1.3544, + "step": 43600 + }, + { + "epoch": 2.3306666666666667, + "grad_norm": 7.71875, + "learning_rate": 2.4790123456790125e-05, + "loss": 1.333, + "step": 43700 + }, + { + "epoch": 2.336, + "grad_norm": 6.21875, + "learning_rate": 2.4592592592592595e-05, + "loss": 1.2135, + "step": 43800 + }, + { + "epoch": 2.3413333333333335, + "grad_norm": 6.59375, + "learning_rate": 2.4395061728395063e-05, + "loss": 1.3494, + "step": 43900 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 8.1875, + "learning_rate": 2.4197530864197533e-05, + "loss": 1.3179, + "step": 44000 + }, + { + "epoch": 2.352, + "grad_norm": 5.9375, + "learning_rate": 2.4e-05, + "loss": 1.401, + "step": 44100 + }, + { + "epoch": 2.3573333333333335, + "grad_norm": 8.125, + "learning_rate": 2.380246913580247e-05, + "loss": 1.2905, + "step": 44200 + }, + { + "epoch": 2.3626666666666667, + "grad_norm": 6.5625, + "learning_rate": 2.360493827160494e-05, + "loss": 1.3236, + "step": 44300 + }, + { + "epoch": 2.368, + "grad_norm": 7.71875, + "learning_rate": 2.340740740740741e-05, + "loss": 1.2924, + "step": 44400 + }, + { + "epoch": 2.3733333333333335, + "grad_norm": 10.1875, + "learning_rate": 2.3209876543209877e-05, + "loss": 1.3823, + "step": 44500 + }, + { + "epoch": 2.3786666666666667, + "grad_norm": 9.0625, + "learning_rate": 2.3012345679012347e-05, + "loss": 1.2555, + "step": 44600 + }, + { + "epoch": 2.384, + "grad_norm": 6.53125, + "learning_rate": 2.2814814814814818e-05, + "loss": 1.319, + "step": 44700 + }, + { + "epoch": 2.389333333333333, + "grad_norm": 6.28125, + "learning_rate": 2.2617283950617285e-05, + "loss": 1.3722, + "step": 44800 + }, + { + "epoch": 2.3946666666666667, + "grad_norm": 8.75, + "learning_rate": 2.2419753086419753e-05, + "loss": 1.2535, + "step": 44900 + }, + { + "epoch": 2.4, + "grad_norm": 5.75, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.3091, + "step": 45000 + }, + { + "epoch": 2.405333333333333, + "grad_norm": 7.09375, + "learning_rate": 2.2024691358024694e-05, + "loss": 1.3417, + "step": 45100 + }, + { + "epoch": 2.4106666666666667, + "grad_norm": 6.21875, + "learning_rate": 2.182716049382716e-05, + "loss": 1.318, + "step": 45200 + }, + { + "epoch": 2.416, + "grad_norm": 6.09375, + "learning_rate": 2.162962962962963e-05, + "loss": 1.2971, + "step": 45300 + }, + { + "epoch": 2.421333333333333, + "grad_norm": 6.875, + "learning_rate": 2.14320987654321e-05, + "loss": 1.3866, + "step": 45400 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 5.9375, + "learning_rate": 2.123456790123457e-05, + "loss": 1.2945, + "step": 45500 + }, + { + "epoch": 2.432, + "grad_norm": 7.4375, + "learning_rate": 2.1037037037037037e-05, + "loss": 1.2541, + "step": 45600 + }, + { + "epoch": 2.437333333333333, + "grad_norm": 5.71875, + "learning_rate": 2.0839506172839508e-05, + "loss": 1.3282, + "step": 45700 + }, + { + "epoch": 2.4426666666666668, + "grad_norm": 12.1875, + "learning_rate": 2.0641975308641975e-05, + "loss": 1.3743, + "step": 45800 + }, + { + "epoch": 2.448, + "grad_norm": 5.0, + "learning_rate": 2.0444444444444446e-05, + "loss": 1.2689, + "step": 45900 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 8.5625, + "learning_rate": 2.0246913580246917e-05, + "loss": 1.347, + "step": 46000 + }, + { + "epoch": 2.458666666666667, + "grad_norm": 7.25, + "learning_rate": 2.0049382716049384e-05, + "loss": 1.3629, + "step": 46100 + }, + { + "epoch": 2.464, + "grad_norm": 12.5, + "learning_rate": 1.985185185185185e-05, + "loss": 1.2604, + "step": 46200 + }, + { + "epoch": 2.469333333333333, + "grad_norm": 7.03125, + "learning_rate": 1.9654320987654322e-05, + "loss": 1.3428, + "step": 46300 + }, + { + "epoch": 2.474666666666667, + "grad_norm": 7.8125, + "learning_rate": 1.9456790123456793e-05, + "loss": 1.2956, + "step": 46400 + }, + { + "epoch": 2.48, + "grad_norm": 7.21875, + "learning_rate": 1.925925925925926e-05, + "loss": 1.2986, + "step": 46500 + }, + { + "epoch": 2.485333333333333, + "grad_norm": 8.3125, + "learning_rate": 1.9061728395061727e-05, + "loss": 1.2794, + "step": 46600 + }, + { + "epoch": 2.490666666666667, + "grad_norm": 8.125, + "learning_rate": 1.8864197530864198e-05, + "loss": 1.3091, + "step": 46700 + }, + { + "epoch": 2.496, + "grad_norm": 6.59375, + "learning_rate": 1.866666666666667e-05, + "loss": 1.2405, + "step": 46800 + }, + { + "epoch": 2.501333333333333, + "grad_norm": 9.25, + "learning_rate": 1.8469135802469136e-05, + "loss": 1.2841, + "step": 46900 + }, + { + "epoch": 2.506666666666667, + "grad_norm": 5.78125, + "learning_rate": 1.8271604938271607e-05, + "loss": 1.3305, + "step": 47000 + }, + { + "epoch": 2.512, + "grad_norm": 8.375, + "learning_rate": 1.8074074074074074e-05, + "loss": 1.3659, + "step": 47100 + }, + { + "epoch": 2.517333333333333, + "grad_norm": 7.0625, + "learning_rate": 1.7876543209876545e-05, + "loss": 1.2434, + "step": 47200 + }, + { + "epoch": 2.522666666666667, + "grad_norm": 9.8125, + "learning_rate": 1.7679012345679012e-05, + "loss": 1.2765, + "step": 47300 + }, + { + "epoch": 2.528, + "grad_norm": 7.71875, + "learning_rate": 1.7481481481481483e-05, + "loss": 1.3136, + "step": 47400 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 7.71875, + "learning_rate": 1.728395061728395e-05, + "loss": 1.3622, + "step": 47500 + }, + { + "epoch": 2.538666666666667, + "grad_norm": 11.0, + "learning_rate": 1.708641975308642e-05, + "loss": 1.2877, + "step": 47600 + }, + { + "epoch": 2.544, + "grad_norm": 6.25, + "learning_rate": 1.688888888888889e-05, + "loss": 1.3239, + "step": 47700 + }, + { + "epoch": 2.5493333333333332, + "grad_norm": 6.375, + "learning_rate": 1.669135802469136e-05, + "loss": 1.3512, + "step": 47800 + }, + { + "epoch": 2.554666666666667, + "grad_norm": 6.875, + "learning_rate": 1.6493827160493826e-05, + "loss": 1.3079, + "step": 47900 + }, + { + "epoch": 2.56, + "grad_norm": 7.90625, + "learning_rate": 1.62962962962963e-05, + "loss": 1.3031, + "step": 48000 + }, + { + "epoch": 2.5653333333333332, + "grad_norm": 8.0, + "learning_rate": 1.6098765432098767e-05, + "loss": 1.3062, + "step": 48100 + }, + { + "epoch": 2.570666666666667, + "grad_norm": 7.625, + "learning_rate": 1.5901234567901235e-05, + "loss": 1.3348, + "step": 48200 + }, + { + "epoch": 2.576, + "grad_norm": 9.6875, + "learning_rate": 1.5703703703703705e-05, + "loss": 1.3392, + "step": 48300 + }, + { + "epoch": 2.5813333333333333, + "grad_norm": 8.75, + "learning_rate": 1.5506172839506173e-05, + "loss": 1.3153, + "step": 48400 + }, + { + "epoch": 2.586666666666667, + "grad_norm": 6.75, + "learning_rate": 1.5308641975308643e-05, + "loss": 1.3348, + "step": 48500 + }, + { + "epoch": 2.592, + "grad_norm": 9.0, + "learning_rate": 1.5111111111111112e-05, + "loss": 1.3008, + "step": 48600 + }, + { + "epoch": 2.5973333333333333, + "grad_norm": 8.1875, + "learning_rate": 1.4913580246913581e-05, + "loss": 1.3492, + "step": 48700 + }, + { + "epoch": 2.602666666666667, + "grad_norm": 6.5, + "learning_rate": 1.4716049382716049e-05, + "loss": 1.2897, + "step": 48800 + }, + { + "epoch": 2.608, + "grad_norm": 6.03125, + "learning_rate": 1.4518518518518521e-05, + "loss": 1.2443, + "step": 48900 + }, + { + "epoch": 2.6133333333333333, + "grad_norm": 8.125, + "learning_rate": 1.4320987654320988e-05, + "loss": 1.3395, + "step": 49000 + }, + { + "epoch": 2.618666666666667, + "grad_norm": 7.90625, + "learning_rate": 1.4123456790123457e-05, + "loss": 1.3716, + "step": 49100 + }, + { + "epoch": 2.624, + "grad_norm": 6.28125, + "learning_rate": 1.3925925925925926e-05, + "loss": 1.3066, + "step": 49200 + }, + { + "epoch": 2.6293333333333333, + "grad_norm": 5.34375, + "learning_rate": 1.3728395061728397e-05, + "loss": 1.2932, + "step": 49300 + }, + { + "epoch": 2.634666666666667, + "grad_norm": 7.21875, + "learning_rate": 1.3530864197530866e-05, + "loss": 1.2657, + "step": 49400 + }, + { + "epoch": 2.64, + "grad_norm": 12.6875, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.2547, + "step": 49500 + }, + { + "epoch": 2.6453333333333333, + "grad_norm": 7.53125, + "learning_rate": 1.3135802469135802e-05, + "loss": 1.2846, + "step": 49600 + }, + { + "epoch": 2.6506666666666665, + "grad_norm": 10.3125, + "learning_rate": 1.2938271604938273e-05, + "loss": 1.3046, + "step": 49700 + }, + { + "epoch": 2.656, + "grad_norm": 8.5625, + "learning_rate": 1.2740740740740742e-05, + "loss": 1.3353, + "step": 49800 + }, + { + "epoch": 2.6613333333333333, + "grad_norm": 10.6875, + "learning_rate": 1.2543209876543211e-05, + "loss": 1.3146, + "step": 49900 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 10.125, + "learning_rate": 1.2345679012345678e-05, + "loss": 1.279, + "step": 50000 + }, + { + "epoch": 2.672, + "grad_norm": 11.125, + "learning_rate": 1.2148148148148149e-05, + "loss": 1.2854, + "step": 50100 + }, + { + "epoch": 2.6773333333333333, + "grad_norm": 6.375, + "learning_rate": 1.1950617283950618e-05, + "loss": 1.3665, + "step": 50200 + }, + { + "epoch": 2.6826666666666665, + "grad_norm": 7.90625, + "learning_rate": 1.1753086419753087e-05, + "loss": 1.2908, + "step": 50300 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 6.03125, + "learning_rate": 1.1555555555555556e-05, + "loss": 1.33, + "step": 50400 + }, + { + "epoch": 2.6933333333333334, + "grad_norm": 9.1875, + "learning_rate": 1.1358024691358025e-05, + "loss": 1.3176, + "step": 50500 + }, + { + "epoch": 2.6986666666666665, + "grad_norm": 8.4375, + "learning_rate": 1.1160493827160494e-05, + "loss": 1.3215, + "step": 50600 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 6.90625, + "learning_rate": 1.0962962962962963e-05, + "loss": 1.3513, + "step": 50700 + }, + { + "epoch": 2.7093333333333334, + "grad_norm": 9.4375, + "learning_rate": 1.0765432098765432e-05, + "loss": 1.2539, + "step": 50800 + }, + { + "epoch": 2.7146666666666666, + "grad_norm": 7.125, + "learning_rate": 1.0567901234567903e-05, + "loss": 1.3037, + "step": 50900 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 7.75, + "learning_rate": 1.037037037037037e-05, + "loss": 1.3418, + "step": 51000 + }, + { + "epoch": 2.7253333333333334, + "grad_norm": 5.90625, + "learning_rate": 1.017283950617284e-05, + "loss": 1.3898, + "step": 51100 + }, + { + "epoch": 2.7306666666666666, + "grad_norm": 5.21875, + "learning_rate": 9.97530864197531e-06, + "loss": 1.2758, + "step": 51200 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 6.75, + "learning_rate": 9.777777777777779e-06, + "loss": 1.3215, + "step": 51300 + }, + { + "epoch": 2.7413333333333334, + "grad_norm": 10.9375, + "learning_rate": 9.580246913580248e-06, + "loss": 1.374, + "step": 51400 + }, + { + "epoch": 2.7466666666666666, + "grad_norm": 6.625, + "learning_rate": 9.382716049382717e-06, + "loss": 1.3613, + "step": 51500 + }, + { + "epoch": 2.752, + "grad_norm": 7.0625, + "learning_rate": 9.185185185185186e-06, + "loss": 1.3206, + "step": 51600 + }, + { + "epoch": 2.7573333333333334, + "grad_norm": 7.0, + "learning_rate": 8.987654320987655e-06, + "loss": 1.3731, + "step": 51700 + }, + { + "epoch": 2.7626666666666666, + "grad_norm": 6.40625, + "learning_rate": 8.790123456790124e-06, + "loss": 1.2751, + "step": 51800 + }, + { + "epoch": 2.768, + "grad_norm": 6.125, + "learning_rate": 8.592592592592593e-06, + "loss": 1.3447, + "step": 51900 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 6.40625, + "learning_rate": 8.395061728395062e-06, + "loss": 1.2362, + "step": 52000 + }, + { + "epoch": 2.7786666666666666, + "grad_norm": 7.125, + "learning_rate": 8.19753086419753e-06, + "loss": 1.3439, + "step": 52100 + }, + { + "epoch": 2.784, + "grad_norm": 5.53125, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3392, + "step": 52200 + }, + { + "epoch": 2.7893333333333334, + "grad_norm": 7.4375, + "learning_rate": 7.802469135802469e-06, + "loss": 1.3598, + "step": 52300 + }, + { + "epoch": 2.7946666666666666, + "grad_norm": 7.53125, + "learning_rate": 7.6049382716049385e-06, + "loss": 1.2485, + "step": 52400 + }, + { + "epoch": 2.8, + "grad_norm": 6.4375, + "learning_rate": 7.4074074074074075e-06, + "loss": 1.2279, + "step": 52500 + }, + { + "epoch": 2.8053333333333335, + "grad_norm": 12.3125, + "learning_rate": 7.209876543209877e-06, + "loss": 1.272, + "step": 52600 + }, + { + "epoch": 2.8106666666666666, + "grad_norm": 7.5, + "learning_rate": 7.0123456790123455e-06, + "loss": 1.2805, + "step": 52700 + }, + { + "epoch": 2.816, + "grad_norm": 6.125, + "learning_rate": 6.814814814814815e-06, + "loss": 1.3183, + "step": 52800 + }, + { + "epoch": 2.8213333333333335, + "grad_norm": 5.78125, + "learning_rate": 6.617283950617284e-06, + "loss": 1.2925, + "step": 52900 + }, + { + "epoch": 2.8266666666666667, + "grad_norm": 6.0625, + "learning_rate": 6.419753086419754e-06, + "loss": 1.2946, + "step": 53000 + }, + { + "epoch": 2.832, + "grad_norm": 10.5, + "learning_rate": 6.222222222222222e-06, + "loss": 1.329, + "step": 53100 + }, + { + "epoch": 2.8373333333333335, + "grad_norm": 9.75, + "learning_rate": 6.024691358024691e-06, + "loss": 1.3318, + "step": 53200 + }, + { + "epoch": 2.8426666666666667, + "grad_norm": 7.0625, + "learning_rate": 5.82716049382716e-06, + "loss": 1.3648, + "step": 53300 + }, + { + "epoch": 2.848, + "grad_norm": 9.25, + "learning_rate": 5.62962962962963e-06, + "loss": 1.2775, + "step": 53400 + }, + { + "epoch": 2.8533333333333335, + "grad_norm": 9.1875, + "learning_rate": 5.432098765432099e-06, + "loss": 1.2806, + "step": 53500 + }, + { + "epoch": 2.8586666666666667, + "grad_norm": 6.4375, + "learning_rate": 5.234567901234568e-06, + "loss": 1.3544, + "step": 53600 + }, + { + "epoch": 2.864, + "grad_norm": 9.8125, + "learning_rate": 5.037037037037037e-06, + "loss": 1.2832, + "step": 53700 + }, + { + "epoch": 2.8693333333333335, + "grad_norm": 5.9375, + "learning_rate": 4.839506172839506e-06, + "loss": 1.3708, + "step": 53800 + }, + { + "epoch": 2.8746666666666667, + "grad_norm": 7.25, + "learning_rate": 4.641975308641976e-06, + "loss": 1.287, + "step": 53900 + }, + { + "epoch": 2.88, + "grad_norm": 10.375, + "learning_rate": 4.444444444444445e-06, + "loss": 1.2741, + "step": 54000 + }, + { + "epoch": 2.8853333333333335, + "grad_norm": 7.78125, + "learning_rate": 4.246913580246914e-06, + "loss": 1.3358, + "step": 54100 + }, + { + "epoch": 2.8906666666666667, + "grad_norm": 8.5, + "learning_rate": 4.049382716049383e-06, + "loss": 1.2651, + "step": 54200 + }, + { + "epoch": 2.896, + "grad_norm": 11.5, + "learning_rate": 3.851851851851852e-06, + "loss": 1.3113, + "step": 54300 + }, + { + "epoch": 2.9013333333333335, + "grad_norm": 6.4375, + "learning_rate": 3.6543209876543214e-06, + "loss": 1.2998, + "step": 54400 + }, + { + "epoch": 2.9066666666666667, + "grad_norm": 7.375, + "learning_rate": 3.45679012345679e-06, + "loss": 1.325, + "step": 54500 + }, + { + "epoch": 2.912, + "grad_norm": 7.75, + "learning_rate": 3.259259259259259e-06, + "loss": 1.2704, + "step": 54600 + }, + { + "epoch": 2.9173333333333336, + "grad_norm": 6.15625, + "learning_rate": 3.061728395061729e-06, + "loss": 1.3235, + "step": 54700 + }, + { + "epoch": 2.9226666666666667, + "grad_norm": 8.5625, + "learning_rate": 2.864197530864198e-06, + "loss": 1.3711, + "step": 54800 + }, + { + "epoch": 2.928, + "grad_norm": 13.6875, + "learning_rate": 2.666666666666667e-06, + "loss": 1.335, + "step": 54900 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 8.9375, + "learning_rate": 2.469135802469136e-06, + "loss": 1.3163, + "step": 55000 + }, + { + "epoch": 2.9386666666666668, + "grad_norm": 11.75, + "learning_rate": 2.271604938271605e-06, + "loss": 1.2763, + "step": 55100 + }, + { + "epoch": 2.944, + "grad_norm": 6.9375, + "learning_rate": 2.0740740740740742e-06, + "loss": 1.3573, + "step": 55200 + }, + { + "epoch": 2.9493333333333336, + "grad_norm": 9.375, + "learning_rate": 1.8765432098765432e-06, + "loss": 1.3565, + "step": 55300 + }, + { + "epoch": 2.9546666666666668, + "grad_norm": 6.28125, + "learning_rate": 1.6790123456790125e-06, + "loss": 1.3489, + "step": 55400 + }, + { + "epoch": 2.96, + "grad_norm": 6.6875, + "learning_rate": 1.4814814814814817e-06, + "loss": 1.2812, + "step": 55500 + }, + { + "epoch": 2.9653333333333336, + "grad_norm": 13.9375, + "learning_rate": 1.2839506172839507e-06, + "loss": 1.3521, + "step": 55600 + }, + { + "epoch": 2.970666666666667, + "grad_norm": 7.1875, + "learning_rate": 1.0864197530864197e-06, + "loss": 1.3804, + "step": 55700 + }, + { + "epoch": 2.976, + "grad_norm": 7.09375, + "learning_rate": 8.88888888888889e-07, + "loss": 1.2888, + "step": 55800 + }, + { + "epoch": 2.981333333333333, + "grad_norm": 8.125, + "learning_rate": 6.913580246913581e-07, + "loss": 1.2953, + "step": 55900 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 5.6875, + "learning_rate": 4.938271604938272e-07, + "loss": 1.2729, + "step": 56000 + }, + { + "epoch": 2.992, + "grad_norm": 6.40625, + "learning_rate": 2.962962962962963e-07, + "loss": 1.3042, + "step": 56100 + }, + { + "epoch": 2.997333333333333, + "grad_norm": 8.375, + "learning_rate": 9.876543209876543e-08, + "loss": 1.2361, + "step": 56200 + } + ], + "logging_steps": 100, + "max_steps": 56250, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 5600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.6905543704576e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-56250/training_args.bin b/checkpoint-56250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/checkpoint-56250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304 diff --git a/config.json b/config.json index 8ccd3b33181212cfbae75635a0f51dcbcf137999..28aa1559a0893c7e50c6a67370092417fa5cc81f 100644 --- a/config.json +++ b/config.json @@ -1,7 +1,7 @@ { - "_name_or_path": "/content/drive/MyDrive/MLP2025/lora_vs_ft_multitask/outputs (1)/dissimilar_FullFT", + "_name_or_path": "meta-llama/Llama-3.2-1B", "architectures": [ - "LlamaForSequenceClassification" + "LlamaForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, @@ -10,18 +10,8 @@ "head_dim": 64, "hidden_act": "silu", "hidden_size": 2048, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1", - "2": "LABEL_2" - }, "initializer_range": 0.02, "intermediate_size": 8192, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1, - "LABEL_2": 2 - }, "max_position_embeddings": 2048, "mlp_bias": false, "model_type": "llama", @@ -39,8 +29,8 @@ }, "rope_theta": 500000.0, "tie_word_embeddings": true, - "torch_dtype": "float32", - "transformers_version": "4.49.0", + "torch_dtype": "bfloat16", + "transformers_version": "4.48.3", "use_cache": false, "vocab_size": 128256 } diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..87bc067ffa726aa51659f4f0abbabfda2b0156f2 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128001, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.48.3" +} diff --git a/model.safetensors b/model.safetensors index 55cd05118fc0076647744ab55b4330b40bbe3376..202cbada8919e88292f18aeedab7f312e2d59b04 100644 --- a/model.safetensors +++ b/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51b4d90d74f22e954bfc244d54f460f91dda63d3871da2e73a37417485aa3318 -size 4943298992 +oid sha256:3b0779d345b0852334d6f314eb73355223b87d6965d3d4a740f955fe6d89a7ce +size 2471645608 diff --git a/special_tokens_map.json b/special_tokens_map.json index e5b39b6305d89284b04934011c68dbb26bf588ca..04829afa78a2d2df203ac846968db37269b01f7f 100644 --- a/special_tokens_map.json +++ b/special_tokens_map.json @@ -13,11 +13,5 @@ "rstrip": false, "single_word": false }, - "pad_token": { - "content": "<|end_of_text|>", - "lstrip": false, - "normalized": false, - "rstrip": false, - "single_word": false - } + "pad_token": "<|end_of_text|>" } diff --git a/tokenizer_config.json b/tokenizer_config.json index 81dd14db6632ad5b35b9d447732e37ac074873a5..3026161bf4fc56964bad68d35d36b5b815a3f716 100644 --- a/tokenizer_config.json +++ b/tokenizer_config.json @@ -2059,5 +2059,5 @@ ], "model_max_length": 131072, "pad_token": "<|end_of_text|>", - "tokenizer_class": "PreTrainedTokenizer" + "tokenizer_class": "PreTrainedTokenizerFast" } diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..42aa2b89bb66fd08b204023d734f84937c88b00f --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585ac7fde2fd224c05319c065c6917947771757f0a476d71d3fc6d777ef44f12 +size 5304