diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cf061e1a117d4f5474bf251f58ae163ae971dbe7 --- /dev/null +++ b/README.md @@ -0,0 +1,218 @@ +--- +base_model: alpindale/Mistral-7B-v0.2-hf +tags: +- axolotl +- generated_from_trainer +model-index: +- name: Einstein-v6-7B + results: [] +--- + + + +[Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) +
See axolotl config + +axolotl version: `0.4.0` +```yaml +base_model: alpindale/Mistral-7B-v0.2-hf +model_type: MistralForCausalLM +tokenizer_type: LlamaTokenizer +is_mistral_derived_model: true + +load_in_8bit: false +load_in_4bit: false +strict: false + +chat_template: chatml +datasets: + - path: data/merged_all.json + ds_type: json + type: alpaca + conversation: chatml + + - path: data/gpteacher-instruct-special-alpaca.json + ds_type: json + type: gpteacher + conversation: chatml + + - path: data/wizardlm_evol_instruct_70k_random_half.json + ds_type: json + type: alpaca + conversation: chatml + + - path: data/capybara_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/synthia-v1.3_sharegpt_12500.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/cot_alpaca_gpt4_extracted_openhermes_2.5_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/slimorca_dedup_filtered_95k_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/airoboros_3.2_without_contextual_slimorca_orca_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/allenai_wild_chat_gpt4_english_toxic_random_half_4k_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + + - path: data/pippa_bagel_repo_3k_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/gpt4_data_lmys_1m_sharegpt.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/sharegpt_gpt4_english.json + ds_type: json + type: sharegpt + conversation: chatml + + - path: data/no_robots_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + + - path: data/oasst_top1_from_fusechatmixture_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + + - path: data/everythinglm-data-v3_sharegpt.json + ds_type: json + type: sharegpt + strict: false + conversation: chatml + +dataset_prepared_path: last_run_prepared +# val_set_size: 0.005 +val_set_size: 0.0 + +do_bench_eval: true + +output_dir: ./Einstein-v6-7B-model + +sequence_len: 8192 +sample_packing: true +pad_to_sequence_len: true +eval_sample_packing: false + +wandb_project: Einstein +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: +hub_model_id: Weyaxi/Einstein-v6-7B + +save_safetensors: true + +gradient_accumulation_steps: 4 +micro_batch_size: 1 +num_epochs: 2 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.000005 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +evals_per_epoch: 3 # changed +eval_table_size: +eval_table_max_new_tokens: 128 +saves_per_epoch: 2 # changed +debug: + +deepspeed: zero3_bf16.json +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "<|im_end|>" + unk_token: "" +tokens: + - "<|im_start|>" + +``` + +

+ +# Einstein-v6-7B + +This model is a fine-tuned version of [alpindale/Mistral-7B-v0.2-hf](https://huggingface.co/alpindale/Mistral-7B-v0.2-hf) on the None dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 1 +- eval_batch_size: 1 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 9 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 36 +- total_eval_batch_size: 9 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 10 +- num_epochs: 2 + +### Training results + + + +### Framework versions + +- Transformers 4.38.2 +- Pytorch 2.1.2+cu118 +- Datasets 2.18.0 +- Tokenizers 0.15.0 diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..e36863df2bc13b20909d6711019409e777802fb5 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,4 @@ +{ + "<|im_end|>": 32000, + "<|im_start|>": 32001 +} diff --git a/checkpoint-1206/config.json b/checkpoint-1206/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76 --- /dev/null +++ b/checkpoint-1206/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "alpindale/Mistral-7B-v0.2-hf", + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 32000, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.38.2", + "use_cache": false, + "vocab_size": 32002 +} diff --git a/checkpoint-1206/generation_config.json b/checkpoint-1206/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc --- /dev/null +++ b/checkpoint-1206/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "transformers_version": "4.38.2" +} diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5db306e9f37c4b097d13acbc2c79fb683244aaab --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:640cdfc27b9b4c7dc135ffc8be7e345ea4e78175fbfaf92f848f3305de92d913 +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..192d6cd5e2524869a59ad20e9d6d3a0e4ae31b51 --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f456b41894b294656b00f40899f37b016b0cbd3fc5f6cfece1ed66c2d1fa1c6 +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..beb3ad7e6f5dfa8e9d930c194906aa5191abf588 --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aee6c39cdf7c911a3c96cf9f19437fe89c97b88a018ec0ed510d422a92d97a20 +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f79f34d4580e557afd7a07e65d1c1e42b620d79f --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:618e19b231e8d29988022a17a08623a3a47bd17ef0e366fef7e13be132a2dfba +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ce8abbd91ba039f58c55feb8828b4df987e714a --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0c72752e3f1d839baa84bace95f0baa137721100f7abf9eb6f278ad3d91fd2c +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c21bb4fd1a539cac416bc2962bee8bb02b4be2b6 --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb23bce469bd6b6cf2c4700225f7be1b53f11f4ea466a3240ee4e8eb6dbb02f +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cce426895d3fdee311af30caead54979ffc3faa6 --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03f48c713177be0b2a6284b27725b4bd58e3a071f12505c23d4044e9e4145384 +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6648c0d8f7a1ded89f6bf201fd2ae691dd90362d --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8db0c6517392bd44bbb3117e07ef75a7f9163fd4bb1bd249a812ec159706e4ce +size 4831623435 diff --git a/checkpoint-1206/global_step1206/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bc89bc50151290b748198b98b9690d2b718200ed --- /dev/null +++ b/checkpoint-1206/global_step1206/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc32e4c829baba0825f6c473240f85e37140a4a3d00bab68dce0a3d4ac83769d +size 4831623435 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a023057ba9193fae89248cd8ee4ae0f293858be9 --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c9afb76d2016a6260a8effe707b6bccedce48834675b1eab9818a761595b352 +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..60d9f5144856eb42a9f84d6aa325247cd0a3ef4f --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24a2d07e7e08f1d95551f765621032687695cdb304d85023fc9f72d3174a7b5e +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5c94358bea170387cd903d10ae4a4ec4999b830 --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1781a0983c1da9b1dbe29b99cc54f29e5509633808ff406ce58c6dba525eb461 +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad2da2a9ef86c60c213fb81ab43bccbc2ce71f78 --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a83e1db401ecb9437fbe9a1e0b6eac0ea109f94eef5ff21465b73decb906c72d +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..673b461dadbb336b1931ff5a733ddfd4f43319ff --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0cbd3c92ba583215c08495704bd146204e79775b020b7fea83ac7681c043d1f +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f00b141691c99df05bc5ffb1b52eceb3f301b29 --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d840c4bd1e9c9875435f80bc1caf93a443c2c195c8e2e3ec0695aded1a1a8456 +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..96fcbdd43e803c80e9b54ac5e53841bb8b62a1f6 --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:043ae2dded1785e74eae756ce3606c5429fdee52fa6db54d8a3f227d081b3904 +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..db1fa7b54abadef25da18242868b2bc4bffbeaf9 --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e6898409ae39821f228d85e5315f2bdc3f01288c5201bccdf1ca9b2cc9cb984 +size 153829 diff --git a/checkpoint-1206/global_step1206/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-1206/global_step1206/zero_pp_rank_8_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..af430f4bebe1aa35c715d84c5623c64f322feb7e --- /dev/null +++ b/checkpoint-1206/global_step1206/zero_pp_rank_8_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b96194551e4c7d97f8be9a35968388ea20559c2e6b909bbbf382e9e4c21a5279 +size 153829 diff --git a/checkpoint-1206/latest b/checkpoint-1206/latest new file mode 100644 index 0000000000000000000000000000000000000000..34d4c1304a2f32052898ef011354ffe438bb60ad --- /dev/null +++ b/checkpoint-1206/latest @@ -0,0 +1 @@ +global_step1206 \ No newline at end of file diff --git a/checkpoint-1206/model-00001-of-00003.safetensors b/checkpoint-1206/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e18149c479d72bb1418c8afe6daf57768e27bada --- /dev/null +++ b/checkpoint-1206/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e1195ff62d6499fd137e1c9f051ecb5c8cc4ebd0936800ce10aa42250f5570a +size 4943178720 diff --git a/checkpoint-1206/model-00002-of-00003.safetensors b/checkpoint-1206/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5c6184aff12c7d63b7b50a1b95084fc9c3ca67ba --- /dev/null +++ b/checkpoint-1206/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa17f326c5f2c97b6575de75571734430eb97a441035542b71107ff6a9e094fc +size 4999819336 diff --git a/checkpoint-1206/model-00003-of-00003.safetensors b/checkpoint-1206/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..585273dc2f4b1bd3129ad3c2336f45777a401a6b --- /dev/null +++ b/checkpoint-1206/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd7d94ada855cea53fb537d5bc1a78219d57a094fe3d381501aee08b7a7d9ad4 +size 4540532728 diff --git a/checkpoint-1206/model.safetensors.index.json b/checkpoint-1206/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2 --- /dev/null +++ b/checkpoint-1206/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 14483496960 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/checkpoint-1206/rng_state_0.pth b/checkpoint-1206/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ae643fef71bb5468722e041971c4fd10143dcde --- /dev/null +++ b/checkpoint-1206/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78df38122b8b51b69a3cce1a8d8cb0f7d8684196dde8fb6d174ef0fd3440d89 +size 16240 diff --git a/checkpoint-1206/rng_state_1.pth b/checkpoint-1206/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0dec857bd06d8263dc0d1f195ea4d4288bad4641 --- /dev/null +++ b/checkpoint-1206/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499f46e15237a5856de1a8f0582d02e4319721d83140e01c31e9e1db92da7108 +size 16240 diff --git a/checkpoint-1206/rng_state_2.pth b/checkpoint-1206/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d57f4b1f904b392ef605de094c7e5171fced622 --- /dev/null +++ b/checkpoint-1206/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b32ec8b414a3886bf179af827449dee557e95bfa64a7c20f26c186df2659c9f +size 16240 diff --git a/checkpoint-1206/rng_state_3.pth b/checkpoint-1206/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c8bebc9d459d1ed2d1ab4f27d7ec2da721d0445 --- /dev/null +++ b/checkpoint-1206/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82765e3b8fb57ca7779e75617b51182226eed278593e6441a31510115950353d +size 16240 diff --git a/checkpoint-1206/rng_state_4.pth b/checkpoint-1206/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..71f7ca7b0554bc7702f1e276ae0cd3924ffba0d2 --- /dev/null +++ b/checkpoint-1206/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2c24e041054b45b5bf8c50512ea8c4552e5f2e877fe798759dec7a7f3aae1 +size 16240 diff --git a/checkpoint-1206/rng_state_5.pth b/checkpoint-1206/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2393f7d616bfb4cf0ab81957f29d35b455685a54 --- /dev/null +++ b/checkpoint-1206/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b3e1210264272a2020cbcb79f6ade48528f5682dadcecb7a94805779548161 +size 16240 diff --git a/checkpoint-1206/rng_state_6.pth b/checkpoint-1206/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..46f8e8cc8551391d67e345af829445ad610b17a4 --- /dev/null +++ b/checkpoint-1206/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:556ec0b910e14a1a5ab8fb6a1a16d525b89e31c69dd9b6cd8d4a4cccad65b546 +size 16240 diff --git a/checkpoint-1206/rng_state_7.pth b/checkpoint-1206/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0723b7d69eb2d78f3ee4bdd7f838269f3f845d1 --- /dev/null +++ b/checkpoint-1206/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e830dc416886fe1aafeacfa75da6baacdbe9a61c66d2f1fbc11417753a516513 +size 16240 diff --git a/checkpoint-1206/rng_state_8.pth b/checkpoint-1206/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9da906954a171d52c0afc8baea75914a9bb9a62 --- /dev/null +++ b/checkpoint-1206/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d7cb0002af3e22c063c6751b91836d7e06c4267f7ba8e1912c42d6867e4885 +size 16240 diff --git a/checkpoint-1206/scheduler.pt b/checkpoint-1206/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..730aa0679b05c54594576e05c8b57359ad913b4d --- /dev/null +++ b/checkpoint-1206/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c74bca99465dbb777fb965aad2291c5beb95242415512d168559d65103eccb89 +size 1064 diff --git a/checkpoint-1206/trainer_state.json b/checkpoint-1206/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0aae078d47ea8ed8d31f200f728b870cd9b1094 --- /dev/null +++ b/checkpoint-1206/trainer_state.json @@ -0,0 +1,8463 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997927461139896, + "eval_steps": 500, + "global_step": 1206, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 27.81778461909011, + "learning_rate": 5.000000000000001e-07, + "loss": 0.7993, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 28.63833175363421, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9056, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 25.646828828014854, + "learning_rate": 1.5e-06, + "loss": 0.8473, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 9.834124771941388, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8192, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 10.558095859980105, + "learning_rate": 2.5e-06, + "loss": 0.7943, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 7.905789045775758, + "learning_rate": 3e-06, + "loss": 0.7075, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 7.259519170268483, + "learning_rate": 3.5e-06, + "loss": 0.7537, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 6.639042051048664, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7471, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 8.515070932390074, + "learning_rate": 4.5e-06, + "loss": 0.7689, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 8.916410424632533, + "learning_rate": 5e-06, + "loss": 0.7194, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 4.835046497413255, + "learning_rate": 4.9999978617243506e-06, + "loss": 0.6949, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 10.065648500649479, + "learning_rate": 4.9999914469010585e-06, + "loss": 0.7039, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 5.299372887839679, + "learning_rate": 4.999980755541098e-06, + "loss": 0.7067, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 5.693110837094718, + "learning_rate": 4.999965787662758e-06, + "loss": 0.7126, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 2.983869635716314, + "learning_rate": 4.999946543291642e-06, + "loss": 0.6496, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 4.2561193962441175, + "learning_rate": 4.999923022460671e-06, + "loss": 0.7036, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 3.011772824968437, + "learning_rate": 4.999895225210079e-06, + "loss": 0.7009, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 3.386638415717137, + "learning_rate": 4.9998631515874165e-06, + "loss": 0.6624, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 3.764658092125165, + "learning_rate": 4.999826801647551e-06, + "loss": 0.6687, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 2.3982096117966614, + "learning_rate": 4.999786175452662e-06, + "loss": 0.706, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 2.8051633678260193, + "learning_rate": 4.999741273072246e-06, + "loss": 0.7031, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 3.1177784624332614, + "learning_rate": 4.999692094583114e-06, + "loss": 0.7525, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 2.2533819675617806, + "learning_rate": 4.9996386400693906e-06, + "loss": 0.6767, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 2.61893793162573, + "learning_rate": 4.999580909622518e-06, + "loss": 0.6432, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 2.76057623723569, + "learning_rate": 4.999518903341251e-06, + "loss": 0.6809, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 2.27983032069553, + "learning_rate": 4.999452621331657e-06, + "loss": 0.6798, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 2.501904568120582, + "learning_rate": 4.99938206370712e-06, + "loss": 0.6412, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 2.819229290729669, + "learning_rate": 4.999307230588338e-06, + "loss": 0.6188, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 2.1233212322022212, + "learning_rate": 4.9992281221033224e-06, + "loss": 0.6378, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 2.7806911906686755, + "learning_rate": 4.999144738387396e-06, + "loss": 0.6653, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 2.4045490257014563, + "learning_rate": 4.999057079583199e-06, + "loss": 0.6377, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 2.3803717769210446, + "learning_rate": 4.998965145840681e-06, + "loss": 0.6855, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 2.3976652879633473, + "learning_rate": 4.998868937317106e-06, + "loss": 0.6284, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 2.2958541157119727, + "learning_rate": 4.998768454177051e-06, + "loss": 0.6521, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 2.1925196833696154, + "learning_rate": 4.998663696592403e-06, + "loss": 0.6619, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 2.361006042901851, + "learning_rate": 4.998554664742362e-06, + "loss": 0.6155, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 2.1577758143653614, + "learning_rate": 4.998441358813443e-06, + "loss": 0.6398, + "step": 37 + }, + { + "epoch": 0.03, + "grad_norm": 2.219872074512664, + "learning_rate": 4.998323778999467e-06, + "loss": 0.6051, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 2.2907501521408546, + "learning_rate": 4.9982019255015705e-06, + "loss": 0.6337, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 2.1769862324666183, + "learning_rate": 4.9980757985281955e-06, + "loss": 0.6606, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 2.4252479779661607, + "learning_rate": 4.997945398295101e-06, + "loss": 0.6685, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 2.3929541982084657, + "learning_rate": 4.99781072502535e-06, + "loss": 0.6084, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 1.932539969840091, + "learning_rate": 4.997671778949318e-06, + "loss": 0.6123, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 2.191742541327873, + "learning_rate": 4.997528560304688e-06, + "loss": 0.6247, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 2.423376784566499, + "learning_rate": 4.997381069336455e-06, + "loss": 0.7024, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 2.0599055392481076, + "learning_rate": 4.997229306296918e-06, + "loss": 0.6612, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 2.16832922087532, + "learning_rate": 4.997073271445686e-06, + "loss": 0.5949, + "step": 47 + }, + { + "epoch": 0.04, + "grad_norm": 2.0483598654319453, + "learning_rate": 4.9969129650496775e-06, + "loss": 0.6406, + "step": 48 + }, + { + "epoch": 0.04, + "grad_norm": 1.963056609139284, + "learning_rate": 4.996748387383113e-06, + "loss": 0.6361, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 2.2094923844269307, + "learning_rate": 4.996579538727527e-06, + "loss": 0.5901, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 2.1088153449411857, + "learning_rate": 4.996406419371749e-06, + "loss": 0.6458, + "step": 51 + }, + { + "epoch": 0.04, + "grad_norm": 2.093448940617732, + "learning_rate": 4.996229029611926e-06, + "loss": 0.6509, + "step": 52 + }, + { + "epoch": 0.04, + "grad_norm": 2.075116207412987, + "learning_rate": 4.996047369751502e-06, + "loss": 0.6295, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 2.138141165277684, + "learning_rate": 4.995861440101229e-06, + "loss": 0.6088, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 2.186316382848445, + "learning_rate": 4.995671240979161e-06, + "loss": 0.6307, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 2.2513741083982195, + "learning_rate": 4.995476772710657e-06, + "loss": 0.6175, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 2.0827167336870596, + "learning_rate": 4.995278035628379e-06, + "loss": 0.5935, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 2.117977588574442, + "learning_rate": 4.995075030072291e-06, + "loss": 0.5998, + "step": 58 + }, + { + "epoch": 0.05, + "grad_norm": 2.0996940200235485, + "learning_rate": 4.994867756389658e-06, + "loss": 0.6159, + "step": 59 + }, + { + "epoch": 0.05, + "grad_norm": 2.141096165691323, + "learning_rate": 4.994656214935045e-06, + "loss": 0.6294, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 2.022748830058395, + "learning_rate": 4.994440406070323e-06, + "loss": 0.6315, + "step": 61 + }, + { + "epoch": 0.05, + "grad_norm": 2.209132168720991, + "learning_rate": 4.994220330164654e-06, + "loss": 0.5645, + "step": 62 + }, + { + "epoch": 0.05, + "grad_norm": 2.0994557317862674, + "learning_rate": 4.993995987594509e-06, + "loss": 0.6272, + "step": 63 + }, + { + "epoch": 0.05, + "grad_norm": 2.204220831053169, + "learning_rate": 4.99376737874365e-06, + "loss": 0.6379, + "step": 64 + }, + { + "epoch": 0.05, + "grad_norm": 2.127733932186697, + "learning_rate": 4.993534504003141e-06, + "loss": 0.622, + "step": 65 + }, + { + "epoch": 0.05, + "grad_norm": 2.1338506582034316, + "learning_rate": 4.993297363771342e-06, + "loss": 0.6259, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 2.104802764460729, + "learning_rate": 4.993055958453912e-06, + "loss": 0.6414, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 2.0889535347771675, + "learning_rate": 4.9928102884638004e-06, + "loss": 0.6466, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 2.252225316694296, + "learning_rate": 4.992560354221258e-06, + "loss": 0.6167, + "step": 69 + }, + { + "epoch": 0.06, + "grad_norm": 2.015392533516649, + "learning_rate": 4.992306156153827e-06, + "loss": 0.5958, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 2.151741408948778, + "learning_rate": 4.992047694696343e-06, + "loss": 0.5875, + "step": 71 + }, + { + "epoch": 0.06, + "grad_norm": 2.0351299117412696, + "learning_rate": 4.991784970290935e-06, + "loss": 0.5935, + "step": 72 + }, + { + "epoch": 0.06, + "grad_norm": 2.0000962363827983, + "learning_rate": 4.991517983387026e-06, + "loss": 0.6091, + "step": 73 + }, + { + "epoch": 0.06, + "grad_norm": 2.202881736102415, + "learning_rate": 4.99124673444133e-06, + "loss": 0.6122, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 2.015074773396151, + "learning_rate": 4.990971223917848e-06, + "loss": 0.6134, + "step": 75 + }, + { + "epoch": 0.06, + "grad_norm": 2.009305960567766, + "learning_rate": 4.990691452287877e-06, + "loss": 0.6308, + "step": 76 + }, + { + "epoch": 0.06, + "grad_norm": 1.9967884756310221, + "learning_rate": 4.990407420029999e-06, + "loss": 0.6098, + "step": 77 + }, + { + "epoch": 0.06, + "grad_norm": 2.0858738033925905, + "learning_rate": 4.990119127630085e-06, + "loss": 0.6344, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 1.9427707561903895, + "learning_rate": 4.989826575581295e-06, + "loss": 0.6049, + "step": 79 + }, + { + "epoch": 0.07, + "grad_norm": 2.157150584766853, + "learning_rate": 4.989529764384073e-06, + "loss": 0.5965, + "step": 80 + }, + { + "epoch": 0.07, + "grad_norm": 2.0303527419352583, + "learning_rate": 4.989228694546151e-06, + "loss": 0.6524, + "step": 81 + }, + { + "epoch": 0.07, + "grad_norm": 2.128799919475717, + "learning_rate": 4.988923366582546e-06, + "loss": 0.5524, + "step": 82 + }, + { + "epoch": 0.07, + "grad_norm": 2.0122786280510696, + "learning_rate": 4.988613781015557e-06, + "loss": 0.6268, + "step": 83 + }, + { + "epoch": 0.07, + "grad_norm": 2.104580177719229, + "learning_rate": 4.988299938374769e-06, + "loss": 0.6229, + "step": 84 + }, + { + "epoch": 0.07, + "grad_norm": 2.3894843860356834, + "learning_rate": 4.9879818391970455e-06, + "loss": 0.6194, + "step": 85 + }, + { + "epoch": 0.07, + "grad_norm": 1.9615211372441477, + "learning_rate": 4.9876594840265355e-06, + "loss": 0.6355, + "step": 86 + }, + { + "epoch": 0.07, + "grad_norm": 2.4509852093141937, + "learning_rate": 4.987332873414666e-06, + "loss": 0.6405, + "step": 87 + }, + { + "epoch": 0.07, + "grad_norm": 2.178942375285086, + "learning_rate": 4.987002007920142e-06, + "loss": 0.5593, + "step": 88 + }, + { + "epoch": 0.07, + "grad_norm": 2.2625634345900445, + "learning_rate": 4.9866668881089515e-06, + "loss": 0.6133, + "step": 89 + }, + { + "epoch": 0.07, + "grad_norm": 2.363092638811143, + "learning_rate": 4.986327514554356e-06, + "loss": 0.6298, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 2.0401982492138546, + "learning_rate": 4.985983887836894e-06, + "loss": 0.6276, + "step": 91 + }, + { + "epoch": 0.08, + "grad_norm": 2.276956647922478, + "learning_rate": 4.985636008544381e-06, + "loss": 0.5691, + "step": 92 + }, + { + "epoch": 0.08, + "grad_norm": 2.1072762844110233, + "learning_rate": 4.985283877271908e-06, + "loss": 0.6175, + "step": 93 + }, + { + "epoch": 0.08, + "grad_norm": 2.2931866879442637, + "learning_rate": 4.984927494621836e-06, + "loss": 0.6419, + "step": 94 + }, + { + "epoch": 0.08, + "grad_norm": 2.112474101166308, + "learning_rate": 4.984566861203801e-06, + "loss": 0.607, + "step": 95 + }, + { + "epoch": 0.08, + "grad_norm": 2.1816059679212634, + "learning_rate": 4.984201977634711e-06, + "loss": 0.6136, + "step": 96 + }, + { + "epoch": 0.08, + "grad_norm": 2.0620776369966554, + "learning_rate": 4.9838328445387415e-06, + "loss": 0.6372, + "step": 97 + }, + { + "epoch": 0.08, + "grad_norm": 2.147592836641578, + "learning_rate": 4.983459462547341e-06, + "loss": 0.606, + "step": 98 + }, + { + "epoch": 0.08, + "grad_norm": 2.1808001877062453, + "learning_rate": 4.983081832299224e-06, + "loss": 0.6019, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 2.3751999527114087, + "learning_rate": 4.98269995444037e-06, + "loss": 0.6021, + "step": 100 + }, + { + "epoch": 0.08, + "grad_norm": 1.8769470206406913, + "learning_rate": 4.98231382962403e-06, + "loss": 0.6082, + "step": 101 + }, + { + "epoch": 0.08, + "grad_norm": 2.3060925784921347, + "learning_rate": 4.981923458510717e-06, + "loss": 0.6174, + "step": 102 + }, + { + "epoch": 0.09, + "grad_norm": 2.1543176832473683, + "learning_rate": 4.981528841768206e-06, + "loss": 0.6092, + "step": 103 + }, + { + "epoch": 0.09, + "grad_norm": 2.1558689520522547, + "learning_rate": 4.981129980071538e-06, + "loss": 0.587, + "step": 104 + }, + { + "epoch": 0.09, + "grad_norm": 2.3830532005188383, + "learning_rate": 4.980726874103014e-06, + "loss": 0.6518, + "step": 105 + }, + { + "epoch": 0.09, + "grad_norm": 2.3333119576634767, + "learning_rate": 4.980319524552195e-06, + "loss": 0.6096, + "step": 106 + }, + { + "epoch": 0.09, + "grad_norm": 2.1135146855324214, + "learning_rate": 4.9799079321159e-06, + "loss": 0.5728, + "step": 107 + }, + { + "epoch": 0.09, + "grad_norm": 2.2300463384326394, + "learning_rate": 4.9794920974982095e-06, + "loss": 0.6563, + "step": 108 + }, + { + "epoch": 0.09, + "grad_norm": 2.1745234017525443, + "learning_rate": 4.979072021410458e-06, + "loss": 0.5968, + "step": 109 + }, + { + "epoch": 0.09, + "grad_norm": 2.1536586182562334, + "learning_rate": 4.978647704571237e-06, + "loss": 0.6189, + "step": 110 + }, + { + "epoch": 0.09, + "grad_norm": 2.193809374687326, + "learning_rate": 4.97821914770639e-06, + "loss": 0.5864, + "step": 111 + }, + { + "epoch": 0.09, + "grad_norm": 2.0525896373682047, + "learning_rate": 4.977786351549017e-06, + "loss": 0.6101, + "step": 112 + }, + { + "epoch": 0.09, + "grad_norm": 2.216099286618384, + "learning_rate": 4.977349316839467e-06, + "loss": 0.5984, + "step": 113 + }, + { + "epoch": 0.09, + "grad_norm": 2.155122255962579, + "learning_rate": 4.97690804432534e-06, + "loss": 0.6311, + "step": 114 + }, + { + "epoch": 0.1, + "grad_norm": 2.2972101190291374, + "learning_rate": 4.976462534761487e-06, + "loss": 0.5813, + "step": 115 + }, + { + "epoch": 0.1, + "grad_norm": 1.9925413745245948, + "learning_rate": 4.9760127889100044e-06, + "loss": 0.6157, + "step": 116 + }, + { + "epoch": 0.1, + "grad_norm": 2.2802548684036568, + "learning_rate": 4.975558807540238e-06, + "loss": 0.6079, + "step": 117 + }, + { + "epoch": 0.1, + "grad_norm": 2.048888007394621, + "learning_rate": 4.9751005914287775e-06, + "loss": 0.6467, + "step": 118 + }, + { + "epoch": 0.1, + "grad_norm": 2.28661640438254, + "learning_rate": 4.974638141359456e-06, + "loss": 0.6029, + "step": 119 + }, + { + "epoch": 0.1, + "grad_norm": 2.004056683755783, + "learning_rate": 4.974171458123351e-06, + "loss": 0.6289, + "step": 120 + }, + { + "epoch": 0.1, + "grad_norm": 2.1628470048067667, + "learning_rate": 4.97370054251878e-06, + "loss": 0.6139, + "step": 121 + }, + { + "epoch": 0.1, + "grad_norm": 2.056119895466544, + "learning_rate": 4.9732253953513e-06, + "loss": 0.5798, + "step": 122 + }, + { + "epoch": 0.1, + "grad_norm": 2.1716513163164275, + "learning_rate": 4.972746017433709e-06, + "loss": 0.6085, + "step": 123 + }, + { + "epoch": 0.1, + "grad_norm": 2.255856676525811, + "learning_rate": 4.97226240958604e-06, + "loss": 0.6342, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 2.1049280498075373, + "learning_rate": 4.971774572635563e-06, + "loss": 0.6197, + "step": 125 + }, + { + "epoch": 0.1, + "grad_norm": 2.133349390995361, + "learning_rate": 4.97128250741678e-06, + "loss": 0.5751, + "step": 126 + }, + { + "epoch": 0.11, + "grad_norm": 2.2044887467317578, + "learning_rate": 4.97078621477143e-06, + "loss": 0.6611, + "step": 127 + }, + { + "epoch": 0.11, + "grad_norm": 2.1413863795698145, + "learning_rate": 4.970285695548481e-06, + "loss": 0.625, + "step": 128 + }, + { + "epoch": 0.11, + "grad_norm": 2.0229587336296615, + "learning_rate": 4.969780950604132e-06, + "loss": 0.5989, + "step": 129 + }, + { + "epoch": 0.11, + "grad_norm": 2.0983599595244247, + "learning_rate": 4.969271980801808e-06, + "loss": 0.5747, + "step": 130 + }, + { + "epoch": 0.11, + "grad_norm": 2.1059041140010786, + "learning_rate": 4.9687587870121645e-06, + "loss": 0.5869, + "step": 131 + }, + { + "epoch": 0.11, + "grad_norm": 1.8967441614595046, + "learning_rate": 4.9682413701130815e-06, + "loss": 0.6272, + "step": 132 + }, + { + "epoch": 0.11, + "grad_norm": 1.9976164993621088, + "learning_rate": 4.967719730989663e-06, + "loss": 0.6282, + "step": 133 + }, + { + "epoch": 0.11, + "grad_norm": 1.8719131324952145, + "learning_rate": 4.967193870534235e-06, + "loss": 0.6052, + "step": 134 + }, + { + "epoch": 0.11, + "grad_norm": 2.071702997476533, + "learning_rate": 4.9666637896463455e-06, + "loss": 0.5785, + "step": 135 + }, + { + "epoch": 0.11, + "grad_norm": 1.9549455320048341, + "learning_rate": 4.966129489232762e-06, + "loss": 0.5739, + "step": 136 + }, + { + "epoch": 0.11, + "grad_norm": 2.0656898626759315, + "learning_rate": 4.9655909702074684e-06, + "loss": 0.6651, + "step": 137 + }, + { + "epoch": 0.11, + "grad_norm": 2.1185948604203038, + "learning_rate": 4.965048233491669e-06, + "loss": 0.5759, + "step": 138 + }, + { + "epoch": 0.12, + "grad_norm": 2.08566019272993, + "learning_rate": 4.964501280013777e-06, + "loss": 0.6271, + "step": 139 + }, + { + "epoch": 0.12, + "grad_norm": 2.117420903965419, + "learning_rate": 4.963950110709425e-06, + "loss": 0.5968, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 1.9784944143818486, + "learning_rate": 4.963394726521453e-06, + "loss": 0.6112, + "step": 141 + }, + { + "epoch": 0.12, + "grad_norm": 2.077292948039572, + "learning_rate": 4.9628351283999144e-06, + "loss": 0.5636, + "step": 142 + }, + { + "epoch": 0.12, + "grad_norm": 2.223803520245629, + "learning_rate": 4.962271317302068e-06, + "loss": 0.6658, + "step": 143 + }, + { + "epoch": 0.12, + "grad_norm": 2.039369072186367, + "learning_rate": 4.9617032941923796e-06, + "loss": 0.5853, + "step": 144 + }, + { + "epoch": 0.12, + "grad_norm": 2.071470113085907, + "learning_rate": 4.961131060042522e-06, + "loss": 0.601, + "step": 145 + }, + { + "epoch": 0.12, + "grad_norm": 2.437470272347474, + "learning_rate": 4.960554615831372e-06, + "loss": 0.6593, + "step": 146 + }, + { + "epoch": 0.12, + "grad_norm": 2.178684122927139, + "learning_rate": 4.959973962545005e-06, + "loss": 0.607, + "step": 147 + }, + { + "epoch": 0.12, + "grad_norm": 2.097006749956471, + "learning_rate": 4.9593891011767e-06, + "loss": 0.5873, + "step": 148 + }, + { + "epoch": 0.12, + "grad_norm": 1.9801202541822784, + "learning_rate": 4.958800032726931e-06, + "loss": 0.5877, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 2.30001951085656, + "learning_rate": 4.958206758203373e-06, + "loss": 0.6368, + "step": 150 + }, + { + "epoch": 0.13, + "grad_norm": 1.990094260131078, + "learning_rate": 4.957609278620891e-06, + "loss": 0.59, + "step": 151 + }, + { + "epoch": 0.13, + "grad_norm": 2.262163752076628, + "learning_rate": 4.957007595001548e-06, + "loss": 0.5779, + "step": 152 + }, + { + "epoch": 0.13, + "grad_norm": 2.1970152093220983, + "learning_rate": 4.956401708374595e-06, + "loss": 0.5894, + "step": 153 + }, + { + "epoch": 0.13, + "grad_norm": 2.220825872684071, + "learning_rate": 4.9557916197764745e-06, + "loss": 0.6528, + "step": 154 + }, + { + "epoch": 0.13, + "grad_norm": 2.099472677591387, + "learning_rate": 4.955177330250817e-06, + "loss": 0.5798, + "step": 155 + }, + { + "epoch": 0.13, + "grad_norm": 2.159203936881569, + "learning_rate": 4.954558840848437e-06, + "loss": 0.6206, + "step": 156 + }, + { + "epoch": 0.13, + "grad_norm": 2.185152414039555, + "learning_rate": 4.953936152627338e-06, + "loss": 0.5624, + "step": 157 + }, + { + "epoch": 0.13, + "grad_norm": 2.0679748168992624, + "learning_rate": 4.953309266652701e-06, + "loss": 0.5859, + "step": 158 + }, + { + "epoch": 0.13, + "grad_norm": 2.327237187255128, + "learning_rate": 4.952678183996891e-06, + "loss": 0.5632, + "step": 159 + }, + { + "epoch": 0.13, + "grad_norm": 2.2865519679977417, + "learning_rate": 4.952042905739451e-06, + "loss": 0.6965, + "step": 160 + }, + { + "epoch": 0.13, + "grad_norm": 2.523435408018699, + "learning_rate": 4.9514034329671e-06, + "loss": 0.6217, + "step": 161 + }, + { + "epoch": 0.13, + "grad_norm": 2.4992653226709636, + "learning_rate": 4.950759766773734e-06, + "loss": 0.6175, + "step": 162 + }, + { + "epoch": 0.14, + "grad_norm": 2.432752824777114, + "learning_rate": 4.950111908260423e-06, + "loss": 0.5862, + "step": 163 + }, + { + "epoch": 0.14, + "grad_norm": 2.137500912204061, + "learning_rate": 4.949459858535404e-06, + "loss": 0.6124, + "step": 164 + }, + { + "epoch": 0.14, + "grad_norm": 2.2226376224120474, + "learning_rate": 4.94880361871409e-06, + "loss": 0.5891, + "step": 165 + }, + { + "epoch": 0.14, + "grad_norm": 2.3821839805775165, + "learning_rate": 4.9481431899190544e-06, + "loss": 0.6008, + "step": 166 + }, + { + "epoch": 0.14, + "grad_norm": 2.306242834684614, + "learning_rate": 4.947478573280044e-06, + "loss": 0.6159, + "step": 167 + }, + { + "epoch": 0.14, + "grad_norm": 2.3298092236851518, + "learning_rate": 4.946809769933963e-06, + "loss": 0.5809, + "step": 168 + }, + { + "epoch": 0.14, + "grad_norm": 2.364296499621558, + "learning_rate": 4.946136781024883e-06, + "loss": 0.5895, + "step": 169 + }, + { + "epoch": 0.14, + "grad_norm": 2.237241095609228, + "learning_rate": 4.945459607704029e-06, + "loss": 0.6144, + "step": 170 + }, + { + "epoch": 0.14, + "grad_norm": 2.4027419761972264, + "learning_rate": 4.9447782511297905e-06, + "loss": 0.5985, + "step": 171 + }, + { + "epoch": 0.14, + "grad_norm": 2.1547059182244284, + "learning_rate": 4.944092712467709e-06, + "loss": 0.5763, + "step": 172 + }, + { + "epoch": 0.14, + "grad_norm": 2.1530221667047984, + "learning_rate": 4.9434029928904805e-06, + "loss": 0.5692, + "step": 173 + }, + { + "epoch": 0.14, + "grad_norm": 2.228588593294869, + "learning_rate": 4.942709093577954e-06, + "loss": 0.5896, + "step": 174 + }, + { + "epoch": 0.15, + "grad_norm": 2.1597295307130198, + "learning_rate": 4.942011015717129e-06, + "loss": 0.5864, + "step": 175 + }, + { + "epoch": 0.15, + "grad_norm": 2.321140955498194, + "learning_rate": 4.941308760502149e-06, + "loss": 0.6089, + "step": 176 + }, + { + "epoch": 0.15, + "grad_norm": 2.220124736460707, + "learning_rate": 4.940602329134309e-06, + "loss": 0.5786, + "step": 177 + }, + { + "epoch": 0.15, + "grad_norm": 2.1698038563080417, + "learning_rate": 4.939891722822043e-06, + "loss": 0.5749, + "step": 178 + }, + { + "epoch": 0.15, + "grad_norm": 2.244425969121411, + "learning_rate": 4.93917694278093e-06, + "loss": 0.5877, + "step": 179 + }, + { + "epoch": 0.15, + "grad_norm": 2.143920008069458, + "learning_rate": 4.938457990233687e-06, + "loss": 0.6024, + "step": 180 + }, + { + "epoch": 0.15, + "grad_norm": 2.1786040820345813, + "learning_rate": 4.937734866410169e-06, + "loss": 0.5845, + "step": 181 + }, + { + "epoch": 0.15, + "grad_norm": 2.301832824481007, + "learning_rate": 4.9370075725473665e-06, + "loss": 0.6182, + "step": 182 + }, + { + "epoch": 0.15, + "grad_norm": 2.3748033727083997, + "learning_rate": 4.936276109889403e-06, + "loss": 0.6073, + "step": 183 + }, + { + "epoch": 0.15, + "grad_norm": 2.476334487382023, + "learning_rate": 4.935540479687534e-06, + "loss": 0.5793, + "step": 184 + }, + { + "epoch": 0.15, + "grad_norm": 2.2509466352322494, + "learning_rate": 4.934800683200143e-06, + "loss": 0.6133, + "step": 185 + }, + { + "epoch": 0.15, + "grad_norm": 2.8391697547684873, + "learning_rate": 4.934056721692742e-06, + "loss": 0.5967, + "step": 186 + }, + { + "epoch": 0.16, + "grad_norm": 2.4492364225391765, + "learning_rate": 4.933308596437965e-06, + "loss": 0.5676, + "step": 187 + }, + { + "epoch": 0.16, + "grad_norm": 2.685548141821295, + "learning_rate": 4.932556308715573e-06, + "loss": 0.6069, + "step": 188 + }, + { + "epoch": 0.16, + "grad_norm": 2.261217637824808, + "learning_rate": 4.931799859812443e-06, + "loss": 0.6411, + "step": 189 + }, + { + "epoch": 0.16, + "grad_norm": 2.3838284395200966, + "learning_rate": 4.931039251022573e-06, + "loss": 0.5745, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 2.2550921344466164, + "learning_rate": 4.930274483647074e-06, + "loss": 0.5989, + "step": 191 + }, + { + "epoch": 0.16, + "grad_norm": 2.078406234527636, + "learning_rate": 4.929505558994175e-06, + "loss": 0.5998, + "step": 192 + }, + { + "epoch": 0.16, + "grad_norm": 2.592864566091496, + "learning_rate": 4.928732478379214e-06, + "loss": 0.5842, + "step": 193 + }, + { + "epoch": 0.16, + "grad_norm": 2.092752299259724, + "learning_rate": 4.927955243124638e-06, + "loss": 0.5789, + "step": 194 + }, + { + "epoch": 0.16, + "grad_norm": 2.3799311595696966, + "learning_rate": 4.927173854560002e-06, + "loss": 0.6265, + "step": 195 + }, + { + "epoch": 0.16, + "grad_norm": 2.246876688010602, + "learning_rate": 4.926388314021964e-06, + "loss": 0.6126, + "step": 196 + }, + { + "epoch": 0.16, + "grad_norm": 2.1409898276704578, + "learning_rate": 4.925598622854287e-06, + "loss": 0.6073, + "step": 197 + }, + { + "epoch": 0.16, + "grad_norm": 2.5946158421875385, + "learning_rate": 4.924804782407834e-06, + "loss": 0.6154, + "step": 198 + }, + { + "epoch": 0.16, + "grad_norm": 2.1225494320427982, + "learning_rate": 4.924006794040562e-06, + "loss": 0.583, + "step": 199 + }, + { + "epoch": 0.17, + "grad_norm": 2.1971323526291338, + "learning_rate": 4.923204659117528e-06, + "loss": 0.6078, + "step": 200 + }, + { + "epoch": 0.17, + "grad_norm": 2.289185506404785, + "learning_rate": 4.92239837901088e-06, + "loss": 0.6127, + "step": 201 + }, + { + "epoch": 0.17, + "grad_norm": 2.0071007751625354, + "learning_rate": 4.921587955099858e-06, + "loss": 0.5804, + "step": 202 + }, + { + "epoch": 0.17, + "grad_norm": 2.2981840149068247, + "learning_rate": 4.920773388770789e-06, + "loss": 0.6027, + "step": 203 + }, + { + "epoch": 0.17, + "grad_norm": 2.236179116886702, + "learning_rate": 4.919954681417087e-06, + "loss": 0.6179, + "step": 204 + }, + { + "epoch": 0.17, + "grad_norm": 2.007422589251611, + "learning_rate": 4.91913183443925e-06, + "loss": 0.5647, + "step": 205 + }, + { + "epoch": 0.17, + "grad_norm": 2.1402813555735483, + "learning_rate": 4.918304849244857e-06, + "loss": 0.5841, + "step": 206 + }, + { + "epoch": 0.17, + "grad_norm": 2.0456415785177104, + "learning_rate": 4.917473727248565e-06, + "loss": 0.5524, + "step": 207 + }, + { + "epoch": 0.17, + "grad_norm": 1.9673558126020942, + "learning_rate": 4.916638469872109e-06, + "loss": 0.5698, + "step": 208 + }, + { + "epoch": 0.17, + "grad_norm": 2.015111672496819, + "learning_rate": 4.9157990785442964e-06, + "loss": 0.5957, + "step": 209 + }, + { + "epoch": 0.17, + "grad_norm": 1.9502065547578398, + "learning_rate": 4.9149555547010086e-06, + "loss": 0.5592, + "step": 210 + }, + { + "epoch": 0.17, + "grad_norm": 2.167936522558899, + "learning_rate": 4.9141078997851945e-06, + "loss": 0.5705, + "step": 211 + }, + { + "epoch": 0.18, + "grad_norm": 2.2066587458997935, + "learning_rate": 4.91325611524687e-06, + "loss": 0.5526, + "step": 212 + }, + { + "epoch": 0.18, + "grad_norm": 1.9132995625903553, + "learning_rate": 4.9124002025431136e-06, + "loss": 0.5767, + "step": 213 + }, + { + "epoch": 0.18, + "grad_norm": 2.0097281107801277, + "learning_rate": 4.91154016313807e-06, + "loss": 0.6185, + "step": 214 + }, + { + "epoch": 0.18, + "grad_norm": 2.023532008241332, + "learning_rate": 4.910675998502938e-06, + "loss": 0.6005, + "step": 215 + }, + { + "epoch": 0.18, + "grad_norm": 1.9253831001776973, + "learning_rate": 4.909807710115977e-06, + "loss": 0.5769, + "step": 216 + }, + { + "epoch": 0.18, + "grad_norm": 2.066862408842564, + "learning_rate": 4.908935299462497e-06, + "loss": 0.5671, + "step": 217 + }, + { + "epoch": 0.18, + "grad_norm": 1.9412704290792853, + "learning_rate": 4.908058768034862e-06, + "loss": 0.5568, + "step": 218 + }, + { + "epoch": 0.18, + "grad_norm": 2.185994457097553, + "learning_rate": 4.907178117332487e-06, + "loss": 0.5621, + "step": 219 + }, + { + "epoch": 0.18, + "grad_norm": 2.021517127546353, + "learning_rate": 4.906293348861829e-06, + "loss": 0.5672, + "step": 220 + }, + { + "epoch": 0.18, + "grad_norm": 2.099703967072734, + "learning_rate": 4.905404464136391e-06, + "loss": 0.5366, + "step": 221 + }, + { + "epoch": 0.18, + "grad_norm": 2.030197056583618, + "learning_rate": 4.904511464676718e-06, + "loss": 0.6064, + "step": 222 + }, + { + "epoch": 0.18, + "grad_norm": 2.4170102988954896, + "learning_rate": 4.903614352010393e-06, + "loss": 0.5919, + "step": 223 + }, + { + "epoch": 0.19, + "grad_norm": 2.0819468873015476, + "learning_rate": 4.9027131276720355e-06, + "loss": 0.5366, + "step": 224 + }, + { + "epoch": 0.19, + "grad_norm": 2.148008018153629, + "learning_rate": 4.901807793203299e-06, + "loss": 0.597, + "step": 225 + }, + { + "epoch": 0.19, + "grad_norm": 2.0303725862017186, + "learning_rate": 4.900898350152866e-06, + "loss": 0.6394, + "step": 226 + }, + { + "epoch": 0.19, + "grad_norm": 2.1598989214704334, + "learning_rate": 4.899984800076449e-06, + "loss": 0.5932, + "step": 227 + }, + { + "epoch": 0.19, + "grad_norm": 2.0816312637185255, + "learning_rate": 4.899067144536786e-06, + "loss": 0.5909, + "step": 228 + }, + { + "epoch": 0.19, + "grad_norm": 1.9024067197329315, + "learning_rate": 4.8981453851036365e-06, + "loss": 0.5463, + "step": 229 + }, + { + "epoch": 0.19, + "grad_norm": 2.1830926868871043, + "learning_rate": 4.897219523353781e-06, + "loss": 0.5821, + "step": 230 + }, + { + "epoch": 0.19, + "grad_norm": 2.1156269612794016, + "learning_rate": 4.8962895608710195e-06, + "loss": 0.5993, + "step": 231 + }, + { + "epoch": 0.19, + "grad_norm": 1.9653407654210864, + "learning_rate": 4.895355499246162e-06, + "loss": 0.5525, + "step": 232 + }, + { + "epoch": 0.19, + "grad_norm": 2.367769051061897, + "learning_rate": 4.894417340077036e-06, + "loss": 0.5683, + "step": 233 + }, + { + "epoch": 0.19, + "grad_norm": 2.078327064466567, + "learning_rate": 4.893475084968474e-06, + "loss": 0.6184, + "step": 234 + }, + { + "epoch": 0.19, + "grad_norm": 2.1661882731589475, + "learning_rate": 4.8925287355323195e-06, + "loss": 0.6321, + "step": 235 + }, + { + "epoch": 0.2, + "grad_norm": 2.182760952002799, + "learning_rate": 4.891578293387413e-06, + "loss": 0.6254, + "step": 236 + }, + { + "epoch": 0.2, + "grad_norm": 1.998723579962691, + "learning_rate": 4.890623760159605e-06, + "loss": 0.5371, + "step": 237 + }, + { + "epoch": 0.2, + "grad_norm": 2.319922346931926, + "learning_rate": 4.8896651374817365e-06, + "loss": 0.5941, + "step": 238 + }, + { + "epoch": 0.2, + "grad_norm": 2.090735197217999, + "learning_rate": 4.888702426993648e-06, + "loss": 0.577, + "step": 239 + }, + { + "epoch": 0.2, + "grad_norm": 2.1247199987228558, + "learning_rate": 4.887735630342173e-06, + "loss": 0.5928, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 2.33151114429804, + "learning_rate": 4.8867647491811315e-06, + "loss": 0.5838, + "step": 241 + }, + { + "epoch": 0.2, + "grad_norm": 2.1570026356289147, + "learning_rate": 4.885789785171334e-06, + "loss": 0.5642, + "step": 242 + }, + { + "epoch": 0.2, + "grad_norm": 2.049571197047368, + "learning_rate": 4.884810739980575e-06, + "loss": 0.6684, + "step": 243 + }, + { + "epoch": 0.2, + "grad_norm": 1.9810062424466381, + "learning_rate": 4.883827615283626e-06, + "loss": 0.5942, + "step": 244 + }, + { + "epoch": 0.2, + "grad_norm": 2.145869663660159, + "learning_rate": 4.882840412762244e-06, + "loss": 0.6356, + "step": 245 + }, + { + "epoch": 0.2, + "grad_norm": 2.19290302186514, + "learning_rate": 4.881849134105156e-06, + "loss": 0.6189, + "step": 246 + }, + { + "epoch": 0.2, + "grad_norm": 2.0561043419872984, + "learning_rate": 4.880853781008062e-06, + "loss": 0.5563, + "step": 247 + }, + { + "epoch": 0.21, + "grad_norm": 1.8831183793224635, + "learning_rate": 4.879854355173638e-06, + "loss": 0.5522, + "step": 248 + }, + { + "epoch": 0.21, + "grad_norm": 2.020981606684741, + "learning_rate": 4.878850858311518e-06, + "loss": 0.5548, + "step": 249 + }, + { + "epoch": 0.21, + "grad_norm": 2.060242570493272, + "learning_rate": 4.877843292138307e-06, + "loss": 0.5715, + "step": 250 + }, + { + "epoch": 0.21, + "grad_norm": 2.082455778933014, + "learning_rate": 4.8768316583775665e-06, + "loss": 0.5959, + "step": 251 + }, + { + "epoch": 0.21, + "grad_norm": 1.9830929719438626, + "learning_rate": 4.875815958759819e-06, + "loss": 0.5813, + "step": 252 + }, + { + "epoch": 0.21, + "grad_norm": 1.9772267506828567, + "learning_rate": 4.8747961950225406e-06, + "loss": 0.539, + "step": 253 + }, + { + "epoch": 0.21, + "grad_norm": 2.1492561995002104, + "learning_rate": 4.873772368910161e-06, + "loss": 0.6059, + "step": 254 + }, + { + "epoch": 0.21, + "grad_norm": 2.253757247139787, + "learning_rate": 4.872744482174058e-06, + "loss": 0.5897, + "step": 255 + }, + { + "epoch": 0.21, + "grad_norm": 2.3282624851882496, + "learning_rate": 4.8717125365725545e-06, + "loss": 0.5675, + "step": 256 + }, + { + "epoch": 0.21, + "grad_norm": 2.15573581133063, + "learning_rate": 4.8706765338709185e-06, + "loss": 0.5958, + "step": 257 + }, + { + "epoch": 0.21, + "grad_norm": 2.073289220218241, + "learning_rate": 4.869636475841358e-06, + "loss": 0.6052, + "step": 258 + }, + { + "epoch": 0.21, + "grad_norm": 2.293714090249444, + "learning_rate": 4.8685923642630165e-06, + "loss": 0.5786, + "step": 259 + }, + { + "epoch": 0.22, + "grad_norm": 1.9496544276539172, + "learning_rate": 4.867544200921974e-06, + "loss": 0.6163, + "step": 260 + }, + { + "epoch": 0.22, + "grad_norm": 2.5267016753690132, + "learning_rate": 4.866491987611239e-06, + "loss": 0.6223, + "step": 261 + }, + { + "epoch": 0.22, + "grad_norm": 1.8731249445320794, + "learning_rate": 4.865435726130751e-06, + "loss": 0.5632, + "step": 262 + }, + { + "epoch": 0.22, + "grad_norm": 2.3586331105798863, + "learning_rate": 4.86437541828737e-06, + "loss": 0.5769, + "step": 263 + }, + { + "epoch": 0.22, + "grad_norm": 2.0258106914510585, + "learning_rate": 4.863311065894883e-06, + "loss": 0.6103, + "step": 264 + }, + { + "epoch": 0.22, + "grad_norm": 2.2543614390885955, + "learning_rate": 4.862242670773991e-06, + "loss": 0.5844, + "step": 265 + }, + { + "epoch": 0.22, + "grad_norm": 1.9440299381244668, + "learning_rate": 4.861170234752314e-06, + "loss": 0.5559, + "step": 266 + }, + { + "epoch": 0.22, + "grad_norm": 2.254538268495492, + "learning_rate": 4.8600937596643815e-06, + "loss": 0.5709, + "step": 267 + }, + { + "epoch": 0.22, + "grad_norm": 2.007651746385687, + "learning_rate": 4.8590132473516346e-06, + "loss": 0.573, + "step": 268 + }, + { + "epoch": 0.22, + "grad_norm": 2.0735253118288837, + "learning_rate": 4.857928699662421e-06, + "loss": 0.5954, + "step": 269 + }, + { + "epoch": 0.22, + "grad_norm": 2.024775417101569, + "learning_rate": 4.856840118451989e-06, + "loss": 0.5992, + "step": 270 + }, + { + "epoch": 0.22, + "grad_norm": 2.1043310699945814, + "learning_rate": 4.855747505582488e-06, + "loss": 0.6507, + "step": 271 + }, + { + "epoch": 0.23, + "grad_norm": 2.0386353328313214, + "learning_rate": 4.854650862922965e-06, + "loss": 0.5666, + "step": 272 + }, + { + "epoch": 0.23, + "grad_norm": 1.978698841367705, + "learning_rate": 4.853550192349358e-06, + "loss": 0.5593, + "step": 273 + }, + { + "epoch": 0.23, + "grad_norm": 1.9386534247633986, + "learning_rate": 4.852445495744497e-06, + "loss": 0.5735, + "step": 274 + }, + { + "epoch": 0.23, + "grad_norm": 2.049346245018599, + "learning_rate": 4.8513367749981e-06, + "loss": 0.5415, + "step": 275 + }, + { + "epoch": 0.23, + "grad_norm": 2.1051969521216605, + "learning_rate": 4.850224032006765e-06, + "loss": 0.5532, + "step": 276 + }, + { + "epoch": 0.23, + "grad_norm": 2.2006792558872315, + "learning_rate": 4.849107268673975e-06, + "loss": 0.5696, + "step": 277 + }, + { + "epoch": 0.23, + "grad_norm": 2.0460787736353647, + "learning_rate": 4.847986486910088e-06, + "loss": 0.5658, + "step": 278 + }, + { + "epoch": 0.23, + "grad_norm": 2.1161843259225406, + "learning_rate": 4.846861688632336e-06, + "loss": 0.583, + "step": 279 + }, + { + "epoch": 0.23, + "grad_norm": 1.8882198480393542, + "learning_rate": 4.8457328757648224e-06, + "loss": 0.5693, + "step": 280 + }, + { + "epoch": 0.23, + "grad_norm": 2.1578413701109596, + "learning_rate": 4.844600050238517e-06, + "loss": 0.5409, + "step": 281 + }, + { + "epoch": 0.23, + "grad_norm": 2.03912467778954, + "learning_rate": 4.843463213991255e-06, + "loss": 0.5908, + "step": 282 + }, + { + "epoch": 0.23, + "grad_norm": 2.2333462480826247, + "learning_rate": 4.842322368967731e-06, + "loss": 0.6088, + "step": 283 + }, + { + "epoch": 0.24, + "grad_norm": 2.06698702157327, + "learning_rate": 4.8411775171194986e-06, + "loss": 0.5953, + "step": 284 + }, + { + "epoch": 0.24, + "grad_norm": 2.1433923121572045, + "learning_rate": 4.840028660404964e-06, + "loss": 0.5851, + "step": 285 + }, + { + "epoch": 0.24, + "grad_norm": 2.214858780835041, + "learning_rate": 4.838875800789386e-06, + "loss": 0.5913, + "step": 286 + }, + { + "epoch": 0.24, + "grad_norm": 2.038128612492624, + "learning_rate": 4.837718940244871e-06, + "loss": 0.5827, + "step": 287 + }, + { + "epoch": 0.24, + "grad_norm": 1.9894065096959768, + "learning_rate": 4.836558080750365e-06, + "loss": 0.5769, + "step": 288 + }, + { + "epoch": 0.24, + "grad_norm": 2.1711590153285822, + "learning_rate": 4.835393224291662e-06, + "loss": 0.654, + "step": 289 + }, + { + "epoch": 0.24, + "grad_norm": 2.105004451988696, + "learning_rate": 4.834224372861386e-06, + "loss": 0.6158, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 1.9554568023729102, + "learning_rate": 4.833051528459001e-06, + "loss": 0.5807, + "step": 291 + }, + { + "epoch": 0.24, + "grad_norm": 2.2693917834500312, + "learning_rate": 4.831874693090797e-06, + "loss": 0.5557, + "step": 292 + }, + { + "epoch": 0.24, + "grad_norm": 1.9081391627126192, + "learning_rate": 4.830693868769892e-06, + "loss": 0.6057, + "step": 293 + }, + { + "epoch": 0.24, + "grad_norm": 2.2133664110768585, + "learning_rate": 4.82950905751623e-06, + "loss": 0.6103, + "step": 294 + }, + { + "epoch": 0.24, + "grad_norm": 2.015392814211589, + "learning_rate": 4.8283202613565735e-06, + "loss": 0.5578, + "step": 295 + }, + { + "epoch": 0.25, + "grad_norm": 2.142124020349717, + "learning_rate": 4.8271274823245e-06, + "loss": 0.5675, + "step": 296 + }, + { + "epoch": 0.25, + "grad_norm": 1.981611826462286, + "learning_rate": 4.825930722460405e-06, + "loss": 0.5696, + "step": 297 + }, + { + "epoch": 0.25, + "grad_norm": 1.966759748348117, + "learning_rate": 4.824729983811486e-06, + "loss": 0.58, + "step": 298 + }, + { + "epoch": 0.25, + "grad_norm": 2.0117040369769397, + "learning_rate": 4.823525268431754e-06, + "loss": 0.6005, + "step": 299 + }, + { + "epoch": 0.25, + "grad_norm": 1.9579664917991193, + "learning_rate": 4.822316578382019e-06, + "loss": 0.5472, + "step": 300 + }, + { + "epoch": 0.25, + "grad_norm": 1.9075723479635032, + "learning_rate": 4.821103915729892e-06, + "loss": 0.5834, + "step": 301 + }, + { + "epoch": 0.25, + "grad_norm": 2.289340229011896, + "learning_rate": 4.819887282549777e-06, + "loss": 0.6088, + "step": 302 + }, + { + "epoch": 0.25, + "grad_norm": 2.0410700553735235, + "learning_rate": 4.818666680922874e-06, + "loss": 0.5449, + "step": 303 + }, + { + "epoch": 0.25, + "grad_norm": 2.074434792511819, + "learning_rate": 4.8174421129371675e-06, + "loss": 0.5826, + "step": 304 + }, + { + "epoch": 0.25, + "grad_norm": 2.1377170527698865, + "learning_rate": 4.816213580687428e-06, + "loss": 0.6262, + "step": 305 + }, + { + "epoch": 0.25, + "grad_norm": 2.060340839248083, + "learning_rate": 4.814981086275209e-06, + "loss": 0.5479, + "step": 306 + }, + { + "epoch": 0.25, + "grad_norm": 2.007036467413588, + "learning_rate": 4.813744631808841e-06, + "loss": 0.5642, + "step": 307 + }, + { + "epoch": 0.26, + "grad_norm": 2.016779606220332, + "learning_rate": 4.8125042194034285e-06, + "loss": 0.5503, + "step": 308 + }, + { + "epoch": 0.26, + "grad_norm": 1.930004252757651, + "learning_rate": 4.811259851180845e-06, + "loss": 0.582, + "step": 309 + }, + { + "epoch": 0.26, + "grad_norm": 1.9179477992752856, + "learning_rate": 4.810011529269734e-06, + "loss": 0.5678, + "step": 310 + }, + { + "epoch": 0.26, + "grad_norm": 2.023430757276848, + "learning_rate": 4.808759255805498e-06, + "loss": 0.614, + "step": 311 + }, + { + "epoch": 0.26, + "grad_norm": 1.8334738409404936, + "learning_rate": 4.807503032930306e-06, + "loss": 0.5742, + "step": 312 + }, + { + "epoch": 0.26, + "grad_norm": 1.937332706274502, + "learning_rate": 4.806242862793075e-06, + "loss": 0.6257, + "step": 313 + }, + { + "epoch": 0.26, + "grad_norm": 2.0265383045700363, + "learning_rate": 4.8049787475494786e-06, + "loss": 0.5733, + "step": 314 + }, + { + "epoch": 0.26, + "grad_norm": 2.056444039073761, + "learning_rate": 4.803710689361939e-06, + "loss": 0.578, + "step": 315 + }, + { + "epoch": 0.26, + "grad_norm": 2.411132719183335, + "learning_rate": 4.802438690399622e-06, + "loss": 0.5778, + "step": 316 + }, + { + "epoch": 0.26, + "grad_norm": 2.0233969242222853, + "learning_rate": 4.801162752838436e-06, + "loss": 0.5649, + "step": 317 + }, + { + "epoch": 0.26, + "grad_norm": 2.2809121915132815, + "learning_rate": 4.799882878861025e-06, + "loss": 0.5589, + "step": 318 + }, + { + "epoch": 0.26, + "grad_norm": 1.9806834041020271, + "learning_rate": 4.798599070656768e-06, + "loss": 0.5753, + "step": 319 + }, + { + "epoch": 0.27, + "grad_norm": 2.095099671577702, + "learning_rate": 4.797311330421773e-06, + "loss": 0.5644, + "step": 320 + }, + { + "epoch": 0.27, + "grad_norm": 2.1697606190375764, + "learning_rate": 4.796019660358877e-06, + "loss": 0.6009, + "step": 321 + }, + { + "epoch": 0.27, + "grad_norm": 1.9549416103216173, + "learning_rate": 4.794724062677635e-06, + "loss": 0.5429, + "step": 322 + }, + { + "epoch": 0.27, + "grad_norm": 1.9986949357292838, + "learning_rate": 4.793424539594323e-06, + "loss": 0.5456, + "step": 323 + }, + { + "epoch": 0.27, + "grad_norm": 1.9414831957796765, + "learning_rate": 4.792121093331935e-06, + "loss": 0.5468, + "step": 324 + }, + { + "epoch": 0.27, + "grad_norm": 2.100702188933012, + "learning_rate": 4.7908137261201685e-06, + "loss": 0.5763, + "step": 325 + }, + { + "epoch": 0.27, + "grad_norm": 2.2747471285831025, + "learning_rate": 4.789502440195436e-06, + "loss": 0.5637, + "step": 326 + }, + { + "epoch": 0.27, + "grad_norm": 1.8996382919319124, + "learning_rate": 4.788187237800849e-06, + "loss": 0.5285, + "step": 327 + }, + { + "epoch": 0.27, + "grad_norm": 2.3451495174978847, + "learning_rate": 4.786868121186218e-06, + "loss": 0.5638, + "step": 328 + }, + { + "epoch": 0.27, + "grad_norm": 2.0437536068229565, + "learning_rate": 4.7855450926080535e-06, + "loss": 0.5282, + "step": 329 + }, + { + "epoch": 0.27, + "grad_norm": 2.1185488514745554, + "learning_rate": 4.784218154329555e-06, + "loss": 0.5689, + "step": 330 + }, + { + "epoch": 0.27, + "grad_norm": 2.08745956731504, + "learning_rate": 4.78288730862061e-06, + "loss": 0.5772, + "step": 331 + }, + { + "epoch": 0.28, + "grad_norm": 1.9479507156354359, + "learning_rate": 4.781552557757789e-06, + "loss": 0.5419, + "step": 332 + }, + { + "epoch": 0.28, + "grad_norm": 2.0211480847937255, + "learning_rate": 4.780213904024346e-06, + "loss": 0.5757, + "step": 333 + }, + { + "epoch": 0.28, + "grad_norm": 1.9075335749936069, + "learning_rate": 4.7788713497102094e-06, + "loss": 0.5693, + "step": 334 + }, + { + "epoch": 0.28, + "grad_norm": 1.9590727137410602, + "learning_rate": 4.777524897111979e-06, + "loss": 0.5501, + "step": 335 + }, + { + "epoch": 0.28, + "grad_norm": 2.0328480247612752, + "learning_rate": 4.776174548532926e-06, + "loss": 0.587, + "step": 336 + }, + { + "epoch": 0.28, + "grad_norm": 2.062540517496736, + "learning_rate": 4.774820306282982e-06, + "loss": 0.5819, + "step": 337 + }, + { + "epoch": 0.28, + "grad_norm": 2.0054452800156195, + "learning_rate": 4.773462172678744e-06, + "loss": 0.5529, + "step": 338 + }, + { + "epoch": 0.28, + "grad_norm": 1.9641125644599562, + "learning_rate": 4.772100150043462e-06, + "loss": 0.5895, + "step": 339 + }, + { + "epoch": 0.28, + "grad_norm": 1.9196744569285298, + "learning_rate": 4.77073424070704e-06, + "loss": 0.5504, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 2.0002752186146484, + "learning_rate": 4.76936444700603e-06, + "loss": 0.5307, + "step": 341 + }, + { + "epoch": 0.28, + "grad_norm": 2.1068919823054344, + "learning_rate": 4.76799077128363e-06, + "loss": 0.5908, + "step": 342 + }, + { + "epoch": 0.28, + "grad_norm": 1.919597745459612, + "learning_rate": 4.766613215889678e-06, + "loss": 0.5423, + "step": 343 + }, + { + "epoch": 0.29, + "grad_norm": 2.0670928578728716, + "learning_rate": 4.765231783180648e-06, + "loss": 0.5901, + "step": 344 + }, + { + "epoch": 0.29, + "grad_norm": 1.906116148793229, + "learning_rate": 4.763846475519648e-06, + "loss": 0.5919, + "step": 345 + }, + { + "epoch": 0.29, + "grad_norm": 1.9133575268702454, + "learning_rate": 4.762457295276413e-06, + "loss": 0.585, + "step": 346 + }, + { + "epoch": 0.29, + "grad_norm": 2.133902651855379, + "learning_rate": 4.7610642448273025e-06, + "loss": 0.5444, + "step": 347 + }, + { + "epoch": 0.29, + "grad_norm": 1.95222194640397, + "learning_rate": 4.7596673265552985e-06, + "loss": 0.5941, + "step": 348 + }, + { + "epoch": 0.29, + "grad_norm": 2.095010268380277, + "learning_rate": 4.758266542849997e-06, + "loss": 0.6045, + "step": 349 + }, + { + "epoch": 0.29, + "grad_norm": 2.0493864712059655, + "learning_rate": 4.756861896107609e-06, + "loss": 0.6011, + "step": 350 + }, + { + "epoch": 0.29, + "grad_norm": 1.9222198823064967, + "learning_rate": 4.755453388730949e-06, + "loss": 0.5521, + "step": 351 + }, + { + "epoch": 0.29, + "grad_norm": 2.368147154955994, + "learning_rate": 4.754041023129442e-06, + "loss": 0.6117, + "step": 352 + }, + { + "epoch": 0.29, + "grad_norm": 1.9734596786106697, + "learning_rate": 4.752624801719108e-06, + "loss": 0.5727, + "step": 353 + }, + { + "epoch": 0.29, + "grad_norm": 2.151510566977991, + "learning_rate": 4.751204726922564e-06, + "loss": 0.6085, + "step": 354 + }, + { + "epoch": 0.29, + "grad_norm": 1.9291219072892685, + "learning_rate": 4.74978080116902e-06, + "loss": 0.5655, + "step": 355 + }, + { + "epoch": 0.3, + "grad_norm": 1.838592559018919, + "learning_rate": 4.748353026894273e-06, + "loss": 0.5508, + "step": 356 + }, + { + "epoch": 0.3, + "grad_norm": 2.069156589116884, + "learning_rate": 4.7469214065407e-06, + "loss": 0.5942, + "step": 357 + }, + { + "epoch": 0.3, + "grad_norm": 1.8960817746615841, + "learning_rate": 4.745485942557264e-06, + "loss": 0.5902, + "step": 358 + }, + { + "epoch": 0.3, + "grad_norm": 2.0606557307859634, + "learning_rate": 4.744046637399497e-06, + "loss": 0.556, + "step": 359 + }, + { + "epoch": 0.3, + "grad_norm": 1.9660065879130573, + "learning_rate": 4.742603493529505e-06, + "loss": 0.5364, + "step": 360 + }, + { + "epoch": 0.3, + "grad_norm": 1.9647921383638112, + "learning_rate": 4.741156513415958e-06, + "loss": 0.5601, + "step": 361 + }, + { + "epoch": 0.3, + "grad_norm": 2.049074688423064, + "learning_rate": 4.739705699534092e-06, + "loss": 0.556, + "step": 362 + }, + { + "epoch": 0.3, + "grad_norm": 1.962593945802751, + "learning_rate": 4.738251054365697e-06, + "loss": 0.5609, + "step": 363 + }, + { + "epoch": 0.3, + "grad_norm": 2.059675349950347, + "learning_rate": 4.736792580399119e-06, + "loss": 0.5499, + "step": 364 + }, + { + "epoch": 0.3, + "grad_norm": 1.8479566025134508, + "learning_rate": 4.7353302801292555e-06, + "loss": 0.5621, + "step": 365 + }, + { + "epoch": 0.3, + "grad_norm": 1.9405450724813613, + "learning_rate": 4.733864156057545e-06, + "loss": 0.5437, + "step": 366 + }, + { + "epoch": 0.3, + "grad_norm": 2.122487864033456, + "learning_rate": 4.7323942106919715e-06, + "loss": 0.5984, + "step": 367 + }, + { + "epoch": 0.31, + "grad_norm": 2.6822841144123046, + "learning_rate": 4.730920446547052e-06, + "loss": 0.5951, + "step": 368 + }, + { + "epoch": 0.31, + "grad_norm": 2.001405394086718, + "learning_rate": 4.729442866143838e-06, + "loss": 0.5552, + "step": 369 + }, + { + "epoch": 0.31, + "grad_norm": 2.081154186949651, + "learning_rate": 4.72796147200991e-06, + "loss": 0.587, + "step": 370 + }, + { + "epoch": 0.31, + "grad_norm": 2.1196544292473236, + "learning_rate": 4.72647626667937e-06, + "loss": 0.5882, + "step": 371 + }, + { + "epoch": 0.31, + "grad_norm": 2.107445583509131, + "learning_rate": 4.724987252692841e-06, + "loss": 0.5389, + "step": 372 + }, + { + "epoch": 0.31, + "grad_norm": 1.9529785007256542, + "learning_rate": 4.723494432597462e-06, + "loss": 0.6439, + "step": 373 + }, + { + "epoch": 0.31, + "grad_norm": 2.11513441515607, + "learning_rate": 4.72199780894688e-06, + "loss": 0.6089, + "step": 374 + }, + { + "epoch": 0.31, + "grad_norm": 1.9769899713721226, + "learning_rate": 4.7204973843012504e-06, + "loss": 0.5393, + "step": 375 + }, + { + "epoch": 0.31, + "grad_norm": 2.063749623036316, + "learning_rate": 4.718993161227231e-06, + "loss": 0.5987, + "step": 376 + }, + { + "epoch": 0.31, + "grad_norm": 2.0515862288253883, + "learning_rate": 4.717485142297977e-06, + "loss": 0.5772, + "step": 377 + }, + { + "epoch": 0.31, + "grad_norm": 1.8962297741946081, + "learning_rate": 4.715973330093135e-06, + "loss": 0.5424, + "step": 378 + }, + { + "epoch": 0.31, + "grad_norm": 2.2210958340400087, + "learning_rate": 4.7144577271988435e-06, + "loss": 0.6072, + "step": 379 + }, + { + "epoch": 0.32, + "grad_norm": 2.067113337475314, + "learning_rate": 4.712938336207724e-06, + "loss": 0.5482, + "step": 380 + }, + { + "epoch": 0.32, + "grad_norm": 1.8985489253954526, + "learning_rate": 4.711415159718876e-06, + "loss": 0.5593, + "step": 381 + }, + { + "epoch": 0.32, + "grad_norm": 2.085236381118245, + "learning_rate": 4.709888200337879e-06, + "loss": 0.5704, + "step": 382 + }, + { + "epoch": 0.32, + "grad_norm": 2.0967664183909784, + "learning_rate": 4.708357460676779e-06, + "loss": 0.5997, + "step": 383 + }, + { + "epoch": 0.32, + "grad_norm": 2.0454278026009645, + "learning_rate": 4.706822943354092e-06, + "loss": 0.5669, + "step": 384 + }, + { + "epoch": 0.32, + "grad_norm": 1.9171673309342674, + "learning_rate": 4.705284650994793e-06, + "loss": 0.517, + "step": 385 + }, + { + "epoch": 0.32, + "grad_norm": 2.2003223432761287, + "learning_rate": 4.70374258623032e-06, + "loss": 0.5957, + "step": 386 + }, + { + "epoch": 0.32, + "grad_norm": 1.936392519491186, + "learning_rate": 4.702196751698557e-06, + "loss": 0.5767, + "step": 387 + }, + { + "epoch": 0.32, + "grad_norm": 2.354272003403086, + "learning_rate": 4.700647150043841e-06, + "loss": 0.6515, + "step": 388 + }, + { + "epoch": 0.32, + "grad_norm": 1.9115059027323418, + "learning_rate": 4.699093783916955e-06, + "loss": 0.5579, + "step": 389 + }, + { + "epoch": 0.32, + "grad_norm": 1.9878827587010002, + "learning_rate": 4.697536655975115e-06, + "loss": 0.572, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 1.9729552535473858, + "learning_rate": 4.69597576888198e-06, + "loss": 0.5665, + "step": 391 + }, + { + "epoch": 0.32, + "grad_norm": 2.177634366499155, + "learning_rate": 4.694411125307632e-06, + "loss": 0.6363, + "step": 392 + }, + { + "epoch": 0.33, + "grad_norm": 1.8955146664976508, + "learning_rate": 4.692842727928584e-06, + "loss": 0.5682, + "step": 393 + }, + { + "epoch": 0.33, + "grad_norm": 2.175305874476245, + "learning_rate": 4.691270579427769e-06, + "loss": 0.5943, + "step": 394 + }, + { + "epoch": 0.33, + "grad_norm": 2.068140527232831, + "learning_rate": 4.689694682494537e-06, + "loss": 0.5659, + "step": 395 + }, + { + "epoch": 0.33, + "grad_norm": 1.9112960694448755, + "learning_rate": 4.688115039824648e-06, + "loss": 0.6048, + "step": 396 + }, + { + "epoch": 0.33, + "grad_norm": 1.9778305624626604, + "learning_rate": 4.686531654120272e-06, + "loss": 0.5695, + "step": 397 + }, + { + "epoch": 0.33, + "grad_norm": 2.096904163204813, + "learning_rate": 4.684944528089981e-06, + "loss": 0.6113, + "step": 398 + }, + { + "epoch": 0.33, + "grad_norm": 2.0011934144948516, + "learning_rate": 4.683353664448745e-06, + "loss": 0.5568, + "step": 399 + }, + { + "epoch": 0.33, + "grad_norm": 1.8562851971757464, + "learning_rate": 4.681759065917929e-06, + "loss": 0.5474, + "step": 400 + }, + { + "epoch": 0.33, + "grad_norm": 1.8190547574166316, + "learning_rate": 4.680160735225285e-06, + "loss": 0.5315, + "step": 401 + }, + { + "epoch": 0.33, + "grad_norm": 1.9247862956929132, + "learning_rate": 4.6785586751049505e-06, + "loss": 0.5568, + "step": 402 + }, + { + "epoch": 0.33, + "grad_norm": 1.8469793674077621, + "learning_rate": 4.676952888297442e-06, + "loss": 0.5811, + "step": 403 + }, + { + "epoch": 0.33, + "grad_norm": 1.946943145198674, + "learning_rate": 4.675343377549653e-06, + "loss": 0.5475, + "step": 404 + }, + { + "epoch": 0.34, + "grad_norm": 1.991304422730463, + "learning_rate": 4.6737301456148445e-06, + "loss": 0.5856, + "step": 405 + }, + { + "epoch": 0.34, + "grad_norm": 1.9168241989446437, + "learning_rate": 4.672113195252644e-06, + "loss": 0.6069, + "step": 406 + }, + { + "epoch": 0.34, + "grad_norm": 1.9305433665377905, + "learning_rate": 4.670492529229039e-06, + "loss": 0.5536, + "step": 407 + }, + { + "epoch": 0.34, + "grad_norm": 1.8441008898830742, + "learning_rate": 4.668868150316377e-06, + "loss": 0.5859, + "step": 408 + }, + { + "epoch": 0.34, + "grad_norm": 1.8879301596961315, + "learning_rate": 4.667240061293351e-06, + "loss": 0.5483, + "step": 409 + }, + { + "epoch": 0.34, + "grad_norm": 2.024767417636281, + "learning_rate": 4.665608264945004e-06, + "loss": 0.5414, + "step": 410 + }, + { + "epoch": 0.34, + "grad_norm": 2.1331610141797395, + "learning_rate": 4.663972764062722e-06, + "loss": 0.5811, + "step": 411 + }, + { + "epoch": 0.34, + "grad_norm": 1.8132480265817386, + "learning_rate": 4.662333561444226e-06, + "loss": 0.5573, + "step": 412 + }, + { + "epoch": 0.34, + "grad_norm": 1.9795813972027145, + "learning_rate": 4.6606906598935675e-06, + "loss": 0.5814, + "step": 413 + }, + { + "epoch": 0.34, + "grad_norm": 1.8782931074297053, + "learning_rate": 4.6590440622211295e-06, + "loss": 0.569, + "step": 414 + }, + { + "epoch": 0.34, + "grad_norm": 1.8219945335518706, + "learning_rate": 4.657393771243614e-06, + "loss": 0.5669, + "step": 415 + }, + { + "epoch": 0.34, + "grad_norm": 2.4047268604371306, + "learning_rate": 4.6557397897840454e-06, + "loss": 0.5602, + "step": 416 + }, + { + "epoch": 0.35, + "grad_norm": 2.064501780523946, + "learning_rate": 4.654082120671757e-06, + "loss": 0.5699, + "step": 417 + }, + { + "epoch": 0.35, + "grad_norm": 1.9183128854940252, + "learning_rate": 4.65242076674239e-06, + "loss": 0.6112, + "step": 418 + }, + { + "epoch": 0.35, + "grad_norm": 1.9315698971629633, + "learning_rate": 4.650755730837894e-06, + "loss": 0.5537, + "step": 419 + }, + { + "epoch": 0.35, + "grad_norm": 1.9527809333659218, + "learning_rate": 4.649087015806509e-06, + "loss": 0.5423, + "step": 420 + }, + { + "epoch": 0.35, + "grad_norm": 1.8940523915995442, + "learning_rate": 4.647414624502777e-06, + "loss": 0.5708, + "step": 421 + }, + { + "epoch": 0.35, + "grad_norm": 1.9976964785548623, + "learning_rate": 4.645738559787524e-06, + "loss": 0.6006, + "step": 422 + }, + { + "epoch": 0.35, + "grad_norm": 1.9098681403283917, + "learning_rate": 4.64405882452786e-06, + "loss": 0.5591, + "step": 423 + }, + { + "epoch": 0.35, + "grad_norm": 1.8695612182804557, + "learning_rate": 4.642375421597175e-06, + "loss": 0.5219, + "step": 424 + }, + { + "epoch": 0.35, + "grad_norm": 1.8912077704810082, + "learning_rate": 4.6406883538751315e-06, + "loss": 0.5224, + "step": 425 + }, + { + "epoch": 0.35, + "grad_norm": 1.9390714726978922, + "learning_rate": 4.638997624247664e-06, + "loss": 0.5359, + "step": 426 + }, + { + "epoch": 0.35, + "grad_norm": 2.051545992296337, + "learning_rate": 4.637303235606968e-06, + "loss": 0.544, + "step": 427 + }, + { + "epoch": 0.35, + "grad_norm": 2.0657109136265914, + "learning_rate": 4.6356051908515e-06, + "loss": 0.5429, + "step": 428 + }, + { + "epoch": 0.36, + "grad_norm": 2.0301022307984793, + "learning_rate": 4.63390349288597e-06, + "loss": 0.5787, + "step": 429 + }, + { + "epoch": 0.36, + "grad_norm": 2.052515756169346, + "learning_rate": 4.632198144621338e-06, + "loss": 0.5778, + "step": 430 + }, + { + "epoch": 0.36, + "grad_norm": 1.9741370495474897, + "learning_rate": 4.630489148974807e-06, + "loss": 0.5142, + "step": 431 + }, + { + "epoch": 0.36, + "grad_norm": 1.9713229498863698, + "learning_rate": 4.62877650886982e-06, + "loss": 0.6127, + "step": 432 + }, + { + "epoch": 0.36, + "grad_norm": 2.1609440121306007, + "learning_rate": 4.627060227236055e-06, + "loss": 0.5886, + "step": 433 + }, + { + "epoch": 0.36, + "grad_norm": 1.944966445355139, + "learning_rate": 4.625340307009418e-06, + "loss": 0.5657, + "step": 434 + }, + { + "epoch": 0.36, + "grad_norm": 2.031003925680835, + "learning_rate": 4.623616751132041e-06, + "loss": 0.5628, + "step": 435 + }, + { + "epoch": 0.36, + "grad_norm": 1.8774113373137704, + "learning_rate": 4.621889562552272e-06, + "loss": 0.6068, + "step": 436 + }, + { + "epoch": 0.36, + "grad_norm": 2.0385201543401785, + "learning_rate": 4.620158744224677e-06, + "loss": 0.5511, + "step": 437 + }, + { + "epoch": 0.36, + "grad_norm": 1.8440750841938207, + "learning_rate": 4.618424299110028e-06, + "loss": 0.5261, + "step": 438 + }, + { + "epoch": 0.36, + "grad_norm": 1.8978691755923442, + "learning_rate": 4.616686230175303e-06, + "loss": 0.5862, + "step": 439 + }, + { + "epoch": 0.36, + "grad_norm": 1.8120850246861446, + "learning_rate": 4.614944540393679e-06, + "loss": 0.5652, + "step": 440 + }, + { + "epoch": 0.37, + "grad_norm": 2.1821084695714914, + "learning_rate": 4.613199232744525e-06, + "loss": 0.5598, + "step": 441 + }, + { + "epoch": 0.37, + "grad_norm": 1.9626422737625222, + "learning_rate": 4.611450310213401e-06, + "loss": 0.5267, + "step": 442 + }, + { + "epoch": 0.37, + "grad_norm": 1.9714913234889215, + "learning_rate": 4.6096977757920505e-06, + "loss": 0.5658, + "step": 443 + }, + { + "epoch": 0.37, + "grad_norm": 2.0179324078198233, + "learning_rate": 4.607941632478393e-06, + "loss": 0.582, + "step": 444 + }, + { + "epoch": 0.37, + "grad_norm": 1.8565193856331161, + "learning_rate": 4.6061818832765246e-06, + "loss": 0.5715, + "step": 445 + }, + { + "epoch": 0.37, + "grad_norm": 1.9798501479599246, + "learning_rate": 4.604418531196708e-06, + "loss": 0.6007, + "step": 446 + }, + { + "epoch": 0.37, + "grad_norm": 2.0095846956468257, + "learning_rate": 4.602651579255369e-06, + "loss": 0.5947, + "step": 447 + }, + { + "epoch": 0.37, + "grad_norm": 1.9316541079988245, + "learning_rate": 4.600881030475093e-06, + "loss": 0.5501, + "step": 448 + }, + { + "epoch": 0.37, + "grad_norm": 2.080069353365406, + "learning_rate": 4.599106887884616e-06, + "loss": 0.5631, + "step": 449 + }, + { + "epoch": 0.37, + "grad_norm": 1.965973137652201, + "learning_rate": 4.5973291545188235e-06, + "loss": 0.5267, + "step": 450 + }, + { + "epoch": 0.37, + "grad_norm": 2.1082225966704087, + "learning_rate": 4.595547833418741e-06, + "loss": 0.6418, + "step": 451 + }, + { + "epoch": 0.37, + "grad_norm": 2.0359312594194083, + "learning_rate": 4.593762927631536e-06, + "loss": 0.5644, + "step": 452 + }, + { + "epoch": 0.38, + "grad_norm": 2.1254892914109433, + "learning_rate": 4.591974440210502e-06, + "loss": 0.5693, + "step": 453 + }, + { + "epoch": 0.38, + "grad_norm": 1.9121188587334927, + "learning_rate": 4.590182374215064e-06, + "loss": 0.5572, + "step": 454 + }, + { + "epoch": 0.38, + "grad_norm": 1.9348642624953207, + "learning_rate": 4.588386732710765e-06, + "loss": 0.5446, + "step": 455 + }, + { + "epoch": 0.38, + "grad_norm": 1.8667846547370581, + "learning_rate": 4.5865875187692695e-06, + "loss": 0.5681, + "step": 456 + }, + { + "epoch": 0.38, + "grad_norm": 1.9219061327454674, + "learning_rate": 4.5847847354683465e-06, + "loss": 0.5508, + "step": 457 + }, + { + "epoch": 0.38, + "grad_norm": 1.8106132369123122, + "learning_rate": 4.5829783858918756e-06, + "loss": 0.5626, + "step": 458 + }, + { + "epoch": 0.38, + "grad_norm": 1.7827483964442634, + "learning_rate": 4.5811684731298355e-06, + "loss": 0.5575, + "step": 459 + }, + { + "epoch": 0.38, + "grad_norm": 1.9284196979863513, + "learning_rate": 4.5793550002783e-06, + "loss": 0.5363, + "step": 460 + }, + { + "epoch": 0.38, + "grad_norm": 2.029647468705457, + "learning_rate": 4.577537970439433e-06, + "loss": 0.5415, + "step": 461 + }, + { + "epoch": 0.38, + "grad_norm": 2.0997127029950087, + "learning_rate": 4.575717386721482e-06, + "loss": 0.5814, + "step": 462 + }, + { + "epoch": 0.38, + "grad_norm": 1.9589290300656341, + "learning_rate": 4.573893252238777e-06, + "loss": 0.5156, + "step": 463 + }, + { + "epoch": 0.38, + "grad_norm": 1.905237143908251, + "learning_rate": 4.572065570111717e-06, + "loss": 0.5536, + "step": 464 + }, + { + "epoch": 0.39, + "grad_norm": 1.929519794935609, + "learning_rate": 4.570234343466775e-06, + "loss": 0.5879, + "step": 465 + }, + { + "epoch": 0.39, + "grad_norm": 2.096095808886982, + "learning_rate": 4.568399575436484e-06, + "loss": 0.6241, + "step": 466 + }, + { + "epoch": 0.39, + "grad_norm": 1.9486118894048778, + "learning_rate": 4.566561269159437e-06, + "loss": 0.6307, + "step": 467 + }, + { + "epoch": 0.39, + "grad_norm": 2.0839490306744586, + "learning_rate": 4.564719427780276e-06, + "loss": 0.5655, + "step": 468 + }, + { + "epoch": 0.39, + "grad_norm": 1.9439525665822102, + "learning_rate": 4.562874054449694e-06, + "loss": 0.5437, + "step": 469 + }, + { + "epoch": 0.39, + "grad_norm": 1.9409142791465297, + "learning_rate": 4.5610251523244244e-06, + "loss": 0.6429, + "step": 470 + }, + { + "epoch": 0.39, + "grad_norm": 1.8664574493795525, + "learning_rate": 4.559172724567238e-06, + "loss": 0.5826, + "step": 471 + }, + { + "epoch": 0.39, + "grad_norm": 1.80819349503324, + "learning_rate": 4.557316774346934e-06, + "loss": 0.5372, + "step": 472 + }, + { + "epoch": 0.39, + "grad_norm": 1.8680097526865296, + "learning_rate": 4.555457304838341e-06, + "loss": 0.5503, + "step": 473 + }, + { + "epoch": 0.39, + "grad_norm": 1.7466938790815696, + "learning_rate": 4.553594319222303e-06, + "loss": 0.5425, + "step": 474 + }, + { + "epoch": 0.39, + "grad_norm": 1.9610557658505607, + "learning_rate": 4.551727820685684e-06, + "loss": 0.5755, + "step": 475 + }, + { + "epoch": 0.39, + "grad_norm": 1.9414839604282412, + "learning_rate": 4.549857812421353e-06, + "loss": 0.5915, + "step": 476 + }, + { + "epoch": 0.4, + "grad_norm": 1.8484957644576423, + "learning_rate": 4.547984297628186e-06, + "loss": 0.5676, + "step": 477 + }, + { + "epoch": 0.4, + "grad_norm": 2.074524028551078, + "learning_rate": 4.546107279511055e-06, + "loss": 0.6084, + "step": 478 + }, + { + "epoch": 0.4, + "grad_norm": 2.069692704122282, + "learning_rate": 4.544226761280826e-06, + "loss": 0.5676, + "step": 479 + }, + { + "epoch": 0.4, + "grad_norm": 1.8975472248317244, + "learning_rate": 4.54234274615435e-06, + "loss": 0.5904, + "step": 480 + }, + { + "epoch": 0.4, + "grad_norm": 2.0118868982719897, + "learning_rate": 4.540455237354466e-06, + "loss": 0.5722, + "step": 481 + }, + { + "epoch": 0.4, + "grad_norm": 1.9733105429381828, + "learning_rate": 4.5385642381099814e-06, + "loss": 0.6112, + "step": 482 + }, + { + "epoch": 0.4, + "grad_norm": 1.862156914026863, + "learning_rate": 4.53666975165568e-06, + "loss": 0.5951, + "step": 483 + }, + { + "epoch": 0.4, + "grad_norm": 1.9512940035297868, + "learning_rate": 4.53477178123231e-06, + "loss": 0.5223, + "step": 484 + }, + { + "epoch": 0.4, + "grad_norm": 1.9202464191558823, + "learning_rate": 4.532870330086577e-06, + "loss": 0.5638, + "step": 485 + }, + { + "epoch": 0.4, + "grad_norm": 1.9015767656854419, + "learning_rate": 4.530965401471143e-06, + "loss": 0.5911, + "step": 486 + }, + { + "epoch": 0.4, + "grad_norm": 1.95190921973106, + "learning_rate": 4.529056998644619e-06, + "loss": 0.6053, + "step": 487 + }, + { + "epoch": 0.4, + "grad_norm": 2.0058459596081644, + "learning_rate": 4.527145124871556e-06, + "loss": 0.5466, + "step": 488 + }, + { + "epoch": 0.41, + "grad_norm": 1.8902620959998047, + "learning_rate": 4.5252297834224454e-06, + "loss": 0.5526, + "step": 489 + }, + { + "epoch": 0.41, + "grad_norm": 1.985466416169018, + "learning_rate": 4.523310977573711e-06, + "loss": 0.5958, + "step": 490 + }, + { + "epoch": 0.41, + "grad_norm": 2.1140148957176415, + "learning_rate": 4.521388710607699e-06, + "loss": 0.613, + "step": 491 + }, + { + "epoch": 0.41, + "grad_norm": 1.9470601192089525, + "learning_rate": 4.51946298581268e-06, + "loss": 0.5847, + "step": 492 + }, + { + "epoch": 0.41, + "grad_norm": 2.0227057176069603, + "learning_rate": 4.51753380648284e-06, + "loss": 0.5784, + "step": 493 + }, + { + "epoch": 0.41, + "grad_norm": 2.05501863673554, + "learning_rate": 4.515601175918269e-06, + "loss": 0.5501, + "step": 494 + }, + { + "epoch": 0.41, + "grad_norm": 2.0129325402811715, + "learning_rate": 4.513665097424967e-06, + "loss": 0.5641, + "step": 495 + }, + { + "epoch": 0.41, + "grad_norm": 2.0322333044110468, + "learning_rate": 4.51172557431483e-06, + "loss": 0.5422, + "step": 496 + }, + { + "epoch": 0.41, + "grad_norm": 1.9573055659958774, + "learning_rate": 4.509782609905644e-06, + "loss": 0.516, + "step": 497 + }, + { + "epoch": 0.41, + "grad_norm": 1.8223127451485421, + "learning_rate": 4.507836207521085e-06, + "loss": 0.5714, + "step": 498 + }, + { + "epoch": 0.41, + "grad_norm": 1.9343089861079434, + "learning_rate": 4.50588637049071e-06, + "loss": 0.5424, + "step": 499 + }, + { + "epoch": 0.41, + "grad_norm": 1.8940990649350729, + "learning_rate": 4.503933102149948e-06, + "loss": 0.5832, + "step": 500 + }, + { + "epoch": 0.42, + "grad_norm": 1.908617301933682, + "learning_rate": 4.501976405840101e-06, + "loss": 0.5399, + "step": 501 + }, + { + "epoch": 0.42, + "grad_norm": 1.8290259512093785, + "learning_rate": 4.500016284908334e-06, + "loss": 0.5561, + "step": 502 + }, + { + "epoch": 0.42, + "grad_norm": 1.9840280991844164, + "learning_rate": 4.49805274270767e-06, + "loss": 0.5645, + "step": 503 + }, + { + "epoch": 0.42, + "grad_norm": 1.9864953051636856, + "learning_rate": 4.496085782596984e-06, + "loss": 0.5369, + "step": 504 + }, + { + "epoch": 0.42, + "grad_norm": 1.979387839103732, + "learning_rate": 4.494115407940999e-06, + "loss": 0.6196, + "step": 505 + }, + { + "epoch": 0.42, + "grad_norm": 1.9266869362165981, + "learning_rate": 4.492141622110279e-06, + "loss": 0.5687, + "step": 506 + }, + { + "epoch": 0.42, + "grad_norm": 1.9887461782376619, + "learning_rate": 4.4901644284812205e-06, + "loss": 0.5264, + "step": 507 + }, + { + "epoch": 0.42, + "grad_norm": 1.8717867803152208, + "learning_rate": 4.488183830436052e-06, + "loss": 0.5612, + "step": 508 + }, + { + "epoch": 0.42, + "grad_norm": 2.0044226171493, + "learning_rate": 4.486199831362828e-06, + "loss": 0.5571, + "step": 509 + }, + { + "epoch": 0.42, + "grad_norm": 2.1075571016617958, + "learning_rate": 4.484212434655414e-06, + "loss": 0.5642, + "step": 510 + }, + { + "epoch": 0.42, + "grad_norm": 1.8031612547539957, + "learning_rate": 4.482221643713494e-06, + "loss": 0.5805, + "step": 511 + }, + { + "epoch": 0.42, + "grad_norm": 1.8782516337672304, + "learning_rate": 4.480227461942556e-06, + "loss": 0.5596, + "step": 512 + }, + { + "epoch": 0.43, + "grad_norm": 2.075073901596185, + "learning_rate": 4.478229892753886e-06, + "loss": 0.6124, + "step": 513 + }, + { + "epoch": 0.43, + "grad_norm": 2.0588983460568304, + "learning_rate": 4.47622893956457e-06, + "loss": 0.5589, + "step": 514 + }, + { + "epoch": 0.43, + "grad_norm": 1.850248236464706, + "learning_rate": 4.474224605797476e-06, + "loss": 0.5603, + "step": 515 + }, + { + "epoch": 0.43, + "grad_norm": 1.932844310652863, + "learning_rate": 4.472216894881261e-06, + "loss": 0.5571, + "step": 516 + }, + { + "epoch": 0.43, + "grad_norm": 2.09975454805468, + "learning_rate": 4.470205810250357e-06, + "loss": 0.5975, + "step": 517 + }, + { + "epoch": 0.43, + "grad_norm": 1.9694087093010304, + "learning_rate": 4.468191355344965e-06, + "loss": 0.5698, + "step": 518 + }, + { + "epoch": 0.43, + "grad_norm": 1.8794788153917539, + "learning_rate": 4.466173533611053e-06, + "loss": 0.5559, + "step": 519 + }, + { + "epoch": 0.43, + "grad_norm": 2.0650455557855434, + "learning_rate": 4.46415234850035e-06, + "loss": 0.5644, + "step": 520 + }, + { + "epoch": 0.43, + "grad_norm": 2.0062649027982022, + "learning_rate": 4.462127803470334e-06, + "loss": 0.608, + "step": 521 + }, + { + "epoch": 0.43, + "grad_norm": 2.043267877462657, + "learning_rate": 4.460099901984235e-06, + "loss": 0.573, + "step": 522 + }, + { + "epoch": 0.43, + "grad_norm": 2.056372436619027, + "learning_rate": 4.4580686475110235e-06, + "loss": 0.5748, + "step": 523 + }, + { + "epoch": 0.43, + "grad_norm": 1.8871033520138176, + "learning_rate": 4.456034043525404e-06, + "loss": 0.5339, + "step": 524 + }, + { + "epoch": 0.44, + "grad_norm": 1.889474616209236, + "learning_rate": 4.45399609350781e-06, + "loss": 0.5185, + "step": 525 + }, + { + "epoch": 0.44, + "grad_norm": 1.9767406217632912, + "learning_rate": 4.451954800944405e-06, + "loss": 0.5758, + "step": 526 + }, + { + "epoch": 0.44, + "grad_norm": 1.9588695861513832, + "learning_rate": 4.449910169327062e-06, + "loss": 0.5472, + "step": 527 + }, + { + "epoch": 0.44, + "grad_norm": 1.8852210889000718, + "learning_rate": 4.447862202153372e-06, + "loss": 0.5917, + "step": 528 + }, + { + "epoch": 0.44, + "grad_norm": 2.0103638871993077, + "learning_rate": 4.445810902926629e-06, + "loss": 0.5761, + "step": 529 + }, + { + "epoch": 0.44, + "grad_norm": 2.201836945389513, + "learning_rate": 4.443756275155827e-06, + "loss": 0.5614, + "step": 530 + }, + { + "epoch": 0.44, + "grad_norm": 1.900702305836831, + "learning_rate": 4.441698322355656e-06, + "loss": 0.5254, + "step": 531 + }, + { + "epoch": 0.44, + "grad_norm": 2.134694583439314, + "learning_rate": 4.4396370480464915e-06, + "loss": 0.5607, + "step": 532 + }, + { + "epoch": 0.44, + "grad_norm": 1.8073751630381198, + "learning_rate": 4.437572455754391e-06, + "loss": 0.536, + "step": 533 + }, + { + "epoch": 0.44, + "grad_norm": 1.9607338020142653, + "learning_rate": 4.435504549011088e-06, + "loss": 0.59, + "step": 534 + }, + { + "epoch": 0.44, + "grad_norm": 2.0756430867435274, + "learning_rate": 4.433433331353988e-06, + "loss": 0.5538, + "step": 535 + }, + { + "epoch": 0.44, + "grad_norm": 1.8280570853718465, + "learning_rate": 4.431358806326158e-06, + "loss": 0.5789, + "step": 536 + }, + { + "epoch": 0.45, + "grad_norm": 2.2005143967434977, + "learning_rate": 4.429280977476321e-06, + "loss": 0.545, + "step": 537 + }, + { + "epoch": 0.45, + "grad_norm": 1.896479397543979, + "learning_rate": 4.4271998483588565e-06, + "loss": 0.5791, + "step": 538 + }, + { + "epoch": 0.45, + "grad_norm": 2.117773381781195, + "learning_rate": 4.425115422533785e-06, + "loss": 0.5234, + "step": 539 + }, + { + "epoch": 0.45, + "grad_norm": 2.4438942429566617, + "learning_rate": 4.423027703566769e-06, + "loss": 0.5692, + "step": 540 + }, + { + "epoch": 0.45, + "grad_norm": 1.873481152225171, + "learning_rate": 4.4209366950291025e-06, + "loss": 0.5739, + "step": 541 + }, + { + "epoch": 0.45, + "grad_norm": 1.8655199147974673, + "learning_rate": 4.4188424004977085e-06, + "loss": 0.5795, + "step": 542 + }, + { + "epoch": 0.45, + "grad_norm": 1.948840412241188, + "learning_rate": 4.416744823555129e-06, + "loss": 0.5304, + "step": 543 + }, + { + "epoch": 0.45, + "grad_norm": 1.8389034133315045, + "learning_rate": 4.414643967789523e-06, + "loss": 0.5076, + "step": 544 + }, + { + "epoch": 0.45, + "grad_norm": 1.8269235720085213, + "learning_rate": 4.412539836794657e-06, + "loss": 0.5837, + "step": 545 + }, + { + "epoch": 0.45, + "grad_norm": 2.1298715969759505, + "learning_rate": 4.410432434169902e-06, + "loss": 0.5694, + "step": 546 + }, + { + "epoch": 0.45, + "grad_norm": 2.0057741366005746, + "learning_rate": 4.408321763520223e-06, + "loss": 0.557, + "step": 547 + }, + { + "epoch": 0.45, + "grad_norm": 1.7901331374893255, + "learning_rate": 4.406207828456177e-06, + "loss": 0.5746, + "step": 548 + }, + { + "epoch": 0.46, + "grad_norm": 2.1994839889416187, + "learning_rate": 4.404090632593904e-06, + "loss": 0.5407, + "step": 549 + }, + { + "epoch": 0.46, + "grad_norm": 1.9664921082690268, + "learning_rate": 4.401970179555123e-06, + "loss": 0.5322, + "step": 550 + }, + { + "epoch": 0.46, + "grad_norm": 1.9933486180243851, + "learning_rate": 4.399846472967124e-06, + "loss": 0.5798, + "step": 551 + }, + { + "epoch": 0.46, + "grad_norm": 1.986612256562151, + "learning_rate": 4.397719516462765e-06, + "loss": 0.5213, + "step": 552 + }, + { + "epoch": 0.46, + "grad_norm": 2.046550123292336, + "learning_rate": 4.395589313680459e-06, + "loss": 0.5857, + "step": 553 + }, + { + "epoch": 0.46, + "grad_norm": 1.7902327250340486, + "learning_rate": 4.393455868264176e-06, + "loss": 0.555, + "step": 554 + }, + { + "epoch": 0.46, + "grad_norm": 2.0203627138517146, + "learning_rate": 4.391319183863432e-06, + "loss": 0.6329, + "step": 555 + }, + { + "epoch": 0.46, + "grad_norm": 1.9373549045181289, + "learning_rate": 4.389179264133281e-06, + "loss": 0.566, + "step": 556 + }, + { + "epoch": 0.46, + "grad_norm": 1.8936753353678124, + "learning_rate": 4.387036112734316e-06, + "loss": 0.5555, + "step": 557 + }, + { + "epoch": 0.46, + "grad_norm": 1.8493817575820743, + "learning_rate": 4.3848897333326545e-06, + "loss": 0.5427, + "step": 558 + }, + { + "epoch": 0.46, + "grad_norm": 1.9119588677783816, + "learning_rate": 4.382740129599937e-06, + "loss": 0.5157, + "step": 559 + }, + { + "epoch": 0.46, + "grad_norm": 1.8190137094200924, + "learning_rate": 4.380587305213321e-06, + "loss": 0.503, + "step": 560 + }, + { + "epoch": 0.47, + "grad_norm": 1.9891332712764953, + "learning_rate": 4.37843126385547e-06, + "loss": 0.5761, + "step": 561 + }, + { + "epoch": 0.47, + "grad_norm": 1.8620896547461154, + "learning_rate": 4.376272009214555e-06, + "loss": 0.5259, + "step": 562 + }, + { + "epoch": 0.47, + "grad_norm": 1.8896721756477406, + "learning_rate": 4.37410954498424e-06, + "loss": 0.5632, + "step": 563 + }, + { + "epoch": 0.47, + "grad_norm": 1.8302281976781984, + "learning_rate": 4.37194387486368e-06, + "loss": 0.5612, + "step": 564 + }, + { + "epoch": 0.47, + "grad_norm": 2.0721820586440165, + "learning_rate": 4.369775002557516e-06, + "loss": 0.533, + "step": 565 + }, + { + "epoch": 0.47, + "grad_norm": 1.8259926551813157, + "learning_rate": 4.367602931775865e-06, + "loss": 0.526, + "step": 566 + }, + { + "epoch": 0.47, + "grad_norm": 1.8096334574000785, + "learning_rate": 4.3654276662343155e-06, + "loss": 0.5306, + "step": 567 + }, + { + "epoch": 0.47, + "grad_norm": 1.9675637591445598, + "learning_rate": 4.363249209653922e-06, + "loss": 0.5577, + "step": 568 + }, + { + "epoch": 0.47, + "grad_norm": 1.8800389115841605, + "learning_rate": 4.361067565761197e-06, + "loss": 0.5553, + "step": 569 + }, + { + "epoch": 0.47, + "grad_norm": 1.827485496395265, + "learning_rate": 4.358882738288105e-06, + "loss": 0.5587, + "step": 570 + }, + { + "epoch": 0.47, + "grad_norm": 1.820954908943235, + "learning_rate": 4.356694730972056e-06, + "loss": 0.6186, + "step": 571 + }, + { + "epoch": 0.47, + "grad_norm": 1.952072431699686, + "learning_rate": 4.3545035475559025e-06, + "loss": 0.5488, + "step": 572 + }, + { + "epoch": 0.48, + "grad_norm": 1.8292648968688423, + "learning_rate": 4.352309191787924e-06, + "loss": 0.5534, + "step": 573 + }, + { + "epoch": 0.48, + "grad_norm": 1.826293122529813, + "learning_rate": 4.350111667421835e-06, + "loss": 0.5872, + "step": 574 + }, + { + "epoch": 0.48, + "grad_norm": 1.9251425791166785, + "learning_rate": 4.347910978216763e-06, + "loss": 0.5298, + "step": 575 + }, + { + "epoch": 0.48, + "grad_norm": 1.8330818196811385, + "learning_rate": 4.345707127937253e-06, + "loss": 0.5871, + "step": 576 + }, + { + "epoch": 0.48, + "grad_norm": 1.7842986545873851, + "learning_rate": 4.3435001203532555e-06, + "loss": 0.4898, + "step": 577 + }, + { + "epoch": 0.48, + "grad_norm": 1.8778666245156521, + "learning_rate": 4.341289959240124e-06, + "loss": 0.5385, + "step": 578 + }, + { + "epoch": 0.48, + "grad_norm": 1.9300679499181266, + "learning_rate": 4.339076648378605e-06, + "loss": 0.5698, + "step": 579 + }, + { + "epoch": 0.48, + "grad_norm": 1.9440861965960357, + "learning_rate": 4.336860191554833e-06, + "loss": 0.5984, + "step": 580 + }, + { + "epoch": 0.48, + "grad_norm": 1.929951096053947, + "learning_rate": 4.3346405925603265e-06, + "loss": 0.6222, + "step": 581 + }, + { + "epoch": 0.48, + "grad_norm": 1.9138258400335695, + "learning_rate": 4.332417855191974e-06, + "loss": 0.5498, + "step": 582 + }, + { + "epoch": 0.48, + "grad_norm": 2.058548455869675, + "learning_rate": 4.330191983252039e-06, + "loss": 0.5218, + "step": 583 + }, + { + "epoch": 0.48, + "grad_norm": 2.243429045583125, + "learning_rate": 4.327962980548142e-06, + "loss": 0.5768, + "step": 584 + }, + { + "epoch": 0.48, + "grad_norm": 1.9213537104634244, + "learning_rate": 4.32573085089326e-06, + "loss": 0.5784, + "step": 585 + }, + { + "epoch": 0.49, + "grad_norm": 1.9165291289119128, + "learning_rate": 4.32349559810572e-06, + "loss": 0.5697, + "step": 586 + }, + { + "epoch": 0.49, + "grad_norm": 1.9674279518735756, + "learning_rate": 4.321257226009193e-06, + "loss": 0.5104, + "step": 587 + }, + { + "epoch": 0.49, + "grad_norm": 1.9051339015323923, + "learning_rate": 4.319015738432683e-06, + "loss": 0.5711, + "step": 588 + }, + { + "epoch": 0.49, + "grad_norm": 1.957357618850765, + "learning_rate": 4.3167711392105245e-06, + "loss": 0.5854, + "step": 589 + }, + { + "epoch": 0.49, + "grad_norm": 1.9859311708308915, + "learning_rate": 4.314523432182376e-06, + "loss": 0.547, + "step": 590 + }, + { + "epoch": 0.49, + "grad_norm": 1.773704456523191, + "learning_rate": 4.312272621193209e-06, + "loss": 0.5259, + "step": 591 + }, + { + "epoch": 0.49, + "grad_norm": 1.82988033655793, + "learning_rate": 4.31001871009331e-06, + "loss": 0.5209, + "step": 592 + }, + { + "epoch": 0.49, + "grad_norm": 1.8925134832060522, + "learning_rate": 4.307761702738264e-06, + "loss": 0.59, + "step": 593 + }, + { + "epoch": 0.49, + "grad_norm": 1.8477075780641046, + "learning_rate": 4.305501602988953e-06, + "loss": 0.5714, + "step": 594 + }, + { + "epoch": 0.49, + "grad_norm": 1.8568432886623798, + "learning_rate": 4.303238414711552e-06, + "loss": 0.5877, + "step": 595 + }, + { + "epoch": 0.49, + "grad_norm": 1.8179798660158206, + "learning_rate": 4.3009721417775166e-06, + "loss": 0.6029, + "step": 596 + }, + { + "epoch": 0.49, + "grad_norm": 1.8494963193854803, + "learning_rate": 4.29870278806358e-06, + "loss": 0.5236, + "step": 597 + }, + { + "epoch": 0.5, + "grad_norm": 1.9586017397154731, + "learning_rate": 4.296430357451744e-06, + "loss": 0.5998, + "step": 598 + }, + { + "epoch": 0.5, + "grad_norm": 1.926616057974202, + "learning_rate": 4.2941548538292765e-06, + "loss": 0.5914, + "step": 599 + }, + { + "epoch": 0.5, + "grad_norm": 1.9321738359144827, + "learning_rate": 4.291876281088701e-06, + "loss": 0.5358, + "step": 600 + }, + { + "epoch": 0.5, + "grad_norm": 1.8229177571361932, + "learning_rate": 4.289594643127788e-06, + "loss": 0.5284, + "step": 601 + }, + { + "epoch": 0.5, + "grad_norm": 1.849252449531427, + "learning_rate": 4.287309943849558e-06, + "loss": 0.5689, + "step": 602 + }, + { + "epoch": 0.5, + "grad_norm": 1.985343175388319, + "learning_rate": 4.285022187162261e-06, + "loss": 0.6101, + "step": 603 + }, + { + "epoch": 0.5, + "grad_norm": 1.9437791826489255, + "learning_rate": 4.2827313769793835e-06, + "loss": 0.5419, + "step": 604 + }, + { + "epoch": 0.5, + "grad_norm": 1.8027421078538746, + "learning_rate": 4.28043751721963e-06, + "loss": 0.5504, + "step": 605 + }, + { + "epoch": 0.5, + "grad_norm": 1.8221230935939319, + "learning_rate": 4.278140611806926e-06, + "loss": 0.5284, + "step": 606 + }, + { + "epoch": 0.5, + "grad_norm": 1.8597205853821357, + "learning_rate": 4.275840664670403e-06, + "loss": 0.623, + "step": 607 + }, + { + "epoch": 0.5, + "grad_norm": 1.7801370844338822, + "learning_rate": 4.2735376797444e-06, + "loss": 0.5265, + "step": 608 + }, + { + "epoch": 0.5, + "grad_norm": 1.9028094416250234, + "learning_rate": 4.271231660968449e-06, + "loss": 0.5764, + "step": 609 + }, + { + "epoch": 0.51, + "grad_norm": 1.9385737581380094, + "learning_rate": 4.268922612287273e-06, + "loss": 0.6047, + "step": 610 + }, + { + "epoch": 0.51, + "grad_norm": 1.760006169733744, + "learning_rate": 4.266610537650778e-06, + "loss": 0.4944, + "step": 611 + }, + { + "epoch": 0.51, + "grad_norm": 1.857083980479501, + "learning_rate": 4.264295441014047e-06, + "loss": 0.5174, + "step": 612 + }, + { + "epoch": 0.51, + "grad_norm": 1.8299942480819913, + "learning_rate": 4.261977326337332e-06, + "loss": 0.5814, + "step": 613 + }, + { + "epoch": 0.51, + "grad_norm": 1.8943903433033418, + "learning_rate": 4.259656197586046e-06, + "loss": 0.5514, + "step": 614 + }, + { + "epoch": 0.51, + "grad_norm": 1.7839062839610529, + "learning_rate": 4.257332058730761e-06, + "loss": 0.5857, + "step": 615 + }, + { + "epoch": 0.51, + "grad_norm": 2.7188975139736256, + "learning_rate": 4.255004913747196e-06, + "loss": 0.5509, + "step": 616 + }, + { + "epoch": 0.51, + "grad_norm": 1.8767461602206779, + "learning_rate": 4.252674766616212e-06, + "loss": 0.5038, + "step": 617 + }, + { + "epoch": 0.51, + "grad_norm": 1.8391588901867753, + "learning_rate": 4.250341621323809e-06, + "loss": 0.5196, + "step": 618 + }, + { + "epoch": 0.51, + "grad_norm": 1.8106924420187829, + "learning_rate": 4.248005481861111e-06, + "loss": 0.5458, + "step": 619 + }, + { + "epoch": 0.51, + "grad_norm": 1.9698953511074666, + "learning_rate": 4.245666352224367e-06, + "loss": 0.5963, + "step": 620 + }, + { + "epoch": 0.51, + "grad_norm": 1.8890424031569348, + "learning_rate": 4.243324236414939e-06, + "loss": 0.5277, + "step": 621 + }, + { + "epoch": 0.52, + "grad_norm": 1.8537879418167673, + "learning_rate": 4.240979138439301e-06, + "loss": 0.5407, + "step": 622 + }, + { + "epoch": 0.52, + "grad_norm": 1.9264981771759184, + "learning_rate": 4.238631062309023e-06, + "loss": 0.5788, + "step": 623 + }, + { + "epoch": 0.52, + "grad_norm": 1.949693389062837, + "learning_rate": 4.236280012040773e-06, + "loss": 0.5007, + "step": 624 + }, + { + "epoch": 0.52, + "grad_norm": 1.8845778025905608, + "learning_rate": 4.233925991656307e-06, + "loss": 0.5905, + "step": 625 + }, + { + "epoch": 0.52, + "grad_norm": 1.8977167810192608, + "learning_rate": 4.231569005182459e-06, + "loss": 0.5342, + "step": 626 + }, + { + "epoch": 0.52, + "grad_norm": 1.9579196623045914, + "learning_rate": 4.229209056651139e-06, + "loss": 0.554, + "step": 627 + }, + { + "epoch": 0.52, + "grad_norm": 1.8427820272426025, + "learning_rate": 4.226846150099324e-06, + "loss": 0.5629, + "step": 628 + }, + { + "epoch": 0.52, + "grad_norm": 1.865218131227253, + "learning_rate": 4.22448028956905e-06, + "loss": 0.558, + "step": 629 + }, + { + "epoch": 0.52, + "grad_norm": 1.7348773966225364, + "learning_rate": 4.222111479107406e-06, + "loss": 0.5332, + "step": 630 + }, + { + "epoch": 0.52, + "grad_norm": 1.779367140127678, + "learning_rate": 4.219739722766528e-06, + "loss": 0.569, + "step": 631 + }, + { + "epoch": 0.52, + "grad_norm": 1.92860570712595, + "learning_rate": 4.217365024603592e-06, + "loss": 0.5342, + "step": 632 + }, + { + "epoch": 0.52, + "grad_norm": 1.946965997476449, + "learning_rate": 4.214987388680804e-06, + "loss": 0.5482, + "step": 633 + }, + { + "epoch": 0.53, + "grad_norm": 1.7930454990298659, + "learning_rate": 4.212606819065399e-06, + "loss": 0.5376, + "step": 634 + }, + { + "epoch": 0.53, + "grad_norm": 1.8379498458279013, + "learning_rate": 4.210223319829626e-06, + "loss": 0.5741, + "step": 635 + }, + { + "epoch": 0.53, + "grad_norm": 1.742977498596499, + "learning_rate": 4.207836895050748e-06, + "loss": 0.5569, + "step": 636 + }, + { + "epoch": 0.53, + "grad_norm": 1.852541709372898, + "learning_rate": 4.205447548811032e-06, + "loss": 0.578, + "step": 637 + }, + { + "epoch": 0.53, + "grad_norm": 1.8180259569107267, + "learning_rate": 4.203055285197745e-06, + "loss": 0.5189, + "step": 638 + }, + { + "epoch": 0.53, + "grad_norm": 1.8177842562763082, + "learning_rate": 4.20066010830314e-06, + "loss": 0.5424, + "step": 639 + }, + { + "epoch": 0.53, + "grad_norm": 1.8068654723170434, + "learning_rate": 4.198262022224457e-06, + "loss": 0.5336, + "step": 640 + }, + { + "epoch": 0.53, + "grad_norm": 1.9664843499052276, + "learning_rate": 4.195861031063909e-06, + "loss": 0.5399, + "step": 641 + }, + { + "epoch": 0.53, + "grad_norm": 1.7812265481792608, + "learning_rate": 4.193457138928683e-06, + "loss": 0.534, + "step": 642 + }, + { + "epoch": 0.53, + "grad_norm": 1.908377487778027, + "learning_rate": 4.191050349930925e-06, + "loss": 0.5831, + "step": 643 + }, + { + "epoch": 0.53, + "grad_norm": 1.8124678634933105, + "learning_rate": 4.18864066818774e-06, + "loss": 0.5309, + "step": 644 + }, + { + "epoch": 0.53, + "grad_norm": 1.902443199964304, + "learning_rate": 4.186228097821176e-06, + "loss": 0.5452, + "step": 645 + }, + { + "epoch": 0.54, + "grad_norm": 1.9694387068719457, + "learning_rate": 4.183812642958227e-06, + "loss": 0.5462, + "step": 646 + }, + { + "epoch": 0.54, + "grad_norm": 1.945352264767711, + "learning_rate": 4.181394307730819e-06, + "loss": 0.4853, + "step": 647 + }, + { + "epoch": 0.54, + "grad_norm": 1.7967416728436914, + "learning_rate": 4.178973096275806e-06, + "loss": 0.5952, + "step": 648 + }, + { + "epoch": 0.54, + "grad_norm": 2.0602433101771616, + "learning_rate": 4.176549012734963e-06, + "loss": 0.6346, + "step": 649 + }, + { + "epoch": 0.54, + "grad_norm": 1.9158731498204968, + "learning_rate": 4.1741220612549746e-06, + "loss": 0.5101, + "step": 650 + }, + { + "epoch": 0.54, + "grad_norm": 1.951875972207364, + "learning_rate": 4.171692245987436e-06, + "loss": 0.5718, + "step": 651 + }, + { + "epoch": 0.54, + "grad_norm": 1.871788727804539, + "learning_rate": 4.169259571088839e-06, + "loss": 0.5516, + "step": 652 + }, + { + "epoch": 0.54, + "grad_norm": 1.945571804366465, + "learning_rate": 4.166824040720566e-06, + "loss": 0.5544, + "step": 653 + }, + { + "epoch": 0.54, + "grad_norm": 1.8975723622706568, + "learning_rate": 4.1643856590488866e-06, + "loss": 0.5643, + "step": 654 + }, + { + "epoch": 0.54, + "grad_norm": 1.9772846459626554, + "learning_rate": 4.161944430244945e-06, + "loss": 0.5487, + "step": 655 + }, + { + "epoch": 0.54, + "grad_norm": 2.036472038769578, + "learning_rate": 4.159500358484759e-06, + "loss": 0.5232, + "step": 656 + }, + { + "epoch": 0.54, + "grad_norm": 1.7742095436926848, + "learning_rate": 4.157053447949206e-06, + "loss": 0.4963, + "step": 657 + }, + { + "epoch": 0.55, + "grad_norm": 2.1819742476725814, + "learning_rate": 4.154603702824023e-06, + "loss": 0.5416, + "step": 658 + }, + { + "epoch": 0.55, + "grad_norm": 1.9151345309457093, + "learning_rate": 4.152151127299794e-06, + "loss": 0.5822, + "step": 659 + }, + { + "epoch": 0.55, + "grad_norm": 2.033640859083771, + "learning_rate": 4.149695725571944e-06, + "loss": 0.5876, + "step": 660 + }, + { + "epoch": 0.55, + "grad_norm": 1.8935471013235925, + "learning_rate": 4.147237501840734e-06, + "loss": 0.548, + "step": 661 + }, + { + "epoch": 0.55, + "grad_norm": 1.7836299476774775, + "learning_rate": 4.144776460311253e-06, + "loss": 0.5274, + "step": 662 + }, + { + "epoch": 0.55, + "grad_norm": 2.194666072449123, + "learning_rate": 4.142312605193407e-06, + "loss": 0.5934, + "step": 663 + }, + { + "epoch": 0.55, + "grad_norm": 1.988265407508224, + "learning_rate": 4.13984594070192e-06, + "loss": 0.5539, + "step": 664 + }, + { + "epoch": 0.55, + "grad_norm": 1.7594955740187146, + "learning_rate": 4.137376471056317e-06, + "loss": 0.5324, + "step": 665 + }, + { + "epoch": 0.55, + "grad_norm": 1.9342530277100989, + "learning_rate": 4.1349042004809224e-06, + "loss": 0.5902, + "step": 666 + }, + { + "epoch": 0.55, + "grad_norm": 1.9757082453588417, + "learning_rate": 4.132429133204856e-06, + "loss": 0.5874, + "step": 667 + }, + { + "epoch": 0.55, + "grad_norm": 1.7792467343474774, + "learning_rate": 4.129951273462016e-06, + "loss": 0.5516, + "step": 668 + }, + { + "epoch": 0.55, + "grad_norm": 1.9010392264817964, + "learning_rate": 4.127470625491082e-06, + "loss": 0.5793, + "step": 669 + }, + { + "epoch": 0.56, + "grad_norm": 2.054505290884914, + "learning_rate": 4.1249871935355e-06, + "loss": 0.5718, + "step": 670 + }, + { + "epoch": 0.56, + "grad_norm": 1.8010036617727825, + "learning_rate": 4.1225009818434805e-06, + "loss": 0.5698, + "step": 671 + }, + { + "epoch": 0.56, + "grad_norm": 1.975020822034628, + "learning_rate": 4.120011994667988e-06, + "loss": 0.5739, + "step": 672 + }, + { + "epoch": 0.56, + "grad_norm": 1.9801075045379748, + "learning_rate": 4.117520236266734e-06, + "loss": 0.5589, + "step": 673 + }, + { + "epoch": 0.56, + "grad_norm": 1.7773808874926829, + "learning_rate": 4.115025710902173e-06, + "loss": 0.5276, + "step": 674 + }, + { + "epoch": 0.56, + "grad_norm": 1.890298398205481, + "learning_rate": 4.112528422841491e-06, + "loss": 0.4914, + "step": 675 + }, + { + "epoch": 0.56, + "grad_norm": 1.9087570296379215, + "learning_rate": 4.110028376356599e-06, + "loss": 0.5412, + "step": 676 + }, + { + "epoch": 0.56, + "grad_norm": 1.8908271691889404, + "learning_rate": 4.1075255757241295e-06, + "loss": 0.5618, + "step": 677 + }, + { + "epoch": 0.56, + "grad_norm": 2.024312170169272, + "learning_rate": 4.105020025225423e-06, + "loss": 0.5618, + "step": 678 + }, + { + "epoch": 0.56, + "grad_norm": 1.8072403207581518, + "learning_rate": 4.102511729146528e-06, + "loss": 0.5744, + "step": 679 + }, + { + "epoch": 0.56, + "grad_norm": 1.7750572145097157, + "learning_rate": 4.100000691778185e-06, + "loss": 0.5716, + "step": 680 + }, + { + "epoch": 0.56, + "grad_norm": 1.8778337896632162, + "learning_rate": 4.097486917415827e-06, + "loss": 0.5683, + "step": 681 + }, + { + "epoch": 0.57, + "grad_norm": 1.9710167098273688, + "learning_rate": 4.094970410359568e-06, + "loss": 0.5273, + "step": 682 + }, + { + "epoch": 0.57, + "grad_norm": 1.9136975523972874, + "learning_rate": 4.092451174914196e-06, + "loss": 0.5239, + "step": 683 + }, + { + "epoch": 0.57, + "grad_norm": 1.929344793900944, + "learning_rate": 4.089929215389167e-06, + "loss": 0.5388, + "step": 684 + }, + { + "epoch": 0.57, + "grad_norm": 1.7211535229712278, + "learning_rate": 4.087404536098597e-06, + "loss": 0.5068, + "step": 685 + }, + { + "epoch": 0.57, + "grad_norm": 1.8739637749458882, + "learning_rate": 4.084877141361254e-06, + "loss": 0.5537, + "step": 686 + }, + { + "epoch": 0.57, + "grad_norm": 1.9268469960932768, + "learning_rate": 4.082347035500553e-06, + "loss": 0.5875, + "step": 687 + }, + { + "epoch": 0.57, + "grad_norm": 1.896542320004603, + "learning_rate": 4.079814222844541e-06, + "loss": 0.5314, + "step": 688 + }, + { + "epoch": 0.57, + "grad_norm": 1.723925126440519, + "learning_rate": 4.077278707725904e-06, + "loss": 0.5009, + "step": 689 + }, + { + "epoch": 0.57, + "grad_norm": 1.8345210205201996, + "learning_rate": 4.074740494481942e-06, + "loss": 0.5544, + "step": 690 + }, + { + "epoch": 0.57, + "grad_norm": 1.766819080519227, + "learning_rate": 4.072199587454578e-06, + "loss": 0.5393, + "step": 691 + }, + { + "epoch": 0.57, + "grad_norm": 1.9577975399484282, + "learning_rate": 4.069655990990337e-06, + "loss": 0.5357, + "step": 692 + }, + { + "epoch": 0.57, + "grad_norm": 1.8254761359015224, + "learning_rate": 4.06710970944035e-06, + "loss": 0.5797, + "step": 693 + }, + { + "epoch": 0.58, + "grad_norm": 2.1203973374999214, + "learning_rate": 4.064560747160337e-06, + "loss": 0.5811, + "step": 694 + }, + { + "epoch": 0.58, + "grad_norm": 1.9066221824053846, + "learning_rate": 4.062009108510605e-06, + "loss": 0.5014, + "step": 695 + }, + { + "epoch": 0.58, + "grad_norm": 1.951489716071849, + "learning_rate": 4.059454797856039e-06, + "loss": 0.529, + "step": 696 + }, + { + "epoch": 0.58, + "grad_norm": 1.8402907113209426, + "learning_rate": 4.056897819566096e-06, + "loss": 0.4942, + "step": 697 + }, + { + "epoch": 0.58, + "grad_norm": 2.0368715640768498, + "learning_rate": 4.0543381780147965e-06, + "loss": 0.5245, + "step": 698 + }, + { + "epoch": 0.58, + "grad_norm": 1.8154462049772704, + "learning_rate": 4.0517758775807135e-06, + "loss": 0.4979, + "step": 699 + }, + { + "epoch": 0.58, + "grad_norm": 1.890388895335948, + "learning_rate": 4.049210922646973e-06, + "loss": 0.5212, + "step": 700 + }, + { + "epoch": 0.58, + "grad_norm": 2.0215900504030166, + "learning_rate": 4.046643317601237e-06, + "loss": 0.5384, + "step": 701 + }, + { + "epoch": 0.58, + "grad_norm": 1.816997259900234, + "learning_rate": 4.0440730668357076e-06, + "loss": 0.492, + "step": 702 + }, + { + "epoch": 0.58, + "grad_norm": 1.968633766153865, + "learning_rate": 4.0415001747471036e-06, + "loss": 0.5917, + "step": 703 + }, + { + "epoch": 0.58, + "grad_norm": 1.8313487810801756, + "learning_rate": 4.0389246457366696e-06, + "loss": 0.5561, + "step": 704 + }, + { + "epoch": 0.58, + "grad_norm": 1.7954421155528784, + "learning_rate": 4.036346484210159e-06, + "loss": 0.5383, + "step": 705 + }, + { + "epoch": 0.59, + "grad_norm": 1.8517101217315919, + "learning_rate": 4.033765694577826e-06, + "loss": 0.5368, + "step": 706 + }, + { + "epoch": 0.59, + "grad_norm": 1.8888441616203875, + "learning_rate": 4.031182281254423e-06, + "loss": 0.5895, + "step": 707 + }, + { + "epoch": 0.59, + "grad_norm": 1.8131436351862782, + "learning_rate": 4.028596248659191e-06, + "loss": 0.5346, + "step": 708 + }, + { + "epoch": 0.59, + "grad_norm": 1.8803113487311214, + "learning_rate": 4.0260076012158486e-06, + "loss": 0.4987, + "step": 709 + }, + { + "epoch": 0.59, + "grad_norm": 1.8989122650791335, + "learning_rate": 4.023416343352589e-06, + "loss": 0.5007, + "step": 710 + }, + { + "epoch": 0.59, + "grad_norm": 1.9466291969735336, + "learning_rate": 4.020822479502074e-06, + "loss": 0.5868, + "step": 711 + }, + { + "epoch": 0.59, + "grad_norm": 1.869533367998661, + "learning_rate": 4.018226014101418e-06, + "loss": 0.5995, + "step": 712 + }, + { + "epoch": 0.59, + "grad_norm": 1.93738608926368, + "learning_rate": 4.015626951592187e-06, + "loss": 0.5625, + "step": 713 + }, + { + "epoch": 0.59, + "grad_norm": 1.8485080870897803, + "learning_rate": 4.013025296420394e-06, + "loss": 0.5585, + "step": 714 + }, + { + "epoch": 0.59, + "grad_norm": 1.8099669115387913, + "learning_rate": 4.010421053036481e-06, + "loss": 0.5384, + "step": 715 + }, + { + "epoch": 0.59, + "grad_norm": 1.8810123612010912, + "learning_rate": 4.007814225895321e-06, + "loss": 0.5589, + "step": 716 + }, + { + "epoch": 0.59, + "grad_norm": 1.8692823610937885, + "learning_rate": 4.005204819456205e-06, + "loss": 0.5474, + "step": 717 + }, + { + "epoch": 0.6, + "grad_norm": 1.8120887102918588, + "learning_rate": 4.00259283818284e-06, + "loss": 0.5138, + "step": 718 + }, + { + "epoch": 0.6, + "grad_norm": 1.7933926935301234, + "learning_rate": 3.999978286543331e-06, + "loss": 0.5235, + "step": 719 + }, + { + "epoch": 0.6, + "grad_norm": 1.8382360731306235, + "learning_rate": 3.997361169010187e-06, + "loss": 0.5846, + "step": 720 + }, + { + "epoch": 0.6, + "grad_norm": 1.993925306673069, + "learning_rate": 3.994741490060301e-06, + "loss": 0.5561, + "step": 721 + }, + { + "epoch": 0.6, + "grad_norm": 1.900088669959918, + "learning_rate": 3.9921192541749505e-06, + "loss": 0.5215, + "step": 722 + }, + { + "epoch": 0.6, + "grad_norm": 1.9250072769385074, + "learning_rate": 3.989494465839785e-06, + "loss": 0.54, + "step": 723 + }, + { + "epoch": 0.6, + "grad_norm": 1.7928905908766457, + "learning_rate": 3.986867129544822e-06, + "loss": 0.6066, + "step": 724 + }, + { + "epoch": 0.6, + "grad_norm": 1.9474900039545116, + "learning_rate": 3.984237249784437e-06, + "loss": 0.5173, + "step": 725 + }, + { + "epoch": 0.6, + "grad_norm": 1.9004077336349998, + "learning_rate": 3.981604831057357e-06, + "loss": 0.5409, + "step": 726 + }, + { + "epoch": 0.6, + "grad_norm": 1.7573843693188624, + "learning_rate": 3.97896987786665e-06, + "loss": 0.5239, + "step": 727 + }, + { + "epoch": 0.6, + "grad_norm": 1.899283660379949, + "learning_rate": 3.976332394719721e-06, + "loss": 0.4977, + "step": 728 + }, + { + "epoch": 0.6, + "grad_norm": 1.8353476568345033, + "learning_rate": 3.973692386128304e-06, + "loss": 0.5834, + "step": 729 + }, + { + "epoch": 0.61, + "grad_norm": 2.032325534167748, + "learning_rate": 3.971049856608451e-06, + "loss": 0.5343, + "step": 730 + }, + { + "epoch": 0.61, + "grad_norm": 1.8161347764383835, + "learning_rate": 3.9684048106805286e-06, + "loss": 0.585, + "step": 731 + }, + { + "epoch": 0.61, + "grad_norm": 1.836376388525165, + "learning_rate": 3.965757252869204e-06, + "loss": 0.5978, + "step": 732 + }, + { + "epoch": 0.61, + "grad_norm": 1.889118862096067, + "learning_rate": 3.963107187703446e-06, + "loss": 0.5393, + "step": 733 + }, + { + "epoch": 0.61, + "grad_norm": 1.7772829607776217, + "learning_rate": 3.96045461971651e-06, + "loss": 0.5164, + "step": 734 + }, + { + "epoch": 0.61, + "grad_norm": 1.7980410807492582, + "learning_rate": 3.957799553445932e-06, + "loss": 0.5455, + "step": 735 + }, + { + "epoch": 0.61, + "grad_norm": 1.907936099702467, + "learning_rate": 3.955141993433526e-06, + "loss": 0.532, + "step": 736 + }, + { + "epoch": 0.61, + "grad_norm": 1.8668064740862462, + "learning_rate": 3.9524819442253645e-06, + "loss": 0.5578, + "step": 737 + }, + { + "epoch": 0.61, + "grad_norm": 1.838952740378055, + "learning_rate": 3.949819410371785e-06, + "loss": 0.5784, + "step": 738 + }, + { + "epoch": 0.61, + "grad_norm": 1.9595767898211005, + "learning_rate": 3.947154396427373e-06, + "loss": 0.5213, + "step": 739 + }, + { + "epoch": 0.61, + "grad_norm": 1.9422968944070973, + "learning_rate": 3.944486906950954e-06, + "loss": 0.5709, + "step": 740 + }, + { + "epoch": 0.61, + "grad_norm": 1.760556693040696, + "learning_rate": 3.941816946505592e-06, + "loss": 0.5564, + "step": 741 + }, + { + "epoch": 0.62, + "grad_norm": 1.8054841879427592, + "learning_rate": 3.939144519658575e-06, + "loss": 0.5435, + "step": 742 + }, + { + "epoch": 0.62, + "grad_norm": 2.1072923992538, + "learning_rate": 3.936469630981412e-06, + "loss": 0.5622, + "step": 743 + }, + { + "epoch": 0.62, + "grad_norm": 1.711687978027928, + "learning_rate": 3.933792285049821e-06, + "loss": 0.5554, + "step": 744 + }, + { + "epoch": 0.62, + "grad_norm": 1.8166543944942228, + "learning_rate": 3.931112486443727e-06, + "loss": 0.5079, + "step": 745 + }, + { + "epoch": 0.62, + "grad_norm": 1.7923405334139695, + "learning_rate": 3.928430239747246e-06, + "loss": 0.5692, + "step": 746 + }, + { + "epoch": 0.62, + "grad_norm": 1.9611773239667012, + "learning_rate": 3.925745549548687e-06, + "loss": 0.5092, + "step": 747 + }, + { + "epoch": 0.62, + "grad_norm": 1.8440088039871827, + "learning_rate": 3.923058420440534e-06, + "loss": 0.5369, + "step": 748 + }, + { + "epoch": 0.62, + "grad_norm": 1.9272316571307881, + "learning_rate": 3.920368857019447e-06, + "loss": 0.5798, + "step": 749 + }, + { + "epoch": 0.62, + "grad_norm": 1.8248503445199376, + "learning_rate": 3.917676863886246e-06, + "loss": 0.5479, + "step": 750 + }, + { + "epoch": 0.62, + "grad_norm": 1.9200626612083824, + "learning_rate": 3.914982445645912e-06, + "loss": 0.549, + "step": 751 + }, + { + "epoch": 0.62, + "grad_norm": 1.8585556832275227, + "learning_rate": 3.91228560690757e-06, + "loss": 0.5283, + "step": 752 + }, + { + "epoch": 0.62, + "grad_norm": 1.819239895382093, + "learning_rate": 3.90958635228449e-06, + "loss": 0.535, + "step": 753 + }, + { + "epoch": 0.63, + "grad_norm": 1.7810389942543545, + "learning_rate": 3.90688468639407e-06, + "loss": 0.5125, + "step": 754 + }, + { + "epoch": 0.63, + "grad_norm": 1.9614453700373935, + "learning_rate": 3.904180613857837e-06, + "loss": 0.5406, + "step": 755 + }, + { + "epoch": 0.63, + "grad_norm": 1.805104940263808, + "learning_rate": 3.901474139301433e-06, + "loss": 0.5794, + "step": 756 + }, + { + "epoch": 0.63, + "grad_norm": 1.78756289235025, + "learning_rate": 3.898765267354607e-06, + "loss": 0.569, + "step": 757 + }, + { + "epoch": 0.63, + "grad_norm": 1.912300438003516, + "learning_rate": 3.896054002651213e-06, + "loss": 0.5565, + "step": 758 + }, + { + "epoch": 0.63, + "grad_norm": 1.8148356694353722, + "learning_rate": 3.893340349829195e-06, + "loss": 0.5471, + "step": 759 + }, + { + "epoch": 0.63, + "grad_norm": 1.6836223387492706, + "learning_rate": 3.890624313530583e-06, + "loss": 0.5145, + "step": 760 + }, + { + "epoch": 0.63, + "grad_norm": 1.8389298216964765, + "learning_rate": 3.887905898401485e-06, + "loss": 0.5441, + "step": 761 + }, + { + "epoch": 0.63, + "grad_norm": 1.7845754057436856, + "learning_rate": 3.885185109092078e-06, + "loss": 0.5478, + "step": 762 + }, + { + "epoch": 0.63, + "grad_norm": 1.77076035925993, + "learning_rate": 3.882461950256598e-06, + "loss": 0.5497, + "step": 763 + }, + { + "epoch": 0.63, + "grad_norm": 1.8011284465286703, + "learning_rate": 3.87973642655334e-06, + "loss": 0.5039, + "step": 764 + }, + { + "epoch": 0.63, + "grad_norm": 1.7400129481667248, + "learning_rate": 3.877008542644637e-06, + "loss": 0.5243, + "step": 765 + }, + { + "epoch": 0.64, + "grad_norm": 1.9899565111682327, + "learning_rate": 3.874278303196866e-06, + "loss": 0.5767, + "step": 766 + }, + { + "epoch": 0.64, + "grad_norm": 1.8345576263874734, + "learning_rate": 3.871545712880429e-06, + "loss": 0.5262, + "step": 767 + }, + { + "epoch": 0.64, + "grad_norm": 1.8375211207672395, + "learning_rate": 3.8688107763697505e-06, + "loss": 0.5467, + "step": 768 + }, + { + "epoch": 0.64, + "grad_norm": 1.8068462280574835, + "learning_rate": 3.8660734983432715e-06, + "loss": 0.5256, + "step": 769 + }, + { + "epoch": 0.64, + "grad_norm": 1.7823522202158735, + "learning_rate": 3.863333883483433e-06, + "loss": 0.5419, + "step": 770 + }, + { + "epoch": 0.64, + "grad_norm": 1.8881514180214427, + "learning_rate": 3.86059193647668e-06, + "loss": 0.541, + "step": 771 + }, + { + "epoch": 0.64, + "grad_norm": 1.8311064595650786, + "learning_rate": 3.85784766201344e-06, + "loss": 0.5455, + "step": 772 + }, + { + "epoch": 0.64, + "grad_norm": 1.9833459774866717, + "learning_rate": 3.855101064788126e-06, + "loss": 0.5723, + "step": 773 + }, + { + "epoch": 0.64, + "grad_norm": 1.7968096633022903, + "learning_rate": 3.852352149499125e-06, + "loss": 0.5153, + "step": 774 + }, + { + "epoch": 0.64, + "grad_norm": 1.775423895652992, + "learning_rate": 3.849600920848787e-06, + "loss": 0.5134, + "step": 775 + }, + { + "epoch": 0.64, + "grad_norm": 1.7262892998825556, + "learning_rate": 3.84684738354342e-06, + "loss": 0.5287, + "step": 776 + }, + { + "epoch": 0.64, + "grad_norm": 1.7866135638778051, + "learning_rate": 3.84409154229328e-06, + "loss": 0.57, + "step": 777 + }, + { + "epoch": 0.64, + "grad_norm": 1.787377916112687, + "learning_rate": 3.841333401812569e-06, + "loss": 0.5312, + "step": 778 + }, + { + "epoch": 0.65, + "grad_norm": 1.684801862246949, + "learning_rate": 3.838572966819416e-06, + "loss": 0.5822, + "step": 779 + }, + { + "epoch": 0.65, + "grad_norm": 1.79074773131748, + "learning_rate": 3.835810242035879e-06, + "loss": 0.5651, + "step": 780 + }, + { + "epoch": 0.65, + "grad_norm": 1.9234904827178134, + "learning_rate": 3.8330452321879305e-06, + "loss": 0.5527, + "step": 781 + }, + { + "epoch": 0.65, + "grad_norm": 2.1733402579018186, + "learning_rate": 3.830277942005455e-06, + "loss": 0.5545, + "step": 782 + }, + { + "epoch": 0.65, + "grad_norm": 2.112229504682016, + "learning_rate": 3.827508376222233e-06, + "loss": 0.5766, + "step": 783 + }, + { + "epoch": 0.65, + "grad_norm": 2.087174122744587, + "learning_rate": 3.824736539575944e-06, + "loss": 0.549, + "step": 784 + }, + { + "epoch": 0.65, + "grad_norm": 1.9570382810890106, + "learning_rate": 3.821962436808145e-06, + "loss": 0.4984, + "step": 785 + }, + { + "epoch": 0.65, + "grad_norm": 1.94720853153738, + "learning_rate": 3.819186072664277e-06, + "loss": 0.5303, + "step": 786 + }, + { + "epoch": 0.65, + "grad_norm": 2.21095404069362, + "learning_rate": 3.816407451893643e-06, + "loss": 0.5674, + "step": 787 + }, + { + "epoch": 0.65, + "grad_norm": 1.7284336698899117, + "learning_rate": 3.8136265792494094e-06, + "loss": 0.5952, + "step": 788 + }, + { + "epoch": 0.65, + "grad_norm": 1.940869697529687, + "learning_rate": 3.8108434594885934e-06, + "loss": 0.5198, + "step": 789 + }, + { + "epoch": 0.65, + "grad_norm": 1.9282749931884566, + "learning_rate": 3.808058097372057e-06, + "loss": 0.5499, + "step": 790 + }, + { + "epoch": 0.66, + "grad_norm": 2.0180195532646983, + "learning_rate": 3.8052704976644984e-06, + "loss": 0.5117, + "step": 791 + }, + { + "epoch": 0.66, + "grad_norm": 1.8303561179366206, + "learning_rate": 3.8024806651344424e-06, + "loss": 0.5034, + "step": 792 + }, + { + "epoch": 0.66, + "grad_norm": 2.0584295539484754, + "learning_rate": 3.7996886045542335e-06, + "loss": 0.5391, + "step": 793 + }, + { + "epoch": 0.66, + "grad_norm": 1.7736893833047733, + "learning_rate": 3.7968943207000284e-06, + "loss": 0.5378, + "step": 794 + }, + { + "epoch": 0.66, + "grad_norm": 1.7840353008162277, + "learning_rate": 3.794097818351786e-06, + "loss": 0.5091, + "step": 795 + }, + { + "epoch": 0.66, + "grad_norm": 2.0949100717616225, + "learning_rate": 3.791299102293261e-06, + "loss": 0.5731, + "step": 796 + }, + { + "epoch": 0.66, + "grad_norm": 2.048353193294094, + "learning_rate": 3.7884981773119943e-06, + "loss": 0.5576, + "step": 797 + }, + { + "epoch": 0.66, + "grad_norm": 1.9990070284918733, + "learning_rate": 3.7856950481993054e-06, + "loss": 0.5297, + "step": 798 + }, + { + "epoch": 0.66, + "grad_norm": 1.859560152641746, + "learning_rate": 3.7828897197502856e-06, + "loss": 0.5131, + "step": 799 + }, + { + "epoch": 0.66, + "grad_norm": 2.0054802770873916, + "learning_rate": 3.780082196763785e-06, + "loss": 0.5428, + "step": 800 + }, + { + "epoch": 0.66, + "grad_norm": 1.8985367093585213, + "learning_rate": 3.7772724840424126e-06, + "loss": 0.5206, + "step": 801 + }, + { + "epoch": 0.66, + "grad_norm": 1.9964704653764362, + "learning_rate": 3.774460586392519e-06, + "loss": 0.5929, + "step": 802 + }, + { + "epoch": 0.67, + "grad_norm": 1.7572936836574113, + "learning_rate": 3.771646508624194e-06, + "loss": 0.5428, + "step": 803 + }, + { + "epoch": 0.67, + "grad_norm": 1.9623695483620975, + "learning_rate": 3.768830255551258e-06, + "loss": 0.5685, + "step": 804 + }, + { + "epoch": 0.67, + "grad_norm": 1.9663290616402378, + "learning_rate": 3.76601183199125e-06, + "loss": 0.5351, + "step": 805 + }, + { + "epoch": 0.67, + "grad_norm": 1.7876590847889615, + "learning_rate": 3.763191242765424e-06, + "loss": 0.567, + "step": 806 + }, + { + "epoch": 0.67, + "grad_norm": 1.8500820456277005, + "learning_rate": 3.7603684926987383e-06, + "loss": 0.523, + "step": 807 + }, + { + "epoch": 0.67, + "grad_norm": 2.041973125533567, + "learning_rate": 3.757543586619845e-06, + "loss": 0.5531, + "step": 808 + }, + { + "epoch": 0.67, + "grad_norm": 1.7440376746222928, + "learning_rate": 3.754716529361089e-06, + "loss": 0.4913, + "step": 809 + }, + { + "epoch": 0.67, + "grad_norm": 1.7910937306897654, + "learning_rate": 3.7518873257584897e-06, + "loss": 0.5128, + "step": 810 + }, + { + "epoch": 0.67, + "grad_norm": 1.9334392608388238, + "learning_rate": 3.7490559806517434e-06, + "loss": 0.5861, + "step": 811 + }, + { + "epoch": 0.67, + "grad_norm": 2.0003597857127673, + "learning_rate": 3.746222498884206e-06, + "loss": 0.5535, + "step": 812 + }, + { + "epoch": 0.67, + "grad_norm": 1.7964615198133413, + "learning_rate": 3.74338688530289e-06, + "loss": 0.5409, + "step": 813 + }, + { + "epoch": 0.67, + "grad_norm": 1.7726488990007383, + "learning_rate": 3.740549144758453e-06, + "loss": 0.5714, + "step": 814 + }, + { + "epoch": 0.68, + "grad_norm": 1.9080323144095523, + "learning_rate": 3.737709282105193e-06, + "loss": 0.5534, + "step": 815 + }, + { + "epoch": 0.68, + "grad_norm": 1.9612361354867969, + "learning_rate": 3.734867302201038e-06, + "loss": 0.5282, + "step": 816 + }, + { + "epoch": 0.68, + "grad_norm": 1.873254058551618, + "learning_rate": 3.7320232099075363e-06, + "loss": 0.5422, + "step": 817 + }, + { + "epoch": 0.68, + "grad_norm": 1.8383882069199007, + "learning_rate": 3.7291770100898508e-06, + "loss": 0.5588, + "step": 818 + }, + { + "epoch": 0.68, + "grad_norm": 2.0137053963220835, + "learning_rate": 3.726328707616749e-06, + "loss": 0.5895, + "step": 819 + }, + { + "epoch": 0.68, + "grad_norm": 1.8207549211692964, + "learning_rate": 3.7234783073605957e-06, + "loss": 0.5428, + "step": 820 + }, + { + "epoch": 0.68, + "grad_norm": 1.7929761418069659, + "learning_rate": 3.7206258141973445e-06, + "loss": 0.555, + "step": 821 + }, + { + "epoch": 0.68, + "grad_norm": 1.8863691259545465, + "learning_rate": 3.7177712330065285e-06, + "loss": 0.5802, + "step": 822 + }, + { + "epoch": 0.68, + "grad_norm": 1.8383911000943605, + "learning_rate": 3.714914568671252e-06, + "loss": 0.4986, + "step": 823 + }, + { + "epoch": 0.68, + "grad_norm": 2.0032777947804044, + "learning_rate": 3.7120558260781846e-06, + "loss": 0.6456, + "step": 824 + }, + { + "epoch": 0.68, + "grad_norm": 1.733320874844507, + "learning_rate": 3.709195010117551e-06, + "loss": 0.5146, + "step": 825 + }, + { + "epoch": 0.68, + "grad_norm": 1.7411187007421471, + "learning_rate": 3.7063321256831193e-06, + "loss": 0.5297, + "step": 826 + }, + { + "epoch": 0.69, + "grad_norm": 1.8334107493901353, + "learning_rate": 3.7034671776722003e-06, + "loss": 0.545, + "step": 827 + }, + { + "epoch": 0.69, + "grad_norm": 1.931467221651553, + "learning_rate": 3.7006001709856314e-06, + "loss": 0.579, + "step": 828 + }, + { + "epoch": 0.69, + "grad_norm": 1.799522216655623, + "learning_rate": 3.697731110527774e-06, + "loss": 0.5453, + "step": 829 + }, + { + "epoch": 0.69, + "grad_norm": 1.8098119388805842, + "learning_rate": 3.6948600012065016e-06, + "loss": 0.5186, + "step": 830 + }, + { + "epoch": 0.69, + "grad_norm": 1.8419013342395714, + "learning_rate": 3.6919868479331934e-06, + "loss": 0.4833, + "step": 831 + }, + { + "epoch": 0.69, + "grad_norm": 1.8419148322752323, + "learning_rate": 3.6891116556227234e-06, + "loss": 0.5479, + "step": 832 + }, + { + "epoch": 0.69, + "grad_norm": 1.7858200344474908, + "learning_rate": 3.6862344291934545e-06, + "loss": 0.5264, + "step": 833 + }, + { + "epoch": 0.69, + "grad_norm": 1.8057437623830686, + "learning_rate": 3.6833551735672293e-06, + "loss": 0.5208, + "step": 834 + }, + { + "epoch": 0.69, + "grad_norm": 1.8570584000334132, + "learning_rate": 3.6804738936693617e-06, + "loss": 0.5652, + "step": 835 + }, + { + "epoch": 0.69, + "grad_norm": 1.7961732805960369, + "learning_rate": 3.677590594428629e-06, + "loss": 0.5693, + "step": 836 + }, + { + "epoch": 0.69, + "grad_norm": 1.954108513879844, + "learning_rate": 3.6747052807772614e-06, + "loss": 0.5673, + "step": 837 + }, + { + "epoch": 0.69, + "grad_norm": 1.834152772161213, + "learning_rate": 3.671817957650936e-06, + "loss": 0.5118, + "step": 838 + }, + { + "epoch": 0.7, + "grad_norm": 1.8035026424969205, + "learning_rate": 3.6689286299887663e-06, + "loss": 0.5778, + "step": 839 + }, + { + "epoch": 0.7, + "grad_norm": 1.7862771700309947, + "learning_rate": 3.666037302733295e-06, + "loss": 0.5575, + "step": 840 + }, + { + "epoch": 0.7, + "grad_norm": 1.7398650592861555, + "learning_rate": 3.6631439808304874e-06, + "loss": 0.5323, + "step": 841 + }, + { + "epoch": 0.7, + "grad_norm": 1.7082885736006344, + "learning_rate": 3.6602486692297183e-06, + "loss": 0.543, + "step": 842 + }, + { + "epoch": 0.7, + "grad_norm": 1.8242434568233548, + "learning_rate": 3.6573513728837685e-06, + "loss": 0.5579, + "step": 843 + }, + { + "epoch": 0.7, + "grad_norm": 1.8305967806472925, + "learning_rate": 3.6544520967488108e-06, + "loss": 0.5425, + "step": 844 + }, + { + "epoch": 0.7, + "grad_norm": 1.7126995402462595, + "learning_rate": 3.651550845784407e-06, + "loss": 0.5399, + "step": 845 + }, + { + "epoch": 0.7, + "grad_norm": 1.992190051239983, + "learning_rate": 3.648647624953496e-06, + "loss": 0.5951, + "step": 846 + }, + { + "epoch": 0.7, + "grad_norm": 1.9362402903409848, + "learning_rate": 3.6457424392223885e-06, + "loss": 0.5427, + "step": 847 + }, + { + "epoch": 0.7, + "grad_norm": 1.7390586845081806, + "learning_rate": 3.642835293560754e-06, + "loss": 0.5269, + "step": 848 + }, + { + "epoch": 0.7, + "grad_norm": 1.8601747321693383, + "learning_rate": 3.639926192941615e-06, + "loss": 0.5246, + "step": 849 + }, + { + "epoch": 0.7, + "grad_norm": 1.8305054240762129, + "learning_rate": 3.6370151423413396e-06, + "loss": 0.562, + "step": 850 + }, + { + "epoch": 0.71, + "grad_norm": 1.8361711553327809, + "learning_rate": 3.6341021467396296e-06, + "loss": 0.5066, + "step": 851 + }, + { + "epoch": 0.71, + "grad_norm": 1.9202617492772214, + "learning_rate": 3.6311872111195163e-06, + "loss": 0.5755, + "step": 852 + }, + { + "epoch": 0.71, + "grad_norm": 1.9056266366653432, + "learning_rate": 3.628270340467348e-06, + "loss": 0.5193, + "step": 853 + }, + { + "epoch": 0.71, + "grad_norm": 1.9700971504271882, + "learning_rate": 3.625351539772783e-06, + "loss": 0.5499, + "step": 854 + }, + { + "epoch": 0.71, + "grad_norm": 1.7142305580780086, + "learning_rate": 3.6224308140287818e-06, + "loss": 0.5597, + "step": 855 + }, + { + "epoch": 0.71, + "grad_norm": 1.7897876492593174, + "learning_rate": 3.6195081682315972e-06, + "loss": 0.5347, + "step": 856 + }, + { + "epoch": 0.71, + "grad_norm": 2.191923699092432, + "learning_rate": 3.616583607380769e-06, + "loss": 0.5251, + "step": 857 + }, + { + "epoch": 0.71, + "grad_norm": 1.8582876176666503, + "learning_rate": 3.61365713647911e-06, + "loss": 0.5067, + "step": 858 + }, + { + "epoch": 0.71, + "grad_norm": 1.991617360171558, + "learning_rate": 3.610728760532701e-06, + "loss": 0.6464, + "step": 859 + }, + { + "epoch": 0.71, + "grad_norm": 1.892621069660817, + "learning_rate": 3.607798484550881e-06, + "loss": 0.5145, + "step": 860 + }, + { + "epoch": 0.71, + "grad_norm": 1.7592963181570629, + "learning_rate": 3.6048663135462423e-06, + "loss": 0.5297, + "step": 861 + }, + { + "epoch": 0.71, + "grad_norm": 2.020192040751123, + "learning_rate": 3.6019322525346157e-06, + "loss": 0.5709, + "step": 862 + }, + { + "epoch": 0.72, + "grad_norm": 1.8575959680616767, + "learning_rate": 3.598996306535067e-06, + "loss": 0.5946, + "step": 863 + }, + { + "epoch": 0.72, + "grad_norm": 1.9638758131071599, + "learning_rate": 3.5960584805698845e-06, + "loss": 0.4833, + "step": 864 + }, + { + "epoch": 0.72, + "grad_norm": 1.7517341191956926, + "learning_rate": 3.593118779664574e-06, + "loss": 0.5439, + "step": 865 + }, + { + "epoch": 0.72, + "grad_norm": 1.7637144330636925, + "learning_rate": 3.590177208847848e-06, + "loss": 0.4898, + "step": 866 + }, + { + "epoch": 0.72, + "grad_norm": 2.107899096934758, + "learning_rate": 3.5872337731516186e-06, + "loss": 0.5332, + "step": 867 + }, + { + "epoch": 0.72, + "grad_norm": 2.016493645108941, + "learning_rate": 3.5842884776109875e-06, + "loss": 0.5313, + "step": 868 + }, + { + "epoch": 0.72, + "grad_norm": 1.8758602544873038, + "learning_rate": 3.581341327264236e-06, + "loss": 0.554, + "step": 869 + }, + { + "epoch": 0.72, + "grad_norm": 1.8566881639083022, + "learning_rate": 3.5783923271528222e-06, + "loss": 0.5322, + "step": 870 + }, + { + "epoch": 0.72, + "grad_norm": 1.9151838907738468, + "learning_rate": 3.5754414823213647e-06, + "loss": 0.5306, + "step": 871 + }, + { + "epoch": 0.72, + "grad_norm": 1.7893407766785276, + "learning_rate": 3.572488797817639e-06, + "loss": 0.5226, + "step": 872 + }, + { + "epoch": 0.72, + "grad_norm": 1.908122661974681, + "learning_rate": 3.569534278692569e-06, + "loss": 0.5132, + "step": 873 + }, + { + "epoch": 0.72, + "grad_norm": 1.9052513037253582, + "learning_rate": 3.5665779300002144e-06, + "loss": 0.513, + "step": 874 + }, + { + "epoch": 0.73, + "grad_norm": 1.7876914527016339, + "learning_rate": 3.563619756797767e-06, + "loss": 0.5627, + "step": 875 + }, + { + "epoch": 0.73, + "grad_norm": 1.9607045801516068, + "learning_rate": 3.5606597641455387e-06, + "loss": 0.4986, + "step": 876 + }, + { + "epoch": 0.73, + "grad_norm": 1.701462749441997, + "learning_rate": 3.5576979571069527e-06, + "loss": 0.5306, + "step": 877 + }, + { + "epoch": 0.73, + "grad_norm": 1.8413701238351416, + "learning_rate": 3.554734340748538e-06, + "loss": 0.5602, + "step": 878 + }, + { + "epoch": 0.73, + "grad_norm": 1.8762306249541667, + "learning_rate": 3.5517689201399162e-06, + "loss": 0.5663, + "step": 879 + }, + { + "epoch": 0.73, + "grad_norm": 1.833164968453507, + "learning_rate": 3.5488017003537977e-06, + "loss": 0.5264, + "step": 880 + }, + { + "epoch": 0.73, + "grad_norm": 1.766302763247428, + "learning_rate": 3.5458326864659687e-06, + "loss": 0.5498, + "step": 881 + }, + { + "epoch": 0.73, + "grad_norm": 1.821883208129187, + "learning_rate": 3.5428618835552867e-06, + "loss": 0.5468, + "step": 882 + }, + { + "epoch": 0.73, + "grad_norm": 1.7773758034614335, + "learning_rate": 3.5398892967036674e-06, + "loss": 0.505, + "step": 883 + }, + { + "epoch": 0.73, + "grad_norm": 1.8248820711070537, + "learning_rate": 3.5369149309960783e-06, + "loss": 0.5679, + "step": 884 + }, + { + "epoch": 0.73, + "grad_norm": 1.8248114104788378, + "learning_rate": 3.5339387915205305e-06, + "loss": 0.5351, + "step": 885 + }, + { + "epoch": 0.73, + "grad_norm": 2.00472132505421, + "learning_rate": 3.53096088336807e-06, + "loss": 0.5637, + "step": 886 + }, + { + "epoch": 0.74, + "grad_norm": 2.0594957277906656, + "learning_rate": 3.5279812116327667e-06, + "loss": 0.567, + "step": 887 + }, + { + "epoch": 0.74, + "grad_norm": 1.916227169502353, + "learning_rate": 3.5249997814117098e-06, + "loss": 0.5733, + "step": 888 + }, + { + "epoch": 0.74, + "grad_norm": 1.7595020268824906, + "learning_rate": 3.5220165978049937e-06, + "loss": 0.5512, + "step": 889 + }, + { + "epoch": 0.74, + "grad_norm": 1.8259487385184114, + "learning_rate": 3.5190316659157126e-06, + "loss": 0.5332, + "step": 890 + }, + { + "epoch": 0.74, + "grad_norm": 1.8216813752485344, + "learning_rate": 3.5160449908499538e-06, + "loss": 0.5718, + "step": 891 + }, + { + "epoch": 0.74, + "grad_norm": 1.8497964997952454, + "learning_rate": 3.5130565777167845e-06, + "loss": 0.5179, + "step": 892 + }, + { + "epoch": 0.74, + "grad_norm": 1.8242356367817554, + "learning_rate": 3.5100664316282464e-06, + "loss": 0.5587, + "step": 893 + }, + { + "epoch": 0.74, + "grad_norm": 1.7793507179190546, + "learning_rate": 3.5070745576993428e-06, + "loss": 0.5924, + "step": 894 + }, + { + "epoch": 0.74, + "grad_norm": 1.920176905610262, + "learning_rate": 3.5040809610480364e-06, + "loss": 0.5579, + "step": 895 + }, + { + "epoch": 0.74, + "grad_norm": 1.954421523744336, + "learning_rate": 3.5010856467952335e-06, + "loss": 0.5496, + "step": 896 + }, + { + "epoch": 0.74, + "grad_norm": 1.7785169911731862, + "learning_rate": 3.4980886200647817e-06, + "loss": 0.5383, + "step": 897 + }, + { + "epoch": 0.74, + "grad_norm": 1.853827977546151, + "learning_rate": 3.4950898859834555e-06, + "loss": 0.5501, + "step": 898 + }, + { + "epoch": 0.75, + "grad_norm": 1.9882198198152168, + "learning_rate": 3.4920894496809515e-06, + "loss": 0.5557, + "step": 899 + }, + { + "epoch": 0.75, + "grad_norm": 1.98090605107646, + "learning_rate": 3.489087316289877e-06, + "loss": 0.5661, + "step": 900 + }, + { + "epoch": 0.75, + "grad_norm": 2.0027723691714785, + "learning_rate": 3.486083490945743e-06, + "loss": 0.4791, + "step": 901 + }, + { + "epoch": 0.75, + "grad_norm": 2.0183911897675015, + "learning_rate": 3.4830779787869555e-06, + "loss": 0.5386, + "step": 902 + }, + { + "epoch": 0.75, + "grad_norm": 1.9385976919386894, + "learning_rate": 3.480070784954805e-06, + "loss": 0.5351, + "step": 903 + }, + { + "epoch": 0.75, + "grad_norm": 1.7612550957325825, + "learning_rate": 3.4770619145934586e-06, + "loss": 0.511, + "step": 904 + }, + { + "epoch": 0.75, + "grad_norm": 1.8677538420589843, + "learning_rate": 3.4740513728499515e-06, + "loss": 0.5942, + "step": 905 + }, + { + "epoch": 0.75, + "grad_norm": 1.9208446249900946, + "learning_rate": 3.4710391648741787e-06, + "loss": 0.5146, + "step": 906 + }, + { + "epoch": 0.75, + "grad_norm": 1.8008673055527855, + "learning_rate": 3.468025295818885e-06, + "loss": 0.5909, + "step": 907 + }, + { + "epoch": 0.75, + "grad_norm": 1.891052390507894, + "learning_rate": 3.465009770839657e-06, + "loss": 0.5527, + "step": 908 + }, + { + "epoch": 0.75, + "grad_norm": 2.0521048489395435, + "learning_rate": 3.4619925950949126e-06, + "loss": 0.5756, + "step": 909 + }, + { + "epoch": 0.75, + "grad_norm": 2.003295441830653, + "learning_rate": 3.4589737737458946e-06, + "loss": 0.5299, + "step": 910 + }, + { + "epoch": 0.76, + "grad_norm": 1.7635851435542724, + "learning_rate": 3.4559533119566612e-06, + "loss": 0.5338, + "step": 911 + }, + { + "epoch": 0.76, + "grad_norm": 1.834326490517508, + "learning_rate": 3.4529312148940763e-06, + "loss": 0.56, + "step": 912 + }, + { + "epoch": 0.76, + "grad_norm": 1.8618427761057224, + "learning_rate": 3.4499074877278016e-06, + "loss": 0.5189, + "step": 913 + }, + { + "epoch": 0.76, + "grad_norm": 2.04459004844406, + "learning_rate": 3.446882135630286e-06, + "loss": 0.5765, + "step": 914 + }, + { + "epoch": 0.76, + "grad_norm": 1.7467595732765806, + "learning_rate": 3.4438551637767604e-06, + "loss": 0.5512, + "step": 915 + }, + { + "epoch": 0.76, + "grad_norm": 1.7952035114217406, + "learning_rate": 3.4408265773452226e-06, + "loss": 0.5348, + "step": 916 + }, + { + "epoch": 0.76, + "grad_norm": 1.8448198186244822, + "learning_rate": 3.4377963815164362e-06, + "loss": 0.5187, + "step": 917 + }, + { + "epoch": 0.76, + "grad_norm": 1.7738820116169103, + "learning_rate": 3.4347645814739156e-06, + "loss": 0.507, + "step": 918 + }, + { + "epoch": 0.76, + "grad_norm": 1.9699054774415494, + "learning_rate": 3.4317311824039216e-06, + "loss": 0.5175, + "step": 919 + }, + { + "epoch": 0.76, + "grad_norm": 1.7482905457169124, + "learning_rate": 3.4286961894954473e-06, + "loss": 0.5188, + "step": 920 + }, + { + "epoch": 0.76, + "grad_norm": 1.8012194296110113, + "learning_rate": 3.425659607940215e-06, + "loss": 0.5465, + "step": 921 + }, + { + "epoch": 0.76, + "grad_norm": 1.7978097428012587, + "learning_rate": 3.422621442932662e-06, + "loss": 0.5257, + "step": 922 + }, + { + "epoch": 0.77, + "grad_norm": 1.8534167116514217, + "learning_rate": 3.419581699669937e-06, + "loss": 0.536, + "step": 923 + }, + { + "epoch": 0.77, + "grad_norm": 1.7733377878036733, + "learning_rate": 3.416540383351888e-06, + "loss": 0.5632, + "step": 924 + }, + { + "epoch": 0.77, + "grad_norm": 1.8124786776539388, + "learning_rate": 3.4134974991810503e-06, + "loss": 0.5471, + "step": 925 + }, + { + "epoch": 0.77, + "grad_norm": 1.8553271859579439, + "learning_rate": 3.4104530523626463e-06, + "loss": 0.538, + "step": 926 + }, + { + "epoch": 0.77, + "grad_norm": 1.8888926038913822, + "learning_rate": 3.4074070481045683e-06, + "loss": 0.4868, + "step": 927 + }, + { + "epoch": 0.77, + "grad_norm": 2.0158609319355505, + "learning_rate": 3.404359491617374e-06, + "loss": 0.5757, + "step": 928 + }, + { + "epoch": 0.77, + "grad_norm": 1.8376639720078027, + "learning_rate": 3.401310388114276e-06, + "loss": 0.5377, + "step": 929 + }, + { + "epoch": 0.77, + "grad_norm": 2.3651883595335232, + "learning_rate": 3.3982597428111336e-06, + "loss": 0.5536, + "step": 930 + }, + { + "epoch": 0.77, + "grad_norm": 1.908409388949023, + "learning_rate": 3.3952075609264423e-06, + "loss": 0.5349, + "step": 931 + }, + { + "epoch": 0.77, + "grad_norm": 1.8261622890952995, + "learning_rate": 3.3921538476813278e-06, + "loss": 0.4991, + "step": 932 + }, + { + "epoch": 0.77, + "grad_norm": 1.924034720876031, + "learning_rate": 3.3890986082995353e-06, + "loss": 0.536, + "step": 933 + }, + { + "epoch": 0.77, + "grad_norm": 1.829615974230478, + "learning_rate": 3.3860418480074188e-06, + "loss": 0.5163, + "step": 934 + }, + { + "epoch": 0.78, + "grad_norm": 1.7812992854973535, + "learning_rate": 3.3829835720339353e-06, + "loss": 0.5412, + "step": 935 + }, + { + "epoch": 0.78, + "grad_norm": 1.8270515542068861, + "learning_rate": 3.3799237856106348e-06, + "loss": 0.5459, + "step": 936 + }, + { + "epoch": 0.78, + "grad_norm": 1.8336967909163833, + "learning_rate": 3.3768624939716506e-06, + "loss": 0.5074, + "step": 937 + }, + { + "epoch": 0.78, + "grad_norm": 1.773892866992307, + "learning_rate": 3.373799702353691e-06, + "loss": 0.5457, + "step": 938 + }, + { + "epoch": 0.78, + "grad_norm": 1.8605607499004266, + "learning_rate": 3.370735415996031e-06, + "loss": 0.5691, + "step": 939 + }, + { + "epoch": 0.78, + "grad_norm": 1.7961529805945686, + "learning_rate": 3.3676696401405007e-06, + "loss": 0.5406, + "step": 940 + }, + { + "epoch": 0.78, + "grad_norm": 1.7406787561376078, + "learning_rate": 3.3646023800314792e-06, + "loss": 0.5297, + "step": 941 + }, + { + "epoch": 0.78, + "grad_norm": 1.9794693468141764, + "learning_rate": 3.361533640915885e-06, + "loss": 0.4765, + "step": 942 + }, + { + "epoch": 0.78, + "grad_norm": 1.820632707720892, + "learning_rate": 3.3584634280431657e-06, + "loss": 0.5395, + "step": 943 + }, + { + "epoch": 0.78, + "grad_norm": 1.8478126164835549, + "learning_rate": 3.3553917466652915e-06, + "loss": 0.5288, + "step": 944 + }, + { + "epoch": 0.78, + "grad_norm": 1.749509825583459, + "learning_rate": 3.352318602036742e-06, + "loss": 0.5343, + "step": 945 + }, + { + "epoch": 0.78, + "grad_norm": 1.8034305951190157, + "learning_rate": 3.3492439994145033e-06, + "loss": 0.5536, + "step": 946 + }, + { + "epoch": 0.79, + "grad_norm": 1.8172591817519397, + "learning_rate": 3.346167944058052e-06, + "loss": 0.5844, + "step": 947 + }, + { + "epoch": 0.79, + "grad_norm": 1.749562414198837, + "learning_rate": 3.3430904412293526e-06, + "loss": 0.4833, + "step": 948 + }, + { + "epoch": 0.79, + "grad_norm": 1.7243742428927225, + "learning_rate": 3.3400114961928444e-06, + "loss": 0.4828, + "step": 949 + }, + { + "epoch": 0.79, + "grad_norm": 1.757242299744874, + "learning_rate": 3.3369311142154337e-06, + "loss": 0.5282, + "step": 950 + }, + { + "epoch": 0.79, + "grad_norm": 2.036302581700697, + "learning_rate": 3.3338493005664853e-06, + "loss": 0.5315, + "step": 951 + }, + { + "epoch": 0.79, + "grad_norm": 1.886299636672335, + "learning_rate": 3.330766060517812e-06, + "loss": 0.5244, + "step": 952 + }, + { + "epoch": 0.79, + "grad_norm": 1.898853787733011, + "learning_rate": 3.3276813993436695e-06, + "loss": 0.5914, + "step": 953 + }, + { + "epoch": 0.79, + "grad_norm": 1.8359472984671243, + "learning_rate": 3.324595322320741e-06, + "loss": 0.5488, + "step": 954 + }, + { + "epoch": 0.79, + "grad_norm": 1.8768955168510497, + "learning_rate": 3.321507834728134e-06, + "loss": 0.5871, + "step": 955 + }, + { + "epoch": 0.79, + "grad_norm": 1.8358033818112791, + "learning_rate": 3.3184189418473674e-06, + "loss": 0.5632, + "step": 956 + }, + { + "epoch": 0.79, + "grad_norm": 1.792562502385941, + "learning_rate": 3.315328648962364e-06, + "loss": 0.4887, + "step": 957 + }, + { + "epoch": 0.79, + "grad_norm": 1.8732702930932368, + "learning_rate": 3.312236961359444e-06, + "loss": 0.5313, + "step": 958 + }, + { + "epoch": 0.8, + "grad_norm": 1.7708047128885986, + "learning_rate": 3.3091438843273115e-06, + "loss": 0.5348, + "step": 959 + }, + { + "epoch": 0.8, + "grad_norm": 1.9094434763935804, + "learning_rate": 3.3060494231570463e-06, + "loss": 0.5027, + "step": 960 + }, + { + "epoch": 0.8, + "grad_norm": 1.87927564418864, + "learning_rate": 3.3029535831420977e-06, + "loss": 0.511, + "step": 961 + }, + { + "epoch": 0.8, + "grad_norm": 1.717365559903535, + "learning_rate": 3.299856369578273e-06, + "loss": 0.5203, + "step": 962 + }, + { + "epoch": 0.8, + "grad_norm": 1.770779257052532, + "learning_rate": 3.2967577877637296e-06, + "loss": 0.5233, + "step": 963 + }, + { + "epoch": 0.8, + "grad_norm": 1.7541392466004568, + "learning_rate": 3.2936578429989653e-06, + "loss": 0.5013, + "step": 964 + }, + { + "epoch": 0.8, + "grad_norm": 1.7840578280891832, + "learning_rate": 3.290556540586809e-06, + "loss": 0.4844, + "step": 965 + }, + { + "epoch": 0.8, + "grad_norm": 1.7184305413001233, + "learning_rate": 3.287453885832413e-06, + "loss": 0.4694, + "step": 966 + }, + { + "epoch": 0.8, + "grad_norm": 1.8671517036325307, + "learning_rate": 3.2843498840432403e-06, + "loss": 0.4652, + "step": 967 + }, + { + "epoch": 0.8, + "grad_norm": 1.9960847871768508, + "learning_rate": 3.2812445405290612e-06, + "loss": 0.5906, + "step": 968 + }, + { + "epoch": 0.8, + "grad_norm": 1.7535227575839891, + "learning_rate": 3.27813786060194e-06, + "loss": 0.5482, + "step": 969 + }, + { + "epoch": 0.8, + "grad_norm": 1.929231862440999, + "learning_rate": 3.2750298495762278e-06, + "loss": 0.5334, + "step": 970 + }, + { + "epoch": 0.8, + "grad_norm": 1.7879676366114814, + "learning_rate": 3.2719205127685505e-06, + "loss": 0.515, + "step": 971 + }, + { + "epoch": 0.81, + "grad_norm": 1.7817120865072218, + "learning_rate": 3.2688098554978053e-06, + "loss": 0.5045, + "step": 972 + }, + { + "epoch": 0.81, + "grad_norm": 1.8725673808714274, + "learning_rate": 3.265697883085145e-06, + "loss": 0.5557, + "step": 973 + }, + { + "epoch": 0.81, + "grad_norm": 1.8554796275037901, + "learning_rate": 3.262584600853973e-06, + "loss": 0.5785, + "step": 974 + }, + { + "epoch": 0.81, + "grad_norm": 1.77078783324655, + "learning_rate": 3.259470014129936e-06, + "loss": 0.524, + "step": 975 + }, + { + "epoch": 0.81, + "grad_norm": 1.820843626030818, + "learning_rate": 3.256354128240907e-06, + "loss": 0.5144, + "step": 976 + }, + { + "epoch": 0.81, + "grad_norm": 1.9330495063889956, + "learning_rate": 3.253236948516987e-06, + "loss": 0.5405, + "step": 977 + }, + { + "epoch": 0.81, + "grad_norm": 1.9113413794485425, + "learning_rate": 3.2501184802904867e-06, + "loss": 0.5212, + "step": 978 + }, + { + "epoch": 0.81, + "grad_norm": 1.799188386703558, + "learning_rate": 3.2469987288959208e-06, + "loss": 0.5148, + "step": 979 + }, + { + "epoch": 0.81, + "grad_norm": 1.8610914183588203, + "learning_rate": 3.2438776996700023e-06, + "loss": 0.5363, + "step": 980 + }, + { + "epoch": 0.81, + "grad_norm": 1.8245263524947073, + "learning_rate": 3.240755397951625e-06, + "loss": 0.5216, + "step": 981 + }, + { + "epoch": 0.81, + "grad_norm": 1.7863270641417597, + "learning_rate": 3.2376318290818643e-06, + "loss": 0.5581, + "step": 982 + }, + { + "epoch": 0.81, + "grad_norm": 1.9266115141469626, + "learning_rate": 3.23450699840396e-06, + "loss": 0.5178, + "step": 983 + }, + { + "epoch": 0.82, + "grad_norm": 1.8044458399187253, + "learning_rate": 3.2313809112633133e-06, + "loss": 0.5252, + "step": 984 + }, + { + "epoch": 0.82, + "grad_norm": 1.8809392949423562, + "learning_rate": 3.2282535730074714e-06, + "loss": 0.486, + "step": 985 + }, + { + "epoch": 0.82, + "grad_norm": 1.9487997548787144, + "learning_rate": 3.2251249889861237e-06, + "loss": 0.5272, + "step": 986 + }, + { + "epoch": 0.82, + "grad_norm": 2.088279538426057, + "learning_rate": 3.2219951645510907e-06, + "loss": 0.5426, + "step": 987 + }, + { + "epoch": 0.82, + "grad_norm": 1.8280370745964312, + "learning_rate": 3.218864105056313e-06, + "loss": 0.5545, + "step": 988 + }, + { + "epoch": 0.82, + "grad_norm": 1.7678201455723743, + "learning_rate": 3.2157318158578473e-06, + "loss": 0.5476, + "step": 989 + }, + { + "epoch": 0.82, + "grad_norm": 1.708170466024094, + "learning_rate": 3.21259830231385e-06, + "loss": 0.5442, + "step": 990 + }, + { + "epoch": 0.82, + "grad_norm": 2.0427224573251483, + "learning_rate": 3.209463569784575e-06, + "loss": 0.5501, + "step": 991 + }, + { + "epoch": 0.82, + "grad_norm": 1.8557413526282036, + "learning_rate": 3.206327623632359e-06, + "loss": 0.5573, + "step": 992 + }, + { + "epoch": 0.82, + "grad_norm": 1.7138810851622357, + "learning_rate": 3.2031904692216153e-06, + "loss": 0.5267, + "step": 993 + }, + { + "epoch": 0.82, + "grad_norm": 1.9034028799031073, + "learning_rate": 3.2000521119188267e-06, + "loss": 0.5605, + "step": 994 + }, + { + "epoch": 0.82, + "grad_norm": 1.994571492675121, + "learning_rate": 3.1969125570925303e-06, + "loss": 0.53, + "step": 995 + }, + { + "epoch": 0.83, + "grad_norm": 1.771581881704634, + "learning_rate": 3.193771810113313e-06, + "loss": 0.6177, + "step": 996 + }, + { + "epoch": 0.83, + "grad_norm": 1.7808220445921694, + "learning_rate": 3.1906298763538005e-06, + "loss": 0.5215, + "step": 997 + }, + { + "epoch": 0.83, + "grad_norm": 1.8069794706642701, + "learning_rate": 3.1874867611886513e-06, + "loss": 0.5444, + "step": 998 + }, + { + "epoch": 0.83, + "grad_norm": 1.7806867210889854, + "learning_rate": 3.1843424699945403e-06, + "loss": 0.5471, + "step": 999 + }, + { + "epoch": 0.83, + "grad_norm": 1.7481554024627886, + "learning_rate": 3.1811970081501576e-06, + "loss": 0.5159, + "step": 1000 + }, + { + "epoch": 0.83, + "grad_norm": 1.8105318680671914, + "learning_rate": 3.1780503810361946e-06, + "loss": 0.4985, + "step": 1001 + }, + { + "epoch": 0.83, + "grad_norm": 1.7033701950072382, + "learning_rate": 3.1749025940353363e-06, + "loss": 0.5594, + "step": 1002 + }, + { + "epoch": 0.83, + "grad_norm": 2.3799847532384515, + "learning_rate": 3.1717536525322512e-06, + "loss": 0.5978, + "step": 1003 + }, + { + "epoch": 0.83, + "grad_norm": 1.7427559432173463, + "learning_rate": 3.1686035619135845e-06, + "loss": 0.5299, + "step": 1004 + }, + { + "epoch": 0.83, + "grad_norm": 1.7454547855925509, + "learning_rate": 3.1654523275679453e-06, + "loss": 0.5439, + "step": 1005 + }, + { + "epoch": 0.83, + "grad_norm": 1.7130931472340127, + "learning_rate": 3.162299954885899e-06, + "loss": 0.5379, + "step": 1006 + }, + { + "epoch": 0.83, + "grad_norm": 1.6940357366272063, + "learning_rate": 3.15914644925996e-06, + "loss": 0.5694, + "step": 1007 + }, + { + "epoch": 0.84, + "grad_norm": 1.8544220651543013, + "learning_rate": 3.1559918160845787e-06, + "loss": 0.5285, + "step": 1008 + }, + { + "epoch": 0.84, + "grad_norm": 1.8481774433371347, + "learning_rate": 3.1528360607561358e-06, + "loss": 0.5384, + "step": 1009 + }, + { + "epoch": 0.84, + "grad_norm": 1.8256828659009958, + "learning_rate": 3.149679188672932e-06, + "loss": 0.4806, + "step": 1010 + }, + { + "epoch": 0.84, + "grad_norm": 1.9380282822721238, + "learning_rate": 3.1465212052351766e-06, + "loss": 0.543, + "step": 1011 + }, + { + "epoch": 0.84, + "grad_norm": 1.985943690469791, + "learning_rate": 3.1433621158449807e-06, + "loss": 0.5549, + "step": 1012 + }, + { + "epoch": 0.84, + "grad_norm": 1.7038398790061953, + "learning_rate": 3.140201925906348e-06, + "loss": 0.4682, + "step": 1013 + }, + { + "epoch": 0.84, + "grad_norm": 1.8748481620529394, + "learning_rate": 3.1370406408251632e-06, + "loss": 0.5046, + "step": 1014 + }, + { + "epoch": 0.84, + "grad_norm": 1.7587036990451181, + "learning_rate": 3.133878266009186e-06, + "loss": 0.5203, + "step": 1015 + }, + { + "epoch": 0.84, + "grad_norm": 1.7503537433041947, + "learning_rate": 3.130714806868041e-06, + "loss": 0.5546, + "step": 1016 + }, + { + "epoch": 0.84, + "grad_norm": 1.7701505667314001, + "learning_rate": 3.127550268813205e-06, + "loss": 0.531, + "step": 1017 + }, + { + "epoch": 0.84, + "grad_norm": 1.771371589393474, + "learning_rate": 3.124384657258001e-06, + "loss": 0.5424, + "step": 1018 + }, + { + "epoch": 0.84, + "grad_norm": 1.8016015279719124, + "learning_rate": 3.1212179776175905e-06, + "loss": 0.5706, + "step": 1019 + }, + { + "epoch": 0.85, + "grad_norm": 1.810944889002695, + "learning_rate": 3.1180502353089598e-06, + "loss": 0.5502, + "step": 1020 + }, + { + "epoch": 0.85, + "grad_norm": 1.8062084514449492, + "learning_rate": 3.1148814357509147e-06, + "loss": 0.5337, + "step": 1021 + }, + { + "epoch": 0.85, + "grad_norm": 1.669643406466654, + "learning_rate": 3.111711584364068e-06, + "loss": 0.4802, + "step": 1022 + }, + { + "epoch": 0.85, + "grad_norm": 1.6852245083058144, + "learning_rate": 3.1085406865708333e-06, + "loss": 0.532, + "step": 1023 + }, + { + "epoch": 0.85, + "grad_norm": 1.8463748056800222, + "learning_rate": 3.1053687477954124e-06, + "loss": 0.5112, + "step": 1024 + }, + { + "epoch": 0.85, + "grad_norm": 1.7302148909577209, + "learning_rate": 3.10219577346379e-06, + "loss": 0.5549, + "step": 1025 + }, + { + "epoch": 0.85, + "grad_norm": 1.7752983463714818, + "learning_rate": 3.0990217690037206e-06, + "loss": 0.5606, + "step": 1026 + }, + { + "epoch": 0.85, + "grad_norm": 1.695119975844164, + "learning_rate": 3.09584673984472e-06, + "loss": 0.486, + "step": 1027 + }, + { + "epoch": 0.85, + "grad_norm": 1.793543444803663, + "learning_rate": 3.0926706914180605e-06, + "loss": 0.6474, + "step": 1028 + }, + { + "epoch": 0.85, + "grad_norm": 1.6954588940750932, + "learning_rate": 3.089493629156755e-06, + "loss": 0.5208, + "step": 1029 + }, + { + "epoch": 0.85, + "grad_norm": 1.9045089074493644, + "learning_rate": 3.08631555849555e-06, + "loss": 0.5291, + "step": 1030 + }, + { + "epoch": 0.85, + "grad_norm": 1.8481217904786489, + "learning_rate": 3.083136484870921e-06, + "loss": 0.5212, + "step": 1031 + }, + { + "epoch": 0.86, + "grad_norm": 1.6729420221561044, + "learning_rate": 3.0799564137210536e-06, + "loss": 0.5024, + "step": 1032 + }, + { + "epoch": 0.86, + "grad_norm": 1.8821832248249077, + "learning_rate": 3.076775350485845e-06, + "loss": 0.5459, + "step": 1033 + }, + { + "epoch": 0.86, + "grad_norm": 1.762473350167322, + "learning_rate": 3.0735933006068863e-06, + "loss": 0.4938, + "step": 1034 + }, + { + "epoch": 0.86, + "grad_norm": 1.7950707678098703, + "learning_rate": 3.0704102695274573e-06, + "loss": 0.4922, + "step": 1035 + }, + { + "epoch": 0.86, + "grad_norm": 1.6853644769275375, + "learning_rate": 3.0672262626925174e-06, + "loss": 0.47, + "step": 1036 + }, + { + "epoch": 0.86, + "grad_norm": 1.809909106997157, + "learning_rate": 3.0640412855486922e-06, + "loss": 0.5545, + "step": 1037 + }, + { + "epoch": 0.86, + "grad_norm": 2.019472393876661, + "learning_rate": 3.06085534354427e-06, + "loss": 0.5616, + "step": 1038 + }, + { + "epoch": 0.86, + "grad_norm": 1.7972785887075076, + "learning_rate": 3.057668442129188e-06, + "loss": 0.5269, + "step": 1039 + }, + { + "epoch": 0.86, + "grad_norm": 1.865555820217107, + "learning_rate": 3.054480586755026e-06, + "loss": 0.5752, + "step": 1040 + }, + { + "epoch": 0.86, + "grad_norm": 1.792147096098412, + "learning_rate": 3.051291782874995e-06, + "loss": 0.54, + "step": 1041 + }, + { + "epoch": 0.86, + "grad_norm": 1.8108893550848508, + "learning_rate": 3.048102035943927e-06, + "loss": 0.5367, + "step": 1042 + }, + { + "epoch": 0.86, + "grad_norm": 2.0966646553454793, + "learning_rate": 3.04491135141827e-06, + "loss": 0.5455, + "step": 1043 + }, + { + "epoch": 0.87, + "grad_norm": 1.7357403687049695, + "learning_rate": 3.041719734756073e-06, + "loss": 0.502, + "step": 1044 + }, + { + "epoch": 0.87, + "grad_norm": 1.8033826162723872, + "learning_rate": 3.038527191416982e-06, + "loss": 0.5644, + "step": 1045 + }, + { + "epoch": 0.87, + "grad_norm": 1.7822928111630525, + "learning_rate": 3.0353337268622267e-06, + "loss": 0.4938, + "step": 1046 + }, + { + "epoch": 0.87, + "grad_norm": 1.7910319343463081, + "learning_rate": 3.0321393465546134e-06, + "loss": 0.5889, + "step": 1047 + }, + { + "epoch": 0.87, + "grad_norm": 1.7457160087273953, + "learning_rate": 3.028944055958514e-06, + "loss": 0.5022, + "step": 1048 + }, + { + "epoch": 0.87, + "grad_norm": 1.691379648176161, + "learning_rate": 3.0257478605398595e-06, + "loss": 0.4841, + "step": 1049 + }, + { + "epoch": 0.87, + "grad_norm": 1.7452186987943483, + "learning_rate": 3.0225507657661257e-06, + "loss": 0.5626, + "step": 1050 + }, + { + "epoch": 0.87, + "grad_norm": 1.7578678635930594, + "learning_rate": 3.0193527771063297e-06, + "loss": 0.5115, + "step": 1051 + }, + { + "epoch": 0.87, + "grad_norm": 1.7879798898209605, + "learning_rate": 3.016153900031016e-06, + "loss": 0.5296, + "step": 1052 + }, + { + "epoch": 0.87, + "grad_norm": 1.6745604796677231, + "learning_rate": 3.0129541400122492e-06, + "loss": 0.5089, + "step": 1053 + }, + { + "epoch": 0.87, + "grad_norm": 1.8484438696306678, + "learning_rate": 3.0097535025236045e-06, + "loss": 0.6124, + "step": 1054 + }, + { + "epoch": 0.87, + "grad_norm": 1.8023880068850882, + "learning_rate": 3.0065519930401595e-06, + "loss": 0.4983, + "step": 1055 + }, + { + "epoch": 0.88, + "grad_norm": 1.743901583565096, + "learning_rate": 3.0033496170384803e-06, + "loss": 0.4998, + "step": 1056 + }, + { + "epoch": 0.88, + "grad_norm": 1.9494472820876043, + "learning_rate": 3.000146379996617e-06, + "loss": 0.537, + "step": 1057 + }, + { + "epoch": 0.88, + "grad_norm": 1.6992995489648048, + "learning_rate": 2.996942287394093e-06, + "loss": 0.5822, + "step": 1058 + }, + { + "epoch": 0.88, + "grad_norm": 1.8498288139189643, + "learning_rate": 2.993737344711895e-06, + "loss": 0.5651, + "step": 1059 + }, + { + "epoch": 0.88, + "grad_norm": 1.755920633785882, + "learning_rate": 2.990531557432464e-06, + "loss": 0.496, + "step": 1060 + }, + { + "epoch": 0.88, + "grad_norm": 1.7876484928074277, + "learning_rate": 2.9873249310396853e-06, + "loss": 0.5224, + "step": 1061 + }, + { + "epoch": 0.88, + "grad_norm": 1.7573987279473129, + "learning_rate": 2.98411747101888e-06, + "loss": 0.5228, + "step": 1062 + }, + { + "epoch": 0.88, + "grad_norm": 1.6995721104857204, + "learning_rate": 2.980909182856794e-06, + "loss": 0.4758, + "step": 1063 + }, + { + "epoch": 0.88, + "grad_norm": 1.907464743607936, + "learning_rate": 2.9777000720415916e-06, + "loss": 0.5254, + "step": 1064 + }, + { + "epoch": 0.88, + "grad_norm": 1.7921365259203703, + "learning_rate": 2.974490144062844e-06, + "loss": 0.5116, + "step": 1065 + }, + { + "epoch": 0.88, + "grad_norm": 1.9010192849593792, + "learning_rate": 2.9712794044115196e-06, + "loss": 0.5136, + "step": 1066 + }, + { + "epoch": 0.88, + "grad_norm": 1.742881813035793, + "learning_rate": 2.968067858579975e-06, + "loss": 0.5436, + "step": 1067 + }, + { + "epoch": 0.89, + "grad_norm": 1.7135933558215708, + "learning_rate": 2.964855512061947e-06, + "loss": 0.5268, + "step": 1068 + }, + { + "epoch": 0.89, + "grad_norm": 1.8360025545734582, + "learning_rate": 2.9616423703525414e-06, + "loss": 0.5238, + "step": 1069 + }, + { + "epoch": 0.89, + "grad_norm": 1.7090421713960848, + "learning_rate": 2.9584284389482237e-06, + "loss": 0.5051, + "step": 1070 + }, + { + "epoch": 0.89, + "grad_norm": 1.7462732547158757, + "learning_rate": 2.9552137233468113e-06, + "loss": 0.4838, + "step": 1071 + }, + { + "epoch": 0.89, + "grad_norm": 1.9336108910937513, + "learning_rate": 2.951998229047464e-06, + "loss": 0.5576, + "step": 1072 + }, + { + "epoch": 0.89, + "grad_norm": 1.784092660568157, + "learning_rate": 2.9487819615506702e-06, + "loss": 0.5349, + "step": 1073 + }, + { + "epoch": 0.89, + "grad_norm": 1.772640354616067, + "learning_rate": 2.945564926358245e-06, + "loss": 0.5423, + "step": 1074 + }, + { + "epoch": 0.89, + "grad_norm": 1.8491968859591044, + "learning_rate": 2.9423471289733125e-06, + "loss": 0.5453, + "step": 1075 + }, + { + "epoch": 0.89, + "grad_norm": 1.8283172103770493, + "learning_rate": 2.9391285749003046e-06, + "loss": 0.5318, + "step": 1076 + }, + { + "epoch": 0.89, + "grad_norm": 1.7802483696828226, + "learning_rate": 2.935909269644946e-06, + "loss": 0.4954, + "step": 1077 + }, + { + "epoch": 0.89, + "grad_norm": 1.8687809173149, + "learning_rate": 2.9326892187142457e-06, + "loss": 0.5428, + "step": 1078 + }, + { + "epoch": 0.89, + "grad_norm": 1.9218917868616974, + "learning_rate": 2.9294684276164888e-06, + "loss": 0.5125, + "step": 1079 + }, + { + "epoch": 0.9, + "grad_norm": 1.8406300824318225, + "learning_rate": 2.9262469018612278e-06, + "loss": 0.5186, + "step": 1080 + }, + { + "epoch": 0.9, + "grad_norm": 1.8153319034513924, + "learning_rate": 2.9230246469592695e-06, + "loss": 0.4878, + "step": 1081 + }, + { + "epoch": 0.9, + "grad_norm": 1.8381190525343576, + "learning_rate": 2.91980166842267e-06, + "loss": 0.5455, + "step": 1082 + }, + { + "epoch": 0.9, + "grad_norm": 1.7941629060330144, + "learning_rate": 2.9165779717647212e-06, + "loss": 0.5425, + "step": 1083 + }, + { + "epoch": 0.9, + "grad_norm": 1.755950985861856, + "learning_rate": 2.9133535624999466e-06, + "loss": 0.4992, + "step": 1084 + }, + { + "epoch": 0.9, + "grad_norm": 1.8065716401418646, + "learning_rate": 2.9101284461440853e-06, + "loss": 0.5569, + "step": 1085 + }, + { + "epoch": 0.9, + "grad_norm": 1.8487073865649808, + "learning_rate": 2.9069026282140887e-06, + "loss": 0.5352, + "step": 1086 + }, + { + "epoch": 0.9, + "grad_norm": 1.877024524581134, + "learning_rate": 2.903676114228107e-06, + "loss": 0.5584, + "step": 1087 + }, + { + "epoch": 0.9, + "grad_norm": 1.812931375367902, + "learning_rate": 2.9004489097054807e-06, + "loss": 0.5154, + "step": 1088 + }, + { + "epoch": 0.9, + "grad_norm": 1.7729938020658174, + "learning_rate": 2.897221020166732e-06, + "loss": 0.5386, + "step": 1089 + }, + { + "epoch": 0.9, + "grad_norm": 1.6991898958250629, + "learning_rate": 2.8939924511335555e-06, + "loss": 0.5467, + "step": 1090 + }, + { + "epoch": 0.9, + "grad_norm": 1.7298323860671052, + "learning_rate": 2.890763208128807e-06, + "loss": 0.5506, + "step": 1091 + }, + { + "epoch": 0.91, + "grad_norm": 1.9718362378496106, + "learning_rate": 2.887533296676497e-06, + "loss": 0.5453, + "step": 1092 + }, + { + "epoch": 0.91, + "grad_norm": 1.7003897379752575, + "learning_rate": 2.8843027223017767e-06, + "loss": 0.5016, + "step": 1093 + }, + { + "epoch": 0.91, + "grad_norm": 1.7604846690613096, + "learning_rate": 2.8810714905309346e-06, + "loss": 0.5206, + "step": 1094 + }, + { + "epoch": 0.91, + "grad_norm": 1.868522047775135, + "learning_rate": 2.8778396068913807e-06, + "loss": 0.5152, + "step": 1095 + }, + { + "epoch": 0.91, + "grad_norm": 1.8080911269766844, + "learning_rate": 2.874607076911642e-06, + "loss": 0.4966, + "step": 1096 + }, + { + "epoch": 0.91, + "grad_norm": 1.7767037245003534, + "learning_rate": 2.871373906121351e-06, + "loss": 0.5081, + "step": 1097 + }, + { + "epoch": 0.91, + "grad_norm": 1.733045586658075, + "learning_rate": 2.8681401000512356e-06, + "loss": 0.5031, + "step": 1098 + }, + { + "epoch": 0.91, + "grad_norm": 1.6767478479637847, + "learning_rate": 2.8649056642331103e-06, + "loss": 0.4856, + "step": 1099 + }, + { + "epoch": 0.91, + "grad_norm": 1.6820690185704608, + "learning_rate": 2.8616706041998686e-06, + "loss": 0.5151, + "step": 1100 + }, + { + "epoch": 0.91, + "grad_norm": 1.840181264549285, + "learning_rate": 2.8584349254854693e-06, + "loss": 0.5393, + "step": 1101 + }, + { + "epoch": 0.91, + "grad_norm": 1.827807570004724, + "learning_rate": 2.8551986336249322e-06, + "loss": 0.5572, + "step": 1102 + }, + { + "epoch": 0.91, + "grad_norm": 1.711815265099016, + "learning_rate": 2.8519617341543233e-06, + "loss": 0.5184, + "step": 1103 + }, + { + "epoch": 0.92, + "grad_norm": 1.7460018389221874, + "learning_rate": 2.8487242326107495e-06, + "loss": 0.5374, + "step": 1104 + }, + { + "epoch": 0.92, + "grad_norm": 1.985067366728648, + "learning_rate": 2.8454861345323475e-06, + "loss": 0.538, + "step": 1105 + }, + { + "epoch": 0.92, + "grad_norm": 1.8044567576569952, + "learning_rate": 2.8422474454582754e-06, + "loss": 0.4947, + "step": 1106 + }, + { + "epoch": 0.92, + "grad_norm": 1.7648712890692506, + "learning_rate": 2.8390081709286997e-06, + "loss": 0.5584, + "step": 1107 + }, + { + "epoch": 0.92, + "grad_norm": 1.7544905722043518, + "learning_rate": 2.8357683164847903e-06, + "loss": 0.5696, + "step": 1108 + }, + { + "epoch": 0.92, + "grad_norm": 1.7923136846837993, + "learning_rate": 2.8325278876687084e-06, + "loss": 0.5502, + "step": 1109 + }, + { + "epoch": 0.92, + "grad_norm": 2.077195937792951, + "learning_rate": 2.8292868900235986e-06, + "loss": 0.543, + "step": 1110 + }, + { + "epoch": 0.92, + "grad_norm": 1.7675854046933754, + "learning_rate": 2.826045329093578e-06, + "loss": 0.5422, + "step": 1111 + }, + { + "epoch": 0.92, + "grad_norm": 1.8457239401392898, + "learning_rate": 2.822803210423727e-06, + "loss": 0.5334, + "step": 1112 + }, + { + "epoch": 0.92, + "grad_norm": 1.7426929121470698, + "learning_rate": 2.8195605395600804e-06, + "loss": 0.4972, + "step": 1113 + }, + { + "epoch": 0.92, + "grad_norm": 1.7675216264197045, + "learning_rate": 2.8163173220496175e-06, + "loss": 0.5442, + "step": 1114 + }, + { + "epoch": 0.92, + "grad_norm": 1.7483102565661375, + "learning_rate": 2.8130735634402527e-06, + "loss": 0.5425, + "step": 1115 + }, + { + "epoch": 0.93, + "grad_norm": 1.692036399159914, + "learning_rate": 2.8098292692808253e-06, + "loss": 0.521, + "step": 1116 + }, + { + "epoch": 0.93, + "grad_norm": 1.799980213437577, + "learning_rate": 2.8065844451210933e-06, + "loss": 0.5597, + "step": 1117 + }, + { + "epoch": 0.93, + "grad_norm": 1.7666190830884467, + "learning_rate": 2.803339096511718e-06, + "loss": 0.5612, + "step": 1118 + }, + { + "epoch": 0.93, + "grad_norm": 1.792129515845057, + "learning_rate": 2.8000932290042597e-06, + "loss": 0.5334, + "step": 1119 + }, + { + "epoch": 0.93, + "grad_norm": 1.7395715578516604, + "learning_rate": 2.7968468481511663e-06, + "loss": 0.5545, + "step": 1120 + }, + { + "epoch": 0.93, + "grad_norm": 1.6843830287676704, + "learning_rate": 2.7935999595057623e-06, + "loss": 0.5659, + "step": 1121 + }, + { + "epoch": 0.93, + "grad_norm": 1.6432688824199502, + "learning_rate": 2.790352568622244e-06, + "loss": 0.4926, + "step": 1122 + }, + { + "epoch": 0.93, + "grad_norm": 1.7430642435954644, + "learning_rate": 2.787104681055663e-06, + "loss": 0.4666, + "step": 1123 + }, + { + "epoch": 0.93, + "grad_norm": 1.8067789882264202, + "learning_rate": 2.783856302361923e-06, + "loss": 0.5233, + "step": 1124 + }, + { + "epoch": 0.93, + "grad_norm": 1.7685143281757654, + "learning_rate": 2.780607438097769e-06, + "loss": 0.5506, + "step": 1125 + }, + { + "epoch": 0.93, + "grad_norm": 1.7163110868931304, + "learning_rate": 2.7773580938207717e-06, + "loss": 0.5044, + "step": 1126 + }, + { + "epoch": 0.93, + "grad_norm": 1.809036270322799, + "learning_rate": 2.7741082750893284e-06, + "loss": 0.5206, + "step": 1127 + }, + { + "epoch": 0.94, + "grad_norm": 1.8193898978325846, + "learning_rate": 2.770857987462645e-06, + "loss": 0.6064, + "step": 1128 + }, + { + "epoch": 0.94, + "grad_norm": 1.765826426309075, + "learning_rate": 2.76760723650073e-06, + "loss": 0.4914, + "step": 1129 + }, + { + "epoch": 0.94, + "grad_norm": 2.046345230237298, + "learning_rate": 2.764356027764385e-06, + "loss": 0.5938, + "step": 1130 + }, + { + "epoch": 0.94, + "grad_norm": 1.8264697696225647, + "learning_rate": 2.7611043668151948e-06, + "loss": 0.5476, + "step": 1131 + }, + { + "epoch": 0.94, + "grad_norm": 1.7776043318415495, + "learning_rate": 2.7578522592155166e-06, + "loss": 0.5318, + "step": 1132 + }, + { + "epoch": 0.94, + "grad_norm": 1.767284538432005, + "learning_rate": 2.7545997105284735e-06, + "loss": 0.5197, + "step": 1133 + }, + { + "epoch": 0.94, + "grad_norm": 1.831190014066027, + "learning_rate": 2.75134672631794e-06, + "loss": 0.4939, + "step": 1134 + }, + { + "epoch": 0.94, + "grad_norm": 1.7727769641989948, + "learning_rate": 2.7480933121485394e-06, + "loss": 0.5542, + "step": 1135 + }, + { + "epoch": 0.94, + "grad_norm": 1.7599576706599651, + "learning_rate": 2.7448394735856275e-06, + "loss": 0.5102, + "step": 1136 + }, + { + "epoch": 0.94, + "grad_norm": 1.7526987759875383, + "learning_rate": 2.7415852161952893e-06, + "loss": 0.5357, + "step": 1137 + }, + { + "epoch": 0.94, + "grad_norm": 1.7478180377944075, + "learning_rate": 2.7383305455443223e-06, + "loss": 0.552, + "step": 1138 + }, + { + "epoch": 0.94, + "grad_norm": 1.8026983878339322, + "learning_rate": 2.7350754672002334e-06, + "loss": 0.5324, + "step": 1139 + }, + { + "epoch": 0.95, + "grad_norm": 1.7539604119960455, + "learning_rate": 2.7318199867312267e-06, + "loss": 0.4951, + "step": 1140 + }, + { + "epoch": 0.95, + "grad_norm": 1.7060714376533908, + "learning_rate": 2.728564109706193e-06, + "loss": 0.5044, + "step": 1141 + }, + { + "epoch": 0.95, + "grad_norm": 1.896732668736906, + "learning_rate": 2.725307841694704e-06, + "loss": 0.5272, + "step": 1142 + }, + { + "epoch": 0.95, + "grad_norm": 1.9094037542829962, + "learning_rate": 2.722051188266998e-06, + "loss": 0.5036, + "step": 1143 + }, + { + "epoch": 0.95, + "grad_norm": 1.7529900591353695, + "learning_rate": 2.7187941549939723e-06, + "loss": 0.4962, + "step": 1144 + }, + { + "epoch": 0.95, + "grad_norm": 1.7652784724721573, + "learning_rate": 2.7155367474471763e-06, + "loss": 0.5159, + "step": 1145 + }, + { + "epoch": 0.95, + "grad_norm": 1.9070275680276054, + "learning_rate": 2.7122789711987964e-06, + "loss": 0.5269, + "step": 1146 + }, + { + "epoch": 0.95, + "grad_norm": 1.7630505518040367, + "learning_rate": 2.709020831821652e-06, + "loss": 0.5286, + "step": 1147 + }, + { + "epoch": 0.95, + "grad_norm": 1.7410138974922291, + "learning_rate": 2.7057623348891846e-06, + "loss": 0.4902, + "step": 1148 + }, + { + "epoch": 0.95, + "grad_norm": 1.745842560539345, + "learning_rate": 2.7025034859754446e-06, + "loss": 0.5178, + "step": 1149 + }, + { + "epoch": 0.95, + "grad_norm": 1.8498982578771728, + "learning_rate": 2.699244290655086e-06, + "loss": 0.55, + "step": 1150 + }, + { + "epoch": 0.95, + "grad_norm": 1.6360369924184164, + "learning_rate": 2.6959847545033558e-06, + "loss": 0.4988, + "step": 1151 + }, + { + "epoch": 0.96, + "grad_norm": 1.6784833460211517, + "learning_rate": 2.692724883096082e-06, + "loss": 0.5303, + "step": 1152 + }, + { + "epoch": 0.96, + "grad_norm": 1.7888637226825195, + "learning_rate": 2.68946468200967e-06, + "loss": 0.542, + "step": 1153 + }, + { + "epoch": 0.96, + "grad_norm": 1.7156031503954616, + "learning_rate": 2.686204156821084e-06, + "loss": 0.499, + "step": 1154 + }, + { + "epoch": 0.96, + "grad_norm": 1.802618839032982, + "learning_rate": 2.6829433131078464e-06, + "loss": 0.5095, + "step": 1155 + }, + { + "epoch": 0.96, + "grad_norm": 1.7018673816457677, + "learning_rate": 2.6796821564480237e-06, + "loss": 0.4911, + "step": 1156 + }, + { + "epoch": 0.96, + "grad_norm": 1.939833859373507, + "learning_rate": 2.6764206924202173e-06, + "loss": 0.5965, + "step": 1157 + }, + { + "epoch": 0.96, + "grad_norm": 1.757462214596805, + "learning_rate": 2.673158926603554e-06, + "loss": 0.5119, + "step": 1158 + }, + { + "epoch": 0.96, + "grad_norm": 1.824906787992325, + "learning_rate": 2.669896864577678e-06, + "loss": 0.4995, + "step": 1159 + }, + { + "epoch": 0.96, + "grad_norm": 1.6963319988581682, + "learning_rate": 2.666634511922739e-06, + "loss": 0.499, + "step": 1160 + }, + { + "epoch": 0.96, + "grad_norm": 1.7490967555131538, + "learning_rate": 2.6633718742193837e-06, + "loss": 0.5045, + "step": 1161 + }, + { + "epoch": 0.96, + "grad_norm": 1.7295387040616608, + "learning_rate": 2.660108957048749e-06, + "loss": 0.48, + "step": 1162 + }, + { + "epoch": 0.96, + "grad_norm": 1.7062936128447537, + "learning_rate": 2.656845765992447e-06, + "loss": 0.5024, + "step": 1163 + }, + { + "epoch": 0.96, + "grad_norm": 1.7291223687738257, + "learning_rate": 2.6535823066325594e-06, + "loss": 0.4965, + "step": 1164 + }, + { + "epoch": 0.97, + "grad_norm": 1.7660018876230184, + "learning_rate": 2.650318584551626e-06, + "loss": 0.6289, + "step": 1165 + }, + { + "epoch": 0.97, + "grad_norm": 1.6875948695046943, + "learning_rate": 2.6470546053326375e-06, + "loss": 0.5099, + "step": 1166 + }, + { + "epoch": 0.97, + "grad_norm": 1.7055862895950586, + "learning_rate": 2.643790374559023e-06, + "loss": 0.4748, + "step": 1167 + }, + { + "epoch": 0.97, + "grad_norm": 1.8397810404769834, + "learning_rate": 2.6405258978146443e-06, + "loss": 0.5547, + "step": 1168 + }, + { + "epoch": 0.97, + "grad_norm": 1.6780759297615608, + "learning_rate": 2.6372611806837804e-06, + "loss": 0.4696, + "step": 1169 + }, + { + "epoch": 0.97, + "grad_norm": 1.7463193906158438, + "learning_rate": 2.633996228751125e-06, + "loss": 0.5167, + "step": 1170 + }, + { + "epoch": 0.97, + "grad_norm": 1.7682737157303552, + "learning_rate": 2.6307310476017705e-06, + "loss": 0.5178, + "step": 1171 + }, + { + "epoch": 0.97, + "grad_norm": 1.7759532350573655, + "learning_rate": 2.627465642821203e-06, + "loss": 0.5411, + "step": 1172 + }, + { + "epoch": 0.97, + "grad_norm": 1.741742707150691, + "learning_rate": 2.624200019995293e-06, + "loss": 0.5357, + "step": 1173 + }, + { + "epoch": 0.97, + "grad_norm": 1.7638181255611864, + "learning_rate": 2.6209341847102787e-06, + "loss": 0.5598, + "step": 1174 + }, + { + "epoch": 0.97, + "grad_norm": 1.6585763596592404, + "learning_rate": 2.6176681425527663e-06, + "loss": 0.4891, + "step": 1175 + }, + { + "epoch": 0.97, + "grad_norm": 1.7652514703885578, + "learning_rate": 2.614401899109716e-06, + "loss": 0.5412, + "step": 1176 + }, + { + "epoch": 0.98, + "grad_norm": 1.7646286601286296, + "learning_rate": 2.6111354599684287e-06, + "loss": 0.4753, + "step": 1177 + }, + { + "epoch": 0.98, + "grad_norm": 1.7933546923906454, + "learning_rate": 2.6078688307165436e-06, + "loss": 0.5159, + "step": 1178 + }, + { + "epoch": 0.98, + "grad_norm": 1.8474498352431208, + "learning_rate": 2.6046020169420223e-06, + "loss": 0.4786, + "step": 1179 + }, + { + "epoch": 0.98, + "grad_norm": 1.816609500392057, + "learning_rate": 2.601335024233145e-06, + "loss": 0.5821, + "step": 1180 + }, + { + "epoch": 0.98, + "grad_norm": 1.7603922858788037, + "learning_rate": 2.598067858178495e-06, + "loss": 0.4749, + "step": 1181 + }, + { + "epoch": 0.98, + "grad_norm": 1.771168764538133, + "learning_rate": 2.594800524366956e-06, + "loss": 0.5221, + "step": 1182 + }, + { + "epoch": 0.98, + "grad_norm": 1.7428386931770696, + "learning_rate": 2.591533028387694e-06, + "loss": 0.5243, + "step": 1183 + }, + { + "epoch": 0.98, + "grad_norm": 1.7354647623517858, + "learning_rate": 2.588265375830155e-06, + "loss": 0.4665, + "step": 1184 + }, + { + "epoch": 0.98, + "grad_norm": 1.7757829783254058, + "learning_rate": 2.5849975722840537e-06, + "loss": 0.4713, + "step": 1185 + }, + { + "epoch": 0.98, + "grad_norm": 1.7660698291034924, + "learning_rate": 2.58172962333936e-06, + "loss": 0.5198, + "step": 1186 + }, + { + "epoch": 0.98, + "grad_norm": 1.7071465020770178, + "learning_rate": 2.5784615345862963e-06, + "loss": 0.5355, + "step": 1187 + }, + { + "epoch": 0.98, + "grad_norm": 1.6994920599655763, + "learning_rate": 2.5751933116153215e-06, + "loss": 0.4867, + "step": 1188 + }, + { + "epoch": 0.99, + "grad_norm": 1.7891977115774562, + "learning_rate": 2.5719249600171247e-06, + "loss": 0.5071, + "step": 1189 + }, + { + "epoch": 0.99, + "grad_norm": 1.6866451169084888, + "learning_rate": 2.568656485382616e-06, + "loss": 0.4767, + "step": 1190 + }, + { + "epoch": 0.99, + "grad_norm": 1.9106444693405875, + "learning_rate": 2.5653878933029134e-06, + "loss": 0.5063, + "step": 1191 + }, + { + "epoch": 0.99, + "grad_norm": 1.7546015951107552, + "learning_rate": 2.56211918936934e-06, + "loss": 0.5536, + "step": 1192 + }, + { + "epoch": 0.99, + "grad_norm": 1.7866083346923656, + "learning_rate": 2.5588503791734053e-06, + "loss": 0.4738, + "step": 1193 + }, + { + "epoch": 0.99, + "grad_norm": 1.6678313975517949, + "learning_rate": 2.5555814683068058e-06, + "loss": 0.5095, + "step": 1194 + }, + { + "epoch": 0.99, + "grad_norm": 1.694690087625629, + "learning_rate": 2.552312462361405e-06, + "loss": 0.5711, + "step": 1195 + }, + { + "epoch": 0.99, + "grad_norm": 1.7583066556547233, + "learning_rate": 2.5490433669292337e-06, + "loss": 0.5183, + "step": 1196 + }, + { + "epoch": 0.99, + "grad_norm": 1.8259327544569408, + "learning_rate": 2.5457741876024716e-06, + "loss": 0.5129, + "step": 1197 + }, + { + "epoch": 0.99, + "grad_norm": 1.743709458286742, + "learning_rate": 2.542504929973445e-06, + "loss": 0.509, + "step": 1198 + }, + { + "epoch": 0.99, + "grad_norm": 1.8551037168096902, + "learning_rate": 2.5392355996346134e-06, + "loss": 0.4874, + "step": 1199 + }, + { + "epoch": 0.99, + "grad_norm": 1.7705896553689628, + "learning_rate": 2.5359662021785596e-06, + "loss": 0.5102, + "step": 1200 + }, + { + "epoch": 1.0, + "grad_norm": 1.8456154073029885, + "learning_rate": 2.532696743197982e-06, + "loss": 0.5363, + "step": 1201 + }, + { + "epoch": 1.0, + "grad_norm": 1.7341454202963031, + "learning_rate": 2.529427228285686e-06, + "loss": 0.5013, + "step": 1202 + }, + { + "epoch": 1.0, + "grad_norm": 1.7923147732329405, + "learning_rate": 2.526157663034568e-06, + "loss": 0.5191, + "step": 1203 + }, + { + "epoch": 1.0, + "grad_norm": 1.731262319220837, + "learning_rate": 2.522888053037616e-06, + "loss": 0.4889, + "step": 1204 + }, + { + "epoch": 1.0, + "grad_norm": 1.797800368847369, + "learning_rate": 2.5196184038878895e-06, + "loss": 0.4868, + "step": 1205 + }, + { + "epoch": 1.0, + "grad_norm": 1.8182272292135089, + "learning_rate": 2.5163487211785194e-06, + "loss": 0.5159, + "step": 1206 + } + ], + "logging_steps": 1, + "max_steps": 2412, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 603, + "total_flos": 568033919631360.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1206/training_args.bin b/checkpoint-1206/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e --- /dev/null +++ b/checkpoint-1206/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26 +size 7352 diff --git a/checkpoint-1206/zero_to_fp32.py b/checkpoint-1206/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b --- /dev/null +++ b/checkpoint-1206/zero_to_fp32.py @@ -0,0 +1,592 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-1809/config.json b/checkpoint-1809/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76 --- /dev/null +++ b/checkpoint-1809/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "alpindale/Mistral-7B-v0.2-hf", + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 32000, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.38.2", + "use_cache": false, + "vocab_size": 32002 +} diff --git a/checkpoint-1809/generation_config.json b/checkpoint-1809/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc --- /dev/null +++ b/checkpoint-1809/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "transformers_version": "4.38.2" +} diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec2bb87aa840c4f3e39f00243d40d225c8aacc6a --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b761fd04713774a95e8f55cd1512423d9b60e2f7d1067e957dc4e58d7c1aee5 +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1332c73f66ac989a229a64fc4ed2a6f429b1298c --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1da59e8825401cf7aaea87c02121a8975ef5f635d67de2862ef3afdfad8d1efc +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..82a2c46eef9a5c40de66e477f837855f0f88217d --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e8bd1199d401a427bc6ab3b0346969377314f13a04d0c44b6974dadeecb0297 +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1548c0ee9b64e20cf46cc668368e7e994ca804a --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b240e3057006281bb7c0412acf601226c8c038a3a0e68b6e95257be0a08d5ceb +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6ccd117435d70689b52e4a411e32d82c4fab024 --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc16be622ac689221d02ff32361dea668dfa54a3962f29f23b1d3fbe7fbd603f +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..971d9630a4c632966f231163619685f075d796c2 --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a31f0153235fb37447cb31d015478d03b41cbc646dbb0c5d908eea301aef039b +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..672f632651c0cf14adead1079e64ea596ba74743 --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f8a1eb5c697a46b107c09fcd567344229ca763eef9fd99b2b55f96093e8c83e +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b811ee40cada0019ea8ba14c219f957f514f054 --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1accd07344e616294aa51015f46d2e872115ae1bba0a305fa544af215d76bb69 +size 4831623435 diff --git a/checkpoint-1809/global_step1809/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..37041e00873f94ab25436dc1549092be1dbd9a02 --- /dev/null +++ b/checkpoint-1809/global_step1809/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be77fe5c721c18aa3684678f1c47a77d3094b2d950cc83f13282dd740ed64b61 +size 4831623435 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..292ec98f06c3f10540c0d9cb8dbfc144d2e4c877 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd31ba10755723f3d557ea9d00db6b1c1e800660204bfe845e0ffcf74d789c83 +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0eed4345efd988d9831dfc723f9f95285c2b0ab --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4624485375edf79b90d9bf1c4daa6da34f3494ae148bfc4b24bd73b9997bc0c1 +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10ccacf21a705964b9e9334830fd5175cd9b3141 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a26376f5885a10c708f9a117f274d6ccaee0da9ccfeb1ac15213aa1509830c +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4f4632248727b10fb10f41d0b1f7ed34c735ca2 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48226f62e4bb0a5db0ce34c43c572a25d6f0da19415f838cfe65e4702302a662 +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1afef7dde30e1e4784d6ccc68bc284949c6a6b4 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d4608cee058708405735822e13a938e3b3c944f9a7c1d220ded5f2f06b572c3 +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..201b8fa9d94f83920ccbe88aad273b22f6c99c53 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5423f97e3f0e471a6ea41652cd8ec31ae166df737f3fcc7ec1a3ac2958274718 +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..163127281d910c3f6bf90fa4cd42d4854a2a1e97 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95ee1216c4418444b48374ee04393e19379060c968a8b0ef16b86c3b411e54c8 +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35a749e0a0d3fe39c2c720e8c886d3893d1b6793 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2726e76b841826bbe108a1bed34734a9250b20b051559fee1d53d7300a2d0e8 +size 153829 diff --git a/checkpoint-1809/global_step1809/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-1809/global_step1809/zero_pp_rank_8_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..766c35e84d3cc839458c28c25c83337a954e3ca4 --- /dev/null +++ b/checkpoint-1809/global_step1809/zero_pp_rank_8_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7c1471f91b15beccbbcb3c355b08a1e31f0841ecbdf88d26267ab30fd5b4697 +size 153829 diff --git a/checkpoint-1809/latest b/checkpoint-1809/latest new file mode 100644 index 0000000000000000000000000000000000000000..40509fea8c03b0331f0d689e84c0191961ecc7c3 --- /dev/null +++ b/checkpoint-1809/latest @@ -0,0 +1 @@ +global_step1809 \ No newline at end of file diff --git a/checkpoint-1809/model-00001-of-00003.safetensors b/checkpoint-1809/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..61710203dc37adf5e29e02a035dd805965011aea --- /dev/null +++ b/checkpoint-1809/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aa419e433b185323444a3b8350d979b45a038e6887330b3a1edaacf48ac9f2d +size 4943178720 diff --git a/checkpoint-1809/model-00002-of-00003.safetensors b/checkpoint-1809/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7842a9fedcbbc6e8bc6d6791d1f99b8aed523b34 --- /dev/null +++ b/checkpoint-1809/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c5a608fc2645deb20b706f73174b5ddc9df7a86e31b670b4ea896c064afb27 +size 4999819336 diff --git a/checkpoint-1809/model-00003-of-00003.safetensors b/checkpoint-1809/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e1e5307207e0a065ef57ec97d3dad29dc5197319 --- /dev/null +++ b/checkpoint-1809/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91a448af004507aa23616541e844c83722dc86610112b69ad59f13b4dc59b466 +size 4540532728 diff --git a/checkpoint-1809/model.safetensors.index.json b/checkpoint-1809/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2 --- /dev/null +++ b/checkpoint-1809/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 14483496960 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/checkpoint-1809/rng_state_0.pth b/checkpoint-1809/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed9c956014a637b9d3ccb494c387c7452ae938e0 --- /dev/null +++ b/checkpoint-1809/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b7907b6e8bbc0deaf9b6cadef63205dade64f9fbf74f9a4dca9c34792d7aab +size 16240 diff --git a/checkpoint-1809/rng_state_1.pth b/checkpoint-1809/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2452cb1ac950d724f0559bab3e53e6a671da5ba --- /dev/null +++ b/checkpoint-1809/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4ca3302c930a1b49ced40d5e2133aedc4c5857930d92deb8c6496a317958d8 +size 16240 diff --git a/checkpoint-1809/rng_state_2.pth b/checkpoint-1809/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..30ca1e0fbf8047c1cd0606a37b02d545623d4a67 --- /dev/null +++ b/checkpoint-1809/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbbf2364108e70a0ac183356d1693182b452bb464271c3d2f4ade972244d710d +size 16240 diff --git a/checkpoint-1809/rng_state_3.pth b/checkpoint-1809/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a342cc40db30db7d18c31cffe2a2e1b1d2f3b084 --- /dev/null +++ b/checkpoint-1809/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9269c171a7948127faa588109a1fb8043194b407d2dfbeda2e25ed8b35126a5 +size 16240 diff --git a/checkpoint-1809/rng_state_4.pth b/checkpoint-1809/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca08e0f4a907b0b1649b7bc3537dd48c83723830 --- /dev/null +++ b/checkpoint-1809/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02625e4547fbacdb164e484867f76d5024a007c22c297f8ecbef13fc6aa3202 +size 16240 diff --git a/checkpoint-1809/rng_state_5.pth b/checkpoint-1809/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1aeba77fabdef8a232c2785991d798bd3f84afd3 --- /dev/null +++ b/checkpoint-1809/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51eb0286c1f14a2c09c443d8c606951c3debeb25f9ba4f71e0aea90ae2f0786e +size 16240 diff --git a/checkpoint-1809/rng_state_6.pth b/checkpoint-1809/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..499c459dc2af4317a2a23f7877927bf7c586e439 --- /dev/null +++ b/checkpoint-1809/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:080bbd36834b7a1623430efdd9f598b791f466541d25b545ca410ec4a930a0f3 +size 16240 diff --git a/checkpoint-1809/rng_state_7.pth b/checkpoint-1809/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cdfb9b9f9f3356413f6755deb29a84b7b4e360a2 --- /dev/null +++ b/checkpoint-1809/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54aa959bf290908dfe1fc65c2591b99982e9fdce5caf276626d0084ccffa7e95 +size 16240 diff --git a/checkpoint-1809/rng_state_8.pth b/checkpoint-1809/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..6533db02002842edcb0c9b2a6dd89506e90ac8c8 --- /dev/null +++ b/checkpoint-1809/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85f8554f99e72a1c251b463a30088dd49afece6deb61c5ad09834d35ff89308b +size 16240 diff --git a/checkpoint-1809/scheduler.pt b/checkpoint-1809/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6305be6b3a4171fe11369d2578fc7945741c40d5 --- /dev/null +++ b/checkpoint-1809/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef3ed14afeb23e7559e1ece00ec5a5ba48527918d9a770399a0f1d431d2f9b0 +size 1064 diff --git a/checkpoint-1809/trainer_state.json b/checkpoint-1809/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2ec8d4384deb2d7c2bc47244f56f5ecd0ca866f5 --- /dev/null +++ b/checkpoint-1809/trainer_state.json @@ -0,0 +1,12684 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4820725388601037, + "eval_steps": 500, + "global_step": 1809, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 27.81778461909011, + "learning_rate": 5.000000000000001e-07, + "loss": 0.7993, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 28.63833175363421, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9056, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 25.646828828014854, + "learning_rate": 1.5e-06, + "loss": 0.8473, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 9.834124771941388, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8192, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 10.558095859980105, + "learning_rate": 2.5e-06, + "loss": 0.7943, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 7.905789045775758, + "learning_rate": 3e-06, + "loss": 0.7075, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 7.259519170268483, + "learning_rate": 3.5e-06, + "loss": 0.7537, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 6.639042051048664, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7471, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 8.515070932390074, + "learning_rate": 4.5e-06, + "loss": 0.7689, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 8.916410424632533, + "learning_rate": 5e-06, + "loss": 0.7194, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 4.835046497413255, + "learning_rate": 4.9999978617243506e-06, + "loss": 0.6949, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 10.065648500649479, + "learning_rate": 4.9999914469010585e-06, + "loss": 0.7039, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 5.299372887839679, + "learning_rate": 4.999980755541098e-06, + "loss": 0.7067, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 5.693110837094718, + "learning_rate": 4.999965787662758e-06, + "loss": 0.7126, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 2.983869635716314, + "learning_rate": 4.999946543291642e-06, + "loss": 0.6496, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 4.2561193962441175, + "learning_rate": 4.999923022460671e-06, + "loss": 0.7036, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 3.011772824968437, + "learning_rate": 4.999895225210079e-06, + "loss": 0.7009, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 3.386638415717137, + "learning_rate": 4.9998631515874165e-06, + "loss": 0.6624, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 3.764658092125165, + "learning_rate": 4.999826801647551e-06, + "loss": 0.6687, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 2.3982096117966614, + "learning_rate": 4.999786175452662e-06, + "loss": 0.706, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 2.8051633678260193, + "learning_rate": 4.999741273072246e-06, + "loss": 0.7031, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 3.1177784624332614, + "learning_rate": 4.999692094583114e-06, + "loss": 0.7525, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 2.2533819675617806, + "learning_rate": 4.9996386400693906e-06, + "loss": 0.6767, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 2.61893793162573, + "learning_rate": 4.999580909622518e-06, + "loss": 0.6432, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 2.76057623723569, + "learning_rate": 4.999518903341251e-06, + "loss": 0.6809, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 2.27983032069553, + "learning_rate": 4.999452621331657e-06, + "loss": 0.6798, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 2.501904568120582, + "learning_rate": 4.99938206370712e-06, + "loss": 0.6412, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 2.819229290729669, + "learning_rate": 4.999307230588338e-06, + "loss": 0.6188, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 2.1233212322022212, + "learning_rate": 4.9992281221033224e-06, + "loss": 0.6378, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 2.7806911906686755, + "learning_rate": 4.999144738387396e-06, + "loss": 0.6653, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 2.4045490257014563, + "learning_rate": 4.999057079583199e-06, + "loss": 0.6377, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 2.3803717769210446, + "learning_rate": 4.998965145840681e-06, + "loss": 0.6855, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 2.3976652879633473, + "learning_rate": 4.998868937317106e-06, + "loss": 0.6284, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 2.2958541157119727, + "learning_rate": 4.998768454177051e-06, + "loss": 0.6521, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 2.1925196833696154, + "learning_rate": 4.998663696592403e-06, + "loss": 0.6619, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 2.361006042901851, + "learning_rate": 4.998554664742362e-06, + "loss": 0.6155, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 2.1577758143653614, + "learning_rate": 4.998441358813443e-06, + "loss": 0.6398, + "step": 37 + }, + { + "epoch": 0.03, + "grad_norm": 2.219872074512664, + "learning_rate": 4.998323778999467e-06, + "loss": 0.6051, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 2.2907501521408546, + "learning_rate": 4.9982019255015705e-06, + "loss": 0.6337, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 2.1769862324666183, + "learning_rate": 4.9980757985281955e-06, + "loss": 0.6606, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 2.4252479779661607, + "learning_rate": 4.997945398295101e-06, + "loss": 0.6685, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 2.3929541982084657, + "learning_rate": 4.99781072502535e-06, + "loss": 0.6084, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 1.932539969840091, + "learning_rate": 4.997671778949318e-06, + "loss": 0.6123, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 2.191742541327873, + "learning_rate": 4.997528560304688e-06, + "loss": 0.6247, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 2.423376784566499, + "learning_rate": 4.997381069336455e-06, + "loss": 0.7024, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 2.0599055392481076, + "learning_rate": 4.997229306296918e-06, + "loss": 0.6612, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 2.16832922087532, + "learning_rate": 4.997073271445686e-06, + "loss": 0.5949, + "step": 47 + }, + { + "epoch": 0.04, + "grad_norm": 2.0483598654319453, + "learning_rate": 4.9969129650496775e-06, + "loss": 0.6406, + "step": 48 + }, + { + "epoch": 0.04, + "grad_norm": 1.963056609139284, + "learning_rate": 4.996748387383113e-06, + "loss": 0.6361, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 2.2094923844269307, + "learning_rate": 4.996579538727527e-06, + "loss": 0.5901, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 2.1088153449411857, + "learning_rate": 4.996406419371749e-06, + "loss": 0.6458, + "step": 51 + }, + { + "epoch": 0.04, + "grad_norm": 2.093448940617732, + "learning_rate": 4.996229029611926e-06, + "loss": 0.6509, + "step": 52 + }, + { + "epoch": 0.04, + "grad_norm": 2.075116207412987, + "learning_rate": 4.996047369751502e-06, + "loss": 0.6295, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 2.138141165277684, + "learning_rate": 4.995861440101229e-06, + "loss": 0.6088, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 2.186316382848445, + "learning_rate": 4.995671240979161e-06, + "loss": 0.6307, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 2.2513741083982195, + "learning_rate": 4.995476772710657e-06, + "loss": 0.6175, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 2.0827167336870596, + "learning_rate": 4.995278035628379e-06, + "loss": 0.5935, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 2.117977588574442, + "learning_rate": 4.995075030072291e-06, + "loss": 0.5998, + "step": 58 + }, + { + "epoch": 0.05, + "grad_norm": 2.0996940200235485, + "learning_rate": 4.994867756389658e-06, + "loss": 0.6159, + "step": 59 + }, + { + "epoch": 0.05, + "grad_norm": 2.141096165691323, + "learning_rate": 4.994656214935045e-06, + "loss": 0.6294, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 2.022748830058395, + "learning_rate": 4.994440406070323e-06, + "loss": 0.6315, + "step": 61 + }, + { + "epoch": 0.05, + "grad_norm": 2.209132168720991, + "learning_rate": 4.994220330164654e-06, + "loss": 0.5645, + "step": 62 + }, + { + "epoch": 0.05, + "grad_norm": 2.0994557317862674, + "learning_rate": 4.993995987594509e-06, + "loss": 0.6272, + "step": 63 + }, + { + "epoch": 0.05, + "grad_norm": 2.204220831053169, + "learning_rate": 4.99376737874365e-06, + "loss": 0.6379, + "step": 64 + }, + { + "epoch": 0.05, + "grad_norm": 2.127733932186697, + "learning_rate": 4.993534504003141e-06, + "loss": 0.622, + "step": 65 + }, + { + "epoch": 0.05, + "grad_norm": 2.1338506582034316, + "learning_rate": 4.993297363771342e-06, + "loss": 0.6259, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 2.104802764460729, + "learning_rate": 4.993055958453912e-06, + "loss": 0.6414, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 2.0889535347771675, + "learning_rate": 4.9928102884638004e-06, + "loss": 0.6466, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 2.252225316694296, + "learning_rate": 4.992560354221258e-06, + "loss": 0.6167, + "step": 69 + }, + { + "epoch": 0.06, + "grad_norm": 2.015392533516649, + "learning_rate": 4.992306156153827e-06, + "loss": 0.5958, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 2.151741408948778, + "learning_rate": 4.992047694696343e-06, + "loss": 0.5875, + "step": 71 + }, + { + "epoch": 0.06, + "grad_norm": 2.0351299117412696, + "learning_rate": 4.991784970290935e-06, + "loss": 0.5935, + "step": 72 + }, + { + "epoch": 0.06, + "grad_norm": 2.0000962363827983, + "learning_rate": 4.991517983387026e-06, + "loss": 0.6091, + "step": 73 + }, + { + "epoch": 0.06, + "grad_norm": 2.202881736102415, + "learning_rate": 4.99124673444133e-06, + "loss": 0.6122, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 2.015074773396151, + "learning_rate": 4.990971223917848e-06, + "loss": 0.6134, + "step": 75 + }, + { + "epoch": 0.06, + "grad_norm": 2.009305960567766, + "learning_rate": 4.990691452287877e-06, + "loss": 0.6308, + "step": 76 + }, + { + "epoch": 0.06, + "grad_norm": 1.9967884756310221, + "learning_rate": 4.990407420029999e-06, + "loss": 0.6098, + "step": 77 + }, + { + "epoch": 0.06, + "grad_norm": 2.0858738033925905, + "learning_rate": 4.990119127630085e-06, + "loss": 0.6344, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 1.9427707561903895, + "learning_rate": 4.989826575581295e-06, + "loss": 0.6049, + "step": 79 + }, + { + "epoch": 0.07, + "grad_norm": 2.157150584766853, + "learning_rate": 4.989529764384073e-06, + "loss": 0.5965, + "step": 80 + }, + { + "epoch": 0.07, + "grad_norm": 2.0303527419352583, + "learning_rate": 4.989228694546151e-06, + "loss": 0.6524, + "step": 81 + }, + { + "epoch": 0.07, + "grad_norm": 2.128799919475717, + "learning_rate": 4.988923366582546e-06, + "loss": 0.5524, + "step": 82 + }, + { + "epoch": 0.07, + "grad_norm": 2.0122786280510696, + "learning_rate": 4.988613781015557e-06, + "loss": 0.6268, + "step": 83 + }, + { + "epoch": 0.07, + "grad_norm": 2.104580177719229, + "learning_rate": 4.988299938374769e-06, + "loss": 0.6229, + "step": 84 + }, + { + "epoch": 0.07, + "grad_norm": 2.3894843860356834, + "learning_rate": 4.9879818391970455e-06, + "loss": 0.6194, + "step": 85 + }, + { + "epoch": 0.07, + "grad_norm": 1.9615211372441477, + "learning_rate": 4.9876594840265355e-06, + "loss": 0.6355, + "step": 86 + }, + { + "epoch": 0.07, + "grad_norm": 2.4509852093141937, + "learning_rate": 4.987332873414666e-06, + "loss": 0.6405, + "step": 87 + }, + { + "epoch": 0.07, + "grad_norm": 2.178942375285086, + "learning_rate": 4.987002007920142e-06, + "loss": 0.5593, + "step": 88 + }, + { + "epoch": 0.07, + "grad_norm": 2.2625634345900445, + "learning_rate": 4.9866668881089515e-06, + "loss": 0.6133, + "step": 89 + }, + { + "epoch": 0.07, + "grad_norm": 2.363092638811143, + "learning_rate": 4.986327514554356e-06, + "loss": 0.6298, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 2.0401982492138546, + "learning_rate": 4.985983887836894e-06, + "loss": 0.6276, + "step": 91 + }, + { + "epoch": 0.08, + "grad_norm": 2.276956647922478, + "learning_rate": 4.985636008544381e-06, + "loss": 0.5691, + "step": 92 + }, + { + "epoch": 0.08, + "grad_norm": 2.1072762844110233, + "learning_rate": 4.985283877271908e-06, + "loss": 0.6175, + "step": 93 + }, + { + "epoch": 0.08, + "grad_norm": 2.2931866879442637, + "learning_rate": 4.984927494621836e-06, + "loss": 0.6419, + "step": 94 + }, + { + "epoch": 0.08, + "grad_norm": 2.112474101166308, + "learning_rate": 4.984566861203801e-06, + "loss": 0.607, + "step": 95 + }, + { + "epoch": 0.08, + "grad_norm": 2.1816059679212634, + "learning_rate": 4.984201977634711e-06, + "loss": 0.6136, + "step": 96 + }, + { + "epoch": 0.08, + "grad_norm": 2.0620776369966554, + "learning_rate": 4.9838328445387415e-06, + "loss": 0.6372, + "step": 97 + }, + { + "epoch": 0.08, + "grad_norm": 2.147592836641578, + "learning_rate": 4.983459462547341e-06, + "loss": 0.606, + "step": 98 + }, + { + "epoch": 0.08, + "grad_norm": 2.1808001877062453, + "learning_rate": 4.983081832299224e-06, + "loss": 0.6019, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 2.3751999527114087, + "learning_rate": 4.98269995444037e-06, + "loss": 0.6021, + "step": 100 + }, + { + "epoch": 0.08, + "grad_norm": 1.8769470206406913, + "learning_rate": 4.98231382962403e-06, + "loss": 0.6082, + "step": 101 + }, + { + "epoch": 0.08, + "grad_norm": 2.3060925784921347, + "learning_rate": 4.981923458510717e-06, + "loss": 0.6174, + "step": 102 + }, + { + "epoch": 0.09, + "grad_norm": 2.1543176832473683, + "learning_rate": 4.981528841768206e-06, + "loss": 0.6092, + "step": 103 + }, + { + "epoch": 0.09, + "grad_norm": 2.1558689520522547, + "learning_rate": 4.981129980071538e-06, + "loss": 0.587, + "step": 104 + }, + { + "epoch": 0.09, + "grad_norm": 2.3830532005188383, + "learning_rate": 4.980726874103014e-06, + "loss": 0.6518, + "step": 105 + }, + { + "epoch": 0.09, + "grad_norm": 2.3333119576634767, + "learning_rate": 4.980319524552195e-06, + "loss": 0.6096, + "step": 106 + }, + { + "epoch": 0.09, + "grad_norm": 2.1135146855324214, + "learning_rate": 4.9799079321159e-06, + "loss": 0.5728, + "step": 107 + }, + { + "epoch": 0.09, + "grad_norm": 2.2300463384326394, + "learning_rate": 4.9794920974982095e-06, + "loss": 0.6563, + "step": 108 + }, + { + "epoch": 0.09, + "grad_norm": 2.1745234017525443, + "learning_rate": 4.979072021410458e-06, + "loss": 0.5968, + "step": 109 + }, + { + "epoch": 0.09, + "grad_norm": 2.1536586182562334, + "learning_rate": 4.978647704571237e-06, + "loss": 0.6189, + "step": 110 + }, + { + "epoch": 0.09, + "grad_norm": 2.193809374687326, + "learning_rate": 4.97821914770639e-06, + "loss": 0.5864, + "step": 111 + }, + { + "epoch": 0.09, + "grad_norm": 2.0525896373682047, + "learning_rate": 4.977786351549017e-06, + "loss": 0.6101, + "step": 112 + }, + { + "epoch": 0.09, + "grad_norm": 2.216099286618384, + "learning_rate": 4.977349316839467e-06, + "loss": 0.5984, + "step": 113 + }, + { + "epoch": 0.09, + "grad_norm": 2.155122255962579, + "learning_rate": 4.97690804432534e-06, + "loss": 0.6311, + "step": 114 + }, + { + "epoch": 0.1, + "grad_norm": 2.2972101190291374, + "learning_rate": 4.976462534761487e-06, + "loss": 0.5813, + "step": 115 + }, + { + "epoch": 0.1, + "grad_norm": 1.9925413745245948, + "learning_rate": 4.9760127889100044e-06, + "loss": 0.6157, + "step": 116 + }, + { + "epoch": 0.1, + "grad_norm": 2.2802548684036568, + "learning_rate": 4.975558807540238e-06, + "loss": 0.6079, + "step": 117 + }, + { + "epoch": 0.1, + "grad_norm": 2.048888007394621, + "learning_rate": 4.9751005914287775e-06, + "loss": 0.6467, + "step": 118 + }, + { + "epoch": 0.1, + "grad_norm": 2.28661640438254, + "learning_rate": 4.974638141359456e-06, + "loss": 0.6029, + "step": 119 + }, + { + "epoch": 0.1, + "grad_norm": 2.004056683755783, + "learning_rate": 4.974171458123351e-06, + "loss": 0.6289, + "step": 120 + }, + { + "epoch": 0.1, + "grad_norm": 2.1628470048067667, + "learning_rate": 4.97370054251878e-06, + "loss": 0.6139, + "step": 121 + }, + { + "epoch": 0.1, + "grad_norm": 2.056119895466544, + "learning_rate": 4.9732253953513e-06, + "loss": 0.5798, + "step": 122 + }, + { + "epoch": 0.1, + "grad_norm": 2.1716513163164275, + "learning_rate": 4.972746017433709e-06, + "loss": 0.6085, + "step": 123 + }, + { + "epoch": 0.1, + "grad_norm": 2.255856676525811, + "learning_rate": 4.97226240958604e-06, + "loss": 0.6342, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 2.1049280498075373, + "learning_rate": 4.971774572635563e-06, + "loss": 0.6197, + "step": 125 + }, + { + "epoch": 0.1, + "grad_norm": 2.133349390995361, + "learning_rate": 4.97128250741678e-06, + "loss": 0.5751, + "step": 126 + }, + { + "epoch": 0.11, + "grad_norm": 2.2044887467317578, + "learning_rate": 4.97078621477143e-06, + "loss": 0.6611, + "step": 127 + }, + { + "epoch": 0.11, + "grad_norm": 2.1413863795698145, + "learning_rate": 4.970285695548481e-06, + "loss": 0.625, + "step": 128 + }, + { + "epoch": 0.11, + "grad_norm": 2.0229587336296615, + "learning_rate": 4.969780950604132e-06, + "loss": 0.5989, + "step": 129 + }, + { + "epoch": 0.11, + "grad_norm": 2.0983599595244247, + "learning_rate": 4.969271980801808e-06, + "loss": 0.5747, + "step": 130 + }, + { + "epoch": 0.11, + "grad_norm": 2.1059041140010786, + "learning_rate": 4.9687587870121645e-06, + "loss": 0.5869, + "step": 131 + }, + { + "epoch": 0.11, + "grad_norm": 1.8967441614595046, + "learning_rate": 4.9682413701130815e-06, + "loss": 0.6272, + "step": 132 + }, + { + "epoch": 0.11, + "grad_norm": 1.9976164993621088, + "learning_rate": 4.967719730989663e-06, + "loss": 0.6282, + "step": 133 + }, + { + "epoch": 0.11, + "grad_norm": 1.8719131324952145, + "learning_rate": 4.967193870534235e-06, + "loss": 0.6052, + "step": 134 + }, + { + "epoch": 0.11, + "grad_norm": 2.071702997476533, + "learning_rate": 4.9666637896463455e-06, + "loss": 0.5785, + "step": 135 + }, + { + "epoch": 0.11, + "grad_norm": 1.9549455320048341, + "learning_rate": 4.966129489232762e-06, + "loss": 0.5739, + "step": 136 + }, + { + "epoch": 0.11, + "grad_norm": 2.0656898626759315, + "learning_rate": 4.9655909702074684e-06, + "loss": 0.6651, + "step": 137 + }, + { + "epoch": 0.11, + "grad_norm": 2.1185948604203038, + "learning_rate": 4.965048233491669e-06, + "loss": 0.5759, + "step": 138 + }, + { + "epoch": 0.12, + "grad_norm": 2.08566019272993, + "learning_rate": 4.964501280013777e-06, + "loss": 0.6271, + "step": 139 + }, + { + "epoch": 0.12, + "grad_norm": 2.117420903965419, + "learning_rate": 4.963950110709425e-06, + "loss": 0.5968, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 1.9784944143818486, + "learning_rate": 4.963394726521453e-06, + "loss": 0.6112, + "step": 141 + }, + { + "epoch": 0.12, + "grad_norm": 2.077292948039572, + "learning_rate": 4.9628351283999144e-06, + "loss": 0.5636, + "step": 142 + }, + { + "epoch": 0.12, + "grad_norm": 2.223803520245629, + "learning_rate": 4.962271317302068e-06, + "loss": 0.6658, + "step": 143 + }, + { + "epoch": 0.12, + "grad_norm": 2.039369072186367, + "learning_rate": 4.9617032941923796e-06, + "loss": 0.5853, + "step": 144 + }, + { + "epoch": 0.12, + "grad_norm": 2.071470113085907, + "learning_rate": 4.961131060042522e-06, + "loss": 0.601, + "step": 145 + }, + { + "epoch": 0.12, + "grad_norm": 2.437470272347474, + "learning_rate": 4.960554615831372e-06, + "loss": 0.6593, + "step": 146 + }, + { + "epoch": 0.12, + "grad_norm": 2.178684122927139, + "learning_rate": 4.959973962545005e-06, + "loss": 0.607, + "step": 147 + }, + { + "epoch": 0.12, + "grad_norm": 2.097006749956471, + "learning_rate": 4.9593891011767e-06, + "loss": 0.5873, + "step": 148 + }, + { + "epoch": 0.12, + "grad_norm": 1.9801202541822784, + "learning_rate": 4.958800032726931e-06, + "loss": 0.5877, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 2.30001951085656, + "learning_rate": 4.958206758203373e-06, + "loss": 0.6368, + "step": 150 + }, + { + "epoch": 0.13, + "grad_norm": 1.990094260131078, + "learning_rate": 4.957609278620891e-06, + "loss": 0.59, + "step": 151 + }, + { + "epoch": 0.13, + "grad_norm": 2.262163752076628, + "learning_rate": 4.957007595001548e-06, + "loss": 0.5779, + "step": 152 + }, + { + "epoch": 0.13, + "grad_norm": 2.1970152093220983, + "learning_rate": 4.956401708374595e-06, + "loss": 0.5894, + "step": 153 + }, + { + "epoch": 0.13, + "grad_norm": 2.220825872684071, + "learning_rate": 4.9557916197764745e-06, + "loss": 0.6528, + "step": 154 + }, + { + "epoch": 0.13, + "grad_norm": 2.099472677591387, + "learning_rate": 4.955177330250817e-06, + "loss": 0.5798, + "step": 155 + }, + { + "epoch": 0.13, + "grad_norm": 2.159203936881569, + "learning_rate": 4.954558840848437e-06, + "loss": 0.6206, + "step": 156 + }, + { + "epoch": 0.13, + "grad_norm": 2.185152414039555, + "learning_rate": 4.953936152627338e-06, + "loss": 0.5624, + "step": 157 + }, + { + "epoch": 0.13, + "grad_norm": 2.0679748168992624, + "learning_rate": 4.953309266652701e-06, + "loss": 0.5859, + "step": 158 + }, + { + "epoch": 0.13, + "grad_norm": 2.327237187255128, + "learning_rate": 4.952678183996891e-06, + "loss": 0.5632, + "step": 159 + }, + { + "epoch": 0.13, + "grad_norm": 2.2865519679977417, + "learning_rate": 4.952042905739451e-06, + "loss": 0.6965, + "step": 160 + }, + { + "epoch": 0.13, + "grad_norm": 2.523435408018699, + "learning_rate": 4.9514034329671e-06, + "loss": 0.6217, + "step": 161 + }, + { + "epoch": 0.13, + "grad_norm": 2.4992653226709636, + "learning_rate": 4.950759766773734e-06, + "loss": 0.6175, + "step": 162 + }, + { + "epoch": 0.14, + "grad_norm": 2.432752824777114, + "learning_rate": 4.950111908260423e-06, + "loss": 0.5862, + "step": 163 + }, + { + "epoch": 0.14, + "grad_norm": 2.137500912204061, + "learning_rate": 4.949459858535404e-06, + "loss": 0.6124, + "step": 164 + }, + { + "epoch": 0.14, + "grad_norm": 2.2226376224120474, + "learning_rate": 4.94880361871409e-06, + "loss": 0.5891, + "step": 165 + }, + { + "epoch": 0.14, + "grad_norm": 2.3821839805775165, + "learning_rate": 4.9481431899190544e-06, + "loss": 0.6008, + "step": 166 + }, + { + "epoch": 0.14, + "grad_norm": 2.306242834684614, + "learning_rate": 4.947478573280044e-06, + "loss": 0.6159, + "step": 167 + }, + { + "epoch": 0.14, + "grad_norm": 2.3298092236851518, + "learning_rate": 4.946809769933963e-06, + "loss": 0.5809, + "step": 168 + }, + { + "epoch": 0.14, + "grad_norm": 2.364296499621558, + "learning_rate": 4.946136781024883e-06, + "loss": 0.5895, + "step": 169 + }, + { + "epoch": 0.14, + "grad_norm": 2.237241095609228, + "learning_rate": 4.945459607704029e-06, + "loss": 0.6144, + "step": 170 + }, + { + "epoch": 0.14, + "grad_norm": 2.4027419761972264, + "learning_rate": 4.9447782511297905e-06, + "loss": 0.5985, + "step": 171 + }, + { + "epoch": 0.14, + "grad_norm": 2.1547059182244284, + "learning_rate": 4.944092712467709e-06, + "loss": 0.5763, + "step": 172 + }, + { + "epoch": 0.14, + "grad_norm": 2.1530221667047984, + "learning_rate": 4.9434029928904805e-06, + "loss": 0.5692, + "step": 173 + }, + { + "epoch": 0.14, + "grad_norm": 2.228588593294869, + "learning_rate": 4.942709093577954e-06, + "loss": 0.5896, + "step": 174 + }, + { + "epoch": 0.15, + "grad_norm": 2.1597295307130198, + "learning_rate": 4.942011015717129e-06, + "loss": 0.5864, + "step": 175 + }, + { + "epoch": 0.15, + "grad_norm": 2.321140955498194, + "learning_rate": 4.941308760502149e-06, + "loss": 0.6089, + "step": 176 + }, + { + "epoch": 0.15, + "grad_norm": 2.220124736460707, + "learning_rate": 4.940602329134309e-06, + "loss": 0.5786, + "step": 177 + }, + { + "epoch": 0.15, + "grad_norm": 2.1698038563080417, + "learning_rate": 4.939891722822043e-06, + "loss": 0.5749, + "step": 178 + }, + { + "epoch": 0.15, + "grad_norm": 2.244425969121411, + "learning_rate": 4.93917694278093e-06, + "loss": 0.5877, + "step": 179 + }, + { + "epoch": 0.15, + "grad_norm": 2.143920008069458, + "learning_rate": 4.938457990233687e-06, + "loss": 0.6024, + "step": 180 + }, + { + "epoch": 0.15, + "grad_norm": 2.1786040820345813, + "learning_rate": 4.937734866410169e-06, + "loss": 0.5845, + "step": 181 + }, + { + "epoch": 0.15, + "grad_norm": 2.301832824481007, + "learning_rate": 4.9370075725473665e-06, + "loss": 0.6182, + "step": 182 + }, + { + "epoch": 0.15, + "grad_norm": 2.3748033727083997, + "learning_rate": 4.936276109889403e-06, + "loss": 0.6073, + "step": 183 + }, + { + "epoch": 0.15, + "grad_norm": 2.476334487382023, + "learning_rate": 4.935540479687534e-06, + "loss": 0.5793, + "step": 184 + }, + { + "epoch": 0.15, + "grad_norm": 2.2509466352322494, + "learning_rate": 4.934800683200143e-06, + "loss": 0.6133, + "step": 185 + }, + { + "epoch": 0.15, + "grad_norm": 2.8391697547684873, + "learning_rate": 4.934056721692742e-06, + "loss": 0.5967, + "step": 186 + }, + { + "epoch": 0.16, + "grad_norm": 2.4492364225391765, + "learning_rate": 4.933308596437965e-06, + "loss": 0.5676, + "step": 187 + }, + { + "epoch": 0.16, + "grad_norm": 2.685548141821295, + "learning_rate": 4.932556308715573e-06, + "loss": 0.6069, + "step": 188 + }, + { + "epoch": 0.16, + "grad_norm": 2.261217637824808, + "learning_rate": 4.931799859812443e-06, + "loss": 0.6411, + "step": 189 + }, + { + "epoch": 0.16, + "grad_norm": 2.3838284395200966, + "learning_rate": 4.931039251022573e-06, + "loss": 0.5745, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 2.2550921344466164, + "learning_rate": 4.930274483647074e-06, + "loss": 0.5989, + "step": 191 + }, + { + "epoch": 0.16, + "grad_norm": 2.078406234527636, + "learning_rate": 4.929505558994175e-06, + "loss": 0.5998, + "step": 192 + }, + { + "epoch": 0.16, + "grad_norm": 2.592864566091496, + "learning_rate": 4.928732478379214e-06, + "loss": 0.5842, + "step": 193 + }, + { + "epoch": 0.16, + "grad_norm": 2.092752299259724, + "learning_rate": 4.927955243124638e-06, + "loss": 0.5789, + "step": 194 + }, + { + "epoch": 0.16, + "grad_norm": 2.3799311595696966, + "learning_rate": 4.927173854560002e-06, + "loss": 0.6265, + "step": 195 + }, + { + "epoch": 0.16, + "grad_norm": 2.246876688010602, + "learning_rate": 4.926388314021964e-06, + "loss": 0.6126, + "step": 196 + }, + { + "epoch": 0.16, + "grad_norm": 2.1409898276704578, + "learning_rate": 4.925598622854287e-06, + "loss": 0.6073, + "step": 197 + }, + { + "epoch": 0.16, + "grad_norm": 2.5946158421875385, + "learning_rate": 4.924804782407834e-06, + "loss": 0.6154, + "step": 198 + }, + { + "epoch": 0.16, + "grad_norm": 2.1225494320427982, + "learning_rate": 4.924006794040562e-06, + "loss": 0.583, + "step": 199 + }, + { + "epoch": 0.17, + "grad_norm": 2.1971323526291338, + "learning_rate": 4.923204659117528e-06, + "loss": 0.6078, + "step": 200 + }, + { + "epoch": 0.17, + "grad_norm": 2.289185506404785, + "learning_rate": 4.92239837901088e-06, + "loss": 0.6127, + "step": 201 + }, + { + "epoch": 0.17, + "grad_norm": 2.0071007751625354, + "learning_rate": 4.921587955099858e-06, + "loss": 0.5804, + "step": 202 + }, + { + "epoch": 0.17, + "grad_norm": 2.2981840149068247, + "learning_rate": 4.920773388770789e-06, + "loss": 0.6027, + "step": 203 + }, + { + "epoch": 0.17, + "grad_norm": 2.236179116886702, + "learning_rate": 4.919954681417087e-06, + "loss": 0.6179, + "step": 204 + }, + { + "epoch": 0.17, + "grad_norm": 2.007422589251611, + "learning_rate": 4.91913183443925e-06, + "loss": 0.5647, + "step": 205 + }, + { + "epoch": 0.17, + "grad_norm": 2.1402813555735483, + "learning_rate": 4.918304849244857e-06, + "loss": 0.5841, + "step": 206 + }, + { + "epoch": 0.17, + "grad_norm": 2.0456415785177104, + "learning_rate": 4.917473727248565e-06, + "loss": 0.5524, + "step": 207 + }, + { + "epoch": 0.17, + "grad_norm": 1.9673558126020942, + "learning_rate": 4.916638469872109e-06, + "loss": 0.5698, + "step": 208 + }, + { + "epoch": 0.17, + "grad_norm": 2.015111672496819, + "learning_rate": 4.9157990785442964e-06, + "loss": 0.5957, + "step": 209 + }, + { + "epoch": 0.17, + "grad_norm": 1.9502065547578398, + "learning_rate": 4.9149555547010086e-06, + "loss": 0.5592, + "step": 210 + }, + { + "epoch": 0.17, + "grad_norm": 2.167936522558899, + "learning_rate": 4.9141078997851945e-06, + "loss": 0.5705, + "step": 211 + }, + { + "epoch": 0.18, + "grad_norm": 2.2066587458997935, + "learning_rate": 4.91325611524687e-06, + "loss": 0.5526, + "step": 212 + }, + { + "epoch": 0.18, + "grad_norm": 1.9132995625903553, + "learning_rate": 4.9124002025431136e-06, + "loss": 0.5767, + "step": 213 + }, + { + "epoch": 0.18, + "grad_norm": 2.0097281107801277, + "learning_rate": 4.91154016313807e-06, + "loss": 0.6185, + "step": 214 + }, + { + "epoch": 0.18, + "grad_norm": 2.023532008241332, + "learning_rate": 4.910675998502938e-06, + "loss": 0.6005, + "step": 215 + }, + { + "epoch": 0.18, + "grad_norm": 1.9253831001776973, + "learning_rate": 4.909807710115977e-06, + "loss": 0.5769, + "step": 216 + }, + { + "epoch": 0.18, + "grad_norm": 2.066862408842564, + "learning_rate": 4.908935299462497e-06, + "loss": 0.5671, + "step": 217 + }, + { + "epoch": 0.18, + "grad_norm": 1.9412704290792853, + "learning_rate": 4.908058768034862e-06, + "loss": 0.5568, + "step": 218 + }, + { + "epoch": 0.18, + "grad_norm": 2.185994457097553, + "learning_rate": 4.907178117332487e-06, + "loss": 0.5621, + "step": 219 + }, + { + "epoch": 0.18, + "grad_norm": 2.021517127546353, + "learning_rate": 4.906293348861829e-06, + "loss": 0.5672, + "step": 220 + }, + { + "epoch": 0.18, + "grad_norm": 2.099703967072734, + "learning_rate": 4.905404464136391e-06, + "loss": 0.5366, + "step": 221 + }, + { + "epoch": 0.18, + "grad_norm": 2.030197056583618, + "learning_rate": 4.904511464676718e-06, + "loss": 0.6064, + "step": 222 + }, + { + "epoch": 0.18, + "grad_norm": 2.4170102988954896, + "learning_rate": 4.903614352010393e-06, + "loss": 0.5919, + "step": 223 + }, + { + "epoch": 0.19, + "grad_norm": 2.0819468873015476, + "learning_rate": 4.9027131276720355e-06, + "loss": 0.5366, + "step": 224 + }, + { + "epoch": 0.19, + "grad_norm": 2.148008018153629, + "learning_rate": 4.901807793203299e-06, + "loss": 0.597, + "step": 225 + }, + { + "epoch": 0.19, + "grad_norm": 2.0303725862017186, + "learning_rate": 4.900898350152866e-06, + "loss": 0.6394, + "step": 226 + }, + { + "epoch": 0.19, + "grad_norm": 2.1598989214704334, + "learning_rate": 4.899984800076449e-06, + "loss": 0.5932, + "step": 227 + }, + { + "epoch": 0.19, + "grad_norm": 2.0816312637185255, + "learning_rate": 4.899067144536786e-06, + "loss": 0.5909, + "step": 228 + }, + { + "epoch": 0.19, + "grad_norm": 1.9024067197329315, + "learning_rate": 4.8981453851036365e-06, + "loss": 0.5463, + "step": 229 + }, + { + "epoch": 0.19, + "grad_norm": 2.1830926868871043, + "learning_rate": 4.897219523353781e-06, + "loss": 0.5821, + "step": 230 + }, + { + "epoch": 0.19, + "grad_norm": 2.1156269612794016, + "learning_rate": 4.8962895608710195e-06, + "loss": 0.5993, + "step": 231 + }, + { + "epoch": 0.19, + "grad_norm": 1.9653407654210864, + "learning_rate": 4.895355499246162e-06, + "loss": 0.5525, + "step": 232 + }, + { + "epoch": 0.19, + "grad_norm": 2.367769051061897, + "learning_rate": 4.894417340077036e-06, + "loss": 0.5683, + "step": 233 + }, + { + "epoch": 0.19, + "grad_norm": 2.078327064466567, + "learning_rate": 4.893475084968474e-06, + "loss": 0.6184, + "step": 234 + }, + { + "epoch": 0.19, + "grad_norm": 2.1661882731589475, + "learning_rate": 4.8925287355323195e-06, + "loss": 0.6321, + "step": 235 + }, + { + "epoch": 0.2, + "grad_norm": 2.182760952002799, + "learning_rate": 4.891578293387413e-06, + "loss": 0.6254, + "step": 236 + }, + { + "epoch": 0.2, + "grad_norm": 1.998723579962691, + "learning_rate": 4.890623760159605e-06, + "loss": 0.5371, + "step": 237 + }, + { + "epoch": 0.2, + "grad_norm": 2.319922346931926, + "learning_rate": 4.8896651374817365e-06, + "loss": 0.5941, + "step": 238 + }, + { + "epoch": 0.2, + "grad_norm": 2.090735197217999, + "learning_rate": 4.888702426993648e-06, + "loss": 0.577, + "step": 239 + }, + { + "epoch": 0.2, + "grad_norm": 2.1247199987228558, + "learning_rate": 4.887735630342173e-06, + "loss": 0.5928, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 2.33151114429804, + "learning_rate": 4.8867647491811315e-06, + "loss": 0.5838, + "step": 241 + }, + { + "epoch": 0.2, + "grad_norm": 2.1570026356289147, + "learning_rate": 4.885789785171334e-06, + "loss": 0.5642, + "step": 242 + }, + { + "epoch": 0.2, + "grad_norm": 2.049571197047368, + "learning_rate": 4.884810739980575e-06, + "loss": 0.6684, + "step": 243 + }, + { + "epoch": 0.2, + "grad_norm": 1.9810062424466381, + "learning_rate": 4.883827615283626e-06, + "loss": 0.5942, + "step": 244 + }, + { + "epoch": 0.2, + "grad_norm": 2.145869663660159, + "learning_rate": 4.882840412762244e-06, + "loss": 0.6356, + "step": 245 + }, + { + "epoch": 0.2, + "grad_norm": 2.19290302186514, + "learning_rate": 4.881849134105156e-06, + "loss": 0.6189, + "step": 246 + }, + { + "epoch": 0.2, + "grad_norm": 2.0561043419872984, + "learning_rate": 4.880853781008062e-06, + "loss": 0.5563, + "step": 247 + }, + { + "epoch": 0.21, + "grad_norm": 1.8831183793224635, + "learning_rate": 4.879854355173638e-06, + "loss": 0.5522, + "step": 248 + }, + { + "epoch": 0.21, + "grad_norm": 2.020981606684741, + "learning_rate": 4.878850858311518e-06, + "loss": 0.5548, + "step": 249 + }, + { + "epoch": 0.21, + "grad_norm": 2.060242570493272, + "learning_rate": 4.877843292138307e-06, + "loss": 0.5715, + "step": 250 + }, + { + "epoch": 0.21, + "grad_norm": 2.082455778933014, + "learning_rate": 4.8768316583775665e-06, + "loss": 0.5959, + "step": 251 + }, + { + "epoch": 0.21, + "grad_norm": 1.9830929719438626, + "learning_rate": 4.875815958759819e-06, + "loss": 0.5813, + "step": 252 + }, + { + "epoch": 0.21, + "grad_norm": 1.9772267506828567, + "learning_rate": 4.8747961950225406e-06, + "loss": 0.539, + "step": 253 + }, + { + "epoch": 0.21, + "grad_norm": 2.1492561995002104, + "learning_rate": 4.873772368910161e-06, + "loss": 0.6059, + "step": 254 + }, + { + "epoch": 0.21, + "grad_norm": 2.253757247139787, + "learning_rate": 4.872744482174058e-06, + "loss": 0.5897, + "step": 255 + }, + { + "epoch": 0.21, + "grad_norm": 2.3282624851882496, + "learning_rate": 4.8717125365725545e-06, + "loss": 0.5675, + "step": 256 + }, + { + "epoch": 0.21, + "grad_norm": 2.15573581133063, + "learning_rate": 4.8706765338709185e-06, + "loss": 0.5958, + "step": 257 + }, + { + "epoch": 0.21, + "grad_norm": 2.073289220218241, + "learning_rate": 4.869636475841358e-06, + "loss": 0.6052, + "step": 258 + }, + { + "epoch": 0.21, + "grad_norm": 2.293714090249444, + "learning_rate": 4.8685923642630165e-06, + "loss": 0.5786, + "step": 259 + }, + { + "epoch": 0.22, + "grad_norm": 1.9496544276539172, + "learning_rate": 4.867544200921974e-06, + "loss": 0.6163, + "step": 260 + }, + { + "epoch": 0.22, + "grad_norm": 2.5267016753690132, + "learning_rate": 4.866491987611239e-06, + "loss": 0.6223, + "step": 261 + }, + { + "epoch": 0.22, + "grad_norm": 1.8731249445320794, + "learning_rate": 4.865435726130751e-06, + "loss": 0.5632, + "step": 262 + }, + { + "epoch": 0.22, + "grad_norm": 2.3586331105798863, + "learning_rate": 4.86437541828737e-06, + "loss": 0.5769, + "step": 263 + }, + { + "epoch": 0.22, + "grad_norm": 2.0258106914510585, + "learning_rate": 4.863311065894883e-06, + "loss": 0.6103, + "step": 264 + }, + { + "epoch": 0.22, + "grad_norm": 2.2543614390885955, + "learning_rate": 4.862242670773991e-06, + "loss": 0.5844, + "step": 265 + }, + { + "epoch": 0.22, + "grad_norm": 1.9440299381244668, + "learning_rate": 4.861170234752314e-06, + "loss": 0.5559, + "step": 266 + }, + { + "epoch": 0.22, + "grad_norm": 2.254538268495492, + "learning_rate": 4.8600937596643815e-06, + "loss": 0.5709, + "step": 267 + }, + { + "epoch": 0.22, + "grad_norm": 2.007651746385687, + "learning_rate": 4.8590132473516346e-06, + "loss": 0.573, + "step": 268 + }, + { + "epoch": 0.22, + "grad_norm": 2.0735253118288837, + "learning_rate": 4.857928699662421e-06, + "loss": 0.5954, + "step": 269 + }, + { + "epoch": 0.22, + "grad_norm": 2.024775417101569, + "learning_rate": 4.856840118451989e-06, + "loss": 0.5992, + "step": 270 + }, + { + "epoch": 0.22, + "grad_norm": 2.1043310699945814, + "learning_rate": 4.855747505582488e-06, + "loss": 0.6507, + "step": 271 + }, + { + "epoch": 0.23, + "grad_norm": 2.0386353328313214, + "learning_rate": 4.854650862922965e-06, + "loss": 0.5666, + "step": 272 + }, + { + "epoch": 0.23, + "grad_norm": 1.978698841367705, + "learning_rate": 4.853550192349358e-06, + "loss": 0.5593, + "step": 273 + }, + { + "epoch": 0.23, + "grad_norm": 1.9386534247633986, + "learning_rate": 4.852445495744497e-06, + "loss": 0.5735, + "step": 274 + }, + { + "epoch": 0.23, + "grad_norm": 2.049346245018599, + "learning_rate": 4.8513367749981e-06, + "loss": 0.5415, + "step": 275 + }, + { + "epoch": 0.23, + "grad_norm": 2.1051969521216605, + "learning_rate": 4.850224032006765e-06, + "loss": 0.5532, + "step": 276 + }, + { + "epoch": 0.23, + "grad_norm": 2.2006792558872315, + "learning_rate": 4.849107268673975e-06, + "loss": 0.5696, + "step": 277 + }, + { + "epoch": 0.23, + "grad_norm": 2.0460787736353647, + "learning_rate": 4.847986486910088e-06, + "loss": 0.5658, + "step": 278 + }, + { + "epoch": 0.23, + "grad_norm": 2.1161843259225406, + "learning_rate": 4.846861688632336e-06, + "loss": 0.583, + "step": 279 + }, + { + "epoch": 0.23, + "grad_norm": 1.8882198480393542, + "learning_rate": 4.8457328757648224e-06, + "loss": 0.5693, + "step": 280 + }, + { + "epoch": 0.23, + "grad_norm": 2.1578413701109596, + "learning_rate": 4.844600050238517e-06, + "loss": 0.5409, + "step": 281 + }, + { + "epoch": 0.23, + "grad_norm": 2.03912467778954, + "learning_rate": 4.843463213991255e-06, + "loss": 0.5908, + "step": 282 + }, + { + "epoch": 0.23, + "grad_norm": 2.2333462480826247, + "learning_rate": 4.842322368967731e-06, + "loss": 0.6088, + "step": 283 + }, + { + "epoch": 0.24, + "grad_norm": 2.06698702157327, + "learning_rate": 4.8411775171194986e-06, + "loss": 0.5953, + "step": 284 + }, + { + "epoch": 0.24, + "grad_norm": 2.1433923121572045, + "learning_rate": 4.840028660404964e-06, + "loss": 0.5851, + "step": 285 + }, + { + "epoch": 0.24, + "grad_norm": 2.214858780835041, + "learning_rate": 4.838875800789386e-06, + "loss": 0.5913, + "step": 286 + }, + { + "epoch": 0.24, + "grad_norm": 2.038128612492624, + "learning_rate": 4.837718940244871e-06, + "loss": 0.5827, + "step": 287 + }, + { + "epoch": 0.24, + "grad_norm": 1.9894065096959768, + "learning_rate": 4.836558080750365e-06, + "loss": 0.5769, + "step": 288 + }, + { + "epoch": 0.24, + "grad_norm": 2.1711590153285822, + "learning_rate": 4.835393224291662e-06, + "loss": 0.654, + "step": 289 + }, + { + "epoch": 0.24, + "grad_norm": 2.105004451988696, + "learning_rate": 4.834224372861386e-06, + "loss": 0.6158, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 1.9554568023729102, + "learning_rate": 4.833051528459001e-06, + "loss": 0.5807, + "step": 291 + }, + { + "epoch": 0.24, + "grad_norm": 2.2693917834500312, + "learning_rate": 4.831874693090797e-06, + "loss": 0.5557, + "step": 292 + }, + { + "epoch": 0.24, + "grad_norm": 1.9081391627126192, + "learning_rate": 4.830693868769892e-06, + "loss": 0.6057, + "step": 293 + }, + { + "epoch": 0.24, + "grad_norm": 2.2133664110768585, + "learning_rate": 4.82950905751623e-06, + "loss": 0.6103, + "step": 294 + }, + { + "epoch": 0.24, + "grad_norm": 2.015392814211589, + "learning_rate": 4.8283202613565735e-06, + "loss": 0.5578, + "step": 295 + }, + { + "epoch": 0.25, + "grad_norm": 2.142124020349717, + "learning_rate": 4.8271274823245e-06, + "loss": 0.5675, + "step": 296 + }, + { + "epoch": 0.25, + "grad_norm": 1.981611826462286, + "learning_rate": 4.825930722460405e-06, + "loss": 0.5696, + "step": 297 + }, + { + "epoch": 0.25, + "grad_norm": 1.966759748348117, + "learning_rate": 4.824729983811486e-06, + "loss": 0.58, + "step": 298 + }, + { + "epoch": 0.25, + "grad_norm": 2.0117040369769397, + "learning_rate": 4.823525268431754e-06, + "loss": 0.6005, + "step": 299 + }, + { + "epoch": 0.25, + "grad_norm": 1.9579664917991193, + "learning_rate": 4.822316578382019e-06, + "loss": 0.5472, + "step": 300 + }, + { + "epoch": 0.25, + "grad_norm": 1.9075723479635032, + "learning_rate": 4.821103915729892e-06, + "loss": 0.5834, + "step": 301 + }, + { + "epoch": 0.25, + "grad_norm": 2.289340229011896, + "learning_rate": 4.819887282549777e-06, + "loss": 0.6088, + "step": 302 + }, + { + "epoch": 0.25, + "grad_norm": 2.0410700553735235, + "learning_rate": 4.818666680922874e-06, + "loss": 0.5449, + "step": 303 + }, + { + "epoch": 0.25, + "grad_norm": 2.074434792511819, + "learning_rate": 4.8174421129371675e-06, + "loss": 0.5826, + "step": 304 + }, + { + "epoch": 0.25, + "grad_norm": 2.1377170527698865, + "learning_rate": 4.816213580687428e-06, + "loss": 0.6262, + "step": 305 + }, + { + "epoch": 0.25, + "grad_norm": 2.060340839248083, + "learning_rate": 4.814981086275209e-06, + "loss": 0.5479, + "step": 306 + }, + { + "epoch": 0.25, + "grad_norm": 2.007036467413588, + "learning_rate": 4.813744631808841e-06, + "loss": 0.5642, + "step": 307 + }, + { + "epoch": 0.26, + "grad_norm": 2.016779606220332, + "learning_rate": 4.8125042194034285e-06, + "loss": 0.5503, + "step": 308 + }, + { + "epoch": 0.26, + "grad_norm": 1.930004252757651, + "learning_rate": 4.811259851180845e-06, + "loss": 0.582, + "step": 309 + }, + { + "epoch": 0.26, + "grad_norm": 1.9179477992752856, + "learning_rate": 4.810011529269734e-06, + "loss": 0.5678, + "step": 310 + }, + { + "epoch": 0.26, + "grad_norm": 2.023430757276848, + "learning_rate": 4.808759255805498e-06, + "loss": 0.614, + "step": 311 + }, + { + "epoch": 0.26, + "grad_norm": 1.8334738409404936, + "learning_rate": 4.807503032930306e-06, + "loss": 0.5742, + "step": 312 + }, + { + "epoch": 0.26, + "grad_norm": 1.937332706274502, + "learning_rate": 4.806242862793075e-06, + "loss": 0.6257, + "step": 313 + }, + { + "epoch": 0.26, + "grad_norm": 2.0265383045700363, + "learning_rate": 4.8049787475494786e-06, + "loss": 0.5733, + "step": 314 + }, + { + "epoch": 0.26, + "grad_norm": 2.056444039073761, + "learning_rate": 4.803710689361939e-06, + "loss": 0.578, + "step": 315 + }, + { + "epoch": 0.26, + "grad_norm": 2.411132719183335, + "learning_rate": 4.802438690399622e-06, + "loss": 0.5778, + "step": 316 + }, + { + "epoch": 0.26, + "grad_norm": 2.0233969242222853, + "learning_rate": 4.801162752838436e-06, + "loss": 0.5649, + "step": 317 + }, + { + "epoch": 0.26, + "grad_norm": 2.2809121915132815, + "learning_rate": 4.799882878861025e-06, + "loss": 0.5589, + "step": 318 + }, + { + "epoch": 0.26, + "grad_norm": 1.9806834041020271, + "learning_rate": 4.798599070656768e-06, + "loss": 0.5753, + "step": 319 + }, + { + "epoch": 0.27, + "grad_norm": 2.095099671577702, + "learning_rate": 4.797311330421773e-06, + "loss": 0.5644, + "step": 320 + }, + { + "epoch": 0.27, + "grad_norm": 2.1697606190375764, + "learning_rate": 4.796019660358877e-06, + "loss": 0.6009, + "step": 321 + }, + { + "epoch": 0.27, + "grad_norm": 1.9549416103216173, + "learning_rate": 4.794724062677635e-06, + "loss": 0.5429, + "step": 322 + }, + { + "epoch": 0.27, + "grad_norm": 1.9986949357292838, + "learning_rate": 4.793424539594323e-06, + "loss": 0.5456, + "step": 323 + }, + { + "epoch": 0.27, + "grad_norm": 1.9414831957796765, + "learning_rate": 4.792121093331935e-06, + "loss": 0.5468, + "step": 324 + }, + { + "epoch": 0.27, + "grad_norm": 2.100702188933012, + "learning_rate": 4.7908137261201685e-06, + "loss": 0.5763, + "step": 325 + }, + { + "epoch": 0.27, + "grad_norm": 2.2747471285831025, + "learning_rate": 4.789502440195436e-06, + "loss": 0.5637, + "step": 326 + }, + { + "epoch": 0.27, + "grad_norm": 1.8996382919319124, + "learning_rate": 4.788187237800849e-06, + "loss": 0.5285, + "step": 327 + }, + { + "epoch": 0.27, + "grad_norm": 2.3451495174978847, + "learning_rate": 4.786868121186218e-06, + "loss": 0.5638, + "step": 328 + }, + { + "epoch": 0.27, + "grad_norm": 2.0437536068229565, + "learning_rate": 4.7855450926080535e-06, + "loss": 0.5282, + "step": 329 + }, + { + "epoch": 0.27, + "grad_norm": 2.1185488514745554, + "learning_rate": 4.784218154329555e-06, + "loss": 0.5689, + "step": 330 + }, + { + "epoch": 0.27, + "grad_norm": 2.08745956731504, + "learning_rate": 4.78288730862061e-06, + "loss": 0.5772, + "step": 331 + }, + { + "epoch": 0.28, + "grad_norm": 1.9479507156354359, + "learning_rate": 4.781552557757789e-06, + "loss": 0.5419, + "step": 332 + }, + { + "epoch": 0.28, + "grad_norm": 2.0211480847937255, + "learning_rate": 4.780213904024346e-06, + "loss": 0.5757, + "step": 333 + }, + { + "epoch": 0.28, + "grad_norm": 1.9075335749936069, + "learning_rate": 4.7788713497102094e-06, + "loss": 0.5693, + "step": 334 + }, + { + "epoch": 0.28, + "grad_norm": 1.9590727137410602, + "learning_rate": 4.777524897111979e-06, + "loss": 0.5501, + "step": 335 + }, + { + "epoch": 0.28, + "grad_norm": 2.0328480247612752, + "learning_rate": 4.776174548532926e-06, + "loss": 0.587, + "step": 336 + }, + { + "epoch": 0.28, + "grad_norm": 2.062540517496736, + "learning_rate": 4.774820306282982e-06, + "loss": 0.5819, + "step": 337 + }, + { + "epoch": 0.28, + "grad_norm": 2.0054452800156195, + "learning_rate": 4.773462172678744e-06, + "loss": 0.5529, + "step": 338 + }, + { + "epoch": 0.28, + "grad_norm": 1.9641125644599562, + "learning_rate": 4.772100150043462e-06, + "loss": 0.5895, + "step": 339 + }, + { + "epoch": 0.28, + "grad_norm": 1.9196744569285298, + "learning_rate": 4.77073424070704e-06, + "loss": 0.5504, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 2.0002752186146484, + "learning_rate": 4.76936444700603e-06, + "loss": 0.5307, + "step": 341 + }, + { + "epoch": 0.28, + "grad_norm": 2.1068919823054344, + "learning_rate": 4.76799077128363e-06, + "loss": 0.5908, + "step": 342 + }, + { + "epoch": 0.28, + "grad_norm": 1.919597745459612, + "learning_rate": 4.766613215889678e-06, + "loss": 0.5423, + "step": 343 + }, + { + "epoch": 0.29, + "grad_norm": 2.0670928578728716, + "learning_rate": 4.765231783180648e-06, + "loss": 0.5901, + "step": 344 + }, + { + "epoch": 0.29, + "grad_norm": 1.906116148793229, + "learning_rate": 4.763846475519648e-06, + "loss": 0.5919, + "step": 345 + }, + { + "epoch": 0.29, + "grad_norm": 1.9133575268702454, + "learning_rate": 4.762457295276413e-06, + "loss": 0.585, + "step": 346 + }, + { + "epoch": 0.29, + "grad_norm": 2.133902651855379, + "learning_rate": 4.7610642448273025e-06, + "loss": 0.5444, + "step": 347 + }, + { + "epoch": 0.29, + "grad_norm": 1.95222194640397, + "learning_rate": 4.7596673265552985e-06, + "loss": 0.5941, + "step": 348 + }, + { + "epoch": 0.29, + "grad_norm": 2.095010268380277, + "learning_rate": 4.758266542849997e-06, + "loss": 0.6045, + "step": 349 + }, + { + "epoch": 0.29, + "grad_norm": 2.0493864712059655, + "learning_rate": 4.756861896107609e-06, + "loss": 0.6011, + "step": 350 + }, + { + "epoch": 0.29, + "grad_norm": 1.9222198823064967, + "learning_rate": 4.755453388730949e-06, + "loss": 0.5521, + "step": 351 + }, + { + "epoch": 0.29, + "grad_norm": 2.368147154955994, + "learning_rate": 4.754041023129442e-06, + "loss": 0.6117, + "step": 352 + }, + { + "epoch": 0.29, + "grad_norm": 1.9734596786106697, + "learning_rate": 4.752624801719108e-06, + "loss": 0.5727, + "step": 353 + }, + { + "epoch": 0.29, + "grad_norm": 2.151510566977991, + "learning_rate": 4.751204726922564e-06, + "loss": 0.6085, + "step": 354 + }, + { + "epoch": 0.29, + "grad_norm": 1.9291219072892685, + "learning_rate": 4.74978080116902e-06, + "loss": 0.5655, + "step": 355 + }, + { + "epoch": 0.3, + "grad_norm": 1.838592559018919, + "learning_rate": 4.748353026894273e-06, + "loss": 0.5508, + "step": 356 + }, + { + "epoch": 0.3, + "grad_norm": 2.069156589116884, + "learning_rate": 4.7469214065407e-06, + "loss": 0.5942, + "step": 357 + }, + { + "epoch": 0.3, + "grad_norm": 1.8960817746615841, + "learning_rate": 4.745485942557264e-06, + "loss": 0.5902, + "step": 358 + }, + { + "epoch": 0.3, + "grad_norm": 2.0606557307859634, + "learning_rate": 4.744046637399497e-06, + "loss": 0.556, + "step": 359 + }, + { + "epoch": 0.3, + "grad_norm": 1.9660065879130573, + "learning_rate": 4.742603493529505e-06, + "loss": 0.5364, + "step": 360 + }, + { + "epoch": 0.3, + "grad_norm": 1.9647921383638112, + "learning_rate": 4.741156513415958e-06, + "loss": 0.5601, + "step": 361 + }, + { + "epoch": 0.3, + "grad_norm": 2.049074688423064, + "learning_rate": 4.739705699534092e-06, + "loss": 0.556, + "step": 362 + }, + { + "epoch": 0.3, + "grad_norm": 1.962593945802751, + "learning_rate": 4.738251054365697e-06, + "loss": 0.5609, + "step": 363 + }, + { + "epoch": 0.3, + "grad_norm": 2.059675349950347, + "learning_rate": 4.736792580399119e-06, + "loss": 0.5499, + "step": 364 + }, + { + "epoch": 0.3, + "grad_norm": 1.8479566025134508, + "learning_rate": 4.7353302801292555e-06, + "loss": 0.5621, + "step": 365 + }, + { + "epoch": 0.3, + "grad_norm": 1.9405450724813613, + "learning_rate": 4.733864156057545e-06, + "loss": 0.5437, + "step": 366 + }, + { + "epoch": 0.3, + "grad_norm": 2.122487864033456, + "learning_rate": 4.7323942106919715e-06, + "loss": 0.5984, + "step": 367 + }, + { + "epoch": 0.31, + "grad_norm": 2.6822841144123046, + "learning_rate": 4.730920446547052e-06, + "loss": 0.5951, + "step": 368 + }, + { + "epoch": 0.31, + "grad_norm": 2.001405394086718, + "learning_rate": 4.729442866143838e-06, + "loss": 0.5552, + "step": 369 + }, + { + "epoch": 0.31, + "grad_norm": 2.081154186949651, + "learning_rate": 4.72796147200991e-06, + "loss": 0.587, + "step": 370 + }, + { + "epoch": 0.31, + "grad_norm": 2.1196544292473236, + "learning_rate": 4.72647626667937e-06, + "loss": 0.5882, + "step": 371 + }, + { + "epoch": 0.31, + "grad_norm": 2.107445583509131, + "learning_rate": 4.724987252692841e-06, + "loss": 0.5389, + "step": 372 + }, + { + "epoch": 0.31, + "grad_norm": 1.9529785007256542, + "learning_rate": 4.723494432597462e-06, + "loss": 0.6439, + "step": 373 + }, + { + "epoch": 0.31, + "grad_norm": 2.11513441515607, + "learning_rate": 4.72199780894688e-06, + "loss": 0.6089, + "step": 374 + }, + { + "epoch": 0.31, + "grad_norm": 1.9769899713721226, + "learning_rate": 4.7204973843012504e-06, + "loss": 0.5393, + "step": 375 + }, + { + "epoch": 0.31, + "grad_norm": 2.063749623036316, + "learning_rate": 4.718993161227231e-06, + "loss": 0.5987, + "step": 376 + }, + { + "epoch": 0.31, + "grad_norm": 2.0515862288253883, + "learning_rate": 4.717485142297977e-06, + "loss": 0.5772, + "step": 377 + }, + { + "epoch": 0.31, + "grad_norm": 1.8962297741946081, + "learning_rate": 4.715973330093135e-06, + "loss": 0.5424, + "step": 378 + }, + { + "epoch": 0.31, + "grad_norm": 2.2210958340400087, + "learning_rate": 4.7144577271988435e-06, + "loss": 0.6072, + "step": 379 + }, + { + "epoch": 0.32, + "grad_norm": 2.067113337475314, + "learning_rate": 4.712938336207724e-06, + "loss": 0.5482, + "step": 380 + }, + { + "epoch": 0.32, + "grad_norm": 1.8985489253954526, + "learning_rate": 4.711415159718876e-06, + "loss": 0.5593, + "step": 381 + }, + { + "epoch": 0.32, + "grad_norm": 2.085236381118245, + "learning_rate": 4.709888200337879e-06, + "loss": 0.5704, + "step": 382 + }, + { + "epoch": 0.32, + "grad_norm": 2.0967664183909784, + "learning_rate": 4.708357460676779e-06, + "loss": 0.5997, + "step": 383 + }, + { + "epoch": 0.32, + "grad_norm": 2.0454278026009645, + "learning_rate": 4.706822943354092e-06, + "loss": 0.5669, + "step": 384 + }, + { + "epoch": 0.32, + "grad_norm": 1.9171673309342674, + "learning_rate": 4.705284650994793e-06, + "loss": 0.517, + "step": 385 + }, + { + "epoch": 0.32, + "grad_norm": 2.2003223432761287, + "learning_rate": 4.70374258623032e-06, + "loss": 0.5957, + "step": 386 + }, + { + "epoch": 0.32, + "grad_norm": 1.936392519491186, + "learning_rate": 4.702196751698557e-06, + "loss": 0.5767, + "step": 387 + }, + { + "epoch": 0.32, + "grad_norm": 2.354272003403086, + "learning_rate": 4.700647150043841e-06, + "loss": 0.6515, + "step": 388 + }, + { + "epoch": 0.32, + "grad_norm": 1.9115059027323418, + "learning_rate": 4.699093783916955e-06, + "loss": 0.5579, + "step": 389 + }, + { + "epoch": 0.32, + "grad_norm": 1.9878827587010002, + "learning_rate": 4.697536655975115e-06, + "loss": 0.572, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 1.9729552535473858, + "learning_rate": 4.69597576888198e-06, + "loss": 0.5665, + "step": 391 + }, + { + "epoch": 0.32, + "grad_norm": 2.177634366499155, + "learning_rate": 4.694411125307632e-06, + "loss": 0.6363, + "step": 392 + }, + { + "epoch": 0.33, + "grad_norm": 1.8955146664976508, + "learning_rate": 4.692842727928584e-06, + "loss": 0.5682, + "step": 393 + }, + { + "epoch": 0.33, + "grad_norm": 2.175305874476245, + "learning_rate": 4.691270579427769e-06, + "loss": 0.5943, + "step": 394 + }, + { + "epoch": 0.33, + "grad_norm": 2.068140527232831, + "learning_rate": 4.689694682494537e-06, + "loss": 0.5659, + "step": 395 + }, + { + "epoch": 0.33, + "grad_norm": 1.9112960694448755, + "learning_rate": 4.688115039824648e-06, + "loss": 0.6048, + "step": 396 + }, + { + "epoch": 0.33, + "grad_norm": 1.9778305624626604, + "learning_rate": 4.686531654120272e-06, + "loss": 0.5695, + "step": 397 + }, + { + "epoch": 0.33, + "grad_norm": 2.096904163204813, + "learning_rate": 4.684944528089981e-06, + "loss": 0.6113, + "step": 398 + }, + { + "epoch": 0.33, + "grad_norm": 2.0011934144948516, + "learning_rate": 4.683353664448745e-06, + "loss": 0.5568, + "step": 399 + }, + { + "epoch": 0.33, + "grad_norm": 1.8562851971757464, + "learning_rate": 4.681759065917929e-06, + "loss": 0.5474, + "step": 400 + }, + { + "epoch": 0.33, + "grad_norm": 1.8190547574166316, + "learning_rate": 4.680160735225285e-06, + "loss": 0.5315, + "step": 401 + }, + { + "epoch": 0.33, + "grad_norm": 1.9247862956929132, + "learning_rate": 4.6785586751049505e-06, + "loss": 0.5568, + "step": 402 + }, + { + "epoch": 0.33, + "grad_norm": 1.8469793674077621, + "learning_rate": 4.676952888297442e-06, + "loss": 0.5811, + "step": 403 + }, + { + "epoch": 0.33, + "grad_norm": 1.946943145198674, + "learning_rate": 4.675343377549653e-06, + "loss": 0.5475, + "step": 404 + }, + { + "epoch": 0.34, + "grad_norm": 1.991304422730463, + "learning_rate": 4.6737301456148445e-06, + "loss": 0.5856, + "step": 405 + }, + { + "epoch": 0.34, + "grad_norm": 1.9168241989446437, + "learning_rate": 4.672113195252644e-06, + "loss": 0.6069, + "step": 406 + }, + { + "epoch": 0.34, + "grad_norm": 1.9305433665377905, + "learning_rate": 4.670492529229039e-06, + "loss": 0.5536, + "step": 407 + }, + { + "epoch": 0.34, + "grad_norm": 1.8441008898830742, + "learning_rate": 4.668868150316377e-06, + "loss": 0.5859, + "step": 408 + }, + { + "epoch": 0.34, + "grad_norm": 1.8879301596961315, + "learning_rate": 4.667240061293351e-06, + "loss": 0.5483, + "step": 409 + }, + { + "epoch": 0.34, + "grad_norm": 2.024767417636281, + "learning_rate": 4.665608264945004e-06, + "loss": 0.5414, + "step": 410 + }, + { + "epoch": 0.34, + "grad_norm": 2.1331610141797395, + "learning_rate": 4.663972764062722e-06, + "loss": 0.5811, + "step": 411 + }, + { + "epoch": 0.34, + "grad_norm": 1.8132480265817386, + "learning_rate": 4.662333561444226e-06, + "loss": 0.5573, + "step": 412 + }, + { + "epoch": 0.34, + "grad_norm": 1.9795813972027145, + "learning_rate": 4.6606906598935675e-06, + "loss": 0.5814, + "step": 413 + }, + { + "epoch": 0.34, + "grad_norm": 1.8782931074297053, + "learning_rate": 4.6590440622211295e-06, + "loss": 0.569, + "step": 414 + }, + { + "epoch": 0.34, + "grad_norm": 1.8219945335518706, + "learning_rate": 4.657393771243614e-06, + "loss": 0.5669, + "step": 415 + }, + { + "epoch": 0.34, + "grad_norm": 2.4047268604371306, + "learning_rate": 4.6557397897840454e-06, + "loss": 0.5602, + "step": 416 + }, + { + "epoch": 0.35, + "grad_norm": 2.064501780523946, + "learning_rate": 4.654082120671757e-06, + "loss": 0.5699, + "step": 417 + }, + { + "epoch": 0.35, + "grad_norm": 1.9183128854940252, + "learning_rate": 4.65242076674239e-06, + "loss": 0.6112, + "step": 418 + }, + { + "epoch": 0.35, + "grad_norm": 1.9315698971629633, + "learning_rate": 4.650755730837894e-06, + "loss": 0.5537, + "step": 419 + }, + { + "epoch": 0.35, + "grad_norm": 1.9527809333659218, + "learning_rate": 4.649087015806509e-06, + "loss": 0.5423, + "step": 420 + }, + { + "epoch": 0.35, + "grad_norm": 1.8940523915995442, + "learning_rate": 4.647414624502777e-06, + "loss": 0.5708, + "step": 421 + }, + { + "epoch": 0.35, + "grad_norm": 1.9976964785548623, + "learning_rate": 4.645738559787524e-06, + "loss": 0.6006, + "step": 422 + }, + { + "epoch": 0.35, + "grad_norm": 1.9098681403283917, + "learning_rate": 4.64405882452786e-06, + "loss": 0.5591, + "step": 423 + }, + { + "epoch": 0.35, + "grad_norm": 1.8695612182804557, + "learning_rate": 4.642375421597175e-06, + "loss": 0.5219, + "step": 424 + }, + { + "epoch": 0.35, + "grad_norm": 1.8912077704810082, + "learning_rate": 4.6406883538751315e-06, + "loss": 0.5224, + "step": 425 + }, + { + "epoch": 0.35, + "grad_norm": 1.9390714726978922, + "learning_rate": 4.638997624247664e-06, + "loss": 0.5359, + "step": 426 + }, + { + "epoch": 0.35, + "grad_norm": 2.051545992296337, + "learning_rate": 4.637303235606968e-06, + "loss": 0.544, + "step": 427 + }, + { + "epoch": 0.35, + "grad_norm": 2.0657109136265914, + "learning_rate": 4.6356051908515e-06, + "loss": 0.5429, + "step": 428 + }, + { + "epoch": 0.36, + "grad_norm": 2.0301022307984793, + "learning_rate": 4.63390349288597e-06, + "loss": 0.5787, + "step": 429 + }, + { + "epoch": 0.36, + "grad_norm": 2.052515756169346, + "learning_rate": 4.632198144621338e-06, + "loss": 0.5778, + "step": 430 + }, + { + "epoch": 0.36, + "grad_norm": 1.9741370495474897, + "learning_rate": 4.630489148974807e-06, + "loss": 0.5142, + "step": 431 + }, + { + "epoch": 0.36, + "grad_norm": 1.9713229498863698, + "learning_rate": 4.62877650886982e-06, + "loss": 0.6127, + "step": 432 + }, + { + "epoch": 0.36, + "grad_norm": 2.1609440121306007, + "learning_rate": 4.627060227236055e-06, + "loss": 0.5886, + "step": 433 + }, + { + "epoch": 0.36, + "grad_norm": 1.944966445355139, + "learning_rate": 4.625340307009418e-06, + "loss": 0.5657, + "step": 434 + }, + { + "epoch": 0.36, + "grad_norm": 2.031003925680835, + "learning_rate": 4.623616751132041e-06, + "loss": 0.5628, + "step": 435 + }, + { + "epoch": 0.36, + "grad_norm": 1.8774113373137704, + "learning_rate": 4.621889562552272e-06, + "loss": 0.6068, + "step": 436 + }, + { + "epoch": 0.36, + "grad_norm": 2.0385201543401785, + "learning_rate": 4.620158744224677e-06, + "loss": 0.5511, + "step": 437 + }, + { + "epoch": 0.36, + "grad_norm": 1.8440750841938207, + "learning_rate": 4.618424299110028e-06, + "loss": 0.5261, + "step": 438 + }, + { + "epoch": 0.36, + "grad_norm": 1.8978691755923442, + "learning_rate": 4.616686230175303e-06, + "loss": 0.5862, + "step": 439 + }, + { + "epoch": 0.36, + "grad_norm": 1.8120850246861446, + "learning_rate": 4.614944540393679e-06, + "loss": 0.5652, + "step": 440 + }, + { + "epoch": 0.37, + "grad_norm": 2.1821084695714914, + "learning_rate": 4.613199232744525e-06, + "loss": 0.5598, + "step": 441 + }, + { + "epoch": 0.37, + "grad_norm": 1.9626422737625222, + "learning_rate": 4.611450310213401e-06, + "loss": 0.5267, + "step": 442 + }, + { + "epoch": 0.37, + "grad_norm": 1.9714913234889215, + "learning_rate": 4.6096977757920505e-06, + "loss": 0.5658, + "step": 443 + }, + { + "epoch": 0.37, + "grad_norm": 2.0179324078198233, + "learning_rate": 4.607941632478393e-06, + "loss": 0.582, + "step": 444 + }, + { + "epoch": 0.37, + "grad_norm": 1.8565193856331161, + "learning_rate": 4.6061818832765246e-06, + "loss": 0.5715, + "step": 445 + }, + { + "epoch": 0.37, + "grad_norm": 1.9798501479599246, + "learning_rate": 4.604418531196708e-06, + "loss": 0.6007, + "step": 446 + }, + { + "epoch": 0.37, + "grad_norm": 2.0095846956468257, + "learning_rate": 4.602651579255369e-06, + "loss": 0.5947, + "step": 447 + }, + { + "epoch": 0.37, + "grad_norm": 1.9316541079988245, + "learning_rate": 4.600881030475093e-06, + "loss": 0.5501, + "step": 448 + }, + { + "epoch": 0.37, + "grad_norm": 2.080069353365406, + "learning_rate": 4.599106887884616e-06, + "loss": 0.5631, + "step": 449 + }, + { + "epoch": 0.37, + "grad_norm": 1.965973137652201, + "learning_rate": 4.5973291545188235e-06, + "loss": 0.5267, + "step": 450 + }, + { + "epoch": 0.37, + "grad_norm": 2.1082225966704087, + "learning_rate": 4.595547833418741e-06, + "loss": 0.6418, + "step": 451 + }, + { + "epoch": 0.37, + "grad_norm": 2.0359312594194083, + "learning_rate": 4.593762927631536e-06, + "loss": 0.5644, + "step": 452 + }, + { + "epoch": 0.38, + "grad_norm": 2.1254892914109433, + "learning_rate": 4.591974440210502e-06, + "loss": 0.5693, + "step": 453 + }, + { + "epoch": 0.38, + "grad_norm": 1.9121188587334927, + "learning_rate": 4.590182374215064e-06, + "loss": 0.5572, + "step": 454 + }, + { + "epoch": 0.38, + "grad_norm": 1.9348642624953207, + "learning_rate": 4.588386732710765e-06, + "loss": 0.5446, + "step": 455 + }, + { + "epoch": 0.38, + "grad_norm": 1.8667846547370581, + "learning_rate": 4.5865875187692695e-06, + "loss": 0.5681, + "step": 456 + }, + { + "epoch": 0.38, + "grad_norm": 1.9219061327454674, + "learning_rate": 4.5847847354683465e-06, + "loss": 0.5508, + "step": 457 + }, + { + "epoch": 0.38, + "grad_norm": 1.8106132369123122, + "learning_rate": 4.5829783858918756e-06, + "loss": 0.5626, + "step": 458 + }, + { + "epoch": 0.38, + "grad_norm": 1.7827483964442634, + "learning_rate": 4.5811684731298355e-06, + "loss": 0.5575, + "step": 459 + }, + { + "epoch": 0.38, + "grad_norm": 1.9284196979863513, + "learning_rate": 4.5793550002783e-06, + "loss": 0.5363, + "step": 460 + }, + { + "epoch": 0.38, + "grad_norm": 2.029647468705457, + "learning_rate": 4.577537970439433e-06, + "loss": 0.5415, + "step": 461 + }, + { + "epoch": 0.38, + "grad_norm": 2.0997127029950087, + "learning_rate": 4.575717386721482e-06, + "loss": 0.5814, + "step": 462 + }, + { + "epoch": 0.38, + "grad_norm": 1.9589290300656341, + "learning_rate": 4.573893252238777e-06, + "loss": 0.5156, + "step": 463 + }, + { + "epoch": 0.38, + "grad_norm": 1.905237143908251, + "learning_rate": 4.572065570111717e-06, + "loss": 0.5536, + "step": 464 + }, + { + "epoch": 0.39, + "grad_norm": 1.929519794935609, + "learning_rate": 4.570234343466775e-06, + "loss": 0.5879, + "step": 465 + }, + { + "epoch": 0.39, + "grad_norm": 2.096095808886982, + "learning_rate": 4.568399575436484e-06, + "loss": 0.6241, + "step": 466 + }, + { + "epoch": 0.39, + "grad_norm": 1.9486118894048778, + "learning_rate": 4.566561269159437e-06, + "loss": 0.6307, + "step": 467 + }, + { + "epoch": 0.39, + "grad_norm": 2.0839490306744586, + "learning_rate": 4.564719427780276e-06, + "loss": 0.5655, + "step": 468 + }, + { + "epoch": 0.39, + "grad_norm": 1.9439525665822102, + "learning_rate": 4.562874054449694e-06, + "loss": 0.5437, + "step": 469 + }, + { + "epoch": 0.39, + "grad_norm": 1.9409142791465297, + "learning_rate": 4.5610251523244244e-06, + "loss": 0.6429, + "step": 470 + }, + { + "epoch": 0.39, + "grad_norm": 1.8664574493795525, + "learning_rate": 4.559172724567238e-06, + "loss": 0.5826, + "step": 471 + }, + { + "epoch": 0.39, + "grad_norm": 1.80819349503324, + "learning_rate": 4.557316774346934e-06, + "loss": 0.5372, + "step": 472 + }, + { + "epoch": 0.39, + "grad_norm": 1.8680097526865296, + "learning_rate": 4.555457304838341e-06, + "loss": 0.5503, + "step": 473 + }, + { + "epoch": 0.39, + "grad_norm": 1.7466938790815696, + "learning_rate": 4.553594319222303e-06, + "loss": 0.5425, + "step": 474 + }, + { + "epoch": 0.39, + "grad_norm": 1.9610557658505607, + "learning_rate": 4.551727820685684e-06, + "loss": 0.5755, + "step": 475 + }, + { + "epoch": 0.39, + "grad_norm": 1.9414839604282412, + "learning_rate": 4.549857812421353e-06, + "loss": 0.5915, + "step": 476 + }, + { + "epoch": 0.4, + "grad_norm": 1.8484957644576423, + "learning_rate": 4.547984297628186e-06, + "loss": 0.5676, + "step": 477 + }, + { + "epoch": 0.4, + "grad_norm": 2.074524028551078, + "learning_rate": 4.546107279511055e-06, + "loss": 0.6084, + "step": 478 + }, + { + "epoch": 0.4, + "grad_norm": 2.069692704122282, + "learning_rate": 4.544226761280826e-06, + "loss": 0.5676, + "step": 479 + }, + { + "epoch": 0.4, + "grad_norm": 1.8975472248317244, + "learning_rate": 4.54234274615435e-06, + "loss": 0.5904, + "step": 480 + }, + { + "epoch": 0.4, + "grad_norm": 2.0118868982719897, + "learning_rate": 4.540455237354466e-06, + "loss": 0.5722, + "step": 481 + }, + { + "epoch": 0.4, + "grad_norm": 1.9733105429381828, + "learning_rate": 4.5385642381099814e-06, + "loss": 0.6112, + "step": 482 + }, + { + "epoch": 0.4, + "grad_norm": 1.862156914026863, + "learning_rate": 4.53666975165568e-06, + "loss": 0.5951, + "step": 483 + }, + { + "epoch": 0.4, + "grad_norm": 1.9512940035297868, + "learning_rate": 4.53477178123231e-06, + "loss": 0.5223, + "step": 484 + }, + { + "epoch": 0.4, + "grad_norm": 1.9202464191558823, + "learning_rate": 4.532870330086577e-06, + "loss": 0.5638, + "step": 485 + }, + { + "epoch": 0.4, + "grad_norm": 1.9015767656854419, + "learning_rate": 4.530965401471143e-06, + "loss": 0.5911, + "step": 486 + }, + { + "epoch": 0.4, + "grad_norm": 1.95190921973106, + "learning_rate": 4.529056998644619e-06, + "loss": 0.6053, + "step": 487 + }, + { + "epoch": 0.4, + "grad_norm": 2.0058459596081644, + "learning_rate": 4.527145124871556e-06, + "loss": 0.5466, + "step": 488 + }, + { + "epoch": 0.41, + "grad_norm": 1.8902620959998047, + "learning_rate": 4.5252297834224454e-06, + "loss": 0.5526, + "step": 489 + }, + { + "epoch": 0.41, + "grad_norm": 1.985466416169018, + "learning_rate": 4.523310977573711e-06, + "loss": 0.5958, + "step": 490 + }, + { + "epoch": 0.41, + "grad_norm": 2.1140148957176415, + "learning_rate": 4.521388710607699e-06, + "loss": 0.613, + "step": 491 + }, + { + "epoch": 0.41, + "grad_norm": 1.9470601192089525, + "learning_rate": 4.51946298581268e-06, + "loss": 0.5847, + "step": 492 + }, + { + "epoch": 0.41, + "grad_norm": 2.0227057176069603, + "learning_rate": 4.51753380648284e-06, + "loss": 0.5784, + "step": 493 + }, + { + "epoch": 0.41, + "grad_norm": 2.05501863673554, + "learning_rate": 4.515601175918269e-06, + "loss": 0.5501, + "step": 494 + }, + { + "epoch": 0.41, + "grad_norm": 2.0129325402811715, + "learning_rate": 4.513665097424967e-06, + "loss": 0.5641, + "step": 495 + }, + { + "epoch": 0.41, + "grad_norm": 2.0322333044110468, + "learning_rate": 4.51172557431483e-06, + "loss": 0.5422, + "step": 496 + }, + { + "epoch": 0.41, + "grad_norm": 1.9573055659958774, + "learning_rate": 4.509782609905644e-06, + "loss": 0.516, + "step": 497 + }, + { + "epoch": 0.41, + "grad_norm": 1.8223127451485421, + "learning_rate": 4.507836207521085e-06, + "loss": 0.5714, + "step": 498 + }, + { + "epoch": 0.41, + "grad_norm": 1.9343089861079434, + "learning_rate": 4.50588637049071e-06, + "loss": 0.5424, + "step": 499 + }, + { + "epoch": 0.41, + "grad_norm": 1.8940990649350729, + "learning_rate": 4.503933102149948e-06, + "loss": 0.5832, + "step": 500 + }, + { + "epoch": 0.42, + "grad_norm": 1.908617301933682, + "learning_rate": 4.501976405840101e-06, + "loss": 0.5399, + "step": 501 + }, + { + "epoch": 0.42, + "grad_norm": 1.8290259512093785, + "learning_rate": 4.500016284908334e-06, + "loss": 0.5561, + "step": 502 + }, + { + "epoch": 0.42, + "grad_norm": 1.9840280991844164, + "learning_rate": 4.49805274270767e-06, + "loss": 0.5645, + "step": 503 + }, + { + "epoch": 0.42, + "grad_norm": 1.9864953051636856, + "learning_rate": 4.496085782596984e-06, + "loss": 0.5369, + "step": 504 + }, + { + "epoch": 0.42, + "grad_norm": 1.979387839103732, + "learning_rate": 4.494115407940999e-06, + "loss": 0.6196, + "step": 505 + }, + { + "epoch": 0.42, + "grad_norm": 1.9266869362165981, + "learning_rate": 4.492141622110279e-06, + "loss": 0.5687, + "step": 506 + }, + { + "epoch": 0.42, + "grad_norm": 1.9887461782376619, + "learning_rate": 4.4901644284812205e-06, + "loss": 0.5264, + "step": 507 + }, + { + "epoch": 0.42, + "grad_norm": 1.8717867803152208, + "learning_rate": 4.488183830436052e-06, + "loss": 0.5612, + "step": 508 + }, + { + "epoch": 0.42, + "grad_norm": 2.0044226171493, + "learning_rate": 4.486199831362828e-06, + "loss": 0.5571, + "step": 509 + }, + { + "epoch": 0.42, + "grad_norm": 2.1075571016617958, + "learning_rate": 4.484212434655414e-06, + "loss": 0.5642, + "step": 510 + }, + { + "epoch": 0.42, + "grad_norm": 1.8031612547539957, + "learning_rate": 4.482221643713494e-06, + "loss": 0.5805, + "step": 511 + }, + { + "epoch": 0.42, + "grad_norm": 1.8782516337672304, + "learning_rate": 4.480227461942556e-06, + "loss": 0.5596, + "step": 512 + }, + { + "epoch": 0.43, + "grad_norm": 2.075073901596185, + "learning_rate": 4.478229892753886e-06, + "loss": 0.6124, + "step": 513 + }, + { + "epoch": 0.43, + "grad_norm": 2.0588983460568304, + "learning_rate": 4.47622893956457e-06, + "loss": 0.5589, + "step": 514 + }, + { + "epoch": 0.43, + "grad_norm": 1.850248236464706, + "learning_rate": 4.474224605797476e-06, + "loss": 0.5603, + "step": 515 + }, + { + "epoch": 0.43, + "grad_norm": 1.932844310652863, + "learning_rate": 4.472216894881261e-06, + "loss": 0.5571, + "step": 516 + }, + { + "epoch": 0.43, + "grad_norm": 2.09975454805468, + "learning_rate": 4.470205810250357e-06, + "loss": 0.5975, + "step": 517 + }, + { + "epoch": 0.43, + "grad_norm": 1.9694087093010304, + "learning_rate": 4.468191355344965e-06, + "loss": 0.5698, + "step": 518 + }, + { + "epoch": 0.43, + "grad_norm": 1.8794788153917539, + "learning_rate": 4.466173533611053e-06, + "loss": 0.5559, + "step": 519 + }, + { + "epoch": 0.43, + "grad_norm": 2.0650455557855434, + "learning_rate": 4.46415234850035e-06, + "loss": 0.5644, + "step": 520 + }, + { + "epoch": 0.43, + "grad_norm": 2.0062649027982022, + "learning_rate": 4.462127803470334e-06, + "loss": 0.608, + "step": 521 + }, + { + "epoch": 0.43, + "grad_norm": 2.043267877462657, + "learning_rate": 4.460099901984235e-06, + "loss": 0.573, + "step": 522 + }, + { + "epoch": 0.43, + "grad_norm": 2.056372436619027, + "learning_rate": 4.4580686475110235e-06, + "loss": 0.5748, + "step": 523 + }, + { + "epoch": 0.43, + "grad_norm": 1.8871033520138176, + "learning_rate": 4.456034043525404e-06, + "loss": 0.5339, + "step": 524 + }, + { + "epoch": 0.44, + "grad_norm": 1.889474616209236, + "learning_rate": 4.45399609350781e-06, + "loss": 0.5185, + "step": 525 + }, + { + "epoch": 0.44, + "grad_norm": 1.9767406217632912, + "learning_rate": 4.451954800944405e-06, + "loss": 0.5758, + "step": 526 + }, + { + "epoch": 0.44, + "grad_norm": 1.9588695861513832, + "learning_rate": 4.449910169327062e-06, + "loss": 0.5472, + "step": 527 + }, + { + "epoch": 0.44, + "grad_norm": 1.8852210889000718, + "learning_rate": 4.447862202153372e-06, + "loss": 0.5917, + "step": 528 + }, + { + "epoch": 0.44, + "grad_norm": 2.0103638871993077, + "learning_rate": 4.445810902926629e-06, + "loss": 0.5761, + "step": 529 + }, + { + "epoch": 0.44, + "grad_norm": 2.201836945389513, + "learning_rate": 4.443756275155827e-06, + "loss": 0.5614, + "step": 530 + }, + { + "epoch": 0.44, + "grad_norm": 1.900702305836831, + "learning_rate": 4.441698322355656e-06, + "loss": 0.5254, + "step": 531 + }, + { + "epoch": 0.44, + "grad_norm": 2.134694583439314, + "learning_rate": 4.4396370480464915e-06, + "loss": 0.5607, + "step": 532 + }, + { + "epoch": 0.44, + "grad_norm": 1.8073751630381198, + "learning_rate": 4.437572455754391e-06, + "loss": 0.536, + "step": 533 + }, + { + "epoch": 0.44, + "grad_norm": 1.9607338020142653, + "learning_rate": 4.435504549011088e-06, + "loss": 0.59, + "step": 534 + }, + { + "epoch": 0.44, + "grad_norm": 2.0756430867435274, + "learning_rate": 4.433433331353988e-06, + "loss": 0.5538, + "step": 535 + }, + { + "epoch": 0.44, + "grad_norm": 1.8280570853718465, + "learning_rate": 4.431358806326158e-06, + "loss": 0.5789, + "step": 536 + }, + { + "epoch": 0.45, + "grad_norm": 2.2005143967434977, + "learning_rate": 4.429280977476321e-06, + "loss": 0.545, + "step": 537 + }, + { + "epoch": 0.45, + "grad_norm": 1.896479397543979, + "learning_rate": 4.4271998483588565e-06, + "loss": 0.5791, + "step": 538 + }, + { + "epoch": 0.45, + "grad_norm": 2.117773381781195, + "learning_rate": 4.425115422533785e-06, + "loss": 0.5234, + "step": 539 + }, + { + "epoch": 0.45, + "grad_norm": 2.4438942429566617, + "learning_rate": 4.423027703566769e-06, + "loss": 0.5692, + "step": 540 + }, + { + "epoch": 0.45, + "grad_norm": 1.873481152225171, + "learning_rate": 4.4209366950291025e-06, + "loss": 0.5739, + "step": 541 + }, + { + "epoch": 0.45, + "grad_norm": 1.8655199147974673, + "learning_rate": 4.4188424004977085e-06, + "loss": 0.5795, + "step": 542 + }, + { + "epoch": 0.45, + "grad_norm": 1.948840412241188, + "learning_rate": 4.416744823555129e-06, + "loss": 0.5304, + "step": 543 + }, + { + "epoch": 0.45, + "grad_norm": 1.8389034133315045, + "learning_rate": 4.414643967789523e-06, + "loss": 0.5076, + "step": 544 + }, + { + "epoch": 0.45, + "grad_norm": 1.8269235720085213, + "learning_rate": 4.412539836794657e-06, + "loss": 0.5837, + "step": 545 + }, + { + "epoch": 0.45, + "grad_norm": 2.1298715969759505, + "learning_rate": 4.410432434169902e-06, + "loss": 0.5694, + "step": 546 + }, + { + "epoch": 0.45, + "grad_norm": 2.0057741366005746, + "learning_rate": 4.408321763520223e-06, + "loss": 0.557, + "step": 547 + }, + { + "epoch": 0.45, + "grad_norm": 1.7901331374893255, + "learning_rate": 4.406207828456177e-06, + "loss": 0.5746, + "step": 548 + }, + { + "epoch": 0.46, + "grad_norm": 2.1994839889416187, + "learning_rate": 4.404090632593904e-06, + "loss": 0.5407, + "step": 549 + }, + { + "epoch": 0.46, + "grad_norm": 1.9664921082690268, + "learning_rate": 4.401970179555123e-06, + "loss": 0.5322, + "step": 550 + }, + { + "epoch": 0.46, + "grad_norm": 1.9933486180243851, + "learning_rate": 4.399846472967124e-06, + "loss": 0.5798, + "step": 551 + }, + { + "epoch": 0.46, + "grad_norm": 1.986612256562151, + "learning_rate": 4.397719516462765e-06, + "loss": 0.5213, + "step": 552 + }, + { + "epoch": 0.46, + "grad_norm": 2.046550123292336, + "learning_rate": 4.395589313680459e-06, + "loss": 0.5857, + "step": 553 + }, + { + "epoch": 0.46, + "grad_norm": 1.7902327250340486, + "learning_rate": 4.393455868264176e-06, + "loss": 0.555, + "step": 554 + }, + { + "epoch": 0.46, + "grad_norm": 2.0203627138517146, + "learning_rate": 4.391319183863432e-06, + "loss": 0.6329, + "step": 555 + }, + { + "epoch": 0.46, + "grad_norm": 1.9373549045181289, + "learning_rate": 4.389179264133281e-06, + "loss": 0.566, + "step": 556 + }, + { + "epoch": 0.46, + "grad_norm": 1.8936753353678124, + "learning_rate": 4.387036112734316e-06, + "loss": 0.5555, + "step": 557 + }, + { + "epoch": 0.46, + "grad_norm": 1.8493817575820743, + "learning_rate": 4.3848897333326545e-06, + "loss": 0.5427, + "step": 558 + }, + { + "epoch": 0.46, + "grad_norm": 1.9119588677783816, + "learning_rate": 4.382740129599937e-06, + "loss": 0.5157, + "step": 559 + }, + { + "epoch": 0.46, + "grad_norm": 1.8190137094200924, + "learning_rate": 4.380587305213321e-06, + "loss": 0.503, + "step": 560 + }, + { + "epoch": 0.47, + "grad_norm": 1.9891332712764953, + "learning_rate": 4.37843126385547e-06, + "loss": 0.5761, + "step": 561 + }, + { + "epoch": 0.47, + "grad_norm": 1.8620896547461154, + "learning_rate": 4.376272009214555e-06, + "loss": 0.5259, + "step": 562 + }, + { + "epoch": 0.47, + "grad_norm": 1.8896721756477406, + "learning_rate": 4.37410954498424e-06, + "loss": 0.5632, + "step": 563 + }, + { + "epoch": 0.47, + "grad_norm": 1.8302281976781984, + "learning_rate": 4.37194387486368e-06, + "loss": 0.5612, + "step": 564 + }, + { + "epoch": 0.47, + "grad_norm": 2.0721820586440165, + "learning_rate": 4.369775002557516e-06, + "loss": 0.533, + "step": 565 + }, + { + "epoch": 0.47, + "grad_norm": 1.8259926551813157, + "learning_rate": 4.367602931775865e-06, + "loss": 0.526, + "step": 566 + }, + { + "epoch": 0.47, + "grad_norm": 1.8096334574000785, + "learning_rate": 4.3654276662343155e-06, + "loss": 0.5306, + "step": 567 + }, + { + "epoch": 0.47, + "grad_norm": 1.9675637591445598, + "learning_rate": 4.363249209653922e-06, + "loss": 0.5577, + "step": 568 + }, + { + "epoch": 0.47, + "grad_norm": 1.8800389115841605, + "learning_rate": 4.361067565761197e-06, + "loss": 0.5553, + "step": 569 + }, + { + "epoch": 0.47, + "grad_norm": 1.827485496395265, + "learning_rate": 4.358882738288105e-06, + "loss": 0.5587, + "step": 570 + }, + { + "epoch": 0.47, + "grad_norm": 1.820954908943235, + "learning_rate": 4.356694730972056e-06, + "loss": 0.6186, + "step": 571 + }, + { + "epoch": 0.47, + "grad_norm": 1.952072431699686, + "learning_rate": 4.3545035475559025e-06, + "loss": 0.5488, + "step": 572 + }, + { + "epoch": 0.48, + "grad_norm": 1.8292648968688423, + "learning_rate": 4.352309191787924e-06, + "loss": 0.5534, + "step": 573 + }, + { + "epoch": 0.48, + "grad_norm": 1.826293122529813, + "learning_rate": 4.350111667421835e-06, + "loss": 0.5872, + "step": 574 + }, + { + "epoch": 0.48, + "grad_norm": 1.9251425791166785, + "learning_rate": 4.347910978216763e-06, + "loss": 0.5298, + "step": 575 + }, + { + "epoch": 0.48, + "grad_norm": 1.8330818196811385, + "learning_rate": 4.345707127937253e-06, + "loss": 0.5871, + "step": 576 + }, + { + "epoch": 0.48, + "grad_norm": 1.7842986545873851, + "learning_rate": 4.3435001203532555e-06, + "loss": 0.4898, + "step": 577 + }, + { + "epoch": 0.48, + "grad_norm": 1.8778666245156521, + "learning_rate": 4.341289959240124e-06, + "loss": 0.5385, + "step": 578 + }, + { + "epoch": 0.48, + "grad_norm": 1.9300679499181266, + "learning_rate": 4.339076648378605e-06, + "loss": 0.5698, + "step": 579 + }, + { + "epoch": 0.48, + "grad_norm": 1.9440861965960357, + "learning_rate": 4.336860191554833e-06, + "loss": 0.5984, + "step": 580 + }, + { + "epoch": 0.48, + "grad_norm": 1.929951096053947, + "learning_rate": 4.3346405925603265e-06, + "loss": 0.6222, + "step": 581 + }, + { + "epoch": 0.48, + "grad_norm": 1.9138258400335695, + "learning_rate": 4.332417855191974e-06, + "loss": 0.5498, + "step": 582 + }, + { + "epoch": 0.48, + "grad_norm": 2.058548455869675, + "learning_rate": 4.330191983252039e-06, + "loss": 0.5218, + "step": 583 + }, + { + "epoch": 0.48, + "grad_norm": 2.243429045583125, + "learning_rate": 4.327962980548142e-06, + "loss": 0.5768, + "step": 584 + }, + { + "epoch": 0.48, + "grad_norm": 1.9213537104634244, + "learning_rate": 4.32573085089326e-06, + "loss": 0.5784, + "step": 585 + }, + { + "epoch": 0.49, + "grad_norm": 1.9165291289119128, + "learning_rate": 4.32349559810572e-06, + "loss": 0.5697, + "step": 586 + }, + { + "epoch": 0.49, + "grad_norm": 1.9674279518735756, + "learning_rate": 4.321257226009193e-06, + "loss": 0.5104, + "step": 587 + }, + { + "epoch": 0.49, + "grad_norm": 1.9051339015323923, + "learning_rate": 4.319015738432683e-06, + "loss": 0.5711, + "step": 588 + }, + { + "epoch": 0.49, + "grad_norm": 1.957357618850765, + "learning_rate": 4.3167711392105245e-06, + "loss": 0.5854, + "step": 589 + }, + { + "epoch": 0.49, + "grad_norm": 1.9859311708308915, + "learning_rate": 4.314523432182376e-06, + "loss": 0.547, + "step": 590 + }, + { + "epoch": 0.49, + "grad_norm": 1.773704456523191, + "learning_rate": 4.312272621193209e-06, + "loss": 0.5259, + "step": 591 + }, + { + "epoch": 0.49, + "grad_norm": 1.82988033655793, + "learning_rate": 4.31001871009331e-06, + "loss": 0.5209, + "step": 592 + }, + { + "epoch": 0.49, + "grad_norm": 1.8925134832060522, + "learning_rate": 4.307761702738264e-06, + "loss": 0.59, + "step": 593 + }, + { + "epoch": 0.49, + "grad_norm": 1.8477075780641046, + "learning_rate": 4.305501602988953e-06, + "loss": 0.5714, + "step": 594 + }, + { + "epoch": 0.49, + "grad_norm": 1.8568432886623798, + "learning_rate": 4.303238414711552e-06, + "loss": 0.5877, + "step": 595 + }, + { + "epoch": 0.49, + "grad_norm": 1.8179798660158206, + "learning_rate": 4.3009721417775166e-06, + "loss": 0.6029, + "step": 596 + }, + { + "epoch": 0.49, + "grad_norm": 1.8494963193854803, + "learning_rate": 4.29870278806358e-06, + "loss": 0.5236, + "step": 597 + }, + { + "epoch": 0.5, + "grad_norm": 1.9586017397154731, + "learning_rate": 4.296430357451744e-06, + "loss": 0.5998, + "step": 598 + }, + { + "epoch": 0.5, + "grad_norm": 1.926616057974202, + "learning_rate": 4.2941548538292765e-06, + "loss": 0.5914, + "step": 599 + }, + { + "epoch": 0.5, + "grad_norm": 1.9321738359144827, + "learning_rate": 4.291876281088701e-06, + "loss": 0.5358, + "step": 600 + }, + { + "epoch": 0.5, + "grad_norm": 1.8229177571361932, + "learning_rate": 4.289594643127788e-06, + "loss": 0.5284, + "step": 601 + }, + { + "epoch": 0.5, + "grad_norm": 1.849252449531427, + "learning_rate": 4.287309943849558e-06, + "loss": 0.5689, + "step": 602 + }, + { + "epoch": 0.5, + "grad_norm": 1.985343175388319, + "learning_rate": 4.285022187162261e-06, + "loss": 0.6101, + "step": 603 + }, + { + "epoch": 0.5, + "grad_norm": 1.9437791826489255, + "learning_rate": 4.2827313769793835e-06, + "loss": 0.5419, + "step": 604 + }, + { + "epoch": 0.5, + "grad_norm": 1.8027421078538746, + "learning_rate": 4.28043751721963e-06, + "loss": 0.5504, + "step": 605 + }, + { + "epoch": 0.5, + "grad_norm": 1.8221230935939319, + "learning_rate": 4.278140611806926e-06, + "loss": 0.5284, + "step": 606 + }, + { + "epoch": 0.5, + "grad_norm": 1.8597205853821357, + "learning_rate": 4.275840664670403e-06, + "loss": 0.623, + "step": 607 + }, + { + "epoch": 0.5, + "grad_norm": 1.7801370844338822, + "learning_rate": 4.2735376797444e-06, + "loss": 0.5265, + "step": 608 + }, + { + "epoch": 0.5, + "grad_norm": 1.9028094416250234, + "learning_rate": 4.271231660968449e-06, + "loss": 0.5764, + "step": 609 + }, + { + "epoch": 0.51, + "grad_norm": 1.9385737581380094, + "learning_rate": 4.268922612287273e-06, + "loss": 0.6047, + "step": 610 + }, + { + "epoch": 0.51, + "grad_norm": 1.760006169733744, + "learning_rate": 4.266610537650778e-06, + "loss": 0.4944, + "step": 611 + }, + { + "epoch": 0.51, + "grad_norm": 1.857083980479501, + "learning_rate": 4.264295441014047e-06, + "loss": 0.5174, + "step": 612 + }, + { + "epoch": 0.51, + "grad_norm": 1.8299942480819913, + "learning_rate": 4.261977326337332e-06, + "loss": 0.5814, + "step": 613 + }, + { + "epoch": 0.51, + "grad_norm": 1.8943903433033418, + "learning_rate": 4.259656197586046e-06, + "loss": 0.5514, + "step": 614 + }, + { + "epoch": 0.51, + "grad_norm": 1.7839062839610529, + "learning_rate": 4.257332058730761e-06, + "loss": 0.5857, + "step": 615 + }, + { + "epoch": 0.51, + "grad_norm": 2.7188975139736256, + "learning_rate": 4.255004913747196e-06, + "loss": 0.5509, + "step": 616 + }, + { + "epoch": 0.51, + "grad_norm": 1.8767461602206779, + "learning_rate": 4.252674766616212e-06, + "loss": 0.5038, + "step": 617 + }, + { + "epoch": 0.51, + "grad_norm": 1.8391588901867753, + "learning_rate": 4.250341621323809e-06, + "loss": 0.5196, + "step": 618 + }, + { + "epoch": 0.51, + "grad_norm": 1.8106924420187829, + "learning_rate": 4.248005481861111e-06, + "loss": 0.5458, + "step": 619 + }, + { + "epoch": 0.51, + "grad_norm": 1.9698953511074666, + "learning_rate": 4.245666352224367e-06, + "loss": 0.5963, + "step": 620 + }, + { + "epoch": 0.51, + "grad_norm": 1.8890424031569348, + "learning_rate": 4.243324236414939e-06, + "loss": 0.5277, + "step": 621 + }, + { + "epoch": 0.52, + "grad_norm": 1.8537879418167673, + "learning_rate": 4.240979138439301e-06, + "loss": 0.5407, + "step": 622 + }, + { + "epoch": 0.52, + "grad_norm": 1.9264981771759184, + "learning_rate": 4.238631062309023e-06, + "loss": 0.5788, + "step": 623 + }, + { + "epoch": 0.52, + "grad_norm": 1.949693389062837, + "learning_rate": 4.236280012040773e-06, + "loss": 0.5007, + "step": 624 + }, + { + "epoch": 0.52, + "grad_norm": 1.8845778025905608, + "learning_rate": 4.233925991656307e-06, + "loss": 0.5905, + "step": 625 + }, + { + "epoch": 0.52, + "grad_norm": 1.8977167810192608, + "learning_rate": 4.231569005182459e-06, + "loss": 0.5342, + "step": 626 + }, + { + "epoch": 0.52, + "grad_norm": 1.9579196623045914, + "learning_rate": 4.229209056651139e-06, + "loss": 0.554, + "step": 627 + }, + { + "epoch": 0.52, + "grad_norm": 1.8427820272426025, + "learning_rate": 4.226846150099324e-06, + "loss": 0.5629, + "step": 628 + }, + { + "epoch": 0.52, + "grad_norm": 1.865218131227253, + "learning_rate": 4.22448028956905e-06, + "loss": 0.558, + "step": 629 + }, + { + "epoch": 0.52, + "grad_norm": 1.7348773966225364, + "learning_rate": 4.222111479107406e-06, + "loss": 0.5332, + "step": 630 + }, + { + "epoch": 0.52, + "grad_norm": 1.779367140127678, + "learning_rate": 4.219739722766528e-06, + "loss": 0.569, + "step": 631 + }, + { + "epoch": 0.52, + "grad_norm": 1.92860570712595, + "learning_rate": 4.217365024603592e-06, + "loss": 0.5342, + "step": 632 + }, + { + "epoch": 0.52, + "grad_norm": 1.946965997476449, + "learning_rate": 4.214987388680804e-06, + "loss": 0.5482, + "step": 633 + }, + { + "epoch": 0.53, + "grad_norm": 1.7930454990298659, + "learning_rate": 4.212606819065399e-06, + "loss": 0.5376, + "step": 634 + }, + { + "epoch": 0.53, + "grad_norm": 1.8379498458279013, + "learning_rate": 4.210223319829626e-06, + "loss": 0.5741, + "step": 635 + }, + { + "epoch": 0.53, + "grad_norm": 1.742977498596499, + "learning_rate": 4.207836895050748e-06, + "loss": 0.5569, + "step": 636 + }, + { + "epoch": 0.53, + "grad_norm": 1.852541709372898, + "learning_rate": 4.205447548811032e-06, + "loss": 0.578, + "step": 637 + }, + { + "epoch": 0.53, + "grad_norm": 1.8180259569107267, + "learning_rate": 4.203055285197745e-06, + "loss": 0.5189, + "step": 638 + }, + { + "epoch": 0.53, + "grad_norm": 1.8177842562763082, + "learning_rate": 4.20066010830314e-06, + "loss": 0.5424, + "step": 639 + }, + { + "epoch": 0.53, + "grad_norm": 1.8068654723170434, + "learning_rate": 4.198262022224457e-06, + "loss": 0.5336, + "step": 640 + }, + { + "epoch": 0.53, + "grad_norm": 1.9664843499052276, + "learning_rate": 4.195861031063909e-06, + "loss": 0.5399, + "step": 641 + }, + { + "epoch": 0.53, + "grad_norm": 1.7812265481792608, + "learning_rate": 4.193457138928683e-06, + "loss": 0.534, + "step": 642 + }, + { + "epoch": 0.53, + "grad_norm": 1.908377487778027, + "learning_rate": 4.191050349930925e-06, + "loss": 0.5831, + "step": 643 + }, + { + "epoch": 0.53, + "grad_norm": 1.8124678634933105, + "learning_rate": 4.18864066818774e-06, + "loss": 0.5309, + "step": 644 + }, + { + "epoch": 0.53, + "grad_norm": 1.902443199964304, + "learning_rate": 4.186228097821176e-06, + "loss": 0.5452, + "step": 645 + }, + { + "epoch": 0.54, + "grad_norm": 1.9694387068719457, + "learning_rate": 4.183812642958227e-06, + "loss": 0.5462, + "step": 646 + }, + { + "epoch": 0.54, + "grad_norm": 1.945352264767711, + "learning_rate": 4.181394307730819e-06, + "loss": 0.4853, + "step": 647 + }, + { + "epoch": 0.54, + "grad_norm": 1.7967416728436914, + "learning_rate": 4.178973096275806e-06, + "loss": 0.5952, + "step": 648 + }, + { + "epoch": 0.54, + "grad_norm": 2.0602433101771616, + "learning_rate": 4.176549012734963e-06, + "loss": 0.6346, + "step": 649 + }, + { + "epoch": 0.54, + "grad_norm": 1.9158731498204968, + "learning_rate": 4.1741220612549746e-06, + "loss": 0.5101, + "step": 650 + }, + { + "epoch": 0.54, + "grad_norm": 1.951875972207364, + "learning_rate": 4.171692245987436e-06, + "loss": 0.5718, + "step": 651 + }, + { + "epoch": 0.54, + "grad_norm": 1.871788727804539, + "learning_rate": 4.169259571088839e-06, + "loss": 0.5516, + "step": 652 + }, + { + "epoch": 0.54, + "grad_norm": 1.945571804366465, + "learning_rate": 4.166824040720566e-06, + "loss": 0.5544, + "step": 653 + }, + { + "epoch": 0.54, + "grad_norm": 1.8975723622706568, + "learning_rate": 4.1643856590488866e-06, + "loss": 0.5643, + "step": 654 + }, + { + "epoch": 0.54, + "grad_norm": 1.9772846459626554, + "learning_rate": 4.161944430244945e-06, + "loss": 0.5487, + "step": 655 + }, + { + "epoch": 0.54, + "grad_norm": 2.036472038769578, + "learning_rate": 4.159500358484759e-06, + "loss": 0.5232, + "step": 656 + }, + { + "epoch": 0.54, + "grad_norm": 1.7742095436926848, + "learning_rate": 4.157053447949206e-06, + "loss": 0.4963, + "step": 657 + }, + { + "epoch": 0.55, + "grad_norm": 2.1819742476725814, + "learning_rate": 4.154603702824023e-06, + "loss": 0.5416, + "step": 658 + }, + { + "epoch": 0.55, + "grad_norm": 1.9151345309457093, + "learning_rate": 4.152151127299794e-06, + "loss": 0.5822, + "step": 659 + }, + { + "epoch": 0.55, + "grad_norm": 2.033640859083771, + "learning_rate": 4.149695725571944e-06, + "loss": 0.5876, + "step": 660 + }, + { + "epoch": 0.55, + "grad_norm": 1.8935471013235925, + "learning_rate": 4.147237501840734e-06, + "loss": 0.548, + "step": 661 + }, + { + "epoch": 0.55, + "grad_norm": 1.7836299476774775, + "learning_rate": 4.144776460311253e-06, + "loss": 0.5274, + "step": 662 + }, + { + "epoch": 0.55, + "grad_norm": 2.194666072449123, + "learning_rate": 4.142312605193407e-06, + "loss": 0.5934, + "step": 663 + }, + { + "epoch": 0.55, + "grad_norm": 1.988265407508224, + "learning_rate": 4.13984594070192e-06, + "loss": 0.5539, + "step": 664 + }, + { + "epoch": 0.55, + "grad_norm": 1.7594955740187146, + "learning_rate": 4.137376471056317e-06, + "loss": 0.5324, + "step": 665 + }, + { + "epoch": 0.55, + "grad_norm": 1.9342530277100989, + "learning_rate": 4.1349042004809224e-06, + "loss": 0.5902, + "step": 666 + }, + { + "epoch": 0.55, + "grad_norm": 1.9757082453588417, + "learning_rate": 4.132429133204856e-06, + "loss": 0.5874, + "step": 667 + }, + { + "epoch": 0.55, + "grad_norm": 1.7792467343474774, + "learning_rate": 4.129951273462016e-06, + "loss": 0.5516, + "step": 668 + }, + { + "epoch": 0.55, + "grad_norm": 1.9010392264817964, + "learning_rate": 4.127470625491082e-06, + "loss": 0.5793, + "step": 669 + }, + { + "epoch": 0.56, + "grad_norm": 2.054505290884914, + "learning_rate": 4.1249871935355e-06, + "loss": 0.5718, + "step": 670 + }, + { + "epoch": 0.56, + "grad_norm": 1.8010036617727825, + "learning_rate": 4.1225009818434805e-06, + "loss": 0.5698, + "step": 671 + }, + { + "epoch": 0.56, + "grad_norm": 1.975020822034628, + "learning_rate": 4.120011994667988e-06, + "loss": 0.5739, + "step": 672 + }, + { + "epoch": 0.56, + "grad_norm": 1.9801075045379748, + "learning_rate": 4.117520236266734e-06, + "loss": 0.5589, + "step": 673 + }, + { + "epoch": 0.56, + "grad_norm": 1.7773808874926829, + "learning_rate": 4.115025710902173e-06, + "loss": 0.5276, + "step": 674 + }, + { + "epoch": 0.56, + "grad_norm": 1.890298398205481, + "learning_rate": 4.112528422841491e-06, + "loss": 0.4914, + "step": 675 + }, + { + "epoch": 0.56, + "grad_norm": 1.9087570296379215, + "learning_rate": 4.110028376356599e-06, + "loss": 0.5412, + "step": 676 + }, + { + "epoch": 0.56, + "grad_norm": 1.8908271691889404, + "learning_rate": 4.1075255757241295e-06, + "loss": 0.5618, + "step": 677 + }, + { + "epoch": 0.56, + "grad_norm": 2.024312170169272, + "learning_rate": 4.105020025225423e-06, + "loss": 0.5618, + "step": 678 + }, + { + "epoch": 0.56, + "grad_norm": 1.8072403207581518, + "learning_rate": 4.102511729146528e-06, + "loss": 0.5744, + "step": 679 + }, + { + "epoch": 0.56, + "grad_norm": 1.7750572145097157, + "learning_rate": 4.100000691778185e-06, + "loss": 0.5716, + "step": 680 + }, + { + "epoch": 0.56, + "grad_norm": 1.8778337896632162, + "learning_rate": 4.097486917415827e-06, + "loss": 0.5683, + "step": 681 + }, + { + "epoch": 0.57, + "grad_norm": 1.9710167098273688, + "learning_rate": 4.094970410359568e-06, + "loss": 0.5273, + "step": 682 + }, + { + "epoch": 0.57, + "grad_norm": 1.9136975523972874, + "learning_rate": 4.092451174914196e-06, + "loss": 0.5239, + "step": 683 + }, + { + "epoch": 0.57, + "grad_norm": 1.929344793900944, + "learning_rate": 4.089929215389167e-06, + "loss": 0.5388, + "step": 684 + }, + { + "epoch": 0.57, + "grad_norm": 1.7211535229712278, + "learning_rate": 4.087404536098597e-06, + "loss": 0.5068, + "step": 685 + }, + { + "epoch": 0.57, + "grad_norm": 1.8739637749458882, + "learning_rate": 4.084877141361254e-06, + "loss": 0.5537, + "step": 686 + }, + { + "epoch": 0.57, + "grad_norm": 1.9268469960932768, + "learning_rate": 4.082347035500553e-06, + "loss": 0.5875, + "step": 687 + }, + { + "epoch": 0.57, + "grad_norm": 1.896542320004603, + "learning_rate": 4.079814222844541e-06, + "loss": 0.5314, + "step": 688 + }, + { + "epoch": 0.57, + "grad_norm": 1.723925126440519, + "learning_rate": 4.077278707725904e-06, + "loss": 0.5009, + "step": 689 + }, + { + "epoch": 0.57, + "grad_norm": 1.8345210205201996, + "learning_rate": 4.074740494481942e-06, + "loss": 0.5544, + "step": 690 + }, + { + "epoch": 0.57, + "grad_norm": 1.766819080519227, + "learning_rate": 4.072199587454578e-06, + "loss": 0.5393, + "step": 691 + }, + { + "epoch": 0.57, + "grad_norm": 1.9577975399484282, + "learning_rate": 4.069655990990337e-06, + "loss": 0.5357, + "step": 692 + }, + { + "epoch": 0.57, + "grad_norm": 1.8254761359015224, + "learning_rate": 4.06710970944035e-06, + "loss": 0.5797, + "step": 693 + }, + { + "epoch": 0.58, + "grad_norm": 2.1203973374999214, + "learning_rate": 4.064560747160337e-06, + "loss": 0.5811, + "step": 694 + }, + { + "epoch": 0.58, + "grad_norm": 1.9066221824053846, + "learning_rate": 4.062009108510605e-06, + "loss": 0.5014, + "step": 695 + }, + { + "epoch": 0.58, + "grad_norm": 1.951489716071849, + "learning_rate": 4.059454797856039e-06, + "loss": 0.529, + "step": 696 + }, + { + "epoch": 0.58, + "grad_norm": 1.8402907113209426, + "learning_rate": 4.056897819566096e-06, + "loss": 0.4942, + "step": 697 + }, + { + "epoch": 0.58, + "grad_norm": 2.0368715640768498, + "learning_rate": 4.0543381780147965e-06, + "loss": 0.5245, + "step": 698 + }, + { + "epoch": 0.58, + "grad_norm": 1.8154462049772704, + "learning_rate": 4.0517758775807135e-06, + "loss": 0.4979, + "step": 699 + }, + { + "epoch": 0.58, + "grad_norm": 1.890388895335948, + "learning_rate": 4.049210922646973e-06, + "loss": 0.5212, + "step": 700 + }, + { + "epoch": 0.58, + "grad_norm": 2.0215900504030166, + "learning_rate": 4.046643317601237e-06, + "loss": 0.5384, + "step": 701 + }, + { + "epoch": 0.58, + "grad_norm": 1.816997259900234, + "learning_rate": 4.0440730668357076e-06, + "loss": 0.492, + "step": 702 + }, + { + "epoch": 0.58, + "grad_norm": 1.968633766153865, + "learning_rate": 4.0415001747471036e-06, + "loss": 0.5917, + "step": 703 + }, + { + "epoch": 0.58, + "grad_norm": 1.8313487810801756, + "learning_rate": 4.0389246457366696e-06, + "loss": 0.5561, + "step": 704 + }, + { + "epoch": 0.58, + "grad_norm": 1.7954421155528784, + "learning_rate": 4.036346484210159e-06, + "loss": 0.5383, + "step": 705 + }, + { + "epoch": 0.59, + "grad_norm": 1.8517101217315919, + "learning_rate": 4.033765694577826e-06, + "loss": 0.5368, + "step": 706 + }, + { + "epoch": 0.59, + "grad_norm": 1.8888441616203875, + "learning_rate": 4.031182281254423e-06, + "loss": 0.5895, + "step": 707 + }, + { + "epoch": 0.59, + "grad_norm": 1.8131436351862782, + "learning_rate": 4.028596248659191e-06, + "loss": 0.5346, + "step": 708 + }, + { + "epoch": 0.59, + "grad_norm": 1.8803113487311214, + "learning_rate": 4.0260076012158486e-06, + "loss": 0.4987, + "step": 709 + }, + { + "epoch": 0.59, + "grad_norm": 1.8989122650791335, + "learning_rate": 4.023416343352589e-06, + "loss": 0.5007, + "step": 710 + }, + { + "epoch": 0.59, + "grad_norm": 1.9466291969735336, + "learning_rate": 4.020822479502074e-06, + "loss": 0.5868, + "step": 711 + }, + { + "epoch": 0.59, + "grad_norm": 1.869533367998661, + "learning_rate": 4.018226014101418e-06, + "loss": 0.5995, + "step": 712 + }, + { + "epoch": 0.59, + "grad_norm": 1.93738608926368, + "learning_rate": 4.015626951592187e-06, + "loss": 0.5625, + "step": 713 + }, + { + "epoch": 0.59, + "grad_norm": 1.8485080870897803, + "learning_rate": 4.013025296420394e-06, + "loss": 0.5585, + "step": 714 + }, + { + "epoch": 0.59, + "grad_norm": 1.8099669115387913, + "learning_rate": 4.010421053036481e-06, + "loss": 0.5384, + "step": 715 + }, + { + "epoch": 0.59, + "grad_norm": 1.8810123612010912, + "learning_rate": 4.007814225895321e-06, + "loss": 0.5589, + "step": 716 + }, + { + "epoch": 0.59, + "grad_norm": 1.8692823610937885, + "learning_rate": 4.005204819456205e-06, + "loss": 0.5474, + "step": 717 + }, + { + "epoch": 0.6, + "grad_norm": 1.8120887102918588, + "learning_rate": 4.00259283818284e-06, + "loss": 0.5138, + "step": 718 + }, + { + "epoch": 0.6, + "grad_norm": 1.7933926935301234, + "learning_rate": 3.999978286543331e-06, + "loss": 0.5235, + "step": 719 + }, + { + "epoch": 0.6, + "grad_norm": 1.8382360731306235, + "learning_rate": 3.997361169010187e-06, + "loss": 0.5846, + "step": 720 + }, + { + "epoch": 0.6, + "grad_norm": 1.993925306673069, + "learning_rate": 3.994741490060301e-06, + "loss": 0.5561, + "step": 721 + }, + { + "epoch": 0.6, + "grad_norm": 1.900088669959918, + "learning_rate": 3.9921192541749505e-06, + "loss": 0.5215, + "step": 722 + }, + { + "epoch": 0.6, + "grad_norm": 1.9250072769385074, + "learning_rate": 3.989494465839785e-06, + "loss": 0.54, + "step": 723 + }, + { + "epoch": 0.6, + "grad_norm": 1.7928905908766457, + "learning_rate": 3.986867129544822e-06, + "loss": 0.6066, + "step": 724 + }, + { + "epoch": 0.6, + "grad_norm": 1.9474900039545116, + "learning_rate": 3.984237249784437e-06, + "loss": 0.5173, + "step": 725 + }, + { + "epoch": 0.6, + "grad_norm": 1.9004077336349998, + "learning_rate": 3.981604831057357e-06, + "loss": 0.5409, + "step": 726 + }, + { + "epoch": 0.6, + "grad_norm": 1.7573843693188624, + "learning_rate": 3.97896987786665e-06, + "loss": 0.5239, + "step": 727 + }, + { + "epoch": 0.6, + "grad_norm": 1.899283660379949, + "learning_rate": 3.976332394719721e-06, + "loss": 0.4977, + "step": 728 + }, + { + "epoch": 0.6, + "grad_norm": 1.8353476568345033, + "learning_rate": 3.973692386128304e-06, + "loss": 0.5834, + "step": 729 + }, + { + "epoch": 0.61, + "grad_norm": 2.032325534167748, + "learning_rate": 3.971049856608451e-06, + "loss": 0.5343, + "step": 730 + }, + { + "epoch": 0.61, + "grad_norm": 1.8161347764383835, + "learning_rate": 3.9684048106805286e-06, + "loss": 0.585, + "step": 731 + }, + { + "epoch": 0.61, + "grad_norm": 1.836376388525165, + "learning_rate": 3.965757252869204e-06, + "loss": 0.5978, + "step": 732 + }, + { + "epoch": 0.61, + "grad_norm": 1.889118862096067, + "learning_rate": 3.963107187703446e-06, + "loss": 0.5393, + "step": 733 + }, + { + "epoch": 0.61, + "grad_norm": 1.7772829607776217, + "learning_rate": 3.96045461971651e-06, + "loss": 0.5164, + "step": 734 + }, + { + "epoch": 0.61, + "grad_norm": 1.7980410807492582, + "learning_rate": 3.957799553445932e-06, + "loss": 0.5455, + "step": 735 + }, + { + "epoch": 0.61, + "grad_norm": 1.907936099702467, + "learning_rate": 3.955141993433526e-06, + "loss": 0.532, + "step": 736 + }, + { + "epoch": 0.61, + "grad_norm": 1.8668064740862462, + "learning_rate": 3.9524819442253645e-06, + "loss": 0.5578, + "step": 737 + }, + { + "epoch": 0.61, + "grad_norm": 1.838952740378055, + "learning_rate": 3.949819410371785e-06, + "loss": 0.5784, + "step": 738 + }, + { + "epoch": 0.61, + "grad_norm": 1.9595767898211005, + "learning_rate": 3.947154396427373e-06, + "loss": 0.5213, + "step": 739 + }, + { + "epoch": 0.61, + "grad_norm": 1.9422968944070973, + "learning_rate": 3.944486906950954e-06, + "loss": 0.5709, + "step": 740 + }, + { + "epoch": 0.61, + "grad_norm": 1.760556693040696, + "learning_rate": 3.941816946505592e-06, + "loss": 0.5564, + "step": 741 + }, + { + "epoch": 0.62, + "grad_norm": 1.8054841879427592, + "learning_rate": 3.939144519658575e-06, + "loss": 0.5435, + "step": 742 + }, + { + "epoch": 0.62, + "grad_norm": 2.1072923992538, + "learning_rate": 3.936469630981412e-06, + "loss": 0.5622, + "step": 743 + }, + { + "epoch": 0.62, + "grad_norm": 1.711687978027928, + "learning_rate": 3.933792285049821e-06, + "loss": 0.5554, + "step": 744 + }, + { + "epoch": 0.62, + "grad_norm": 1.8166543944942228, + "learning_rate": 3.931112486443727e-06, + "loss": 0.5079, + "step": 745 + }, + { + "epoch": 0.62, + "grad_norm": 1.7923405334139695, + "learning_rate": 3.928430239747246e-06, + "loss": 0.5692, + "step": 746 + }, + { + "epoch": 0.62, + "grad_norm": 1.9611773239667012, + "learning_rate": 3.925745549548687e-06, + "loss": 0.5092, + "step": 747 + }, + { + "epoch": 0.62, + "grad_norm": 1.8440088039871827, + "learning_rate": 3.923058420440534e-06, + "loss": 0.5369, + "step": 748 + }, + { + "epoch": 0.62, + "grad_norm": 1.9272316571307881, + "learning_rate": 3.920368857019447e-06, + "loss": 0.5798, + "step": 749 + }, + { + "epoch": 0.62, + "grad_norm": 1.8248503445199376, + "learning_rate": 3.917676863886246e-06, + "loss": 0.5479, + "step": 750 + }, + { + "epoch": 0.62, + "grad_norm": 1.9200626612083824, + "learning_rate": 3.914982445645912e-06, + "loss": 0.549, + "step": 751 + }, + { + "epoch": 0.62, + "grad_norm": 1.8585556832275227, + "learning_rate": 3.91228560690757e-06, + "loss": 0.5283, + "step": 752 + }, + { + "epoch": 0.62, + "grad_norm": 1.819239895382093, + "learning_rate": 3.90958635228449e-06, + "loss": 0.535, + "step": 753 + }, + { + "epoch": 0.63, + "grad_norm": 1.7810389942543545, + "learning_rate": 3.90688468639407e-06, + "loss": 0.5125, + "step": 754 + }, + { + "epoch": 0.63, + "grad_norm": 1.9614453700373935, + "learning_rate": 3.904180613857837e-06, + "loss": 0.5406, + "step": 755 + }, + { + "epoch": 0.63, + "grad_norm": 1.805104940263808, + "learning_rate": 3.901474139301433e-06, + "loss": 0.5794, + "step": 756 + }, + { + "epoch": 0.63, + "grad_norm": 1.78756289235025, + "learning_rate": 3.898765267354607e-06, + "loss": 0.569, + "step": 757 + }, + { + "epoch": 0.63, + "grad_norm": 1.912300438003516, + "learning_rate": 3.896054002651213e-06, + "loss": 0.5565, + "step": 758 + }, + { + "epoch": 0.63, + "grad_norm": 1.8148356694353722, + "learning_rate": 3.893340349829195e-06, + "loss": 0.5471, + "step": 759 + }, + { + "epoch": 0.63, + "grad_norm": 1.6836223387492706, + "learning_rate": 3.890624313530583e-06, + "loss": 0.5145, + "step": 760 + }, + { + "epoch": 0.63, + "grad_norm": 1.8389298216964765, + "learning_rate": 3.887905898401485e-06, + "loss": 0.5441, + "step": 761 + }, + { + "epoch": 0.63, + "grad_norm": 1.7845754057436856, + "learning_rate": 3.885185109092078e-06, + "loss": 0.5478, + "step": 762 + }, + { + "epoch": 0.63, + "grad_norm": 1.77076035925993, + "learning_rate": 3.882461950256598e-06, + "loss": 0.5497, + "step": 763 + }, + { + "epoch": 0.63, + "grad_norm": 1.8011284465286703, + "learning_rate": 3.87973642655334e-06, + "loss": 0.5039, + "step": 764 + }, + { + "epoch": 0.63, + "grad_norm": 1.7400129481667248, + "learning_rate": 3.877008542644637e-06, + "loss": 0.5243, + "step": 765 + }, + { + "epoch": 0.64, + "grad_norm": 1.9899565111682327, + "learning_rate": 3.874278303196866e-06, + "loss": 0.5767, + "step": 766 + }, + { + "epoch": 0.64, + "grad_norm": 1.8345576263874734, + "learning_rate": 3.871545712880429e-06, + "loss": 0.5262, + "step": 767 + }, + { + "epoch": 0.64, + "grad_norm": 1.8375211207672395, + "learning_rate": 3.8688107763697505e-06, + "loss": 0.5467, + "step": 768 + }, + { + "epoch": 0.64, + "grad_norm": 1.8068462280574835, + "learning_rate": 3.8660734983432715e-06, + "loss": 0.5256, + "step": 769 + }, + { + "epoch": 0.64, + "grad_norm": 1.7823522202158735, + "learning_rate": 3.863333883483433e-06, + "loss": 0.5419, + "step": 770 + }, + { + "epoch": 0.64, + "grad_norm": 1.8881514180214427, + "learning_rate": 3.86059193647668e-06, + "loss": 0.541, + "step": 771 + }, + { + "epoch": 0.64, + "grad_norm": 1.8311064595650786, + "learning_rate": 3.85784766201344e-06, + "loss": 0.5455, + "step": 772 + }, + { + "epoch": 0.64, + "grad_norm": 1.9833459774866717, + "learning_rate": 3.855101064788126e-06, + "loss": 0.5723, + "step": 773 + }, + { + "epoch": 0.64, + "grad_norm": 1.7968096633022903, + "learning_rate": 3.852352149499125e-06, + "loss": 0.5153, + "step": 774 + }, + { + "epoch": 0.64, + "grad_norm": 1.775423895652992, + "learning_rate": 3.849600920848787e-06, + "loss": 0.5134, + "step": 775 + }, + { + "epoch": 0.64, + "grad_norm": 1.7262892998825556, + "learning_rate": 3.84684738354342e-06, + "loss": 0.5287, + "step": 776 + }, + { + "epoch": 0.64, + "grad_norm": 1.7866135638778051, + "learning_rate": 3.84409154229328e-06, + "loss": 0.57, + "step": 777 + }, + { + "epoch": 0.64, + "grad_norm": 1.787377916112687, + "learning_rate": 3.841333401812569e-06, + "loss": 0.5312, + "step": 778 + }, + { + "epoch": 0.65, + "grad_norm": 1.684801862246949, + "learning_rate": 3.838572966819416e-06, + "loss": 0.5822, + "step": 779 + }, + { + "epoch": 0.65, + "grad_norm": 1.79074773131748, + "learning_rate": 3.835810242035879e-06, + "loss": 0.5651, + "step": 780 + }, + { + "epoch": 0.65, + "grad_norm": 1.9234904827178134, + "learning_rate": 3.8330452321879305e-06, + "loss": 0.5527, + "step": 781 + }, + { + "epoch": 0.65, + "grad_norm": 2.1733402579018186, + "learning_rate": 3.830277942005455e-06, + "loss": 0.5545, + "step": 782 + }, + { + "epoch": 0.65, + "grad_norm": 2.112229504682016, + "learning_rate": 3.827508376222233e-06, + "loss": 0.5766, + "step": 783 + }, + { + "epoch": 0.65, + "grad_norm": 2.087174122744587, + "learning_rate": 3.824736539575944e-06, + "loss": 0.549, + "step": 784 + }, + { + "epoch": 0.65, + "grad_norm": 1.9570382810890106, + "learning_rate": 3.821962436808145e-06, + "loss": 0.4984, + "step": 785 + }, + { + "epoch": 0.65, + "grad_norm": 1.94720853153738, + "learning_rate": 3.819186072664277e-06, + "loss": 0.5303, + "step": 786 + }, + { + "epoch": 0.65, + "grad_norm": 2.21095404069362, + "learning_rate": 3.816407451893643e-06, + "loss": 0.5674, + "step": 787 + }, + { + "epoch": 0.65, + "grad_norm": 1.7284336698899117, + "learning_rate": 3.8136265792494094e-06, + "loss": 0.5952, + "step": 788 + }, + { + "epoch": 0.65, + "grad_norm": 1.940869697529687, + "learning_rate": 3.8108434594885934e-06, + "loss": 0.5198, + "step": 789 + }, + { + "epoch": 0.65, + "grad_norm": 1.9282749931884566, + "learning_rate": 3.808058097372057e-06, + "loss": 0.5499, + "step": 790 + }, + { + "epoch": 0.66, + "grad_norm": 2.0180195532646983, + "learning_rate": 3.8052704976644984e-06, + "loss": 0.5117, + "step": 791 + }, + { + "epoch": 0.66, + "grad_norm": 1.8303561179366206, + "learning_rate": 3.8024806651344424e-06, + "loss": 0.5034, + "step": 792 + }, + { + "epoch": 0.66, + "grad_norm": 2.0584295539484754, + "learning_rate": 3.7996886045542335e-06, + "loss": 0.5391, + "step": 793 + }, + { + "epoch": 0.66, + "grad_norm": 1.7736893833047733, + "learning_rate": 3.7968943207000284e-06, + "loss": 0.5378, + "step": 794 + }, + { + "epoch": 0.66, + "grad_norm": 1.7840353008162277, + "learning_rate": 3.794097818351786e-06, + "loss": 0.5091, + "step": 795 + }, + { + "epoch": 0.66, + "grad_norm": 2.0949100717616225, + "learning_rate": 3.791299102293261e-06, + "loss": 0.5731, + "step": 796 + }, + { + "epoch": 0.66, + "grad_norm": 2.048353193294094, + "learning_rate": 3.7884981773119943e-06, + "loss": 0.5576, + "step": 797 + }, + { + "epoch": 0.66, + "grad_norm": 1.9990070284918733, + "learning_rate": 3.7856950481993054e-06, + "loss": 0.5297, + "step": 798 + }, + { + "epoch": 0.66, + "grad_norm": 1.859560152641746, + "learning_rate": 3.7828897197502856e-06, + "loss": 0.5131, + "step": 799 + }, + { + "epoch": 0.66, + "grad_norm": 2.0054802770873916, + "learning_rate": 3.780082196763785e-06, + "loss": 0.5428, + "step": 800 + }, + { + "epoch": 0.66, + "grad_norm": 1.8985367093585213, + "learning_rate": 3.7772724840424126e-06, + "loss": 0.5206, + "step": 801 + }, + { + "epoch": 0.66, + "grad_norm": 1.9964704653764362, + "learning_rate": 3.774460586392519e-06, + "loss": 0.5929, + "step": 802 + }, + { + "epoch": 0.67, + "grad_norm": 1.7572936836574113, + "learning_rate": 3.771646508624194e-06, + "loss": 0.5428, + "step": 803 + }, + { + "epoch": 0.67, + "grad_norm": 1.9623695483620975, + "learning_rate": 3.768830255551258e-06, + "loss": 0.5685, + "step": 804 + }, + { + "epoch": 0.67, + "grad_norm": 1.9663290616402378, + "learning_rate": 3.76601183199125e-06, + "loss": 0.5351, + "step": 805 + }, + { + "epoch": 0.67, + "grad_norm": 1.7876590847889615, + "learning_rate": 3.763191242765424e-06, + "loss": 0.567, + "step": 806 + }, + { + "epoch": 0.67, + "grad_norm": 1.8500820456277005, + "learning_rate": 3.7603684926987383e-06, + "loss": 0.523, + "step": 807 + }, + { + "epoch": 0.67, + "grad_norm": 2.041973125533567, + "learning_rate": 3.757543586619845e-06, + "loss": 0.5531, + "step": 808 + }, + { + "epoch": 0.67, + "grad_norm": 1.7440376746222928, + "learning_rate": 3.754716529361089e-06, + "loss": 0.4913, + "step": 809 + }, + { + "epoch": 0.67, + "grad_norm": 1.7910937306897654, + "learning_rate": 3.7518873257584897e-06, + "loss": 0.5128, + "step": 810 + }, + { + "epoch": 0.67, + "grad_norm": 1.9334392608388238, + "learning_rate": 3.7490559806517434e-06, + "loss": 0.5861, + "step": 811 + }, + { + "epoch": 0.67, + "grad_norm": 2.0003597857127673, + "learning_rate": 3.746222498884206e-06, + "loss": 0.5535, + "step": 812 + }, + { + "epoch": 0.67, + "grad_norm": 1.7964615198133413, + "learning_rate": 3.74338688530289e-06, + "loss": 0.5409, + "step": 813 + }, + { + "epoch": 0.67, + "grad_norm": 1.7726488990007383, + "learning_rate": 3.740549144758453e-06, + "loss": 0.5714, + "step": 814 + }, + { + "epoch": 0.68, + "grad_norm": 1.9080323144095523, + "learning_rate": 3.737709282105193e-06, + "loss": 0.5534, + "step": 815 + }, + { + "epoch": 0.68, + "grad_norm": 1.9612361354867969, + "learning_rate": 3.734867302201038e-06, + "loss": 0.5282, + "step": 816 + }, + { + "epoch": 0.68, + "grad_norm": 1.873254058551618, + "learning_rate": 3.7320232099075363e-06, + "loss": 0.5422, + "step": 817 + }, + { + "epoch": 0.68, + "grad_norm": 1.8383882069199007, + "learning_rate": 3.7291770100898508e-06, + "loss": 0.5588, + "step": 818 + }, + { + "epoch": 0.68, + "grad_norm": 2.0137053963220835, + "learning_rate": 3.726328707616749e-06, + "loss": 0.5895, + "step": 819 + }, + { + "epoch": 0.68, + "grad_norm": 1.8207549211692964, + "learning_rate": 3.7234783073605957e-06, + "loss": 0.5428, + "step": 820 + }, + { + "epoch": 0.68, + "grad_norm": 1.7929761418069659, + "learning_rate": 3.7206258141973445e-06, + "loss": 0.555, + "step": 821 + }, + { + "epoch": 0.68, + "grad_norm": 1.8863691259545465, + "learning_rate": 3.7177712330065285e-06, + "loss": 0.5802, + "step": 822 + }, + { + "epoch": 0.68, + "grad_norm": 1.8383911000943605, + "learning_rate": 3.714914568671252e-06, + "loss": 0.4986, + "step": 823 + }, + { + "epoch": 0.68, + "grad_norm": 2.0032777947804044, + "learning_rate": 3.7120558260781846e-06, + "loss": 0.6456, + "step": 824 + }, + { + "epoch": 0.68, + "grad_norm": 1.733320874844507, + "learning_rate": 3.709195010117551e-06, + "loss": 0.5146, + "step": 825 + }, + { + "epoch": 0.68, + "grad_norm": 1.7411187007421471, + "learning_rate": 3.7063321256831193e-06, + "loss": 0.5297, + "step": 826 + }, + { + "epoch": 0.69, + "grad_norm": 1.8334107493901353, + "learning_rate": 3.7034671776722003e-06, + "loss": 0.545, + "step": 827 + }, + { + "epoch": 0.69, + "grad_norm": 1.931467221651553, + "learning_rate": 3.7006001709856314e-06, + "loss": 0.579, + "step": 828 + }, + { + "epoch": 0.69, + "grad_norm": 1.799522216655623, + "learning_rate": 3.697731110527774e-06, + "loss": 0.5453, + "step": 829 + }, + { + "epoch": 0.69, + "grad_norm": 1.8098119388805842, + "learning_rate": 3.6948600012065016e-06, + "loss": 0.5186, + "step": 830 + }, + { + "epoch": 0.69, + "grad_norm": 1.8419013342395714, + "learning_rate": 3.6919868479331934e-06, + "loss": 0.4833, + "step": 831 + }, + { + "epoch": 0.69, + "grad_norm": 1.8419148322752323, + "learning_rate": 3.6891116556227234e-06, + "loss": 0.5479, + "step": 832 + }, + { + "epoch": 0.69, + "grad_norm": 1.7858200344474908, + "learning_rate": 3.6862344291934545e-06, + "loss": 0.5264, + "step": 833 + }, + { + "epoch": 0.69, + "grad_norm": 1.8057437623830686, + "learning_rate": 3.6833551735672293e-06, + "loss": 0.5208, + "step": 834 + }, + { + "epoch": 0.69, + "grad_norm": 1.8570584000334132, + "learning_rate": 3.6804738936693617e-06, + "loss": 0.5652, + "step": 835 + }, + { + "epoch": 0.69, + "grad_norm": 1.7961732805960369, + "learning_rate": 3.677590594428629e-06, + "loss": 0.5693, + "step": 836 + }, + { + "epoch": 0.69, + "grad_norm": 1.954108513879844, + "learning_rate": 3.6747052807772614e-06, + "loss": 0.5673, + "step": 837 + }, + { + "epoch": 0.69, + "grad_norm": 1.834152772161213, + "learning_rate": 3.671817957650936e-06, + "loss": 0.5118, + "step": 838 + }, + { + "epoch": 0.7, + "grad_norm": 1.8035026424969205, + "learning_rate": 3.6689286299887663e-06, + "loss": 0.5778, + "step": 839 + }, + { + "epoch": 0.7, + "grad_norm": 1.7862771700309947, + "learning_rate": 3.666037302733295e-06, + "loss": 0.5575, + "step": 840 + }, + { + "epoch": 0.7, + "grad_norm": 1.7398650592861555, + "learning_rate": 3.6631439808304874e-06, + "loss": 0.5323, + "step": 841 + }, + { + "epoch": 0.7, + "grad_norm": 1.7082885736006344, + "learning_rate": 3.6602486692297183e-06, + "loss": 0.543, + "step": 842 + }, + { + "epoch": 0.7, + "grad_norm": 1.8242434568233548, + "learning_rate": 3.6573513728837685e-06, + "loss": 0.5579, + "step": 843 + }, + { + "epoch": 0.7, + "grad_norm": 1.8305967806472925, + "learning_rate": 3.6544520967488108e-06, + "loss": 0.5425, + "step": 844 + }, + { + "epoch": 0.7, + "grad_norm": 1.7126995402462595, + "learning_rate": 3.651550845784407e-06, + "loss": 0.5399, + "step": 845 + }, + { + "epoch": 0.7, + "grad_norm": 1.992190051239983, + "learning_rate": 3.648647624953496e-06, + "loss": 0.5951, + "step": 846 + }, + { + "epoch": 0.7, + "grad_norm": 1.9362402903409848, + "learning_rate": 3.6457424392223885e-06, + "loss": 0.5427, + "step": 847 + }, + { + "epoch": 0.7, + "grad_norm": 1.7390586845081806, + "learning_rate": 3.642835293560754e-06, + "loss": 0.5269, + "step": 848 + }, + { + "epoch": 0.7, + "grad_norm": 1.8601747321693383, + "learning_rate": 3.639926192941615e-06, + "loss": 0.5246, + "step": 849 + }, + { + "epoch": 0.7, + "grad_norm": 1.8305054240762129, + "learning_rate": 3.6370151423413396e-06, + "loss": 0.562, + "step": 850 + }, + { + "epoch": 0.71, + "grad_norm": 1.8361711553327809, + "learning_rate": 3.6341021467396296e-06, + "loss": 0.5066, + "step": 851 + }, + { + "epoch": 0.71, + "grad_norm": 1.9202617492772214, + "learning_rate": 3.6311872111195163e-06, + "loss": 0.5755, + "step": 852 + }, + { + "epoch": 0.71, + "grad_norm": 1.9056266366653432, + "learning_rate": 3.628270340467348e-06, + "loss": 0.5193, + "step": 853 + }, + { + "epoch": 0.71, + "grad_norm": 1.9700971504271882, + "learning_rate": 3.625351539772783e-06, + "loss": 0.5499, + "step": 854 + }, + { + "epoch": 0.71, + "grad_norm": 1.7142305580780086, + "learning_rate": 3.6224308140287818e-06, + "loss": 0.5597, + "step": 855 + }, + { + "epoch": 0.71, + "grad_norm": 1.7897876492593174, + "learning_rate": 3.6195081682315972e-06, + "loss": 0.5347, + "step": 856 + }, + { + "epoch": 0.71, + "grad_norm": 2.191923699092432, + "learning_rate": 3.616583607380769e-06, + "loss": 0.5251, + "step": 857 + }, + { + "epoch": 0.71, + "grad_norm": 1.8582876176666503, + "learning_rate": 3.61365713647911e-06, + "loss": 0.5067, + "step": 858 + }, + { + "epoch": 0.71, + "grad_norm": 1.991617360171558, + "learning_rate": 3.610728760532701e-06, + "loss": 0.6464, + "step": 859 + }, + { + "epoch": 0.71, + "grad_norm": 1.892621069660817, + "learning_rate": 3.607798484550881e-06, + "loss": 0.5145, + "step": 860 + }, + { + "epoch": 0.71, + "grad_norm": 1.7592963181570629, + "learning_rate": 3.6048663135462423e-06, + "loss": 0.5297, + "step": 861 + }, + { + "epoch": 0.71, + "grad_norm": 2.020192040751123, + "learning_rate": 3.6019322525346157e-06, + "loss": 0.5709, + "step": 862 + }, + { + "epoch": 0.72, + "grad_norm": 1.8575959680616767, + "learning_rate": 3.598996306535067e-06, + "loss": 0.5946, + "step": 863 + }, + { + "epoch": 0.72, + "grad_norm": 1.9638758131071599, + "learning_rate": 3.5960584805698845e-06, + "loss": 0.4833, + "step": 864 + }, + { + "epoch": 0.72, + "grad_norm": 1.7517341191956926, + "learning_rate": 3.593118779664574e-06, + "loss": 0.5439, + "step": 865 + }, + { + "epoch": 0.72, + "grad_norm": 1.7637144330636925, + "learning_rate": 3.590177208847848e-06, + "loss": 0.4898, + "step": 866 + }, + { + "epoch": 0.72, + "grad_norm": 2.107899096934758, + "learning_rate": 3.5872337731516186e-06, + "loss": 0.5332, + "step": 867 + }, + { + "epoch": 0.72, + "grad_norm": 2.016493645108941, + "learning_rate": 3.5842884776109875e-06, + "loss": 0.5313, + "step": 868 + }, + { + "epoch": 0.72, + "grad_norm": 1.8758602544873038, + "learning_rate": 3.581341327264236e-06, + "loss": 0.554, + "step": 869 + }, + { + "epoch": 0.72, + "grad_norm": 1.8566881639083022, + "learning_rate": 3.5783923271528222e-06, + "loss": 0.5322, + "step": 870 + }, + { + "epoch": 0.72, + "grad_norm": 1.9151838907738468, + "learning_rate": 3.5754414823213647e-06, + "loss": 0.5306, + "step": 871 + }, + { + "epoch": 0.72, + "grad_norm": 1.7893407766785276, + "learning_rate": 3.572488797817639e-06, + "loss": 0.5226, + "step": 872 + }, + { + "epoch": 0.72, + "grad_norm": 1.908122661974681, + "learning_rate": 3.569534278692569e-06, + "loss": 0.5132, + "step": 873 + }, + { + "epoch": 0.72, + "grad_norm": 1.9052513037253582, + "learning_rate": 3.5665779300002144e-06, + "loss": 0.513, + "step": 874 + }, + { + "epoch": 0.73, + "grad_norm": 1.7876914527016339, + "learning_rate": 3.563619756797767e-06, + "loss": 0.5627, + "step": 875 + }, + { + "epoch": 0.73, + "grad_norm": 1.9607045801516068, + "learning_rate": 3.5606597641455387e-06, + "loss": 0.4986, + "step": 876 + }, + { + "epoch": 0.73, + "grad_norm": 1.701462749441997, + "learning_rate": 3.5576979571069527e-06, + "loss": 0.5306, + "step": 877 + }, + { + "epoch": 0.73, + "grad_norm": 1.8413701238351416, + "learning_rate": 3.554734340748538e-06, + "loss": 0.5602, + "step": 878 + }, + { + "epoch": 0.73, + "grad_norm": 1.8762306249541667, + "learning_rate": 3.5517689201399162e-06, + "loss": 0.5663, + "step": 879 + }, + { + "epoch": 0.73, + "grad_norm": 1.833164968453507, + "learning_rate": 3.5488017003537977e-06, + "loss": 0.5264, + "step": 880 + }, + { + "epoch": 0.73, + "grad_norm": 1.766302763247428, + "learning_rate": 3.5458326864659687e-06, + "loss": 0.5498, + "step": 881 + }, + { + "epoch": 0.73, + "grad_norm": 1.821883208129187, + "learning_rate": 3.5428618835552867e-06, + "loss": 0.5468, + "step": 882 + }, + { + "epoch": 0.73, + "grad_norm": 1.7773758034614335, + "learning_rate": 3.5398892967036674e-06, + "loss": 0.505, + "step": 883 + }, + { + "epoch": 0.73, + "grad_norm": 1.8248820711070537, + "learning_rate": 3.5369149309960783e-06, + "loss": 0.5679, + "step": 884 + }, + { + "epoch": 0.73, + "grad_norm": 1.8248114104788378, + "learning_rate": 3.5339387915205305e-06, + "loss": 0.5351, + "step": 885 + }, + { + "epoch": 0.73, + "grad_norm": 2.00472132505421, + "learning_rate": 3.53096088336807e-06, + "loss": 0.5637, + "step": 886 + }, + { + "epoch": 0.74, + "grad_norm": 2.0594957277906656, + "learning_rate": 3.5279812116327667e-06, + "loss": 0.567, + "step": 887 + }, + { + "epoch": 0.74, + "grad_norm": 1.916227169502353, + "learning_rate": 3.5249997814117098e-06, + "loss": 0.5733, + "step": 888 + }, + { + "epoch": 0.74, + "grad_norm": 1.7595020268824906, + "learning_rate": 3.5220165978049937e-06, + "loss": 0.5512, + "step": 889 + }, + { + "epoch": 0.74, + "grad_norm": 1.8259487385184114, + "learning_rate": 3.5190316659157126e-06, + "loss": 0.5332, + "step": 890 + }, + { + "epoch": 0.74, + "grad_norm": 1.8216813752485344, + "learning_rate": 3.5160449908499538e-06, + "loss": 0.5718, + "step": 891 + }, + { + "epoch": 0.74, + "grad_norm": 1.8497964997952454, + "learning_rate": 3.5130565777167845e-06, + "loss": 0.5179, + "step": 892 + }, + { + "epoch": 0.74, + "grad_norm": 1.8242356367817554, + "learning_rate": 3.5100664316282464e-06, + "loss": 0.5587, + "step": 893 + }, + { + "epoch": 0.74, + "grad_norm": 1.7793507179190546, + "learning_rate": 3.5070745576993428e-06, + "loss": 0.5924, + "step": 894 + }, + { + "epoch": 0.74, + "grad_norm": 1.920176905610262, + "learning_rate": 3.5040809610480364e-06, + "loss": 0.5579, + "step": 895 + }, + { + "epoch": 0.74, + "grad_norm": 1.954421523744336, + "learning_rate": 3.5010856467952335e-06, + "loss": 0.5496, + "step": 896 + }, + { + "epoch": 0.74, + "grad_norm": 1.7785169911731862, + "learning_rate": 3.4980886200647817e-06, + "loss": 0.5383, + "step": 897 + }, + { + "epoch": 0.74, + "grad_norm": 1.853827977546151, + "learning_rate": 3.4950898859834555e-06, + "loss": 0.5501, + "step": 898 + }, + { + "epoch": 0.75, + "grad_norm": 1.9882198198152168, + "learning_rate": 3.4920894496809515e-06, + "loss": 0.5557, + "step": 899 + }, + { + "epoch": 0.75, + "grad_norm": 1.98090605107646, + "learning_rate": 3.489087316289877e-06, + "loss": 0.5661, + "step": 900 + }, + { + "epoch": 0.75, + "grad_norm": 2.0027723691714785, + "learning_rate": 3.486083490945743e-06, + "loss": 0.4791, + "step": 901 + }, + { + "epoch": 0.75, + "grad_norm": 2.0183911897675015, + "learning_rate": 3.4830779787869555e-06, + "loss": 0.5386, + "step": 902 + }, + { + "epoch": 0.75, + "grad_norm": 1.9385976919386894, + "learning_rate": 3.480070784954805e-06, + "loss": 0.5351, + "step": 903 + }, + { + "epoch": 0.75, + "grad_norm": 1.7612550957325825, + "learning_rate": 3.4770619145934586e-06, + "loss": 0.511, + "step": 904 + }, + { + "epoch": 0.75, + "grad_norm": 1.8677538420589843, + "learning_rate": 3.4740513728499515e-06, + "loss": 0.5942, + "step": 905 + }, + { + "epoch": 0.75, + "grad_norm": 1.9208446249900946, + "learning_rate": 3.4710391648741787e-06, + "loss": 0.5146, + "step": 906 + }, + { + "epoch": 0.75, + "grad_norm": 1.8008673055527855, + "learning_rate": 3.468025295818885e-06, + "loss": 0.5909, + "step": 907 + }, + { + "epoch": 0.75, + "grad_norm": 1.891052390507894, + "learning_rate": 3.465009770839657e-06, + "loss": 0.5527, + "step": 908 + }, + { + "epoch": 0.75, + "grad_norm": 2.0521048489395435, + "learning_rate": 3.4619925950949126e-06, + "loss": 0.5756, + "step": 909 + }, + { + "epoch": 0.75, + "grad_norm": 2.003295441830653, + "learning_rate": 3.4589737737458946e-06, + "loss": 0.5299, + "step": 910 + }, + { + "epoch": 0.76, + "grad_norm": 1.7635851435542724, + "learning_rate": 3.4559533119566612e-06, + "loss": 0.5338, + "step": 911 + }, + { + "epoch": 0.76, + "grad_norm": 1.834326490517508, + "learning_rate": 3.4529312148940763e-06, + "loss": 0.56, + "step": 912 + }, + { + "epoch": 0.76, + "grad_norm": 1.8618427761057224, + "learning_rate": 3.4499074877278016e-06, + "loss": 0.5189, + "step": 913 + }, + { + "epoch": 0.76, + "grad_norm": 2.04459004844406, + "learning_rate": 3.446882135630286e-06, + "loss": 0.5765, + "step": 914 + }, + { + "epoch": 0.76, + "grad_norm": 1.7467595732765806, + "learning_rate": 3.4438551637767604e-06, + "loss": 0.5512, + "step": 915 + }, + { + "epoch": 0.76, + "grad_norm": 1.7952035114217406, + "learning_rate": 3.4408265773452226e-06, + "loss": 0.5348, + "step": 916 + }, + { + "epoch": 0.76, + "grad_norm": 1.8448198186244822, + "learning_rate": 3.4377963815164362e-06, + "loss": 0.5187, + "step": 917 + }, + { + "epoch": 0.76, + "grad_norm": 1.7738820116169103, + "learning_rate": 3.4347645814739156e-06, + "loss": 0.507, + "step": 918 + }, + { + "epoch": 0.76, + "grad_norm": 1.9699054774415494, + "learning_rate": 3.4317311824039216e-06, + "loss": 0.5175, + "step": 919 + }, + { + "epoch": 0.76, + "grad_norm": 1.7482905457169124, + "learning_rate": 3.4286961894954473e-06, + "loss": 0.5188, + "step": 920 + }, + { + "epoch": 0.76, + "grad_norm": 1.8012194296110113, + "learning_rate": 3.425659607940215e-06, + "loss": 0.5465, + "step": 921 + }, + { + "epoch": 0.76, + "grad_norm": 1.7978097428012587, + "learning_rate": 3.422621442932662e-06, + "loss": 0.5257, + "step": 922 + }, + { + "epoch": 0.77, + "grad_norm": 1.8534167116514217, + "learning_rate": 3.419581699669937e-06, + "loss": 0.536, + "step": 923 + }, + { + "epoch": 0.77, + "grad_norm": 1.7733377878036733, + "learning_rate": 3.416540383351888e-06, + "loss": 0.5632, + "step": 924 + }, + { + "epoch": 0.77, + "grad_norm": 1.8124786776539388, + "learning_rate": 3.4134974991810503e-06, + "loss": 0.5471, + "step": 925 + }, + { + "epoch": 0.77, + "grad_norm": 1.8553271859579439, + "learning_rate": 3.4104530523626463e-06, + "loss": 0.538, + "step": 926 + }, + { + "epoch": 0.77, + "grad_norm": 1.8888926038913822, + "learning_rate": 3.4074070481045683e-06, + "loss": 0.4868, + "step": 927 + }, + { + "epoch": 0.77, + "grad_norm": 2.0158609319355505, + "learning_rate": 3.404359491617374e-06, + "loss": 0.5757, + "step": 928 + }, + { + "epoch": 0.77, + "grad_norm": 1.8376639720078027, + "learning_rate": 3.401310388114276e-06, + "loss": 0.5377, + "step": 929 + }, + { + "epoch": 0.77, + "grad_norm": 2.3651883595335232, + "learning_rate": 3.3982597428111336e-06, + "loss": 0.5536, + "step": 930 + }, + { + "epoch": 0.77, + "grad_norm": 1.908409388949023, + "learning_rate": 3.3952075609264423e-06, + "loss": 0.5349, + "step": 931 + }, + { + "epoch": 0.77, + "grad_norm": 1.8261622890952995, + "learning_rate": 3.3921538476813278e-06, + "loss": 0.4991, + "step": 932 + }, + { + "epoch": 0.77, + "grad_norm": 1.924034720876031, + "learning_rate": 3.3890986082995353e-06, + "loss": 0.536, + "step": 933 + }, + { + "epoch": 0.77, + "grad_norm": 1.829615974230478, + "learning_rate": 3.3860418480074188e-06, + "loss": 0.5163, + "step": 934 + }, + { + "epoch": 0.78, + "grad_norm": 1.7812992854973535, + "learning_rate": 3.3829835720339353e-06, + "loss": 0.5412, + "step": 935 + }, + { + "epoch": 0.78, + "grad_norm": 1.8270515542068861, + "learning_rate": 3.3799237856106348e-06, + "loss": 0.5459, + "step": 936 + }, + { + "epoch": 0.78, + "grad_norm": 1.8336967909163833, + "learning_rate": 3.3768624939716506e-06, + "loss": 0.5074, + "step": 937 + }, + { + "epoch": 0.78, + "grad_norm": 1.773892866992307, + "learning_rate": 3.373799702353691e-06, + "loss": 0.5457, + "step": 938 + }, + { + "epoch": 0.78, + "grad_norm": 1.8605607499004266, + "learning_rate": 3.370735415996031e-06, + "loss": 0.5691, + "step": 939 + }, + { + "epoch": 0.78, + "grad_norm": 1.7961529805945686, + "learning_rate": 3.3676696401405007e-06, + "loss": 0.5406, + "step": 940 + }, + { + "epoch": 0.78, + "grad_norm": 1.7406787561376078, + "learning_rate": 3.3646023800314792e-06, + "loss": 0.5297, + "step": 941 + }, + { + "epoch": 0.78, + "grad_norm": 1.9794693468141764, + "learning_rate": 3.361533640915885e-06, + "loss": 0.4765, + "step": 942 + }, + { + "epoch": 0.78, + "grad_norm": 1.820632707720892, + "learning_rate": 3.3584634280431657e-06, + "loss": 0.5395, + "step": 943 + }, + { + "epoch": 0.78, + "grad_norm": 1.8478126164835549, + "learning_rate": 3.3553917466652915e-06, + "loss": 0.5288, + "step": 944 + }, + { + "epoch": 0.78, + "grad_norm": 1.749509825583459, + "learning_rate": 3.352318602036742e-06, + "loss": 0.5343, + "step": 945 + }, + { + "epoch": 0.78, + "grad_norm": 1.8034305951190157, + "learning_rate": 3.3492439994145033e-06, + "loss": 0.5536, + "step": 946 + }, + { + "epoch": 0.79, + "grad_norm": 1.8172591817519397, + "learning_rate": 3.346167944058052e-06, + "loss": 0.5844, + "step": 947 + }, + { + "epoch": 0.79, + "grad_norm": 1.749562414198837, + "learning_rate": 3.3430904412293526e-06, + "loss": 0.4833, + "step": 948 + }, + { + "epoch": 0.79, + "grad_norm": 1.7243742428927225, + "learning_rate": 3.3400114961928444e-06, + "loss": 0.4828, + "step": 949 + }, + { + "epoch": 0.79, + "grad_norm": 1.757242299744874, + "learning_rate": 3.3369311142154337e-06, + "loss": 0.5282, + "step": 950 + }, + { + "epoch": 0.79, + "grad_norm": 2.036302581700697, + "learning_rate": 3.3338493005664853e-06, + "loss": 0.5315, + "step": 951 + }, + { + "epoch": 0.79, + "grad_norm": 1.886299636672335, + "learning_rate": 3.330766060517812e-06, + "loss": 0.5244, + "step": 952 + }, + { + "epoch": 0.79, + "grad_norm": 1.898853787733011, + "learning_rate": 3.3276813993436695e-06, + "loss": 0.5914, + "step": 953 + }, + { + "epoch": 0.79, + "grad_norm": 1.8359472984671243, + "learning_rate": 3.324595322320741e-06, + "loss": 0.5488, + "step": 954 + }, + { + "epoch": 0.79, + "grad_norm": 1.8768955168510497, + "learning_rate": 3.321507834728134e-06, + "loss": 0.5871, + "step": 955 + }, + { + "epoch": 0.79, + "grad_norm": 1.8358033818112791, + "learning_rate": 3.3184189418473674e-06, + "loss": 0.5632, + "step": 956 + }, + { + "epoch": 0.79, + "grad_norm": 1.792562502385941, + "learning_rate": 3.315328648962364e-06, + "loss": 0.4887, + "step": 957 + }, + { + "epoch": 0.79, + "grad_norm": 1.8732702930932368, + "learning_rate": 3.312236961359444e-06, + "loss": 0.5313, + "step": 958 + }, + { + "epoch": 0.8, + "grad_norm": 1.7708047128885986, + "learning_rate": 3.3091438843273115e-06, + "loss": 0.5348, + "step": 959 + }, + { + "epoch": 0.8, + "grad_norm": 1.9094434763935804, + "learning_rate": 3.3060494231570463e-06, + "loss": 0.5027, + "step": 960 + }, + { + "epoch": 0.8, + "grad_norm": 1.87927564418864, + "learning_rate": 3.3029535831420977e-06, + "loss": 0.511, + "step": 961 + }, + { + "epoch": 0.8, + "grad_norm": 1.717365559903535, + "learning_rate": 3.299856369578273e-06, + "loss": 0.5203, + "step": 962 + }, + { + "epoch": 0.8, + "grad_norm": 1.770779257052532, + "learning_rate": 3.2967577877637296e-06, + "loss": 0.5233, + "step": 963 + }, + { + "epoch": 0.8, + "grad_norm": 1.7541392466004568, + "learning_rate": 3.2936578429989653e-06, + "loss": 0.5013, + "step": 964 + }, + { + "epoch": 0.8, + "grad_norm": 1.7840578280891832, + "learning_rate": 3.290556540586809e-06, + "loss": 0.4844, + "step": 965 + }, + { + "epoch": 0.8, + "grad_norm": 1.7184305413001233, + "learning_rate": 3.287453885832413e-06, + "loss": 0.4694, + "step": 966 + }, + { + "epoch": 0.8, + "grad_norm": 1.8671517036325307, + "learning_rate": 3.2843498840432403e-06, + "loss": 0.4652, + "step": 967 + }, + { + "epoch": 0.8, + "grad_norm": 1.9960847871768508, + "learning_rate": 3.2812445405290612e-06, + "loss": 0.5906, + "step": 968 + }, + { + "epoch": 0.8, + "grad_norm": 1.7535227575839891, + "learning_rate": 3.27813786060194e-06, + "loss": 0.5482, + "step": 969 + }, + { + "epoch": 0.8, + "grad_norm": 1.929231862440999, + "learning_rate": 3.2750298495762278e-06, + "loss": 0.5334, + "step": 970 + }, + { + "epoch": 0.8, + "grad_norm": 1.7879676366114814, + "learning_rate": 3.2719205127685505e-06, + "loss": 0.515, + "step": 971 + }, + { + "epoch": 0.81, + "grad_norm": 1.7817120865072218, + "learning_rate": 3.2688098554978053e-06, + "loss": 0.5045, + "step": 972 + }, + { + "epoch": 0.81, + "grad_norm": 1.8725673808714274, + "learning_rate": 3.265697883085145e-06, + "loss": 0.5557, + "step": 973 + }, + { + "epoch": 0.81, + "grad_norm": 1.8554796275037901, + "learning_rate": 3.262584600853973e-06, + "loss": 0.5785, + "step": 974 + }, + { + "epoch": 0.81, + "grad_norm": 1.77078783324655, + "learning_rate": 3.259470014129936e-06, + "loss": 0.524, + "step": 975 + }, + { + "epoch": 0.81, + "grad_norm": 1.820843626030818, + "learning_rate": 3.256354128240907e-06, + "loss": 0.5144, + "step": 976 + }, + { + "epoch": 0.81, + "grad_norm": 1.9330495063889956, + "learning_rate": 3.253236948516987e-06, + "loss": 0.5405, + "step": 977 + }, + { + "epoch": 0.81, + "grad_norm": 1.9113413794485425, + "learning_rate": 3.2501184802904867e-06, + "loss": 0.5212, + "step": 978 + }, + { + "epoch": 0.81, + "grad_norm": 1.799188386703558, + "learning_rate": 3.2469987288959208e-06, + "loss": 0.5148, + "step": 979 + }, + { + "epoch": 0.81, + "grad_norm": 1.8610914183588203, + "learning_rate": 3.2438776996700023e-06, + "loss": 0.5363, + "step": 980 + }, + { + "epoch": 0.81, + "grad_norm": 1.8245263524947073, + "learning_rate": 3.240755397951625e-06, + "loss": 0.5216, + "step": 981 + }, + { + "epoch": 0.81, + "grad_norm": 1.7863270641417597, + "learning_rate": 3.2376318290818643e-06, + "loss": 0.5581, + "step": 982 + }, + { + "epoch": 0.81, + "grad_norm": 1.9266115141469626, + "learning_rate": 3.23450699840396e-06, + "loss": 0.5178, + "step": 983 + }, + { + "epoch": 0.82, + "grad_norm": 1.8044458399187253, + "learning_rate": 3.2313809112633133e-06, + "loss": 0.5252, + "step": 984 + }, + { + "epoch": 0.82, + "grad_norm": 1.8809392949423562, + "learning_rate": 3.2282535730074714e-06, + "loss": 0.486, + "step": 985 + }, + { + "epoch": 0.82, + "grad_norm": 1.9487997548787144, + "learning_rate": 3.2251249889861237e-06, + "loss": 0.5272, + "step": 986 + }, + { + "epoch": 0.82, + "grad_norm": 2.088279538426057, + "learning_rate": 3.2219951645510907e-06, + "loss": 0.5426, + "step": 987 + }, + { + "epoch": 0.82, + "grad_norm": 1.8280370745964312, + "learning_rate": 3.218864105056313e-06, + "loss": 0.5545, + "step": 988 + }, + { + "epoch": 0.82, + "grad_norm": 1.7678201455723743, + "learning_rate": 3.2157318158578473e-06, + "loss": 0.5476, + "step": 989 + }, + { + "epoch": 0.82, + "grad_norm": 1.708170466024094, + "learning_rate": 3.21259830231385e-06, + "loss": 0.5442, + "step": 990 + }, + { + "epoch": 0.82, + "grad_norm": 2.0427224573251483, + "learning_rate": 3.209463569784575e-06, + "loss": 0.5501, + "step": 991 + }, + { + "epoch": 0.82, + "grad_norm": 1.8557413526282036, + "learning_rate": 3.206327623632359e-06, + "loss": 0.5573, + "step": 992 + }, + { + "epoch": 0.82, + "grad_norm": 1.7138810851622357, + "learning_rate": 3.2031904692216153e-06, + "loss": 0.5267, + "step": 993 + }, + { + "epoch": 0.82, + "grad_norm": 1.9034028799031073, + "learning_rate": 3.2000521119188267e-06, + "loss": 0.5605, + "step": 994 + }, + { + "epoch": 0.82, + "grad_norm": 1.994571492675121, + "learning_rate": 3.1969125570925303e-06, + "loss": 0.53, + "step": 995 + }, + { + "epoch": 0.83, + "grad_norm": 1.771581881704634, + "learning_rate": 3.193771810113313e-06, + "loss": 0.6177, + "step": 996 + }, + { + "epoch": 0.83, + "grad_norm": 1.7808220445921694, + "learning_rate": 3.1906298763538005e-06, + "loss": 0.5215, + "step": 997 + }, + { + "epoch": 0.83, + "grad_norm": 1.8069794706642701, + "learning_rate": 3.1874867611886513e-06, + "loss": 0.5444, + "step": 998 + }, + { + "epoch": 0.83, + "grad_norm": 1.7806867210889854, + "learning_rate": 3.1843424699945403e-06, + "loss": 0.5471, + "step": 999 + }, + { + "epoch": 0.83, + "grad_norm": 1.7481554024627886, + "learning_rate": 3.1811970081501576e-06, + "loss": 0.5159, + "step": 1000 + }, + { + "epoch": 0.83, + "grad_norm": 1.8105318680671914, + "learning_rate": 3.1780503810361946e-06, + "loss": 0.4985, + "step": 1001 + }, + { + "epoch": 0.83, + "grad_norm": 1.7033701950072382, + "learning_rate": 3.1749025940353363e-06, + "loss": 0.5594, + "step": 1002 + }, + { + "epoch": 0.83, + "grad_norm": 2.3799847532384515, + "learning_rate": 3.1717536525322512e-06, + "loss": 0.5978, + "step": 1003 + }, + { + "epoch": 0.83, + "grad_norm": 1.7427559432173463, + "learning_rate": 3.1686035619135845e-06, + "loss": 0.5299, + "step": 1004 + }, + { + "epoch": 0.83, + "grad_norm": 1.7454547855925509, + "learning_rate": 3.1654523275679453e-06, + "loss": 0.5439, + "step": 1005 + }, + { + "epoch": 0.83, + "grad_norm": 1.7130931472340127, + "learning_rate": 3.162299954885899e-06, + "loss": 0.5379, + "step": 1006 + }, + { + "epoch": 0.83, + "grad_norm": 1.6940357366272063, + "learning_rate": 3.15914644925996e-06, + "loss": 0.5694, + "step": 1007 + }, + { + "epoch": 0.84, + "grad_norm": 1.8544220651543013, + "learning_rate": 3.1559918160845787e-06, + "loss": 0.5285, + "step": 1008 + }, + { + "epoch": 0.84, + "grad_norm": 1.8481774433371347, + "learning_rate": 3.1528360607561358e-06, + "loss": 0.5384, + "step": 1009 + }, + { + "epoch": 0.84, + "grad_norm": 1.8256828659009958, + "learning_rate": 3.149679188672932e-06, + "loss": 0.4806, + "step": 1010 + }, + { + "epoch": 0.84, + "grad_norm": 1.9380282822721238, + "learning_rate": 3.1465212052351766e-06, + "loss": 0.543, + "step": 1011 + }, + { + "epoch": 0.84, + "grad_norm": 1.985943690469791, + "learning_rate": 3.1433621158449807e-06, + "loss": 0.5549, + "step": 1012 + }, + { + "epoch": 0.84, + "grad_norm": 1.7038398790061953, + "learning_rate": 3.140201925906348e-06, + "loss": 0.4682, + "step": 1013 + }, + { + "epoch": 0.84, + "grad_norm": 1.8748481620529394, + "learning_rate": 3.1370406408251632e-06, + "loss": 0.5046, + "step": 1014 + }, + { + "epoch": 0.84, + "grad_norm": 1.7587036990451181, + "learning_rate": 3.133878266009186e-06, + "loss": 0.5203, + "step": 1015 + }, + { + "epoch": 0.84, + "grad_norm": 1.7503537433041947, + "learning_rate": 3.130714806868041e-06, + "loss": 0.5546, + "step": 1016 + }, + { + "epoch": 0.84, + "grad_norm": 1.7701505667314001, + "learning_rate": 3.127550268813205e-06, + "loss": 0.531, + "step": 1017 + }, + { + "epoch": 0.84, + "grad_norm": 1.771371589393474, + "learning_rate": 3.124384657258001e-06, + "loss": 0.5424, + "step": 1018 + }, + { + "epoch": 0.84, + "grad_norm": 1.8016015279719124, + "learning_rate": 3.1212179776175905e-06, + "loss": 0.5706, + "step": 1019 + }, + { + "epoch": 0.85, + "grad_norm": 1.810944889002695, + "learning_rate": 3.1180502353089598e-06, + "loss": 0.5502, + "step": 1020 + }, + { + "epoch": 0.85, + "grad_norm": 1.8062084514449492, + "learning_rate": 3.1148814357509147e-06, + "loss": 0.5337, + "step": 1021 + }, + { + "epoch": 0.85, + "grad_norm": 1.669643406466654, + "learning_rate": 3.111711584364068e-06, + "loss": 0.4802, + "step": 1022 + }, + { + "epoch": 0.85, + "grad_norm": 1.6852245083058144, + "learning_rate": 3.1085406865708333e-06, + "loss": 0.532, + "step": 1023 + }, + { + "epoch": 0.85, + "grad_norm": 1.8463748056800222, + "learning_rate": 3.1053687477954124e-06, + "loss": 0.5112, + "step": 1024 + }, + { + "epoch": 0.85, + "grad_norm": 1.7302148909577209, + "learning_rate": 3.10219577346379e-06, + "loss": 0.5549, + "step": 1025 + }, + { + "epoch": 0.85, + "grad_norm": 1.7752983463714818, + "learning_rate": 3.0990217690037206e-06, + "loss": 0.5606, + "step": 1026 + }, + { + "epoch": 0.85, + "grad_norm": 1.695119975844164, + "learning_rate": 3.09584673984472e-06, + "loss": 0.486, + "step": 1027 + }, + { + "epoch": 0.85, + "grad_norm": 1.793543444803663, + "learning_rate": 3.0926706914180605e-06, + "loss": 0.6474, + "step": 1028 + }, + { + "epoch": 0.85, + "grad_norm": 1.6954588940750932, + "learning_rate": 3.089493629156755e-06, + "loss": 0.5208, + "step": 1029 + }, + { + "epoch": 0.85, + "grad_norm": 1.9045089074493644, + "learning_rate": 3.08631555849555e-06, + "loss": 0.5291, + "step": 1030 + }, + { + "epoch": 0.85, + "grad_norm": 1.8481217904786489, + "learning_rate": 3.083136484870921e-06, + "loss": 0.5212, + "step": 1031 + }, + { + "epoch": 0.86, + "grad_norm": 1.6729420221561044, + "learning_rate": 3.0799564137210536e-06, + "loss": 0.5024, + "step": 1032 + }, + { + "epoch": 0.86, + "grad_norm": 1.8821832248249077, + "learning_rate": 3.076775350485845e-06, + "loss": 0.5459, + "step": 1033 + }, + { + "epoch": 0.86, + "grad_norm": 1.762473350167322, + "learning_rate": 3.0735933006068863e-06, + "loss": 0.4938, + "step": 1034 + }, + { + "epoch": 0.86, + "grad_norm": 1.7950707678098703, + "learning_rate": 3.0704102695274573e-06, + "loss": 0.4922, + "step": 1035 + }, + { + "epoch": 0.86, + "grad_norm": 1.6853644769275375, + "learning_rate": 3.0672262626925174e-06, + "loss": 0.47, + "step": 1036 + }, + { + "epoch": 0.86, + "grad_norm": 1.809909106997157, + "learning_rate": 3.0640412855486922e-06, + "loss": 0.5545, + "step": 1037 + }, + { + "epoch": 0.86, + "grad_norm": 2.019472393876661, + "learning_rate": 3.06085534354427e-06, + "loss": 0.5616, + "step": 1038 + }, + { + "epoch": 0.86, + "grad_norm": 1.7972785887075076, + "learning_rate": 3.057668442129188e-06, + "loss": 0.5269, + "step": 1039 + }, + { + "epoch": 0.86, + "grad_norm": 1.865555820217107, + "learning_rate": 3.054480586755026e-06, + "loss": 0.5752, + "step": 1040 + }, + { + "epoch": 0.86, + "grad_norm": 1.792147096098412, + "learning_rate": 3.051291782874995e-06, + "loss": 0.54, + "step": 1041 + }, + { + "epoch": 0.86, + "grad_norm": 1.8108893550848508, + "learning_rate": 3.048102035943927e-06, + "loss": 0.5367, + "step": 1042 + }, + { + "epoch": 0.86, + "grad_norm": 2.0966646553454793, + "learning_rate": 3.04491135141827e-06, + "loss": 0.5455, + "step": 1043 + }, + { + "epoch": 0.87, + "grad_norm": 1.7357403687049695, + "learning_rate": 3.041719734756073e-06, + "loss": 0.502, + "step": 1044 + }, + { + "epoch": 0.87, + "grad_norm": 1.8033826162723872, + "learning_rate": 3.038527191416982e-06, + "loss": 0.5644, + "step": 1045 + }, + { + "epoch": 0.87, + "grad_norm": 1.7822928111630525, + "learning_rate": 3.0353337268622267e-06, + "loss": 0.4938, + "step": 1046 + }, + { + "epoch": 0.87, + "grad_norm": 1.7910319343463081, + "learning_rate": 3.0321393465546134e-06, + "loss": 0.5889, + "step": 1047 + }, + { + "epoch": 0.87, + "grad_norm": 1.7457160087273953, + "learning_rate": 3.028944055958514e-06, + "loss": 0.5022, + "step": 1048 + }, + { + "epoch": 0.87, + "grad_norm": 1.691379648176161, + "learning_rate": 3.0257478605398595e-06, + "loss": 0.4841, + "step": 1049 + }, + { + "epoch": 0.87, + "grad_norm": 1.7452186987943483, + "learning_rate": 3.0225507657661257e-06, + "loss": 0.5626, + "step": 1050 + }, + { + "epoch": 0.87, + "grad_norm": 1.7578678635930594, + "learning_rate": 3.0193527771063297e-06, + "loss": 0.5115, + "step": 1051 + }, + { + "epoch": 0.87, + "grad_norm": 1.7879798898209605, + "learning_rate": 3.016153900031016e-06, + "loss": 0.5296, + "step": 1052 + }, + { + "epoch": 0.87, + "grad_norm": 1.6745604796677231, + "learning_rate": 3.0129541400122492e-06, + "loss": 0.5089, + "step": 1053 + }, + { + "epoch": 0.87, + "grad_norm": 1.8484438696306678, + "learning_rate": 3.0097535025236045e-06, + "loss": 0.6124, + "step": 1054 + }, + { + "epoch": 0.87, + "grad_norm": 1.8023880068850882, + "learning_rate": 3.0065519930401595e-06, + "loss": 0.4983, + "step": 1055 + }, + { + "epoch": 0.88, + "grad_norm": 1.743901583565096, + "learning_rate": 3.0033496170384803e-06, + "loss": 0.4998, + "step": 1056 + }, + { + "epoch": 0.88, + "grad_norm": 1.9494472820876043, + "learning_rate": 3.000146379996617e-06, + "loss": 0.537, + "step": 1057 + }, + { + "epoch": 0.88, + "grad_norm": 1.6992995489648048, + "learning_rate": 2.996942287394093e-06, + "loss": 0.5822, + "step": 1058 + }, + { + "epoch": 0.88, + "grad_norm": 1.8498288139189643, + "learning_rate": 2.993737344711895e-06, + "loss": 0.5651, + "step": 1059 + }, + { + "epoch": 0.88, + "grad_norm": 1.755920633785882, + "learning_rate": 2.990531557432464e-06, + "loss": 0.496, + "step": 1060 + }, + { + "epoch": 0.88, + "grad_norm": 1.7876484928074277, + "learning_rate": 2.9873249310396853e-06, + "loss": 0.5224, + "step": 1061 + }, + { + "epoch": 0.88, + "grad_norm": 1.7573987279473129, + "learning_rate": 2.98411747101888e-06, + "loss": 0.5228, + "step": 1062 + }, + { + "epoch": 0.88, + "grad_norm": 1.6995721104857204, + "learning_rate": 2.980909182856794e-06, + "loss": 0.4758, + "step": 1063 + }, + { + "epoch": 0.88, + "grad_norm": 1.907464743607936, + "learning_rate": 2.9777000720415916e-06, + "loss": 0.5254, + "step": 1064 + }, + { + "epoch": 0.88, + "grad_norm": 1.7921365259203703, + "learning_rate": 2.974490144062844e-06, + "loss": 0.5116, + "step": 1065 + }, + { + "epoch": 0.88, + "grad_norm": 1.9010192849593792, + "learning_rate": 2.9712794044115196e-06, + "loss": 0.5136, + "step": 1066 + }, + { + "epoch": 0.88, + "grad_norm": 1.742881813035793, + "learning_rate": 2.968067858579975e-06, + "loss": 0.5436, + "step": 1067 + }, + { + "epoch": 0.89, + "grad_norm": 1.7135933558215708, + "learning_rate": 2.964855512061947e-06, + "loss": 0.5268, + "step": 1068 + }, + { + "epoch": 0.89, + "grad_norm": 1.8360025545734582, + "learning_rate": 2.9616423703525414e-06, + "loss": 0.5238, + "step": 1069 + }, + { + "epoch": 0.89, + "grad_norm": 1.7090421713960848, + "learning_rate": 2.9584284389482237e-06, + "loss": 0.5051, + "step": 1070 + }, + { + "epoch": 0.89, + "grad_norm": 1.7462732547158757, + "learning_rate": 2.9552137233468113e-06, + "loss": 0.4838, + "step": 1071 + }, + { + "epoch": 0.89, + "grad_norm": 1.9336108910937513, + "learning_rate": 2.951998229047464e-06, + "loss": 0.5576, + "step": 1072 + }, + { + "epoch": 0.89, + "grad_norm": 1.784092660568157, + "learning_rate": 2.9487819615506702e-06, + "loss": 0.5349, + "step": 1073 + }, + { + "epoch": 0.89, + "grad_norm": 1.772640354616067, + "learning_rate": 2.945564926358245e-06, + "loss": 0.5423, + "step": 1074 + }, + { + "epoch": 0.89, + "grad_norm": 1.8491968859591044, + "learning_rate": 2.9423471289733125e-06, + "loss": 0.5453, + "step": 1075 + }, + { + "epoch": 0.89, + "grad_norm": 1.8283172103770493, + "learning_rate": 2.9391285749003046e-06, + "loss": 0.5318, + "step": 1076 + }, + { + "epoch": 0.89, + "grad_norm": 1.7802483696828226, + "learning_rate": 2.935909269644946e-06, + "loss": 0.4954, + "step": 1077 + }, + { + "epoch": 0.89, + "grad_norm": 1.8687809173149, + "learning_rate": 2.9326892187142457e-06, + "loss": 0.5428, + "step": 1078 + }, + { + "epoch": 0.89, + "grad_norm": 1.9218917868616974, + "learning_rate": 2.9294684276164888e-06, + "loss": 0.5125, + "step": 1079 + }, + { + "epoch": 0.9, + "grad_norm": 1.8406300824318225, + "learning_rate": 2.9262469018612278e-06, + "loss": 0.5186, + "step": 1080 + }, + { + "epoch": 0.9, + "grad_norm": 1.8153319034513924, + "learning_rate": 2.9230246469592695e-06, + "loss": 0.4878, + "step": 1081 + }, + { + "epoch": 0.9, + "grad_norm": 1.8381190525343576, + "learning_rate": 2.91980166842267e-06, + "loss": 0.5455, + "step": 1082 + }, + { + "epoch": 0.9, + "grad_norm": 1.7941629060330144, + "learning_rate": 2.9165779717647212e-06, + "loss": 0.5425, + "step": 1083 + }, + { + "epoch": 0.9, + "grad_norm": 1.755950985861856, + "learning_rate": 2.9133535624999466e-06, + "loss": 0.4992, + "step": 1084 + }, + { + "epoch": 0.9, + "grad_norm": 1.8065716401418646, + "learning_rate": 2.9101284461440853e-06, + "loss": 0.5569, + "step": 1085 + }, + { + "epoch": 0.9, + "grad_norm": 1.8487073865649808, + "learning_rate": 2.9069026282140887e-06, + "loss": 0.5352, + "step": 1086 + }, + { + "epoch": 0.9, + "grad_norm": 1.877024524581134, + "learning_rate": 2.903676114228107e-06, + "loss": 0.5584, + "step": 1087 + }, + { + "epoch": 0.9, + "grad_norm": 1.812931375367902, + "learning_rate": 2.9004489097054807e-06, + "loss": 0.5154, + "step": 1088 + }, + { + "epoch": 0.9, + "grad_norm": 1.7729938020658174, + "learning_rate": 2.897221020166732e-06, + "loss": 0.5386, + "step": 1089 + }, + { + "epoch": 0.9, + "grad_norm": 1.6991898958250629, + "learning_rate": 2.8939924511335555e-06, + "loss": 0.5467, + "step": 1090 + }, + { + "epoch": 0.9, + "grad_norm": 1.7298323860671052, + "learning_rate": 2.890763208128807e-06, + "loss": 0.5506, + "step": 1091 + }, + { + "epoch": 0.91, + "grad_norm": 1.9718362378496106, + "learning_rate": 2.887533296676497e-06, + "loss": 0.5453, + "step": 1092 + }, + { + "epoch": 0.91, + "grad_norm": 1.7003897379752575, + "learning_rate": 2.8843027223017767e-06, + "loss": 0.5016, + "step": 1093 + }, + { + "epoch": 0.91, + "grad_norm": 1.7604846690613096, + "learning_rate": 2.8810714905309346e-06, + "loss": 0.5206, + "step": 1094 + }, + { + "epoch": 0.91, + "grad_norm": 1.868522047775135, + "learning_rate": 2.8778396068913807e-06, + "loss": 0.5152, + "step": 1095 + }, + { + "epoch": 0.91, + "grad_norm": 1.8080911269766844, + "learning_rate": 2.874607076911642e-06, + "loss": 0.4966, + "step": 1096 + }, + { + "epoch": 0.91, + "grad_norm": 1.7767037245003534, + "learning_rate": 2.871373906121351e-06, + "loss": 0.5081, + "step": 1097 + }, + { + "epoch": 0.91, + "grad_norm": 1.733045586658075, + "learning_rate": 2.8681401000512356e-06, + "loss": 0.5031, + "step": 1098 + }, + { + "epoch": 0.91, + "grad_norm": 1.6767478479637847, + "learning_rate": 2.8649056642331103e-06, + "loss": 0.4856, + "step": 1099 + }, + { + "epoch": 0.91, + "grad_norm": 1.6820690185704608, + "learning_rate": 2.8616706041998686e-06, + "loss": 0.5151, + "step": 1100 + }, + { + "epoch": 0.91, + "grad_norm": 1.840181264549285, + "learning_rate": 2.8584349254854693e-06, + "loss": 0.5393, + "step": 1101 + }, + { + "epoch": 0.91, + "grad_norm": 1.827807570004724, + "learning_rate": 2.8551986336249322e-06, + "loss": 0.5572, + "step": 1102 + }, + { + "epoch": 0.91, + "grad_norm": 1.711815265099016, + "learning_rate": 2.8519617341543233e-06, + "loss": 0.5184, + "step": 1103 + }, + { + "epoch": 0.92, + "grad_norm": 1.7460018389221874, + "learning_rate": 2.8487242326107495e-06, + "loss": 0.5374, + "step": 1104 + }, + { + "epoch": 0.92, + "grad_norm": 1.985067366728648, + "learning_rate": 2.8454861345323475e-06, + "loss": 0.538, + "step": 1105 + }, + { + "epoch": 0.92, + "grad_norm": 1.8044567576569952, + "learning_rate": 2.8422474454582754e-06, + "loss": 0.4947, + "step": 1106 + }, + { + "epoch": 0.92, + "grad_norm": 1.7648712890692506, + "learning_rate": 2.8390081709286997e-06, + "loss": 0.5584, + "step": 1107 + }, + { + "epoch": 0.92, + "grad_norm": 1.7544905722043518, + "learning_rate": 2.8357683164847903e-06, + "loss": 0.5696, + "step": 1108 + }, + { + "epoch": 0.92, + "grad_norm": 1.7923136846837993, + "learning_rate": 2.8325278876687084e-06, + "loss": 0.5502, + "step": 1109 + }, + { + "epoch": 0.92, + "grad_norm": 2.077195937792951, + "learning_rate": 2.8292868900235986e-06, + "loss": 0.543, + "step": 1110 + }, + { + "epoch": 0.92, + "grad_norm": 1.7675854046933754, + "learning_rate": 2.826045329093578e-06, + "loss": 0.5422, + "step": 1111 + }, + { + "epoch": 0.92, + "grad_norm": 1.8457239401392898, + "learning_rate": 2.822803210423727e-06, + "loss": 0.5334, + "step": 1112 + }, + { + "epoch": 0.92, + "grad_norm": 1.7426929121470698, + "learning_rate": 2.8195605395600804e-06, + "loss": 0.4972, + "step": 1113 + }, + { + "epoch": 0.92, + "grad_norm": 1.7675216264197045, + "learning_rate": 2.8163173220496175e-06, + "loss": 0.5442, + "step": 1114 + }, + { + "epoch": 0.92, + "grad_norm": 1.7483102565661375, + "learning_rate": 2.8130735634402527e-06, + "loss": 0.5425, + "step": 1115 + }, + { + "epoch": 0.93, + "grad_norm": 1.692036399159914, + "learning_rate": 2.8098292692808253e-06, + "loss": 0.521, + "step": 1116 + }, + { + "epoch": 0.93, + "grad_norm": 1.799980213437577, + "learning_rate": 2.8065844451210933e-06, + "loss": 0.5597, + "step": 1117 + }, + { + "epoch": 0.93, + "grad_norm": 1.7666190830884467, + "learning_rate": 2.803339096511718e-06, + "loss": 0.5612, + "step": 1118 + }, + { + "epoch": 0.93, + "grad_norm": 1.792129515845057, + "learning_rate": 2.8000932290042597e-06, + "loss": 0.5334, + "step": 1119 + }, + { + "epoch": 0.93, + "grad_norm": 1.7395715578516604, + "learning_rate": 2.7968468481511663e-06, + "loss": 0.5545, + "step": 1120 + }, + { + "epoch": 0.93, + "grad_norm": 1.6843830287676704, + "learning_rate": 2.7935999595057623e-06, + "loss": 0.5659, + "step": 1121 + }, + { + "epoch": 0.93, + "grad_norm": 1.6432688824199502, + "learning_rate": 2.790352568622244e-06, + "loss": 0.4926, + "step": 1122 + }, + { + "epoch": 0.93, + "grad_norm": 1.7430642435954644, + "learning_rate": 2.787104681055663e-06, + "loss": 0.4666, + "step": 1123 + }, + { + "epoch": 0.93, + "grad_norm": 1.8067789882264202, + "learning_rate": 2.783856302361923e-06, + "loss": 0.5233, + "step": 1124 + }, + { + "epoch": 0.93, + "grad_norm": 1.7685143281757654, + "learning_rate": 2.780607438097769e-06, + "loss": 0.5506, + "step": 1125 + }, + { + "epoch": 0.93, + "grad_norm": 1.7163110868931304, + "learning_rate": 2.7773580938207717e-06, + "loss": 0.5044, + "step": 1126 + }, + { + "epoch": 0.93, + "grad_norm": 1.809036270322799, + "learning_rate": 2.7741082750893284e-06, + "loss": 0.5206, + "step": 1127 + }, + { + "epoch": 0.94, + "grad_norm": 1.8193898978325846, + "learning_rate": 2.770857987462645e-06, + "loss": 0.6064, + "step": 1128 + }, + { + "epoch": 0.94, + "grad_norm": 1.765826426309075, + "learning_rate": 2.76760723650073e-06, + "loss": 0.4914, + "step": 1129 + }, + { + "epoch": 0.94, + "grad_norm": 2.046345230237298, + "learning_rate": 2.764356027764385e-06, + "loss": 0.5938, + "step": 1130 + }, + { + "epoch": 0.94, + "grad_norm": 1.8264697696225647, + "learning_rate": 2.7611043668151948e-06, + "loss": 0.5476, + "step": 1131 + }, + { + "epoch": 0.94, + "grad_norm": 1.7776043318415495, + "learning_rate": 2.7578522592155166e-06, + "loss": 0.5318, + "step": 1132 + }, + { + "epoch": 0.94, + "grad_norm": 1.767284538432005, + "learning_rate": 2.7545997105284735e-06, + "loss": 0.5197, + "step": 1133 + }, + { + "epoch": 0.94, + "grad_norm": 1.831190014066027, + "learning_rate": 2.75134672631794e-06, + "loss": 0.4939, + "step": 1134 + }, + { + "epoch": 0.94, + "grad_norm": 1.7727769641989948, + "learning_rate": 2.7480933121485394e-06, + "loss": 0.5542, + "step": 1135 + }, + { + "epoch": 0.94, + "grad_norm": 1.7599576706599651, + "learning_rate": 2.7448394735856275e-06, + "loss": 0.5102, + "step": 1136 + }, + { + "epoch": 0.94, + "grad_norm": 1.7526987759875383, + "learning_rate": 2.7415852161952893e-06, + "loss": 0.5357, + "step": 1137 + }, + { + "epoch": 0.94, + "grad_norm": 1.7478180377944075, + "learning_rate": 2.7383305455443223e-06, + "loss": 0.552, + "step": 1138 + }, + { + "epoch": 0.94, + "grad_norm": 1.8026983878339322, + "learning_rate": 2.7350754672002334e-06, + "loss": 0.5324, + "step": 1139 + }, + { + "epoch": 0.95, + "grad_norm": 1.7539604119960455, + "learning_rate": 2.7318199867312267e-06, + "loss": 0.4951, + "step": 1140 + }, + { + "epoch": 0.95, + "grad_norm": 1.7060714376533908, + "learning_rate": 2.728564109706193e-06, + "loss": 0.5044, + "step": 1141 + }, + { + "epoch": 0.95, + "grad_norm": 1.896732668736906, + "learning_rate": 2.725307841694704e-06, + "loss": 0.5272, + "step": 1142 + }, + { + "epoch": 0.95, + "grad_norm": 1.9094037542829962, + "learning_rate": 2.722051188266998e-06, + "loss": 0.5036, + "step": 1143 + }, + { + "epoch": 0.95, + "grad_norm": 1.7529900591353695, + "learning_rate": 2.7187941549939723e-06, + "loss": 0.4962, + "step": 1144 + }, + { + "epoch": 0.95, + "grad_norm": 1.7652784724721573, + "learning_rate": 2.7155367474471763e-06, + "loss": 0.5159, + "step": 1145 + }, + { + "epoch": 0.95, + "grad_norm": 1.9070275680276054, + "learning_rate": 2.7122789711987964e-06, + "loss": 0.5269, + "step": 1146 + }, + { + "epoch": 0.95, + "grad_norm": 1.7630505518040367, + "learning_rate": 2.709020831821652e-06, + "loss": 0.5286, + "step": 1147 + }, + { + "epoch": 0.95, + "grad_norm": 1.7410138974922291, + "learning_rate": 2.7057623348891846e-06, + "loss": 0.4902, + "step": 1148 + }, + { + "epoch": 0.95, + "grad_norm": 1.745842560539345, + "learning_rate": 2.7025034859754446e-06, + "loss": 0.5178, + "step": 1149 + }, + { + "epoch": 0.95, + "grad_norm": 1.8498982578771728, + "learning_rate": 2.699244290655086e-06, + "loss": 0.55, + "step": 1150 + }, + { + "epoch": 0.95, + "grad_norm": 1.6360369924184164, + "learning_rate": 2.6959847545033558e-06, + "loss": 0.4988, + "step": 1151 + }, + { + "epoch": 0.96, + "grad_norm": 1.6784833460211517, + "learning_rate": 2.692724883096082e-06, + "loss": 0.5303, + "step": 1152 + }, + { + "epoch": 0.96, + "grad_norm": 1.7888637226825195, + "learning_rate": 2.68946468200967e-06, + "loss": 0.542, + "step": 1153 + }, + { + "epoch": 0.96, + "grad_norm": 1.7156031503954616, + "learning_rate": 2.686204156821084e-06, + "loss": 0.499, + "step": 1154 + }, + { + "epoch": 0.96, + "grad_norm": 1.802618839032982, + "learning_rate": 2.6829433131078464e-06, + "loss": 0.5095, + "step": 1155 + }, + { + "epoch": 0.96, + "grad_norm": 1.7018673816457677, + "learning_rate": 2.6796821564480237e-06, + "loss": 0.4911, + "step": 1156 + }, + { + "epoch": 0.96, + "grad_norm": 1.939833859373507, + "learning_rate": 2.6764206924202173e-06, + "loss": 0.5965, + "step": 1157 + }, + { + "epoch": 0.96, + "grad_norm": 1.757462214596805, + "learning_rate": 2.673158926603554e-06, + "loss": 0.5119, + "step": 1158 + }, + { + "epoch": 0.96, + "grad_norm": 1.824906787992325, + "learning_rate": 2.669896864577678e-06, + "loss": 0.4995, + "step": 1159 + }, + { + "epoch": 0.96, + "grad_norm": 1.6963319988581682, + "learning_rate": 2.666634511922739e-06, + "loss": 0.499, + "step": 1160 + }, + { + "epoch": 0.96, + "grad_norm": 1.7490967555131538, + "learning_rate": 2.6633718742193837e-06, + "loss": 0.5045, + "step": 1161 + }, + { + "epoch": 0.96, + "grad_norm": 1.7295387040616608, + "learning_rate": 2.660108957048749e-06, + "loss": 0.48, + "step": 1162 + }, + { + "epoch": 0.96, + "grad_norm": 1.7062936128447537, + "learning_rate": 2.656845765992447e-06, + "loss": 0.5024, + "step": 1163 + }, + { + "epoch": 0.96, + "grad_norm": 1.7291223687738257, + "learning_rate": 2.6535823066325594e-06, + "loss": 0.4965, + "step": 1164 + }, + { + "epoch": 0.97, + "grad_norm": 1.7660018876230184, + "learning_rate": 2.650318584551626e-06, + "loss": 0.6289, + "step": 1165 + }, + { + "epoch": 0.97, + "grad_norm": 1.6875948695046943, + "learning_rate": 2.6470546053326375e-06, + "loss": 0.5099, + "step": 1166 + }, + { + "epoch": 0.97, + "grad_norm": 1.7055862895950586, + "learning_rate": 2.643790374559023e-06, + "loss": 0.4748, + "step": 1167 + }, + { + "epoch": 0.97, + "grad_norm": 1.8397810404769834, + "learning_rate": 2.6405258978146443e-06, + "loss": 0.5547, + "step": 1168 + }, + { + "epoch": 0.97, + "grad_norm": 1.6780759297615608, + "learning_rate": 2.6372611806837804e-06, + "loss": 0.4696, + "step": 1169 + }, + { + "epoch": 0.97, + "grad_norm": 1.7463193906158438, + "learning_rate": 2.633996228751125e-06, + "loss": 0.5167, + "step": 1170 + }, + { + "epoch": 0.97, + "grad_norm": 1.7682737157303552, + "learning_rate": 2.6307310476017705e-06, + "loss": 0.5178, + "step": 1171 + }, + { + "epoch": 0.97, + "grad_norm": 1.7759532350573655, + "learning_rate": 2.627465642821203e-06, + "loss": 0.5411, + "step": 1172 + }, + { + "epoch": 0.97, + "grad_norm": 1.741742707150691, + "learning_rate": 2.624200019995293e-06, + "loss": 0.5357, + "step": 1173 + }, + { + "epoch": 0.97, + "grad_norm": 1.7638181255611864, + "learning_rate": 2.6209341847102787e-06, + "loss": 0.5598, + "step": 1174 + }, + { + "epoch": 0.97, + "grad_norm": 1.6585763596592404, + "learning_rate": 2.6176681425527663e-06, + "loss": 0.4891, + "step": 1175 + }, + { + "epoch": 0.97, + "grad_norm": 1.7652514703885578, + "learning_rate": 2.614401899109716e-06, + "loss": 0.5412, + "step": 1176 + }, + { + "epoch": 0.98, + "grad_norm": 1.7646286601286296, + "learning_rate": 2.6111354599684287e-06, + "loss": 0.4753, + "step": 1177 + }, + { + "epoch": 0.98, + "grad_norm": 1.7933546923906454, + "learning_rate": 2.6078688307165436e-06, + "loss": 0.5159, + "step": 1178 + }, + { + "epoch": 0.98, + "grad_norm": 1.8474498352431208, + "learning_rate": 2.6046020169420223e-06, + "loss": 0.4786, + "step": 1179 + }, + { + "epoch": 0.98, + "grad_norm": 1.816609500392057, + "learning_rate": 2.601335024233145e-06, + "loss": 0.5821, + "step": 1180 + }, + { + "epoch": 0.98, + "grad_norm": 1.7603922858788037, + "learning_rate": 2.598067858178495e-06, + "loss": 0.4749, + "step": 1181 + }, + { + "epoch": 0.98, + "grad_norm": 1.771168764538133, + "learning_rate": 2.594800524366956e-06, + "loss": 0.5221, + "step": 1182 + }, + { + "epoch": 0.98, + "grad_norm": 1.7428386931770696, + "learning_rate": 2.591533028387694e-06, + "loss": 0.5243, + "step": 1183 + }, + { + "epoch": 0.98, + "grad_norm": 1.7354647623517858, + "learning_rate": 2.588265375830155e-06, + "loss": 0.4665, + "step": 1184 + }, + { + "epoch": 0.98, + "grad_norm": 1.7757829783254058, + "learning_rate": 2.5849975722840537e-06, + "loss": 0.4713, + "step": 1185 + }, + { + "epoch": 0.98, + "grad_norm": 1.7660698291034924, + "learning_rate": 2.58172962333936e-06, + "loss": 0.5198, + "step": 1186 + }, + { + "epoch": 0.98, + "grad_norm": 1.7071465020770178, + "learning_rate": 2.5784615345862963e-06, + "loss": 0.5355, + "step": 1187 + }, + { + "epoch": 0.98, + "grad_norm": 1.6994920599655763, + "learning_rate": 2.5751933116153215e-06, + "loss": 0.4867, + "step": 1188 + }, + { + "epoch": 0.99, + "grad_norm": 1.7891977115774562, + "learning_rate": 2.5719249600171247e-06, + "loss": 0.5071, + "step": 1189 + }, + { + "epoch": 0.99, + "grad_norm": 1.6866451169084888, + "learning_rate": 2.568656485382616e-06, + "loss": 0.4767, + "step": 1190 + }, + { + "epoch": 0.99, + "grad_norm": 1.9106444693405875, + "learning_rate": 2.5653878933029134e-06, + "loss": 0.5063, + "step": 1191 + }, + { + "epoch": 0.99, + "grad_norm": 1.7546015951107552, + "learning_rate": 2.56211918936934e-06, + "loss": 0.5536, + "step": 1192 + }, + { + "epoch": 0.99, + "grad_norm": 1.7866083346923656, + "learning_rate": 2.5588503791734053e-06, + "loss": 0.4738, + "step": 1193 + }, + { + "epoch": 0.99, + "grad_norm": 1.6678313975517949, + "learning_rate": 2.5555814683068058e-06, + "loss": 0.5095, + "step": 1194 + }, + { + "epoch": 0.99, + "grad_norm": 1.694690087625629, + "learning_rate": 2.552312462361405e-06, + "loss": 0.5711, + "step": 1195 + }, + { + "epoch": 0.99, + "grad_norm": 1.7583066556547233, + "learning_rate": 2.5490433669292337e-06, + "loss": 0.5183, + "step": 1196 + }, + { + "epoch": 0.99, + "grad_norm": 1.8259327544569408, + "learning_rate": 2.5457741876024716e-06, + "loss": 0.5129, + "step": 1197 + }, + { + "epoch": 0.99, + "grad_norm": 1.743709458286742, + "learning_rate": 2.542504929973445e-06, + "loss": 0.509, + "step": 1198 + }, + { + "epoch": 0.99, + "grad_norm": 1.8551037168096902, + "learning_rate": 2.5392355996346134e-06, + "loss": 0.4874, + "step": 1199 + }, + { + "epoch": 0.99, + "grad_norm": 1.7705896553689628, + "learning_rate": 2.5359662021785596e-06, + "loss": 0.5102, + "step": 1200 + }, + { + "epoch": 1.0, + "grad_norm": 1.8456154073029885, + "learning_rate": 2.532696743197982e-06, + "loss": 0.5363, + "step": 1201 + }, + { + "epoch": 1.0, + "grad_norm": 1.7341454202963031, + "learning_rate": 2.529427228285686e-06, + "loss": 0.5013, + "step": 1202 + }, + { + "epoch": 1.0, + "grad_norm": 1.7923147732329405, + "learning_rate": 2.526157663034568e-06, + "loss": 0.5191, + "step": 1203 + }, + { + "epoch": 1.0, + "grad_norm": 1.731262319220837, + "learning_rate": 2.522888053037616e-06, + "loss": 0.4889, + "step": 1204 + }, + { + "epoch": 1.0, + "grad_norm": 1.797800368847369, + "learning_rate": 2.5196184038878895e-06, + "loss": 0.4868, + "step": 1205 + }, + { + "epoch": 1.0, + "grad_norm": 1.8182272292135089, + "learning_rate": 2.5163487211785194e-06, + "loss": 0.5159, + "step": 1206 + }, + { + "epoch": 1.0, + "grad_norm": 1.9699143840893472, + "learning_rate": 2.5130790105026908e-06, + "loss": 0.543, + "step": 1207 + }, + { + "epoch": 1.0, + "grad_norm": 1.805587879000798, + "learning_rate": 2.5098092774536397e-06, + "loss": 0.5162, + "step": 1208 + }, + { + "epoch": 1.0, + "grad_norm": 1.966538834153111, + "learning_rate": 2.506539527624637e-06, + "loss": 0.4973, + "step": 1209 + }, + { + "epoch": 1.0, + "grad_norm": 1.7007116827865891, + "learning_rate": 2.5032697666089833e-06, + "loss": 0.5337, + "step": 1210 + }, + { + "epoch": 1.0, + "grad_norm": 1.8200190388383481, + "learning_rate": 2.5e-06, + "loss": 0.492, + "step": 1211 + }, + { + "epoch": 1.0, + "grad_norm": 1.7811733389101785, + "learning_rate": 2.496730233391017e-06, + "loss": 0.533, + "step": 1212 + }, + { + "epoch": 1.01, + "grad_norm": 1.7692852455085013, + "learning_rate": 2.4934604723753636e-06, + "loss": 0.5151, + "step": 1213 + }, + { + "epoch": 1.01, + "grad_norm": 2.0118407638136726, + "learning_rate": 2.4901907225463607e-06, + "loss": 0.566, + "step": 1214 + }, + { + "epoch": 1.01, + "grad_norm": 1.9919699597672162, + "learning_rate": 2.486920989497309e-06, + "loss": 0.5296, + "step": 1215 + }, + { + "epoch": 1.01, + "grad_norm": 1.7399123797451834, + "learning_rate": 2.483651278821481e-06, + "loss": 0.5535, + "step": 1216 + }, + { + "epoch": 1.01, + "grad_norm": 2.0162050634113617, + "learning_rate": 2.4803815961121117e-06, + "loss": 0.5105, + "step": 1217 + }, + { + "epoch": 1.01, + "grad_norm": 1.9472302767468135, + "learning_rate": 2.4771119469623856e-06, + "loss": 0.4829, + "step": 1218 + }, + { + "epoch": 1.01, + "grad_norm": 1.9358326178363474, + "learning_rate": 2.4738423369654327e-06, + "loss": 0.5895, + "step": 1219 + }, + { + "epoch": 1.01, + "grad_norm": 1.8202396491898063, + "learning_rate": 2.470572771714315e-06, + "loss": 0.5159, + "step": 1220 + }, + { + "epoch": 1.01, + "grad_norm": 2.0705540084815652, + "learning_rate": 2.4673032568020183e-06, + "loss": 0.5375, + "step": 1221 + }, + { + "epoch": 1.01, + "grad_norm": 1.9290016818033147, + "learning_rate": 2.464033797821441e-06, + "loss": 0.5328, + "step": 1222 + }, + { + "epoch": 1.01, + "grad_norm": 1.858876842427081, + "learning_rate": 2.460764400365387e-06, + "loss": 0.5246, + "step": 1223 + }, + { + "epoch": 1.01, + "grad_norm": 1.7372257522644121, + "learning_rate": 2.457495070026555e-06, + "loss": 0.5557, + "step": 1224 + }, + { + "epoch": 1.02, + "grad_norm": 2.042578607858068, + "learning_rate": 2.454225812397529e-06, + "loss": 0.5493, + "step": 1225 + }, + { + "epoch": 1.02, + "grad_norm": 1.80578953353184, + "learning_rate": 2.450956633070767e-06, + "loss": 0.4722, + "step": 1226 + }, + { + "epoch": 1.02, + "grad_norm": 1.6245117501883604, + "learning_rate": 2.4476875376385954e-06, + "loss": 0.4861, + "step": 1227 + }, + { + "epoch": 1.0, + "grad_norm": 2.3717275673814986, + "learning_rate": 2.4444185316931955e-06, + "loss": 0.4955, + "step": 1228 + }, + { + "epoch": 1.0, + "grad_norm": 2.789230426976571, + "learning_rate": 2.441149620826595e-06, + "loss": 0.401, + "step": 1229 + }, + { + "epoch": 1.0, + "grad_norm": 2.3165196574538163, + "learning_rate": 2.437880810630661e-06, + "loss": 0.391, + "step": 1230 + }, + { + "epoch": 1.0, + "grad_norm": 3.7748119497874244, + "learning_rate": 2.434612106697087e-06, + "loss": 0.3971, + "step": 1231 + }, + { + "epoch": 1.0, + "grad_norm": 2.516708769328096, + "learning_rate": 2.4313435146173845e-06, + "loss": 0.3677, + "step": 1232 + }, + { + "epoch": 1.0, + "grad_norm": 2.0383812730416593, + "learning_rate": 2.4280750399828757e-06, + "loss": 0.3834, + "step": 1233 + }, + { + "epoch": 1.01, + "grad_norm": 2.388274870254754, + "learning_rate": 2.424806688384679e-06, + "loss": 0.38, + "step": 1234 + }, + { + "epoch": 1.01, + "grad_norm": 2.428758767469847, + "learning_rate": 2.4215384654137037e-06, + "loss": 0.3557, + "step": 1235 + }, + { + "epoch": 1.01, + "grad_norm": 1.9871015940327752, + "learning_rate": 2.41827037666064e-06, + "loss": 0.3742, + "step": 1236 + }, + { + "epoch": 1.01, + "grad_norm": 2.0490853630896595, + "learning_rate": 2.415002427715948e-06, + "loss": 0.4077, + "step": 1237 + }, + { + "epoch": 1.01, + "grad_norm": 2.36022057857035, + "learning_rate": 2.4117346241698457e-06, + "loss": 0.4079, + "step": 1238 + }, + { + "epoch": 1.01, + "grad_norm": 2.4014397498962974, + "learning_rate": 2.408466971612307e-06, + "loss": 0.3783, + "step": 1239 + }, + { + "epoch": 1.01, + "grad_norm": 2.1970209263326246, + "learning_rate": 2.405199475633045e-06, + "loss": 0.4019, + "step": 1240 + }, + { + "epoch": 1.01, + "grad_norm": 1.8747804397851657, + "learning_rate": 2.4019321418215053e-06, + "loss": 0.3657, + "step": 1241 + }, + { + "epoch": 1.01, + "grad_norm": 2.0377029592503666, + "learning_rate": 2.398664975766856e-06, + "loss": 0.3575, + "step": 1242 + }, + { + "epoch": 1.01, + "grad_norm": 2.2162687478729133, + "learning_rate": 2.3953979830579785e-06, + "loss": 0.3891, + "step": 1243 + }, + { + "epoch": 1.01, + "grad_norm": 2.0736112974636605, + "learning_rate": 2.3921311692834577e-06, + "loss": 0.3872, + "step": 1244 + }, + { + "epoch": 1.01, + "grad_norm": 1.8065329023464558, + "learning_rate": 2.3888645400315717e-06, + "loss": 0.3684, + "step": 1245 + }, + { + "epoch": 1.02, + "grad_norm": 2.144863722944226, + "learning_rate": 2.385598100890285e-06, + "loss": 0.3781, + "step": 1246 + }, + { + "epoch": 1.02, + "grad_norm": 2.245173550848138, + "learning_rate": 2.382331857447234e-06, + "loss": 0.3906, + "step": 1247 + }, + { + "epoch": 1.02, + "grad_norm": 2.0580037557233806, + "learning_rate": 2.379065815289723e-06, + "loss": 0.3461, + "step": 1248 + }, + { + "epoch": 1.02, + "grad_norm": 1.754328637936701, + "learning_rate": 2.3757999800047088e-06, + "loss": 0.3626, + "step": 1249 + }, + { + "epoch": 1.02, + "grad_norm": 1.8749369460952616, + "learning_rate": 2.3725343571787974e-06, + "loss": 0.3723, + "step": 1250 + }, + { + "epoch": 1.02, + "grad_norm": 1.9635590762348785, + "learning_rate": 2.36926895239823e-06, + "loss": 0.3506, + "step": 1251 + }, + { + "epoch": 1.02, + "grad_norm": 1.9091295881177242, + "learning_rate": 2.3660037712488758e-06, + "loss": 0.3705, + "step": 1252 + }, + { + "epoch": 1.02, + "grad_norm": 2.0807822077632445, + "learning_rate": 2.36273881931622e-06, + "loss": 0.4083, + "step": 1253 + }, + { + "epoch": 1.02, + "grad_norm": 1.9247801946548893, + "learning_rate": 2.3594741021853565e-06, + "loss": 0.3896, + "step": 1254 + }, + { + "epoch": 1.02, + "grad_norm": 2.003234826375957, + "learning_rate": 2.356209625440977e-06, + "loss": 0.3928, + "step": 1255 + }, + { + "epoch": 1.02, + "grad_norm": 1.9601094488156638, + "learning_rate": 2.352945394667363e-06, + "loss": 0.346, + "step": 1256 + }, + { + "epoch": 1.02, + "grad_norm": 1.835912356231795, + "learning_rate": 2.3496814154483754e-06, + "loss": 0.3268, + "step": 1257 + }, + { + "epoch": 1.03, + "grad_norm": 1.851616138864044, + "learning_rate": 2.346417693367442e-06, + "loss": 0.395, + "step": 1258 + }, + { + "epoch": 1.03, + "grad_norm": 2.017511453982363, + "learning_rate": 2.3431542340075535e-06, + "loss": 0.3989, + "step": 1259 + }, + { + "epoch": 1.03, + "grad_norm": 1.9337327085061278, + "learning_rate": 2.3398910429512516e-06, + "loss": 0.4168, + "step": 1260 + }, + { + "epoch": 1.03, + "grad_norm": 1.8957440589808827, + "learning_rate": 2.3366281257806167e-06, + "loss": 0.3626, + "step": 1261 + }, + { + "epoch": 1.03, + "grad_norm": 1.819897111464585, + "learning_rate": 2.3333654880772622e-06, + "loss": 0.3737, + "step": 1262 + }, + { + "epoch": 1.03, + "grad_norm": 1.9283607336926767, + "learning_rate": 2.3301031354223226e-06, + "loss": 0.3595, + "step": 1263 + }, + { + "epoch": 1.03, + "grad_norm": 1.8049670593502345, + "learning_rate": 2.3268410733964463e-06, + "loss": 0.3645, + "step": 1264 + }, + { + "epoch": 1.03, + "grad_norm": 1.866103990559354, + "learning_rate": 2.3235793075797835e-06, + "loss": 0.391, + "step": 1265 + }, + { + "epoch": 1.03, + "grad_norm": 1.774992664072412, + "learning_rate": 2.3203178435519767e-06, + "loss": 0.3863, + "step": 1266 + }, + { + "epoch": 1.03, + "grad_norm": 1.8431093658964484, + "learning_rate": 2.3170566868921553e-06, + "loss": 0.4175, + "step": 1267 + }, + { + "epoch": 1.03, + "grad_norm": 1.7731154009482526, + "learning_rate": 2.3137958431789175e-06, + "loss": 0.3651, + "step": 1268 + }, + { + "epoch": 1.03, + "grad_norm": 1.980392583405916, + "learning_rate": 2.3105353179903313e-06, + "loss": 0.3919, + "step": 1269 + }, + { + "epoch": 1.04, + "grad_norm": 1.8435910751312221, + "learning_rate": 2.3072751169039183e-06, + "loss": 0.3466, + "step": 1270 + }, + { + "epoch": 1.04, + "grad_norm": 1.88150621693115, + "learning_rate": 2.304015245496645e-06, + "loss": 0.3991, + "step": 1271 + }, + { + "epoch": 1.04, + "grad_norm": 1.9365960105712363, + "learning_rate": 2.300755709344915e-06, + "loss": 0.3675, + "step": 1272 + }, + { + "epoch": 1.04, + "grad_norm": 1.8120924423380202, + "learning_rate": 2.297496514024556e-06, + "loss": 0.389, + "step": 1273 + }, + { + "epoch": 1.04, + "grad_norm": 1.822066570446833, + "learning_rate": 2.2942376651108158e-06, + "loss": 0.3355, + "step": 1274 + }, + { + "epoch": 1.04, + "grad_norm": 1.968043494993567, + "learning_rate": 2.290979168178348e-06, + "loss": 0.3909, + "step": 1275 + }, + { + "epoch": 1.04, + "grad_norm": 1.8571689944285859, + "learning_rate": 2.287721028801204e-06, + "loss": 0.376, + "step": 1276 + }, + { + "epoch": 1.04, + "grad_norm": 2.003415605331929, + "learning_rate": 2.2844632525528245e-06, + "loss": 0.3439, + "step": 1277 + }, + { + "epoch": 1.04, + "grad_norm": 2.248040597881556, + "learning_rate": 2.2812058450060285e-06, + "loss": 0.3789, + "step": 1278 + }, + { + "epoch": 1.04, + "grad_norm": 1.8018969815730068, + "learning_rate": 2.2779488117330032e-06, + "loss": 0.3756, + "step": 1279 + }, + { + "epoch": 1.04, + "grad_norm": 1.90374397055853, + "learning_rate": 2.2746921583052967e-06, + "loss": 0.4126, + "step": 1280 + }, + { + "epoch": 1.04, + "grad_norm": 1.8558365521624263, + "learning_rate": 2.2714358902938073e-06, + "loss": 0.3959, + "step": 1281 + }, + { + "epoch": 1.05, + "grad_norm": 1.8375175796231433, + "learning_rate": 2.268180013268774e-06, + "loss": 0.4048, + "step": 1282 + }, + { + "epoch": 1.05, + "grad_norm": 1.984205865069469, + "learning_rate": 2.2649245327997674e-06, + "loss": 0.4039, + "step": 1283 + }, + { + "epoch": 1.05, + "grad_norm": 1.8933532928718015, + "learning_rate": 2.261669454455679e-06, + "loss": 0.3781, + "step": 1284 + }, + { + "epoch": 1.05, + "grad_norm": 1.9740915743952114, + "learning_rate": 2.2584147838047116e-06, + "loss": 0.4003, + "step": 1285 + }, + { + "epoch": 1.05, + "grad_norm": 1.8808844925592019, + "learning_rate": 2.2551605264143725e-06, + "loss": 0.3449, + "step": 1286 + }, + { + "epoch": 1.05, + "grad_norm": 1.9307797122579196, + "learning_rate": 2.251906687851461e-06, + "loss": 0.4182, + "step": 1287 + }, + { + "epoch": 1.05, + "grad_norm": 1.8492505145939904, + "learning_rate": 2.2486532736820614e-06, + "loss": 0.3736, + "step": 1288 + }, + { + "epoch": 1.05, + "grad_norm": 1.8826597143825838, + "learning_rate": 2.245400289471528e-06, + "loss": 0.3987, + "step": 1289 + }, + { + "epoch": 1.05, + "grad_norm": 1.8696499317715565, + "learning_rate": 2.242147740784484e-06, + "loss": 0.3725, + "step": 1290 + }, + { + "epoch": 1.05, + "grad_norm": 2.0572316139676463, + "learning_rate": 2.2388956331848057e-06, + "loss": 0.3777, + "step": 1291 + }, + { + "epoch": 1.05, + "grad_norm": 1.9916048666817696, + "learning_rate": 2.2356439722356154e-06, + "loss": 0.3435, + "step": 1292 + }, + { + "epoch": 1.05, + "grad_norm": 1.7903849297787813, + "learning_rate": 2.2323927634992706e-06, + "loss": 0.3691, + "step": 1293 + }, + { + "epoch": 1.06, + "grad_norm": 1.8840722711485807, + "learning_rate": 2.2291420125373555e-06, + "loss": 0.3619, + "step": 1294 + }, + { + "epoch": 1.06, + "grad_norm": 1.853222255447046, + "learning_rate": 2.225891724910672e-06, + "loss": 0.3406, + "step": 1295 + }, + { + "epoch": 1.06, + "grad_norm": 1.8075515802139996, + "learning_rate": 2.2226419061792282e-06, + "loss": 0.3775, + "step": 1296 + }, + { + "epoch": 1.06, + "grad_norm": 1.8220733253527324, + "learning_rate": 2.2193925619022323e-06, + "loss": 0.3652, + "step": 1297 + }, + { + "epoch": 1.06, + "grad_norm": 1.9758397782161456, + "learning_rate": 2.2161436976380774e-06, + "loss": 0.3825, + "step": 1298 + }, + { + "epoch": 1.06, + "grad_norm": 2.0469053125573202, + "learning_rate": 2.212895318944338e-06, + "loss": 0.4162, + "step": 1299 + }, + { + "epoch": 1.06, + "grad_norm": 1.8037669439194224, + "learning_rate": 2.2096474313777574e-06, + "loss": 0.3584, + "step": 1300 + }, + { + "epoch": 1.06, + "grad_norm": 1.8852980241376032, + "learning_rate": 2.206400040494238e-06, + "loss": 0.3786, + "step": 1301 + }, + { + "epoch": 1.06, + "grad_norm": 1.8014277477129081, + "learning_rate": 2.2031531518488345e-06, + "loss": 0.4126, + "step": 1302 + }, + { + "epoch": 1.06, + "grad_norm": 1.844230526856602, + "learning_rate": 2.1999067709957407e-06, + "loss": 0.4005, + "step": 1303 + }, + { + "epoch": 1.06, + "grad_norm": 1.9775624321749639, + "learning_rate": 2.1966609034882825e-06, + "loss": 0.4279, + "step": 1304 + }, + { + "epoch": 1.06, + "grad_norm": 1.7752280618538778, + "learning_rate": 2.193415554878907e-06, + "loss": 0.3512, + "step": 1305 + }, + { + "epoch": 1.07, + "grad_norm": 1.8490455260047038, + "learning_rate": 2.1901707307191743e-06, + "loss": 0.3828, + "step": 1306 + }, + { + "epoch": 1.07, + "grad_norm": 5.328150832014928, + "learning_rate": 2.1869264365597477e-06, + "loss": 0.3909, + "step": 1307 + }, + { + "epoch": 1.07, + "grad_norm": 1.8437062886123319, + "learning_rate": 2.1836826779503838e-06, + "loss": 0.37, + "step": 1308 + }, + { + "epoch": 1.07, + "grad_norm": 2.008796830412121, + "learning_rate": 2.1804394604399204e-06, + "loss": 0.4077, + "step": 1309 + }, + { + "epoch": 1.07, + "grad_norm": 1.800679268264127, + "learning_rate": 2.1771967895762736e-06, + "loss": 0.3679, + "step": 1310 + }, + { + "epoch": 1.07, + "grad_norm": 1.8462133413299637, + "learning_rate": 2.173954670906423e-06, + "loss": 0.3602, + "step": 1311 + }, + { + "epoch": 1.07, + "grad_norm": 1.809976917930169, + "learning_rate": 2.1707131099764022e-06, + "loss": 0.3899, + "step": 1312 + }, + { + "epoch": 1.07, + "grad_norm": 1.8544861012991105, + "learning_rate": 2.1674721123312924e-06, + "loss": 0.3747, + "step": 1313 + }, + { + "epoch": 1.07, + "grad_norm": 1.8852269898368, + "learning_rate": 2.1642316835152106e-06, + "loss": 0.4467, + "step": 1314 + }, + { + "epoch": 1.07, + "grad_norm": 1.9122728391881445, + "learning_rate": 2.1609918290713007e-06, + "loss": 0.3402, + "step": 1315 + }, + { + "epoch": 1.07, + "grad_norm": 1.9590310432156601, + "learning_rate": 2.1577525545417254e-06, + "loss": 0.3732, + "step": 1316 + }, + { + "epoch": 1.07, + "grad_norm": 1.8276147883157745, + "learning_rate": 2.1545138654676525e-06, + "loss": 0.3953, + "step": 1317 + }, + { + "epoch": 1.08, + "grad_norm": 1.8133703409989375, + "learning_rate": 2.151275767389252e-06, + "loss": 0.3539, + "step": 1318 + }, + { + "epoch": 1.08, + "grad_norm": 1.8006183709975836, + "learning_rate": 2.148038265845678e-06, + "loss": 0.4006, + "step": 1319 + }, + { + "epoch": 1.08, + "grad_norm": 1.8947220090164194, + "learning_rate": 2.144801366375069e-06, + "loss": 0.4406, + "step": 1320 + }, + { + "epoch": 1.08, + "grad_norm": 1.8280103512099313, + "learning_rate": 2.141565074514531e-06, + "loss": 0.3815, + "step": 1321 + }, + { + "epoch": 1.08, + "grad_norm": 1.8706012819390525, + "learning_rate": 2.138329395800132e-06, + "loss": 0.3445, + "step": 1322 + }, + { + "epoch": 1.08, + "grad_norm": 1.9063701163877025, + "learning_rate": 2.1350943357668905e-06, + "loss": 0.3983, + "step": 1323 + }, + { + "epoch": 1.08, + "grad_norm": 2.033333592395131, + "learning_rate": 2.131859899948765e-06, + "loss": 0.3686, + "step": 1324 + }, + { + "epoch": 1.08, + "grad_norm": 2.0894724502176425, + "learning_rate": 2.1286260938786497e-06, + "loss": 0.3811, + "step": 1325 + }, + { + "epoch": 1.08, + "grad_norm": 1.9145691870270913, + "learning_rate": 2.125392923088358e-06, + "loss": 0.3783, + "step": 1326 + }, + { + "epoch": 1.08, + "grad_norm": 1.941699323344672, + "learning_rate": 2.1221603931086193e-06, + "loss": 0.3842, + "step": 1327 + }, + { + "epoch": 1.08, + "grad_norm": 2.0079800551627565, + "learning_rate": 2.118928509469066e-06, + "loss": 0.3885, + "step": 1328 + }, + { + "epoch": 1.08, + "grad_norm": 1.851351482771633, + "learning_rate": 2.1156972776982238e-06, + "loss": 0.3281, + "step": 1329 + }, + { + "epoch": 1.08, + "grad_norm": 1.9104937018736412, + "learning_rate": 2.112466703323504e-06, + "loss": 0.4231, + "step": 1330 + }, + { + "epoch": 1.09, + "grad_norm": 1.92374307717419, + "learning_rate": 2.1092367918711935e-06, + "loss": 0.3702, + "step": 1331 + }, + { + "epoch": 1.09, + "grad_norm": 1.8725737952655952, + "learning_rate": 2.1060075488664453e-06, + "loss": 0.3591, + "step": 1332 + }, + { + "epoch": 1.09, + "grad_norm": 1.850042908610832, + "learning_rate": 2.1027789798332688e-06, + "loss": 0.3368, + "step": 1333 + }, + { + "epoch": 1.09, + "grad_norm": 1.9324592525287807, + "learning_rate": 2.0995510902945197e-06, + "loss": 0.3676, + "step": 1334 + }, + { + "epoch": 1.09, + "grad_norm": 1.9116116557564555, + "learning_rate": 2.0963238857718934e-06, + "loss": 0.3817, + "step": 1335 + }, + { + "epoch": 1.09, + "grad_norm": 1.9148726445140338, + "learning_rate": 2.0930973717859117e-06, + "loss": 0.3704, + "step": 1336 + }, + { + "epoch": 1.09, + "grad_norm": 1.8376871831619126, + "learning_rate": 2.089871553855915e-06, + "loss": 0.3521, + "step": 1337 + }, + { + "epoch": 1.09, + "grad_norm": 2.069303925978208, + "learning_rate": 2.086646437500054e-06, + "loss": 0.3848, + "step": 1338 + }, + { + "epoch": 1.09, + "grad_norm": 1.876178784774616, + "learning_rate": 2.08342202823528e-06, + "loss": 0.3697, + "step": 1339 + }, + { + "epoch": 1.09, + "grad_norm": 1.8981757166548485, + "learning_rate": 2.0801983315773317e-06, + "loss": 0.3864, + "step": 1340 + }, + { + "epoch": 1.09, + "grad_norm": 1.8313223303972075, + "learning_rate": 2.0769753530407317e-06, + "loss": 0.3768, + "step": 1341 + }, + { + "epoch": 1.09, + "grad_norm": 1.9073767874852925, + "learning_rate": 2.073753098138773e-06, + "loss": 0.3991, + "step": 1342 + }, + { + "epoch": 1.1, + "grad_norm": 1.837313805268737, + "learning_rate": 2.0705315723835116e-06, + "loss": 0.3959, + "step": 1343 + }, + { + "epoch": 1.1, + "grad_norm": 1.9539946764244502, + "learning_rate": 2.067310781285755e-06, + "loss": 0.4305, + "step": 1344 + }, + { + "epoch": 1.1, + "grad_norm": 2.019270181770809, + "learning_rate": 2.0640907303550545e-06, + "loss": 0.3601, + "step": 1345 + }, + { + "epoch": 1.1, + "grad_norm": 2.406213238917182, + "learning_rate": 2.0608714250996954e-06, + "loss": 0.4426, + "step": 1346 + }, + { + "epoch": 1.1, + "grad_norm": 1.9236578073704644, + "learning_rate": 2.0576528710266875e-06, + "loss": 0.4038, + "step": 1347 + }, + { + "epoch": 1.1, + "grad_norm": 2.048182172212149, + "learning_rate": 2.054435073641756e-06, + "loss": 0.3746, + "step": 1348 + }, + { + "epoch": 1.1, + "grad_norm": 1.928863945427719, + "learning_rate": 2.0512180384493306e-06, + "loss": 0.3894, + "step": 1349 + }, + { + "epoch": 1.1, + "grad_norm": 1.8335551339682872, + "learning_rate": 2.0480017709525372e-06, + "loss": 0.3693, + "step": 1350 + }, + { + "epoch": 1.1, + "grad_norm": 1.9647819756067608, + "learning_rate": 2.044786276653189e-06, + "loss": 0.3781, + "step": 1351 + }, + { + "epoch": 1.1, + "grad_norm": 2.12907859222308, + "learning_rate": 2.041571561051777e-06, + "loss": 0.4171, + "step": 1352 + }, + { + "epoch": 1.1, + "grad_norm": 1.9030554994611362, + "learning_rate": 2.0383576296474595e-06, + "loss": 0.3871, + "step": 1353 + }, + { + "epoch": 1.1, + "grad_norm": 1.8482128197200014, + "learning_rate": 2.0351444879380533e-06, + "loss": 0.3801, + "step": 1354 + }, + { + "epoch": 1.11, + "grad_norm": 1.9237098856083394, + "learning_rate": 2.031932141420026e-06, + "loss": 0.397, + "step": 1355 + }, + { + "epoch": 1.11, + "grad_norm": 1.9292461604759314, + "learning_rate": 2.0287205955884812e-06, + "loss": 0.3808, + "step": 1356 + }, + { + "epoch": 1.11, + "grad_norm": 1.905891034454967, + "learning_rate": 2.025509855937156e-06, + "loss": 0.3991, + "step": 1357 + }, + { + "epoch": 1.11, + "grad_norm": 1.8451385574242787, + "learning_rate": 2.0222999279584084e-06, + "loss": 0.3801, + "step": 1358 + }, + { + "epoch": 1.11, + "grad_norm": 1.949400009057099, + "learning_rate": 2.0190908171432073e-06, + "loss": 0.3892, + "step": 1359 + }, + { + "epoch": 1.11, + "grad_norm": 1.9605363810464835, + "learning_rate": 2.0158825289811214e-06, + "loss": 0.3965, + "step": 1360 + }, + { + "epoch": 1.11, + "grad_norm": 1.8606173348780064, + "learning_rate": 2.012675068960315e-06, + "loss": 0.3954, + "step": 1361 + }, + { + "epoch": 1.11, + "grad_norm": 1.894555038278285, + "learning_rate": 2.009468442567537e-06, + "loss": 0.3872, + "step": 1362 + }, + { + "epoch": 1.11, + "grad_norm": 1.8879641436732342, + "learning_rate": 2.006262655288106e-06, + "loss": 0.381, + "step": 1363 + }, + { + "epoch": 1.11, + "grad_norm": 6.804463123370788, + "learning_rate": 2.003057712605908e-06, + "loss": 0.3598, + "step": 1364 + }, + { + "epoch": 1.11, + "grad_norm": 1.9484231062475323, + "learning_rate": 1.9998536200033843e-06, + "loss": 0.387, + "step": 1365 + }, + { + "epoch": 1.11, + "grad_norm": 1.9430636182866459, + "learning_rate": 1.996650382961521e-06, + "loss": 0.3815, + "step": 1366 + }, + { + "epoch": 1.12, + "grad_norm": 1.8099872908810362, + "learning_rate": 1.9934480069598418e-06, + "loss": 0.3931, + "step": 1367 + }, + { + "epoch": 1.12, + "grad_norm": 2.0871498559503583, + "learning_rate": 1.990246497476396e-06, + "loss": 0.3946, + "step": 1368 + }, + { + "epoch": 1.12, + "grad_norm": 1.9534152521538926, + "learning_rate": 1.9870458599877524e-06, + "loss": 0.3998, + "step": 1369 + }, + { + "epoch": 1.12, + "grad_norm": 1.9712355359168434, + "learning_rate": 1.9838460999689854e-06, + "loss": 0.3741, + "step": 1370 + }, + { + "epoch": 1.12, + "grad_norm": 1.8831191819719022, + "learning_rate": 1.980647222893671e-06, + "loss": 0.3758, + "step": 1371 + }, + { + "epoch": 1.12, + "grad_norm": 2.03493312021646, + "learning_rate": 1.977449234233875e-06, + "loss": 0.4066, + "step": 1372 + }, + { + "epoch": 1.12, + "grad_norm": 1.9837157371609282, + "learning_rate": 1.9742521394601413e-06, + "loss": 0.3757, + "step": 1373 + }, + { + "epoch": 1.12, + "grad_norm": 1.9871704920253919, + "learning_rate": 1.9710559440414867e-06, + "loss": 0.3811, + "step": 1374 + }, + { + "epoch": 1.12, + "grad_norm": 1.8609975534569105, + "learning_rate": 1.9678606534453874e-06, + "loss": 0.3709, + "step": 1375 + }, + { + "epoch": 1.12, + "grad_norm": 1.8599855946550903, + "learning_rate": 1.9646662731377737e-06, + "loss": 0.3589, + "step": 1376 + }, + { + "epoch": 1.12, + "grad_norm": 2.0183183444158224, + "learning_rate": 1.9614728085830185e-06, + "loss": 0.3521, + "step": 1377 + }, + { + "epoch": 1.12, + "grad_norm": 1.9976152320569405, + "learning_rate": 1.958280265243927e-06, + "loss": 0.3757, + "step": 1378 + }, + { + "epoch": 1.13, + "grad_norm": 1.9951401325370672, + "learning_rate": 1.9550886485817313e-06, + "loss": 0.3947, + "step": 1379 + }, + { + "epoch": 1.13, + "grad_norm": 1.9553672687038417, + "learning_rate": 1.9518979640560737e-06, + "loss": 0.3473, + "step": 1380 + }, + { + "epoch": 1.13, + "grad_norm": 1.9340367763443969, + "learning_rate": 1.9487082171250057e-06, + "loss": 0.37, + "step": 1381 + }, + { + "epoch": 1.13, + "grad_norm": 1.8996712185125788, + "learning_rate": 1.9455194132449745e-06, + "loss": 0.3924, + "step": 1382 + }, + { + "epoch": 1.13, + "grad_norm": 1.9351658663427442, + "learning_rate": 1.9423315578708126e-06, + "loss": 0.3959, + "step": 1383 + }, + { + "epoch": 1.13, + "grad_norm": 2.0174109611058504, + "learning_rate": 1.939144656455731e-06, + "loss": 0.3987, + "step": 1384 + }, + { + "epoch": 1.13, + "grad_norm": 1.76886531168205, + "learning_rate": 1.9359587144513086e-06, + "loss": 0.4277, + "step": 1385 + }, + { + "epoch": 1.13, + "grad_norm": 2.1774228741508455, + "learning_rate": 1.9327737373074834e-06, + "loss": 0.4474, + "step": 1386 + }, + { + "epoch": 1.13, + "grad_norm": 1.8335022286037221, + "learning_rate": 1.929589730472543e-06, + "loss": 0.3586, + "step": 1387 + }, + { + "epoch": 1.13, + "grad_norm": 1.944762597816562, + "learning_rate": 1.926406699393114e-06, + "loss": 0.3916, + "step": 1388 + }, + { + "epoch": 1.13, + "grad_norm": 1.9158836718088024, + "learning_rate": 1.9232246495141554e-06, + "loss": 0.3471, + "step": 1389 + }, + { + "epoch": 1.13, + "grad_norm": 1.9546368466405357, + "learning_rate": 1.920043586278947e-06, + "loss": 0.3747, + "step": 1390 + }, + { + "epoch": 1.14, + "grad_norm": 1.9070019014660136, + "learning_rate": 1.9168635151290803e-06, + "loss": 0.3524, + "step": 1391 + }, + { + "epoch": 1.14, + "grad_norm": 2.023146490194608, + "learning_rate": 1.9136844415044502e-06, + "loss": 0.3707, + "step": 1392 + }, + { + "epoch": 1.14, + "grad_norm": 1.8809251159178713, + "learning_rate": 1.910506370843246e-06, + "loss": 0.3801, + "step": 1393 + }, + { + "epoch": 1.14, + "grad_norm": 2.0409011175956784, + "learning_rate": 1.9073293085819402e-06, + "loss": 0.373, + "step": 1394 + }, + { + "epoch": 1.14, + "grad_norm": 2.0117643519136315, + "learning_rate": 1.9041532601552804e-06, + "loss": 0.3645, + "step": 1395 + }, + { + "epoch": 1.14, + "grad_norm": 1.9716378326274158, + "learning_rate": 1.9009782309962805e-06, + "loss": 0.3614, + "step": 1396 + }, + { + "epoch": 1.14, + "grad_norm": 1.9329872273189466, + "learning_rate": 1.8978042265362103e-06, + "loss": 0.3551, + "step": 1397 + }, + { + "epoch": 1.14, + "grad_norm": 1.9199554634763143, + "learning_rate": 1.8946312522045874e-06, + "loss": 0.3902, + "step": 1398 + }, + { + "epoch": 1.14, + "grad_norm": 1.9590655710866773, + "learning_rate": 1.891459313429167e-06, + "loss": 0.4142, + "step": 1399 + }, + { + "epoch": 1.14, + "grad_norm": 2.0331664011816972, + "learning_rate": 1.8882884156359324e-06, + "loss": 0.3656, + "step": 1400 + }, + { + "epoch": 1.14, + "grad_norm": 2.0472909494424583, + "learning_rate": 1.8851185642490863e-06, + "loss": 0.3886, + "step": 1401 + }, + { + "epoch": 1.14, + "grad_norm": 1.9929489595454677, + "learning_rate": 1.8819497646910408e-06, + "loss": 0.3672, + "step": 1402 + }, + { + "epoch": 1.15, + "grad_norm": 1.9438211462442658, + "learning_rate": 1.87878202238241e-06, + "loss": 0.3713, + "step": 1403 + }, + { + "epoch": 1.15, + "grad_norm": 1.9090031612890588, + "learning_rate": 1.8756153427419996e-06, + "loss": 0.3806, + "step": 1404 + }, + { + "epoch": 1.15, + "grad_norm": 1.8225379267675694, + "learning_rate": 1.872449731186796e-06, + "loss": 0.3412, + "step": 1405 + }, + { + "epoch": 1.15, + "grad_norm": 1.7944071121109437, + "learning_rate": 1.86928519313196e-06, + "loss": 0.3642, + "step": 1406 + }, + { + "epoch": 1.15, + "grad_norm": 1.9414616279338623, + "learning_rate": 1.8661217339908142e-06, + "loss": 0.3806, + "step": 1407 + }, + { + "epoch": 1.15, + "grad_norm": 1.944356212181711, + "learning_rate": 1.8629593591748374e-06, + "loss": 0.3987, + "step": 1408 + }, + { + "epoch": 1.15, + "grad_norm": 1.857841085738498, + "learning_rate": 1.8597980740936528e-06, + "loss": 0.3899, + "step": 1409 + }, + { + "epoch": 1.15, + "grad_norm": 1.8710356295384132, + "learning_rate": 1.8566378841550205e-06, + "loss": 0.3784, + "step": 1410 + }, + { + "epoch": 1.15, + "grad_norm": 1.8728296119496737, + "learning_rate": 1.8534787947648247e-06, + "loss": 0.3867, + "step": 1411 + }, + { + "epoch": 1.15, + "grad_norm": 1.8738844694805654, + "learning_rate": 1.8503208113270687e-06, + "loss": 0.3696, + "step": 1412 + }, + { + "epoch": 1.15, + "grad_norm": 1.9649370685779552, + "learning_rate": 1.8471639392438648e-06, + "loss": 0.3986, + "step": 1413 + }, + { + "epoch": 1.15, + "grad_norm": 1.7859555369523812, + "learning_rate": 1.8440081839154222e-06, + "loss": 0.3871, + "step": 1414 + }, + { + "epoch": 1.16, + "grad_norm": 1.8610430021362592, + "learning_rate": 1.840853550740041e-06, + "loss": 0.333, + "step": 1415 + }, + { + "epoch": 1.16, + "grad_norm": 1.9871037672382785, + "learning_rate": 1.8377000451141013e-06, + "loss": 0.3655, + "step": 1416 + }, + { + "epoch": 1.16, + "grad_norm": 2.0510993717790544, + "learning_rate": 1.8345476724320549e-06, + "loss": 0.3345, + "step": 1417 + }, + { + "epoch": 1.16, + "grad_norm": 2.022865297999793, + "learning_rate": 1.8313964380864157e-06, + "loss": 0.4238, + "step": 1418 + }, + { + "epoch": 1.16, + "grad_norm": 2.0272213314003786, + "learning_rate": 1.8282463474677485e-06, + "loss": 0.3775, + "step": 1419 + }, + { + "epoch": 1.16, + "grad_norm": 2.006744012043913, + "learning_rate": 1.825097405964665e-06, + "loss": 0.3886, + "step": 1420 + }, + { + "epoch": 1.16, + "grad_norm": 2.0596399522136406, + "learning_rate": 1.8219496189638065e-06, + "loss": 0.4091, + "step": 1421 + }, + { + "epoch": 1.16, + "grad_norm": 1.8816895162930982, + "learning_rate": 1.8188029918498434e-06, + "loss": 0.4065, + "step": 1422 + }, + { + "epoch": 1.16, + "grad_norm": 1.9988370328142775, + "learning_rate": 1.8156575300054607e-06, + "loss": 0.3968, + "step": 1423 + }, + { + "epoch": 1.16, + "grad_norm": 2.0379288149529216, + "learning_rate": 1.8125132388113497e-06, + "loss": 0.3893, + "step": 1424 + }, + { + "epoch": 1.16, + "grad_norm": 1.8764951987892278, + "learning_rate": 1.8093701236461999e-06, + "loss": 0.3757, + "step": 1425 + }, + { + "epoch": 1.16, + "grad_norm": 1.9911843473469748, + "learning_rate": 1.806228189886688e-06, + "loss": 0.3891, + "step": 1426 + }, + { + "epoch": 1.17, + "grad_norm": 1.9631453513585595, + "learning_rate": 1.8030874429074701e-06, + "loss": 0.3969, + "step": 1427 + }, + { + "epoch": 1.17, + "grad_norm": 1.8998526626952037, + "learning_rate": 1.7999478880811735e-06, + "loss": 0.3919, + "step": 1428 + }, + { + "epoch": 1.17, + "grad_norm": 1.8805553933080315, + "learning_rate": 1.7968095307783845e-06, + "loss": 0.3767, + "step": 1429 + }, + { + "epoch": 1.17, + "grad_norm": 1.9958093732421776, + "learning_rate": 1.7936723763676426e-06, + "loss": 0.3861, + "step": 1430 + }, + { + "epoch": 1.17, + "grad_norm": 1.8587137598489651, + "learning_rate": 1.7905364302154264e-06, + "loss": 0.3289, + "step": 1431 + }, + { + "epoch": 1.17, + "grad_norm": 2.0380004642313785, + "learning_rate": 1.7874016976861504e-06, + "loss": 0.3531, + "step": 1432 + }, + { + "epoch": 1.17, + "grad_norm": 1.9171820086465794, + "learning_rate": 1.784268184142154e-06, + "loss": 0.3986, + "step": 1433 + }, + { + "epoch": 1.17, + "grad_norm": 1.95855879390137, + "learning_rate": 1.7811358949436874e-06, + "loss": 0.3402, + "step": 1434 + }, + { + "epoch": 1.17, + "grad_norm": 1.9995990338040457, + "learning_rate": 1.7780048354489101e-06, + "loss": 0.3599, + "step": 1435 + }, + { + "epoch": 1.17, + "grad_norm": 1.9243145774410442, + "learning_rate": 1.7748750110138768e-06, + "loss": 0.4399, + "step": 1436 + }, + { + "epoch": 1.17, + "grad_norm": 2.279285862974166, + "learning_rate": 1.7717464269925288e-06, + "loss": 0.3614, + "step": 1437 + }, + { + "epoch": 1.17, + "grad_norm": 1.9005095716347011, + "learning_rate": 1.7686190887366875e-06, + "loss": 0.3665, + "step": 1438 + }, + { + "epoch": 1.18, + "grad_norm": 1.8076423185524721, + "learning_rate": 1.7654930015960401e-06, + "loss": 0.3408, + "step": 1439 + }, + { + "epoch": 1.18, + "grad_norm": 1.8762893879880087, + "learning_rate": 1.762368170918136e-06, + "loss": 0.39, + "step": 1440 + }, + { + "epoch": 1.18, + "grad_norm": 2.0153368993119556, + "learning_rate": 1.7592446020483762e-06, + "loss": 0.3539, + "step": 1441 + }, + { + "epoch": 1.18, + "grad_norm": 1.9585515808006808, + "learning_rate": 1.7561223003299994e-06, + "loss": 0.3956, + "step": 1442 + }, + { + "epoch": 1.18, + "grad_norm": 2.124848103864915, + "learning_rate": 1.7530012711040794e-06, + "loss": 0.4119, + "step": 1443 + }, + { + "epoch": 1.18, + "grad_norm": 2.012402459921111, + "learning_rate": 1.749881519709514e-06, + "loss": 0.408, + "step": 1444 + }, + { + "epoch": 1.18, + "grad_norm": 1.9649268732755643, + "learning_rate": 1.7467630514830136e-06, + "loss": 0.3283, + "step": 1445 + }, + { + "epoch": 1.18, + "grad_norm": 1.8596310758669552, + "learning_rate": 1.7436458717590931e-06, + "loss": 0.4354, + "step": 1446 + }, + { + "epoch": 1.18, + "grad_norm": 1.9102148486337966, + "learning_rate": 1.7405299858700648e-06, + "loss": 0.3954, + "step": 1447 + }, + { + "epoch": 1.18, + "grad_norm": 1.8553487771224224, + "learning_rate": 1.737415399146027e-06, + "loss": 0.3668, + "step": 1448 + }, + { + "epoch": 1.18, + "grad_norm": 2.1142472778200756, + "learning_rate": 1.7343021169148554e-06, + "loss": 0.3745, + "step": 1449 + }, + { + "epoch": 1.18, + "grad_norm": 1.9058887276269199, + "learning_rate": 1.7311901445021955e-06, + "loss": 0.3818, + "step": 1450 + }, + { + "epoch": 1.19, + "grad_norm": 2.0622661899571666, + "learning_rate": 1.7280794872314499e-06, + "loss": 0.3961, + "step": 1451 + }, + { + "epoch": 1.19, + "grad_norm": 1.8962754770592172, + "learning_rate": 1.7249701504237737e-06, + "loss": 0.3586, + "step": 1452 + }, + { + "epoch": 1.19, + "grad_norm": 1.8165490259194481, + "learning_rate": 1.7218621393980606e-06, + "loss": 0.3311, + "step": 1453 + }, + { + "epoch": 1.19, + "grad_norm": 1.9977375977133494, + "learning_rate": 1.7187554594709396e-06, + "loss": 0.3674, + "step": 1454 + }, + { + "epoch": 1.19, + "grad_norm": 1.8504323227168384, + "learning_rate": 1.7156501159567607e-06, + "loss": 0.3743, + "step": 1455 + }, + { + "epoch": 1.19, + "grad_norm": 1.9541250949627105, + "learning_rate": 1.7125461141675881e-06, + "loss": 0.3812, + "step": 1456 + }, + { + "epoch": 1.19, + "grad_norm": 1.993766367538168, + "learning_rate": 1.7094434594131914e-06, + "loss": 0.355, + "step": 1457 + }, + { + "epoch": 1.19, + "grad_norm": 1.851815452351873, + "learning_rate": 1.7063421570010349e-06, + "loss": 0.3792, + "step": 1458 + }, + { + "epoch": 1.19, + "grad_norm": 1.8699896985814497, + "learning_rate": 1.7032422122362704e-06, + "loss": 0.345, + "step": 1459 + }, + { + "epoch": 1.19, + "grad_norm": 1.941362367589001, + "learning_rate": 1.700143630421727e-06, + "loss": 0.3735, + "step": 1460 + }, + { + "epoch": 1.19, + "grad_norm": 1.844833441576945, + "learning_rate": 1.6970464168579034e-06, + "loss": 0.3883, + "step": 1461 + }, + { + "epoch": 1.19, + "grad_norm": 1.9382330200940399, + "learning_rate": 1.6939505768429548e-06, + "loss": 0.3451, + "step": 1462 + }, + { + "epoch": 1.2, + "grad_norm": 1.9404379114850492, + "learning_rate": 1.6908561156726894e-06, + "loss": 0.3886, + "step": 1463 + }, + { + "epoch": 1.2, + "grad_norm": 1.89967752240511, + "learning_rate": 1.6877630386405567e-06, + "loss": 0.4322, + "step": 1464 + }, + { + "epoch": 1.2, + "grad_norm": 1.9542258627644085, + "learning_rate": 1.6846713510376363e-06, + "loss": 0.4143, + "step": 1465 + }, + { + "epoch": 1.2, + "grad_norm": 2.0224476812069305, + "learning_rate": 1.6815810581526337e-06, + "loss": 0.3885, + "step": 1466 + }, + { + "epoch": 1.2, + "grad_norm": 1.9984358815769925, + "learning_rate": 1.6784921652718666e-06, + "loss": 0.326, + "step": 1467 + }, + { + "epoch": 1.2, + "grad_norm": 1.9112545672749313, + "learning_rate": 1.675404677679259e-06, + "loss": 0.3818, + "step": 1468 + }, + { + "epoch": 1.2, + "grad_norm": 1.8535662369823578, + "learning_rate": 1.6723186006563309e-06, + "loss": 0.348, + "step": 1469 + }, + { + "epoch": 1.2, + "grad_norm": 1.9484817526163822, + "learning_rate": 1.6692339394821877e-06, + "loss": 0.3357, + "step": 1470 + }, + { + "epoch": 1.2, + "grad_norm": 1.898163029912662, + "learning_rate": 1.6661506994335164e-06, + "loss": 0.3755, + "step": 1471 + }, + { + "epoch": 1.2, + "grad_norm": 1.8795795559493234, + "learning_rate": 1.6630688857845678e-06, + "loss": 0.3616, + "step": 1472 + }, + { + "epoch": 1.2, + "grad_norm": 1.9167503410588418, + "learning_rate": 1.6599885038071566e-06, + "loss": 0.3592, + "step": 1473 + }, + { + "epoch": 1.2, + "grad_norm": 1.9765253259894953, + "learning_rate": 1.6569095587706485e-06, + "loss": 0.3953, + "step": 1474 + }, + { + "epoch": 1.21, + "grad_norm": 1.9352433621405845, + "learning_rate": 1.6538320559419488e-06, + "loss": 0.3528, + "step": 1475 + }, + { + "epoch": 1.21, + "grad_norm": 2.0111021011512125, + "learning_rate": 1.6507560005854977e-06, + "loss": 0.407, + "step": 1476 + }, + { + "epoch": 1.21, + "grad_norm": 1.8339393905209536, + "learning_rate": 1.6476813979632589e-06, + "loss": 0.3668, + "step": 1477 + }, + { + "epoch": 1.21, + "grad_norm": 1.9309495145983575, + "learning_rate": 1.6446082533347096e-06, + "loss": 0.4106, + "step": 1478 + }, + { + "epoch": 1.21, + "grad_norm": 1.8708341753950297, + "learning_rate": 1.641536571956835e-06, + "loss": 0.3749, + "step": 1479 + }, + { + "epoch": 1.21, + "grad_norm": 1.8244009733234272, + "learning_rate": 1.6384663590841154e-06, + "loss": 0.3832, + "step": 1480 + }, + { + "epoch": 1.21, + "grad_norm": 1.8878853394194013, + "learning_rate": 1.6353976199685222e-06, + "loss": 0.3539, + "step": 1481 + }, + { + "epoch": 1.21, + "grad_norm": 1.8830734244466278, + "learning_rate": 1.6323303598595006e-06, + "loss": 0.3852, + "step": 1482 + }, + { + "epoch": 1.21, + "grad_norm": 1.866253132730359, + "learning_rate": 1.6292645840039697e-06, + "loss": 0.364, + "step": 1483 + }, + { + "epoch": 1.21, + "grad_norm": 1.977321954101075, + "learning_rate": 1.6262002976463098e-06, + "loss": 0.3866, + "step": 1484 + }, + { + "epoch": 1.21, + "grad_norm": 1.9753878011905568, + "learning_rate": 1.62313750602835e-06, + "loss": 0.3999, + "step": 1485 + }, + { + "epoch": 1.21, + "grad_norm": 1.9461948334927384, + "learning_rate": 1.6200762143893659e-06, + "loss": 0.3769, + "step": 1486 + }, + { + "epoch": 1.22, + "grad_norm": 1.9597078370114984, + "learning_rate": 1.6170164279660656e-06, + "loss": 0.3546, + "step": 1487 + }, + { + "epoch": 1.22, + "grad_norm": 2.0333727955548735, + "learning_rate": 1.6139581519925818e-06, + "loss": 0.3631, + "step": 1488 + }, + { + "epoch": 1.22, + "grad_norm": 1.8957200128798963, + "learning_rate": 1.6109013917004657e-06, + "loss": 0.3738, + "step": 1489 + }, + { + "epoch": 1.22, + "grad_norm": 1.8758015207075704, + "learning_rate": 1.6078461523186722e-06, + "loss": 0.3511, + "step": 1490 + }, + { + "epoch": 1.22, + "grad_norm": 1.9539261883496823, + "learning_rate": 1.6047924390735587e-06, + "loss": 0.4074, + "step": 1491 + }, + { + "epoch": 1.22, + "grad_norm": 2.046216911945662, + "learning_rate": 1.6017402571888677e-06, + "loss": 0.3729, + "step": 1492 + }, + { + "epoch": 1.22, + "grad_norm": 2.0334239477316194, + "learning_rate": 1.5986896118857247e-06, + "loss": 0.3999, + "step": 1493 + }, + { + "epoch": 1.22, + "grad_norm": 2.0768274033669556, + "learning_rate": 1.5956405083826266e-06, + "loss": 0.3982, + "step": 1494 + }, + { + "epoch": 1.22, + "grad_norm": 1.9997134218487143, + "learning_rate": 1.592592951895432e-06, + "loss": 0.4319, + "step": 1495 + }, + { + "epoch": 1.22, + "grad_norm": 1.9000589337955354, + "learning_rate": 1.5895469476373545e-06, + "loss": 0.3813, + "step": 1496 + }, + { + "epoch": 1.22, + "grad_norm": 1.8787692854188953, + "learning_rate": 1.5865025008189501e-06, + "loss": 0.3801, + "step": 1497 + }, + { + "epoch": 1.22, + "grad_norm": 1.8346902202639779, + "learning_rate": 1.5834596166481132e-06, + "loss": 0.3533, + "step": 1498 + }, + { + "epoch": 1.23, + "grad_norm": 1.8993496821666367, + "learning_rate": 1.5804183003300627e-06, + "loss": 0.429, + "step": 1499 + }, + { + "epoch": 1.23, + "grad_norm": 2.342530229905022, + "learning_rate": 1.5773785570673378e-06, + "loss": 0.3356, + "step": 1500 + }, + { + "epoch": 1.23, + "grad_norm": 2.1048882391009127, + "learning_rate": 1.5743403920597856e-06, + "loss": 0.3896, + "step": 1501 + }, + { + "epoch": 1.23, + "grad_norm": 1.8528209728378324, + "learning_rate": 1.5713038105045535e-06, + "loss": 0.3307, + "step": 1502 + }, + { + "epoch": 1.23, + "grad_norm": 1.9057632190431548, + "learning_rate": 1.5682688175960797e-06, + "loss": 0.3806, + "step": 1503 + }, + { + "epoch": 1.23, + "grad_norm": 1.8724905465304538, + "learning_rate": 1.5652354185260848e-06, + "loss": 0.3637, + "step": 1504 + }, + { + "epoch": 1.23, + "grad_norm": 1.8484069152287292, + "learning_rate": 1.5622036184835648e-06, + "loss": 0.3161, + "step": 1505 + }, + { + "epoch": 1.23, + "grad_norm": 1.8399814687678377, + "learning_rate": 1.559173422654778e-06, + "loss": 0.3745, + "step": 1506 + }, + { + "epoch": 1.23, + "grad_norm": 1.8838641942793775, + "learning_rate": 1.5561448362232404e-06, + "loss": 0.3537, + "step": 1507 + }, + { + "epoch": 1.23, + "grad_norm": 1.8623848433104377, + "learning_rate": 1.5531178643697142e-06, + "loss": 0.3624, + "step": 1508 + }, + { + "epoch": 1.23, + "grad_norm": 1.8997144759052735, + "learning_rate": 1.5500925122721988e-06, + "loss": 0.3679, + "step": 1509 + }, + { + "epoch": 1.23, + "grad_norm": 1.8976582272389906, + "learning_rate": 1.5470687851059235e-06, + "loss": 0.3736, + "step": 1510 + }, + { + "epoch": 1.24, + "grad_norm": 1.8750760623537808, + "learning_rate": 1.5440466880433388e-06, + "loss": 0.3735, + "step": 1511 + }, + { + "epoch": 1.24, + "grad_norm": 1.990180186983658, + "learning_rate": 1.5410262262541065e-06, + "loss": 0.3797, + "step": 1512 + }, + { + "epoch": 1.24, + "grad_norm": 1.8820633605632435, + "learning_rate": 1.538007404905089e-06, + "loss": 0.3659, + "step": 1513 + }, + { + "epoch": 1.24, + "grad_norm": 1.9458293982836543, + "learning_rate": 1.5349902291603441e-06, + "loss": 0.4092, + "step": 1514 + }, + { + "epoch": 1.24, + "grad_norm": 1.822097097325058, + "learning_rate": 1.5319747041811158e-06, + "loss": 0.3276, + "step": 1515 + }, + { + "epoch": 1.24, + "grad_norm": 2.0516824372881457, + "learning_rate": 1.528960835125822e-06, + "loss": 0.4232, + "step": 1516 + }, + { + "epoch": 1.24, + "grad_norm": 2.0624060387577816, + "learning_rate": 1.5259486271500489e-06, + "loss": 0.3996, + "step": 1517 + }, + { + "epoch": 1.24, + "grad_norm": 1.9158764361943028, + "learning_rate": 1.522938085406542e-06, + "loss": 0.3728, + "step": 1518 + }, + { + "epoch": 1.24, + "grad_norm": 1.9071590654189663, + "learning_rate": 1.5199292150451956e-06, + "loss": 0.3459, + "step": 1519 + }, + { + "epoch": 1.24, + "grad_norm": 1.9532115896688163, + "learning_rate": 1.5169220212130449e-06, + "loss": 0.3513, + "step": 1520 + }, + { + "epoch": 1.24, + "grad_norm": 1.9901825773245059, + "learning_rate": 1.5139165090542574e-06, + "loss": 0.3468, + "step": 1521 + }, + { + "epoch": 1.24, + "grad_norm": 1.7913388603914477, + "learning_rate": 1.510912683710124e-06, + "loss": 0.3381, + "step": 1522 + }, + { + "epoch": 1.24, + "grad_norm": 1.8270379040698477, + "learning_rate": 1.5079105503190497e-06, + "loss": 0.3873, + "step": 1523 + }, + { + "epoch": 1.25, + "grad_norm": 1.9259224146444094, + "learning_rate": 1.5049101140165453e-06, + "loss": 0.3553, + "step": 1524 + }, + { + "epoch": 1.25, + "grad_norm": 1.7933642267566716, + "learning_rate": 1.501911379935219e-06, + "loss": 0.3928, + "step": 1525 + }, + { + "epoch": 1.25, + "grad_norm": 1.859002957520952, + "learning_rate": 1.498914353204767e-06, + "loss": 0.3331, + "step": 1526 + }, + { + "epoch": 1.25, + "grad_norm": 1.9280095918192017, + "learning_rate": 1.4959190389519646e-06, + "loss": 0.3902, + "step": 1527 + }, + { + "epoch": 1.25, + "grad_norm": 1.9929705610530277, + "learning_rate": 1.492925442300658e-06, + "loss": 0.3765, + "step": 1528 + }, + { + "epoch": 1.25, + "grad_norm": 2.02617558936789, + "learning_rate": 1.4899335683717546e-06, + "loss": 0.3815, + "step": 1529 + }, + { + "epoch": 1.25, + "grad_norm": 1.8532248246777345, + "learning_rate": 1.4869434222832157e-06, + "loss": 0.3998, + "step": 1530 + }, + { + "epoch": 1.25, + "grad_norm": 1.8616511215661515, + "learning_rate": 1.4839550091500464e-06, + "loss": 0.4005, + "step": 1531 + }, + { + "epoch": 1.25, + "grad_norm": 1.9696593290003677, + "learning_rate": 1.4809683340842885e-06, + "loss": 0.4136, + "step": 1532 + }, + { + "epoch": 1.25, + "grad_norm": 1.9439323576237217, + "learning_rate": 1.477983402195008e-06, + "loss": 0.3674, + "step": 1533 + }, + { + "epoch": 1.25, + "grad_norm": 1.8858064066643994, + "learning_rate": 1.475000218588291e-06, + "loss": 0.3505, + "step": 1534 + }, + { + "epoch": 1.25, + "grad_norm": 1.9565923900750009, + "learning_rate": 1.4720187883672337e-06, + "loss": 0.379, + "step": 1535 + }, + { + "epoch": 1.26, + "grad_norm": 1.9482950589580994, + "learning_rate": 1.4690391166319307e-06, + "loss": 0.3962, + "step": 1536 + }, + { + "epoch": 1.26, + "grad_norm": 1.979462387227227, + "learning_rate": 1.4660612084794701e-06, + "loss": 0.3662, + "step": 1537 + }, + { + "epoch": 1.26, + "grad_norm": 1.894203355197371, + "learning_rate": 1.4630850690039221e-06, + "loss": 0.3703, + "step": 1538 + }, + { + "epoch": 1.26, + "grad_norm": 1.8798042105520323, + "learning_rate": 1.460110703296333e-06, + "loss": 0.3631, + "step": 1539 + }, + { + "epoch": 1.26, + "grad_norm": 1.9687008779986372, + "learning_rate": 1.4571381164447137e-06, + "loss": 0.4081, + "step": 1540 + }, + { + "epoch": 1.26, + "grad_norm": 2.043706332156422, + "learning_rate": 1.454167313534031e-06, + "loss": 0.3629, + "step": 1541 + }, + { + "epoch": 1.26, + "grad_norm": 1.9336401989651433, + "learning_rate": 1.4511982996462038e-06, + "loss": 0.4042, + "step": 1542 + }, + { + "epoch": 1.26, + "grad_norm": 1.9550529998108908, + "learning_rate": 1.4482310798600852e-06, + "loss": 0.3768, + "step": 1543 + }, + { + "epoch": 1.26, + "grad_norm": 1.874147928818456, + "learning_rate": 1.4452656592514633e-06, + "loss": 0.4125, + "step": 1544 + }, + { + "epoch": 1.26, + "grad_norm": 1.848295970105597, + "learning_rate": 1.442302042893048e-06, + "loss": 0.3646, + "step": 1545 + }, + { + "epoch": 1.26, + "grad_norm": 1.991422406332833, + "learning_rate": 1.439340235854462e-06, + "loss": 0.3885, + "step": 1546 + }, + { + "epoch": 1.26, + "grad_norm": 1.89855710617557, + "learning_rate": 1.436380243202233e-06, + "loss": 0.3658, + "step": 1547 + }, + { + "epoch": 1.27, + "grad_norm": 1.8657910310229384, + "learning_rate": 1.4334220699997856e-06, + "loss": 0.3659, + "step": 1548 + }, + { + "epoch": 1.27, + "grad_norm": 1.9035891506078888, + "learning_rate": 1.4304657213074314e-06, + "loss": 0.3662, + "step": 1549 + }, + { + "epoch": 1.27, + "grad_norm": 1.9026573701280374, + "learning_rate": 1.4275112021823618e-06, + "loss": 0.3712, + "step": 1550 + }, + { + "epoch": 1.27, + "grad_norm": 1.9342408780305267, + "learning_rate": 1.4245585176786363e-06, + "loss": 0.355, + "step": 1551 + }, + { + "epoch": 1.27, + "grad_norm": 1.8785254217068754, + "learning_rate": 1.4216076728471794e-06, + "loss": 0.3985, + "step": 1552 + }, + { + "epoch": 1.27, + "grad_norm": 1.9602955113202258, + "learning_rate": 1.4186586727357649e-06, + "loss": 0.4063, + "step": 1553 + }, + { + "epoch": 1.27, + "grad_norm": 2.083823151902659, + "learning_rate": 1.4157115223890136e-06, + "loss": 0.4121, + "step": 1554 + }, + { + "epoch": 1.27, + "grad_norm": 1.8676871403375772, + "learning_rate": 1.4127662268483818e-06, + "loss": 0.3912, + "step": 1555 + }, + { + "epoch": 1.27, + "grad_norm": 1.9120128683776039, + "learning_rate": 1.4098227911521523e-06, + "loss": 0.3453, + "step": 1556 + }, + { + "epoch": 1.27, + "grad_norm": 1.844790264464269, + "learning_rate": 1.4068812203354264e-06, + "loss": 0.3666, + "step": 1557 + }, + { + "epoch": 1.27, + "grad_norm": 1.8477236162312085, + "learning_rate": 1.4039415194301159e-06, + "loss": 0.3652, + "step": 1558 + }, + { + "epoch": 1.27, + "grad_norm": 1.9200270211079769, + "learning_rate": 1.4010036934649334e-06, + "loss": 0.3755, + "step": 1559 + }, + { + "epoch": 1.28, + "grad_norm": 1.8353558471804892, + "learning_rate": 1.3980677474653838e-06, + "loss": 0.3653, + "step": 1560 + }, + { + "epoch": 1.28, + "grad_norm": 1.9621989060334357, + "learning_rate": 1.3951336864537572e-06, + "loss": 0.4104, + "step": 1561 + }, + { + "epoch": 1.28, + "grad_norm": 1.8245538722983388, + "learning_rate": 1.3922015154491194e-06, + "loss": 0.3991, + "step": 1562 + }, + { + "epoch": 1.28, + "grad_norm": 1.933539870056334, + "learning_rate": 1.3892712394673002e-06, + "loss": 0.3877, + "step": 1563 + }, + { + "epoch": 1.28, + "grad_norm": 1.8275785324682217, + "learning_rate": 1.3863428635208915e-06, + "loss": 0.3546, + "step": 1564 + }, + { + "epoch": 1.28, + "grad_norm": 2.0450836317829215, + "learning_rate": 1.3834163926192318e-06, + "loss": 0.3847, + "step": 1565 + }, + { + "epoch": 1.28, + "grad_norm": 3.523986698344347, + "learning_rate": 1.380491831768403e-06, + "loss": 0.3502, + "step": 1566 + }, + { + "epoch": 1.28, + "grad_norm": 1.9164812764116064, + "learning_rate": 1.3775691859712193e-06, + "loss": 0.309, + "step": 1567 + }, + { + "epoch": 1.28, + "grad_norm": 2.0951493120042604, + "learning_rate": 1.3746484602272178e-06, + "loss": 0.3678, + "step": 1568 + }, + { + "epoch": 1.28, + "grad_norm": 1.8843177010635455, + "learning_rate": 1.3717296595326527e-06, + "loss": 0.358, + "step": 1569 + }, + { + "epoch": 1.28, + "grad_norm": 1.9562282189438478, + "learning_rate": 1.3688127888804837e-06, + "loss": 0.4021, + "step": 1570 + }, + { + "epoch": 1.28, + "grad_norm": 1.997781626544885, + "learning_rate": 1.36589785326037e-06, + "loss": 0.4158, + "step": 1571 + }, + { + "epoch": 1.29, + "grad_norm": 1.8805954764404564, + "learning_rate": 1.3629848576586604e-06, + "loss": 0.3678, + "step": 1572 + }, + { + "epoch": 1.29, + "grad_norm": 2.037723153555198, + "learning_rate": 1.3600738070583858e-06, + "loss": 0.3611, + "step": 1573 + }, + { + "epoch": 1.29, + "grad_norm": 1.9504207408498462, + "learning_rate": 1.3571647064392467e-06, + "loss": 0.4096, + "step": 1574 + }, + { + "epoch": 1.29, + "grad_norm": 2.0573869926356494, + "learning_rate": 1.3542575607776117e-06, + "loss": 0.3698, + "step": 1575 + }, + { + "epoch": 1.29, + "grad_norm": 1.9648011988919714, + "learning_rate": 1.3513523750465049e-06, + "loss": 0.3557, + "step": 1576 + }, + { + "epoch": 1.29, + "grad_norm": 2.0566628239070077, + "learning_rate": 1.3484491542155941e-06, + "loss": 0.4099, + "step": 1577 + }, + { + "epoch": 1.29, + "grad_norm": 1.840088910062188, + "learning_rate": 1.3455479032511903e-06, + "loss": 0.3759, + "step": 1578 + }, + { + "epoch": 1.29, + "grad_norm": 1.916068103431673, + "learning_rate": 1.3426486271162326e-06, + "loss": 0.36, + "step": 1579 + }, + { + "epoch": 1.29, + "grad_norm": 1.932989091441797, + "learning_rate": 1.3397513307702817e-06, + "loss": 0.3658, + "step": 1580 + }, + { + "epoch": 1.29, + "grad_norm": 1.8629067871512175, + "learning_rate": 1.3368560191695126e-06, + "loss": 0.3562, + "step": 1581 + }, + { + "epoch": 1.29, + "grad_norm": 2.0118302341661307, + "learning_rate": 1.3339626972667048e-06, + "loss": 0.3878, + "step": 1582 + }, + { + "epoch": 1.29, + "grad_norm": 1.9124583307461076, + "learning_rate": 1.3310713700112348e-06, + "loss": 0.3809, + "step": 1583 + }, + { + "epoch": 1.3, + "grad_norm": 1.9774861213509043, + "learning_rate": 1.328182042349065e-06, + "loss": 0.4137, + "step": 1584 + }, + { + "epoch": 1.3, + "grad_norm": 1.9114216906066048, + "learning_rate": 1.3252947192227388e-06, + "loss": 0.3837, + "step": 1585 + }, + { + "epoch": 1.3, + "grad_norm": 1.8560468375199388, + "learning_rate": 1.3224094055713713e-06, + "loss": 0.3603, + "step": 1586 + }, + { + "epoch": 1.3, + "grad_norm": 1.9212128604014926, + "learning_rate": 1.3195261063306381e-06, + "loss": 0.3458, + "step": 1587 + }, + { + "epoch": 1.3, + "grad_norm": 1.9251208352537634, + "learning_rate": 1.316644826432772e-06, + "loss": 0.3844, + "step": 1588 + }, + { + "epoch": 1.3, + "grad_norm": 1.883081065391109, + "learning_rate": 1.313765570806547e-06, + "loss": 0.4208, + "step": 1589 + }, + { + "epoch": 1.3, + "grad_norm": 1.8564972529452957, + "learning_rate": 1.3108883443772779e-06, + "loss": 0.3622, + "step": 1590 + }, + { + "epoch": 1.3, + "grad_norm": 1.9725309818034906, + "learning_rate": 1.3080131520668075e-06, + "loss": 0.3489, + "step": 1591 + }, + { + "epoch": 1.3, + "grad_norm": 1.9747828638689664, + "learning_rate": 1.3051399987934988e-06, + "loss": 0.38, + "step": 1592 + }, + { + "epoch": 1.3, + "grad_norm": 1.8498395134731278, + "learning_rate": 1.3022688894722271e-06, + "loss": 0.3797, + "step": 1593 + }, + { + "epoch": 1.3, + "grad_norm": 1.8845414148933772, + "learning_rate": 1.2993998290143698e-06, + "loss": 0.3335, + "step": 1594 + }, + { + "epoch": 1.3, + "grad_norm": 1.9610318168301932, + "learning_rate": 1.296532822327801e-06, + "loss": 0.3769, + "step": 1595 + }, + { + "epoch": 1.31, + "grad_norm": 1.8917429842068785, + "learning_rate": 1.2936678743168813e-06, + "loss": 0.3981, + "step": 1596 + }, + { + "epoch": 1.31, + "grad_norm": 2.005525949740854, + "learning_rate": 1.29080498988245e-06, + "loss": 0.3789, + "step": 1597 + }, + { + "epoch": 1.31, + "grad_norm": 1.891996578027132, + "learning_rate": 1.2879441739218152e-06, + "loss": 0.3906, + "step": 1598 + }, + { + "epoch": 1.31, + "grad_norm": 2.0224573297517114, + "learning_rate": 1.285085431328748e-06, + "loss": 0.3852, + "step": 1599 + }, + { + "epoch": 1.31, + "grad_norm": 1.9933585122271171, + "learning_rate": 1.282228766993472e-06, + "loss": 0.3811, + "step": 1600 + }, + { + "epoch": 1.31, + "grad_norm": 1.9655771579152717, + "learning_rate": 1.2793741858026565e-06, + "loss": 0.3799, + "step": 1601 + }, + { + "epoch": 1.31, + "grad_norm": 1.8953068551718162, + "learning_rate": 1.2765216926394047e-06, + "loss": 0.3508, + "step": 1602 + }, + { + "epoch": 1.31, + "grad_norm": 1.8702448937265155, + "learning_rate": 1.2736712923832526e-06, + "loss": 0.3427, + "step": 1603 + }, + { + "epoch": 1.31, + "grad_norm": 1.9279047888369216, + "learning_rate": 1.2708229899101505e-06, + "loss": 0.3755, + "step": 1604 + }, + { + "epoch": 1.31, + "grad_norm": 1.8867926377124098, + "learning_rate": 1.2679767900924647e-06, + "loss": 0.3366, + "step": 1605 + }, + { + "epoch": 1.31, + "grad_norm": 1.8256946570291102, + "learning_rate": 1.2651326977989629e-06, + "loss": 0.3419, + "step": 1606 + }, + { + "epoch": 1.31, + "grad_norm": 1.8845001674022432, + "learning_rate": 1.2622907178948074e-06, + "loss": 0.3593, + "step": 1607 + }, + { + "epoch": 1.32, + "grad_norm": 1.8041550297275601, + "learning_rate": 1.2594508552415474e-06, + "loss": 0.3565, + "step": 1608 + }, + { + "epoch": 1.32, + "grad_norm": 1.929162466271085, + "learning_rate": 1.2566131146971105e-06, + "loss": 0.346, + "step": 1609 + }, + { + "epoch": 1.32, + "grad_norm": 1.9783530922620556, + "learning_rate": 1.2537775011157943e-06, + "loss": 0.3655, + "step": 1610 + }, + { + "epoch": 1.32, + "grad_norm": 1.9493980516637623, + "learning_rate": 1.2509440193482564e-06, + "loss": 0.417, + "step": 1611 + }, + { + "epoch": 1.32, + "grad_norm": 1.8895548928491517, + "learning_rate": 1.2481126742415098e-06, + "loss": 0.3731, + "step": 1612 + }, + { + "epoch": 1.32, + "grad_norm": 1.874868433424839, + "learning_rate": 1.2452834706389122e-06, + "loss": 0.3743, + "step": 1613 + }, + { + "epoch": 1.32, + "grad_norm": 1.917114604759422, + "learning_rate": 1.2424564133801553e-06, + "loss": 0.3412, + "step": 1614 + }, + { + "epoch": 1.32, + "grad_norm": 1.9354723425395528, + "learning_rate": 1.2396315073012636e-06, + "loss": 0.3564, + "step": 1615 + }, + { + "epoch": 1.32, + "grad_norm": 1.9621850514310992, + "learning_rate": 1.2368087572345772e-06, + "loss": 0.348, + "step": 1616 + }, + { + "epoch": 1.32, + "grad_norm": 2.058589411316211, + "learning_rate": 1.233988168008751e-06, + "loss": 0.3679, + "step": 1617 + }, + { + "epoch": 1.32, + "grad_norm": 1.9516795286397743, + "learning_rate": 1.2311697444487431e-06, + "loss": 0.3635, + "step": 1618 + }, + { + "epoch": 1.32, + "grad_norm": 1.9233248775745249, + "learning_rate": 1.2283534913758066e-06, + "loss": 0.3957, + "step": 1619 + }, + { + "epoch": 1.33, + "grad_norm": 1.9303786560618386, + "learning_rate": 1.225539413607482e-06, + "loss": 0.3806, + "step": 1620 + }, + { + "epoch": 1.33, + "grad_norm": 2.030744520145863, + "learning_rate": 1.222727515957588e-06, + "loss": 0.4023, + "step": 1621 + }, + { + "epoch": 1.33, + "grad_norm": 1.9537051918570292, + "learning_rate": 1.2199178032362149e-06, + "loss": 0.3808, + "step": 1622 + }, + { + "epoch": 1.33, + "grad_norm": 1.8928085054817043, + "learning_rate": 1.2171102802497148e-06, + "loss": 0.3982, + "step": 1623 + }, + { + "epoch": 1.33, + "grad_norm": 2.0571192296380296, + "learning_rate": 1.2143049518006952e-06, + "loss": 0.4044, + "step": 1624 + }, + { + "epoch": 1.33, + "grad_norm": 1.856402590326006, + "learning_rate": 1.2115018226880063e-06, + "loss": 0.3977, + "step": 1625 + }, + { + "epoch": 1.33, + "grad_norm": 1.927548078890778, + "learning_rate": 1.208700897706739e-06, + "loss": 0.4048, + "step": 1626 + }, + { + "epoch": 1.33, + "grad_norm": 1.9400375481531664, + "learning_rate": 1.205902181648215e-06, + "loss": 0.3605, + "step": 1627 + }, + { + "epoch": 1.33, + "grad_norm": 1.873775466516257, + "learning_rate": 1.2031056792999726e-06, + "loss": 0.3375, + "step": 1628 + }, + { + "epoch": 1.33, + "grad_norm": 1.9913863168589552, + "learning_rate": 1.2003113954457673e-06, + "loss": 0.3964, + "step": 1629 + }, + { + "epoch": 1.33, + "grad_norm": 1.9685736172926571, + "learning_rate": 1.1975193348655584e-06, + "loss": 0.3587, + "step": 1630 + }, + { + "epoch": 1.33, + "grad_norm": 1.8698671252931964, + "learning_rate": 1.1947295023355022e-06, + "loss": 0.3568, + "step": 1631 + }, + { + "epoch": 1.34, + "grad_norm": 1.9615330930141146, + "learning_rate": 1.1919419026279434e-06, + "loss": 0.385, + "step": 1632 + }, + { + "epoch": 1.34, + "grad_norm": 1.8699401980633292, + "learning_rate": 1.189156540511407e-06, + "loss": 0.362, + "step": 1633 + }, + { + "epoch": 1.34, + "grad_norm": 2.054845402143213, + "learning_rate": 1.186373420750592e-06, + "loss": 0.3746, + "step": 1634 + }, + { + "epoch": 1.34, + "grad_norm": 1.828582523525085, + "learning_rate": 1.1835925481063575e-06, + "loss": 0.3915, + "step": 1635 + }, + { + "epoch": 1.34, + "grad_norm": 1.9369510226251998, + "learning_rate": 1.1808139273357232e-06, + "loss": 0.3736, + "step": 1636 + }, + { + "epoch": 1.34, + "grad_norm": 1.9623351823945685, + "learning_rate": 1.1780375631918544e-06, + "loss": 0.3861, + "step": 1637 + }, + { + "epoch": 1.34, + "grad_norm": 2.057951803903781, + "learning_rate": 1.1752634604240565e-06, + "loss": 0.3988, + "step": 1638 + }, + { + "epoch": 1.34, + "grad_norm": 1.926766218075179, + "learning_rate": 1.1724916237777675e-06, + "loss": 0.3526, + "step": 1639 + }, + { + "epoch": 1.34, + "grad_norm": 1.8312750701887877, + "learning_rate": 1.1697220579945466e-06, + "loss": 0.3518, + "step": 1640 + }, + { + "epoch": 1.34, + "grad_norm": 2.025004547929062, + "learning_rate": 1.1669547678120701e-06, + "loss": 0.3651, + "step": 1641 + }, + { + "epoch": 1.34, + "grad_norm": 2.3363123335351874, + "learning_rate": 1.1641897579641221e-06, + "loss": 0.4033, + "step": 1642 + }, + { + "epoch": 1.34, + "grad_norm": 1.8749245234784346, + "learning_rate": 1.1614270331805844e-06, + "loss": 0.3701, + "step": 1643 + }, + { + "epoch": 1.35, + "grad_norm": 1.8332454151429327, + "learning_rate": 1.1586665981874323e-06, + "loss": 0.3911, + "step": 1644 + }, + { + "epoch": 1.35, + "grad_norm": 2.217946505455479, + "learning_rate": 1.1559084577067206e-06, + "loss": 0.3346, + "step": 1645 + }, + { + "epoch": 1.35, + "grad_norm": 1.799776695931742, + "learning_rate": 1.1531526164565816e-06, + "loss": 0.3489, + "step": 1646 + }, + { + "epoch": 1.35, + "grad_norm": 1.9376934559686718, + "learning_rate": 1.150399079151214e-06, + "loss": 0.3721, + "step": 1647 + }, + { + "epoch": 1.35, + "grad_norm": 1.826040524283735, + "learning_rate": 1.1476478505008753e-06, + "loss": 0.3464, + "step": 1648 + }, + { + "epoch": 1.35, + "grad_norm": 1.9007570045973046, + "learning_rate": 1.144898935211874e-06, + "loss": 0.3859, + "step": 1649 + }, + { + "epoch": 1.35, + "grad_norm": 2.1474984005060334, + "learning_rate": 1.1421523379865603e-06, + "loss": 0.3456, + "step": 1650 + }, + { + "epoch": 1.35, + "grad_norm": 1.843189989485683, + "learning_rate": 1.1394080635233204e-06, + "loss": 0.3052, + "step": 1651 + }, + { + "epoch": 1.35, + "grad_norm": 2.009903889503656, + "learning_rate": 1.136666116516567e-06, + "loss": 0.4498, + "step": 1652 + }, + { + "epoch": 1.35, + "grad_norm": 3.0285468769549473, + "learning_rate": 1.1339265016567294e-06, + "loss": 0.3532, + "step": 1653 + }, + { + "epoch": 1.35, + "grad_norm": 1.8725192886740858, + "learning_rate": 1.1311892236302508e-06, + "loss": 0.3685, + "step": 1654 + }, + { + "epoch": 1.35, + "grad_norm": 1.8726862166869487, + "learning_rate": 1.128454287119573e-06, + "loss": 0.3761, + "step": 1655 + }, + { + "epoch": 1.36, + "grad_norm": 1.8883766624211467, + "learning_rate": 1.1257216968031357e-06, + "loss": 0.3574, + "step": 1656 + }, + { + "epoch": 1.36, + "grad_norm": 1.9004020165185547, + "learning_rate": 1.1229914573553641e-06, + "loss": 0.3638, + "step": 1657 + }, + { + "epoch": 1.36, + "grad_norm": 1.8723325311418417, + "learning_rate": 1.1202635734466612e-06, + "loss": 0.3468, + "step": 1658 + }, + { + "epoch": 1.36, + "grad_norm": 1.804021127084218, + "learning_rate": 1.1175380497434022e-06, + "loss": 0.3534, + "step": 1659 + }, + { + "epoch": 1.36, + "grad_norm": 1.8962566852248846, + "learning_rate": 1.1148148909079229e-06, + "loss": 0.3943, + "step": 1660 + }, + { + "epoch": 1.36, + "grad_norm": 1.9982200928541012, + "learning_rate": 1.1120941015985152e-06, + "loss": 0.4224, + "step": 1661 + }, + { + "epoch": 1.36, + "grad_norm": 1.8053179049263286, + "learning_rate": 1.109375686469417e-06, + "loss": 0.3389, + "step": 1662 + }, + { + "epoch": 1.36, + "grad_norm": 1.888467793597335, + "learning_rate": 1.106659650170805e-06, + "loss": 0.387, + "step": 1663 + }, + { + "epoch": 1.36, + "grad_norm": 1.8685159814187862, + "learning_rate": 1.1039459973487876e-06, + "loss": 0.3428, + "step": 1664 + }, + { + "epoch": 1.36, + "grad_norm": 1.972180811818148, + "learning_rate": 1.101234732645393e-06, + "loss": 0.3587, + "step": 1665 + }, + { + "epoch": 1.36, + "grad_norm": 2.252459557872569, + "learning_rate": 1.0985258606985683e-06, + "loss": 0.3684, + "step": 1666 + }, + { + "epoch": 1.36, + "grad_norm": 1.9679034729828595, + "learning_rate": 1.0958193861421634e-06, + "loss": 0.338, + "step": 1667 + }, + { + "epoch": 1.37, + "grad_norm": 1.9117564762331398, + "learning_rate": 1.0931153136059304e-06, + "loss": 0.4016, + "step": 1668 + }, + { + "epoch": 1.37, + "grad_norm": 1.915297031471561, + "learning_rate": 1.0904136477155112e-06, + "loss": 0.3629, + "step": 1669 + }, + { + "epoch": 1.37, + "grad_norm": 1.8376703588677337, + "learning_rate": 1.0877143930924306e-06, + "loss": 0.371, + "step": 1670 + }, + { + "epoch": 1.37, + "grad_norm": 1.9070422380758454, + "learning_rate": 1.085017554354089e-06, + "loss": 0.3533, + "step": 1671 + }, + { + "epoch": 1.37, + "grad_norm": 1.9752631861235486, + "learning_rate": 1.0823231361137543e-06, + "loss": 0.4164, + "step": 1672 + }, + { + "epoch": 1.37, + "grad_norm": 1.885197204563304, + "learning_rate": 1.0796311429805536e-06, + "loss": 0.3929, + "step": 1673 + }, + { + "epoch": 1.37, + "grad_norm": 1.9090106863841916, + "learning_rate": 1.0769415795594659e-06, + "loss": 0.3449, + "step": 1674 + }, + { + "epoch": 1.37, + "grad_norm": 2.022637519082336, + "learning_rate": 1.074254450451314e-06, + "loss": 0.3553, + "step": 1675 + }, + { + "epoch": 1.37, + "grad_norm": 1.942217527277708, + "learning_rate": 1.0715697602527542e-06, + "loss": 0.3936, + "step": 1676 + }, + { + "epoch": 1.37, + "grad_norm": 1.8809306152215932, + "learning_rate": 1.0688875135562738e-06, + "loss": 0.3481, + "step": 1677 + }, + { + "epoch": 1.37, + "grad_norm": 2.0969194462234513, + "learning_rate": 1.0662077149501798e-06, + "loss": 0.3864, + "step": 1678 + }, + { + "epoch": 1.37, + "grad_norm": 1.8365124296835973, + "learning_rate": 1.0635303690185894e-06, + "loss": 0.3778, + "step": 1679 + }, + { + "epoch": 1.38, + "grad_norm": 1.9221630207347382, + "learning_rate": 1.0608554803414256e-06, + "loss": 0.3443, + "step": 1680 + }, + { + "epoch": 1.38, + "grad_norm": 1.9319799829762891, + "learning_rate": 1.0581830534944084e-06, + "loss": 0.3759, + "step": 1681 + }, + { + "epoch": 1.38, + "grad_norm": 2.00532761754314, + "learning_rate": 1.055513093049046e-06, + "loss": 0.373, + "step": 1682 + }, + { + "epoch": 1.38, + "grad_norm": 1.8361577324130107, + "learning_rate": 1.052845603572627e-06, + "loss": 0.3671, + "step": 1683 + }, + { + "epoch": 1.38, + "grad_norm": 1.9246365496147386, + "learning_rate": 1.0501805896282144e-06, + "loss": 0.3888, + "step": 1684 + }, + { + "epoch": 1.38, + "grad_norm": 1.933677406014513, + "learning_rate": 1.047518055774636e-06, + "loss": 0.428, + "step": 1685 + }, + { + "epoch": 1.38, + "grad_norm": 1.8497481971894003, + "learning_rate": 1.0448580065664754e-06, + "loss": 0.339, + "step": 1686 + }, + { + "epoch": 1.38, + "grad_norm": 1.9674163310656592, + "learning_rate": 1.042200446554068e-06, + "loss": 0.3933, + "step": 1687 + }, + { + "epoch": 1.38, + "grad_norm": 1.8703345670634528, + "learning_rate": 1.039545380283491e-06, + "loss": 0.3805, + "step": 1688 + }, + { + "epoch": 1.38, + "grad_norm": 1.8996794102359933, + "learning_rate": 1.0368928122965547e-06, + "loss": 0.3612, + "step": 1689 + }, + { + "epoch": 1.38, + "grad_norm": 1.8163372630466865, + "learning_rate": 1.0342427471307973e-06, + "loss": 0.3631, + "step": 1690 + }, + { + "epoch": 1.38, + "grad_norm": 1.8990581755942872, + "learning_rate": 1.031595189319473e-06, + "loss": 0.4539, + "step": 1691 + }, + { + "epoch": 1.39, + "grad_norm": 1.9101558963616596, + "learning_rate": 1.0289501433915493e-06, + "loss": 0.4649, + "step": 1692 + }, + { + "epoch": 1.39, + "grad_norm": 1.8873611659348446, + "learning_rate": 1.0263076138716962e-06, + "loss": 0.3649, + "step": 1693 + }, + { + "epoch": 1.39, + "grad_norm": 1.823482013352725, + "learning_rate": 1.0236676052802791e-06, + "loss": 0.3648, + "step": 1694 + }, + { + "epoch": 1.39, + "grad_norm": 1.8931382792204232, + "learning_rate": 1.0210301221333512e-06, + "loss": 0.3589, + "step": 1695 + }, + { + "epoch": 1.39, + "grad_norm": 2.0713580311911355, + "learning_rate": 1.0183951689426438e-06, + "loss": 0.3474, + "step": 1696 + }, + { + "epoch": 1.39, + "grad_norm": 1.8607620741027457, + "learning_rate": 1.0157627502155632e-06, + "loss": 0.3773, + "step": 1697 + }, + { + "epoch": 1.39, + "grad_norm": 1.8645944548746636, + "learning_rate": 1.0131328704551782e-06, + "loss": 0.3457, + "step": 1698 + }, + { + "epoch": 1.39, + "grad_norm": 1.852711235772826, + "learning_rate": 1.0105055341602153e-06, + "loss": 0.3559, + "step": 1699 + }, + { + "epoch": 1.39, + "grad_norm": 1.969084245230365, + "learning_rate": 1.00788074582505e-06, + "loss": 0.3786, + "step": 1700 + }, + { + "epoch": 1.39, + "grad_norm": 1.939185809703108, + "learning_rate": 1.005258509939699e-06, + "loss": 0.3649, + "step": 1701 + }, + { + "epoch": 1.39, + "grad_norm": 1.9104875321871906, + "learning_rate": 1.0026388309898132e-06, + "loss": 0.388, + "step": 1702 + }, + { + "epoch": 1.39, + "grad_norm": 2.161662535348609, + "learning_rate": 1.0000217134566694e-06, + "loss": 0.3692, + "step": 1703 + }, + { + "epoch": 1.4, + "grad_norm": 1.8024704245485432, + "learning_rate": 9.974071618171613e-07, + "loss": 0.3751, + "step": 1704 + }, + { + "epoch": 1.4, + "grad_norm": 1.7739470701867779, + "learning_rate": 9.94795180543796e-07, + "loss": 0.3373, + "step": 1705 + }, + { + "epoch": 1.4, + "grad_norm": 1.8188576734630457, + "learning_rate": 9.921857741046806e-07, + "loss": 0.3945, + "step": 1706 + }, + { + "epoch": 1.4, + "grad_norm": 1.9054961265186567, + "learning_rate": 9.895789469635204e-07, + "loss": 0.3518, + "step": 1707 + }, + { + "epoch": 1.4, + "grad_norm": 1.8782724635395873, + "learning_rate": 9.869747035796071e-07, + "loss": 0.37, + "step": 1708 + }, + { + "epoch": 1.4, + "grad_norm": 1.838615529167183, + "learning_rate": 9.843730484078128e-07, + "loss": 0.3376, + "step": 1709 + }, + { + "epoch": 1.4, + "grad_norm": 1.785535753238471, + "learning_rate": 9.817739858985828e-07, + "loss": 0.337, + "step": 1710 + }, + { + "epoch": 1.4, + "grad_norm": 1.8535882977550358, + "learning_rate": 9.791775204979263e-07, + "loss": 0.3391, + "step": 1711 + }, + { + "epoch": 1.4, + "grad_norm": 1.882614515071742, + "learning_rate": 9.765836566474105e-07, + "loss": 0.391, + "step": 1712 + }, + { + "epoch": 1.4, + "grad_norm": 1.8285960302994975, + "learning_rate": 9.739923987841518e-07, + "loss": 0.356, + "step": 1713 + }, + { + "epoch": 1.4, + "grad_norm": 1.8054856197120326, + "learning_rate": 9.714037513408093e-07, + "loss": 0.3623, + "step": 1714 + }, + { + "epoch": 1.4, + "grad_norm": 1.8671208649893825, + "learning_rate": 9.68817718745577e-07, + "loss": 0.3693, + "step": 1715 + }, + { + "epoch": 1.4, + "grad_norm": 1.9004503058230886, + "learning_rate": 9.662343054221743e-07, + "loss": 0.3327, + "step": 1716 + }, + { + "epoch": 1.41, + "grad_norm": 1.9148999919712566, + "learning_rate": 9.636535157898422e-07, + "loss": 0.3618, + "step": 1717 + }, + { + "epoch": 1.41, + "grad_norm": 1.8635582232372712, + "learning_rate": 9.610753542633309e-07, + "loss": 0.3884, + "step": 1718 + }, + { + "epoch": 1.41, + "grad_norm": 1.9383472683274976, + "learning_rate": 9.58499825252897e-07, + "loss": 0.3953, + "step": 1719 + }, + { + "epoch": 1.41, + "grad_norm": 1.946035726357351, + "learning_rate": 9.559269331642937e-07, + "loss": 0.3292, + "step": 1720 + }, + { + "epoch": 1.41, + "grad_norm": 1.8700217872447233, + "learning_rate": 9.533566823987628e-07, + "loss": 0.361, + "step": 1721 + }, + { + "epoch": 1.41, + "grad_norm": 1.8900223904453795, + "learning_rate": 9.507890773530276e-07, + "loss": 0.3349, + "step": 1722 + }, + { + "epoch": 1.41, + "grad_norm": 1.9125828500996216, + "learning_rate": 9.482241224192867e-07, + "loss": 0.3641, + "step": 1723 + }, + { + "epoch": 1.41, + "grad_norm": 1.940533327906808, + "learning_rate": 9.456618219852042e-07, + "loss": 0.4036, + "step": 1724 + }, + { + "epoch": 1.41, + "grad_norm": 2.0712298544333687, + "learning_rate": 9.431021804339047e-07, + "loss": 0.3934, + "step": 1725 + }, + { + "epoch": 1.41, + "grad_norm": 1.8791027421557622, + "learning_rate": 9.40545202143962e-07, + "loss": 0.3507, + "step": 1726 + }, + { + "epoch": 1.41, + "grad_norm": 1.9686923479849525, + "learning_rate": 9.379908914893962e-07, + "loss": 0.3497, + "step": 1727 + }, + { + "epoch": 1.41, + "grad_norm": 2.0437193308441253, + "learning_rate": 9.354392528396638e-07, + "loss": 0.395, + "step": 1728 + }, + { + "epoch": 1.42, + "grad_norm": 1.864988214025856, + "learning_rate": 9.328902905596512e-07, + "loss": 0.379, + "step": 1729 + }, + { + "epoch": 1.42, + "grad_norm": 1.963062444850751, + "learning_rate": 9.303440090096633e-07, + "loss": 0.3565, + "step": 1730 + }, + { + "epoch": 1.42, + "grad_norm": 1.9399170798660286, + "learning_rate": 9.278004125454232e-07, + "loss": 0.415, + "step": 1731 + }, + { + "epoch": 1.42, + "grad_norm": 1.874726297624515, + "learning_rate": 9.252595055180585e-07, + "loss": 0.3606, + "step": 1732 + }, + { + "epoch": 1.42, + "grad_norm": 1.834934005776965, + "learning_rate": 9.227212922740971e-07, + "loss": 0.4104, + "step": 1733 + }, + { + "epoch": 1.42, + "grad_norm": 1.8726418919835732, + "learning_rate": 9.20185777155459e-07, + "loss": 0.3325, + "step": 1734 + }, + { + "epoch": 1.42, + "grad_norm": 1.9432074923657174, + "learning_rate": 9.176529644994481e-07, + "loss": 0.3663, + "step": 1735 + }, + { + "epoch": 1.42, + "grad_norm": 1.7937207452405413, + "learning_rate": 9.151228586387464e-07, + "loss": 0.3225, + "step": 1736 + }, + { + "epoch": 1.42, + "grad_norm": 1.911607719176901, + "learning_rate": 9.125954639014037e-07, + "loss": 0.3491, + "step": 1737 + }, + { + "epoch": 1.42, + "grad_norm": 1.8954594851178048, + "learning_rate": 9.100707846108337e-07, + "loss": 0.3474, + "step": 1738 + }, + { + "epoch": 1.42, + "grad_norm": 1.9081066235083353, + "learning_rate": 9.075488250858047e-07, + "loss": 0.3654, + "step": 1739 + }, + { + "epoch": 1.42, + "grad_norm": 1.9384836973235149, + "learning_rate": 9.050295896404326e-07, + "loss": 0.3519, + "step": 1740 + }, + { + "epoch": 1.43, + "grad_norm": 1.9655302768136176, + "learning_rate": 9.02513082584173e-07, + "loss": 0.3482, + "step": 1741 + }, + { + "epoch": 1.43, + "grad_norm": 1.900218584161994, + "learning_rate": 8.999993082218156e-07, + "loss": 0.3576, + "step": 1742 + }, + { + "epoch": 1.43, + "grad_norm": 2.030742409886431, + "learning_rate": 8.974882708534724e-07, + "loss": 0.3055, + "step": 1743 + }, + { + "epoch": 1.43, + "grad_norm": 1.865959678567607, + "learning_rate": 8.949799747745766e-07, + "loss": 0.3485, + "step": 1744 + }, + { + "epoch": 1.43, + "grad_norm": 1.8300998571759115, + "learning_rate": 8.924744242758707e-07, + "loss": 0.3412, + "step": 1745 + }, + { + "epoch": 1.43, + "grad_norm": 2.3841641123937514, + "learning_rate": 8.899716236434019e-07, + "loss": 0.3484, + "step": 1746 + }, + { + "epoch": 1.43, + "grad_norm": 1.844271076789803, + "learning_rate": 8.874715771585105e-07, + "loss": 0.3762, + "step": 1747 + }, + { + "epoch": 1.43, + "grad_norm": 1.8687696131042617, + "learning_rate": 8.84974289097828e-07, + "loss": 0.402, + "step": 1748 + }, + { + "epoch": 1.43, + "grad_norm": 1.889973499535232, + "learning_rate": 8.824797637332669e-07, + "loss": 0.3566, + "step": 1749 + }, + { + "epoch": 1.43, + "grad_norm": 1.8681107208205963, + "learning_rate": 8.799880053320131e-07, + "loss": 0.4057, + "step": 1750 + }, + { + "epoch": 1.43, + "grad_norm": 1.8928327876139377, + "learning_rate": 8.774990181565201e-07, + "loss": 0.3784, + "step": 1751 + }, + { + "epoch": 1.43, + "grad_norm": 1.931089236577729, + "learning_rate": 8.750128064645002e-07, + "loss": 0.4008, + "step": 1752 + }, + { + "epoch": 1.44, + "grad_norm": 1.9573581859995763, + "learning_rate": 8.725293745089181e-07, + "loss": 0.3486, + "step": 1753 + }, + { + "epoch": 1.44, + "grad_norm": 1.9164746693234396, + "learning_rate": 8.700487265379845e-07, + "loss": 0.3634, + "step": 1754 + }, + { + "epoch": 1.44, + "grad_norm": 1.812159570787973, + "learning_rate": 8.675708667951446e-07, + "loss": 0.3476, + "step": 1755 + }, + { + "epoch": 1.44, + "grad_norm": 2.0355096473340146, + "learning_rate": 8.650957995190784e-07, + "loss": 0.3562, + "step": 1756 + }, + { + "epoch": 1.44, + "grad_norm": 1.8995538618272807, + "learning_rate": 8.626235289436846e-07, + "loss": 0.3767, + "step": 1757 + }, + { + "epoch": 1.44, + "grad_norm": 1.8751894629115184, + "learning_rate": 8.601540592980812e-07, + "loss": 0.3709, + "step": 1758 + }, + { + "epoch": 1.44, + "grad_norm": 1.8772906072081945, + "learning_rate": 8.576873948065931e-07, + "loss": 0.3692, + "step": 1759 + }, + { + "epoch": 1.44, + "grad_norm": 1.855725719743314, + "learning_rate": 8.552235396887479e-07, + "loss": 0.3461, + "step": 1760 + }, + { + "epoch": 1.44, + "grad_norm": 1.9058932387569096, + "learning_rate": 8.52762498159266e-07, + "loss": 0.4035, + "step": 1761 + }, + { + "epoch": 1.44, + "grad_norm": 1.8155999399280405, + "learning_rate": 8.503042744280565e-07, + "loss": 0.3821, + "step": 1762 + }, + { + "epoch": 1.44, + "grad_norm": 1.9191184065214926, + "learning_rate": 8.478488727002062e-07, + "loss": 0.4182, + "step": 1763 + }, + { + "epoch": 1.44, + "grad_norm": 1.8660511914055784, + "learning_rate": 8.453962971759766e-07, + "loss": 0.3936, + "step": 1764 + }, + { + "epoch": 1.45, + "grad_norm": 1.8559359079620885, + "learning_rate": 8.429465520507932e-07, + "loss": 0.3555, + "step": 1765 + }, + { + "epoch": 1.45, + "grad_norm": 1.871625930259135, + "learning_rate": 8.404996415152414e-07, + "loss": 0.3336, + "step": 1766 + }, + { + "epoch": 1.45, + "grad_norm": 1.9146405985810966, + "learning_rate": 8.38055569755055e-07, + "loss": 0.3595, + "step": 1767 + }, + { + "epoch": 1.45, + "grad_norm": 1.8172916285896499, + "learning_rate": 8.356143409511145e-07, + "loss": 0.3763, + "step": 1768 + }, + { + "epoch": 1.45, + "grad_norm": 1.9045338434685268, + "learning_rate": 8.331759592794344e-07, + "loss": 0.3454, + "step": 1769 + }, + { + "epoch": 1.45, + "grad_norm": 1.9019450574908656, + "learning_rate": 8.307404289111618e-07, + "loss": 0.3782, + "step": 1770 + }, + { + "epoch": 1.45, + "grad_norm": 1.8040956687408418, + "learning_rate": 8.283077540125642e-07, + "loss": 0.3397, + "step": 1771 + }, + { + "epoch": 1.45, + "grad_norm": 1.8854623689371994, + "learning_rate": 8.258779387450258e-07, + "loss": 0.3632, + "step": 1772 + }, + { + "epoch": 1.45, + "grad_norm": 1.8703628366355571, + "learning_rate": 8.234509872650381e-07, + "loss": 0.3796, + "step": 1773 + }, + { + "epoch": 1.45, + "grad_norm": 1.8974382562927672, + "learning_rate": 8.210269037241945e-07, + "loss": 0.3577, + "step": 1774 + }, + { + "epoch": 1.45, + "grad_norm": 1.8041564148309792, + "learning_rate": 8.186056922691816e-07, + "loss": 0.3423, + "step": 1775 + }, + { + "epoch": 1.45, + "grad_norm": 1.8871513088592733, + "learning_rate": 8.161873570417742e-07, + "loss": 0.3724, + "step": 1776 + }, + { + "epoch": 1.46, + "grad_norm": 1.7959090299202567, + "learning_rate": 8.137719021788248e-07, + "loss": 0.3514, + "step": 1777 + }, + { + "epoch": 1.46, + "grad_norm": 1.77414937614363, + "learning_rate": 8.113593318122609e-07, + "loss": 0.3655, + "step": 1778 + }, + { + "epoch": 1.46, + "grad_norm": 1.8415138040355723, + "learning_rate": 8.089496500690747e-07, + "loss": 0.3469, + "step": 1779 + }, + { + "epoch": 1.46, + "grad_norm": 1.943916626029921, + "learning_rate": 8.06542861071318e-07, + "loss": 0.3626, + "step": 1780 + }, + { + "epoch": 1.46, + "grad_norm": 1.9699325195709307, + "learning_rate": 8.041389689360921e-07, + "loss": 0.3897, + "step": 1781 + }, + { + "epoch": 1.46, + "grad_norm": 1.8300758832916175, + "learning_rate": 8.01737977775545e-07, + "loss": 0.3528, + "step": 1782 + }, + { + "epoch": 1.46, + "grad_norm": 1.8854405268423242, + "learning_rate": 7.993398916968609e-07, + "loss": 0.3458, + "step": 1783 + }, + { + "epoch": 1.46, + "grad_norm": 1.8610707367327934, + "learning_rate": 7.969447148022555e-07, + "loss": 0.3825, + "step": 1784 + }, + { + "epoch": 1.46, + "grad_norm": 1.8761158349166456, + "learning_rate": 7.945524511889676e-07, + "loss": 0.361, + "step": 1785 + }, + { + "epoch": 1.46, + "grad_norm": 1.8316905966902863, + "learning_rate": 7.921631049492526e-07, + "loss": 0.3791, + "step": 1786 + }, + { + "epoch": 1.46, + "grad_norm": 1.8815617462853849, + "learning_rate": 7.897766801703754e-07, + "loss": 0.3334, + "step": 1787 + }, + { + "epoch": 1.46, + "grad_norm": 1.8069850793814037, + "learning_rate": 7.873931809346022e-07, + "loss": 0.3063, + "step": 1788 + }, + { + "epoch": 1.47, + "grad_norm": 1.877897596569181, + "learning_rate": 7.850126113191961e-07, + "loss": 0.3551, + "step": 1789 + }, + { + "epoch": 1.47, + "grad_norm": 1.933100704380605, + "learning_rate": 7.826349753964083e-07, + "loss": 0.4, + "step": 1790 + }, + { + "epoch": 1.47, + "grad_norm": 1.8588317568608963, + "learning_rate": 7.802602772334719e-07, + "loss": 0.3695, + "step": 1791 + }, + { + "epoch": 1.47, + "grad_norm": 1.75903586927703, + "learning_rate": 7.778885208925943e-07, + "loss": 0.3334, + "step": 1792 + }, + { + "epoch": 1.47, + "grad_norm": 1.847597726088611, + "learning_rate": 7.755197104309512e-07, + "loss": 0.3508, + "step": 1793 + }, + { + "epoch": 1.47, + "grad_norm": 1.8730373365521515, + "learning_rate": 7.731538499006767e-07, + "loss": 0.3727, + "step": 1794 + }, + { + "epoch": 1.47, + "grad_norm": 1.8696875894594878, + "learning_rate": 7.707909433488611e-07, + "loss": 0.3694, + "step": 1795 + }, + { + "epoch": 1.47, + "grad_norm": 1.8224097896476315, + "learning_rate": 7.684309948175414e-07, + "loss": 0.3682, + "step": 1796 + }, + { + "epoch": 1.47, + "grad_norm": 1.8896591788553188, + "learning_rate": 7.660740083436943e-07, + "loss": 0.353, + "step": 1797 + }, + { + "epoch": 1.47, + "grad_norm": 1.8622597363460462, + "learning_rate": 7.637199879592275e-07, + "loss": 0.3835, + "step": 1798 + }, + { + "epoch": 1.47, + "grad_norm": 1.8261440807434144, + "learning_rate": 7.61368937690978e-07, + "loss": 0.3673, + "step": 1799 + }, + { + "epoch": 1.47, + "grad_norm": 1.86324753247062, + "learning_rate": 7.590208615607001e-07, + "loss": 0.3613, + "step": 1800 + }, + { + "epoch": 1.48, + "grad_norm": 1.8704051001710107, + "learning_rate": 7.566757635850608e-07, + "loss": 0.3756, + "step": 1801 + }, + { + "epoch": 1.48, + "grad_norm": 1.8547689419526656, + "learning_rate": 7.543336477756336e-07, + "loss": 0.3557, + "step": 1802 + }, + { + "epoch": 1.48, + "grad_norm": 1.8970591656145008, + "learning_rate": 7.519945181388893e-07, + "loss": 0.3713, + "step": 1803 + }, + { + "epoch": 1.48, + "grad_norm": 2.034710049647413, + "learning_rate": 7.496583786761911e-07, + "loss": 0.379, + "step": 1804 + }, + { + "epoch": 1.48, + "grad_norm": 1.7207339510591724, + "learning_rate": 7.47325233383788e-07, + "loss": 0.324, + "step": 1805 + }, + { + "epoch": 1.48, + "grad_norm": 1.8353430031672993, + "learning_rate": 7.449950862528046e-07, + "loss": 0.3688, + "step": 1806 + }, + { + "epoch": 1.48, + "grad_norm": 1.8248952138910253, + "learning_rate": 7.426679412692403e-07, + "loss": 0.3744, + "step": 1807 + }, + { + "epoch": 1.48, + "grad_norm": 1.8581710166024752, + "learning_rate": 7.403438024139547e-07, + "loss": 0.3591, + "step": 1808 + }, + { + "epoch": 1.48, + "grad_norm": 1.972956887111899, + "learning_rate": 7.380226736626692e-07, + "loss": 0.3786, + "step": 1809 + } + ], + "logging_steps": 1, + "max_steps": 2412, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 603, + "total_flos": 852109767475200.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1809/training_args.bin b/checkpoint-1809/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e --- /dev/null +++ b/checkpoint-1809/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26 +size 7352 diff --git a/checkpoint-1809/zero_to_fp32.py b/checkpoint-1809/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b --- /dev/null +++ b/checkpoint-1809/zero_to_fp32.py @@ -0,0 +1,592 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-2412/config.json b/checkpoint-2412/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76 --- /dev/null +++ b/checkpoint-2412/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "alpindale/Mistral-7B-v0.2-hf", + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 32000, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.38.2", + "use_cache": false, + "vocab_size": 32002 +} diff --git a/checkpoint-2412/generation_config.json b/checkpoint-2412/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc --- /dev/null +++ b/checkpoint-2412/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "transformers_version": "4.38.2" +} diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a59ff7a72f9df7a68e7248c7399fd213b080dd5 --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e111d64b9ed3b4d12e7789f3192543373535aa92ee6c263ea786d7b23522bd49 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ac560f6e5321dbce193cc92e13b9baff4470062 --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61b4141a3fa6d69f16e220f7b8c5d2cd3082a9523abf78acf9b27bbb7bb2d886 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d193120ef6ed6b5d0eb87e15df87b771e9fb65d --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3501f7efc9a0a2bfa63c0615155be565644fd9d8177e0aa748d17d19df9987f9 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..51d8ee6f63f7ab6751b97b92bbcb81de4edd4d42 --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f42a454bc0043a5a21051c0a6cde9be4b129e43271934efe035e421ce7048855 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e208605d73864af1217ae03dd78c332a6a8a26f --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e5af8493f413af7dd5769c90efd786dacb4e6288379692e0e4d8fa8e6069071 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9d41711f76d39f04d8abe041dbc972b32c61ad89 --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf6269cf2020dcfcff7333f3d4507e1a924ed80000200f90172b97709be09ed +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f36e24043786da5b21dcf8c77bffd6c143ccbe35 --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39d766462269221e2711490a61a779a5b0353d922145a3bad0b53fd31f240a25 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..427c21d64831fcf5abddb0c3b950ea9bef3ce771 --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e646565376c91ee882145d965b468d65d730d2d11fef7b4979f6d8b7c66ad29 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..116880a693a1dd676d5291ea62f95e501d65db88 --- /dev/null +++ b/checkpoint-2412/global_step2412/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa1ee0975fb89f5419c41d5bc2e2f62fef1b141132a7125b7a92896641cbb001 +size 4831623435 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd9d3734b8410e9aca4de6be1cfe056321e2fe62 --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5721478ea543d8dd78b059def5d48ba9b5a08820b4d524c8bd30ac284e466c4 +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..774949779d756d7ac6644f0438b01207093a5bba --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2004ecf30872828841b44b9cad984d2fb4049e40e90d86f1cdf34c0c8af4cf28 +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e856e3b7c3db51d54629b44a65db43e135baeb42 --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92568d2f2270b067395a7744b0e8e59530291155adc571485e510e5881453efb +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c80d18171a71b37e6e8c1632689efe5bd4c4df7 --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2cc635532d6fce6a79b4935c5c637cd3a7e22a4a09353821285e20040086b76 +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9297703656fc3df2fb912c3e8364b40f7acb2150 --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6937465a4c2af5b1415fd2afcb341257b94593dd351e04d85a15e207912b72b +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ecfe45e1e980d872619605aa7ba4da1edc26509 --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a64004407be05cca8398dc72883f365b0947dc9dd2a983e6d943edc998c9e04 +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9553a8d1a362495b7bd575fc191edad0bac73ab2 --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8474ab6e0891601b6f95e61afdd6dcdf1085b7df706655f714825bee2ab72a4b +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdefca753bb52a1a131c05be59fb545b9ed03f2b --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e295c5022a8e8800aa98a2b9745c62d4648292fe4ea02bc5f154090bea6adc38 +size 153829 diff --git a/checkpoint-2412/global_step2412/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-2412/global_step2412/zero_pp_rank_8_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b382b7eb0495e302ef9241b3f6e7c519bad3df6e --- /dev/null +++ b/checkpoint-2412/global_step2412/zero_pp_rank_8_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39fde8a1dcf3d0db192cd199515ea53b7cf0de2864570f8b3e1c599c3b94ed9e +size 153829 diff --git a/checkpoint-2412/latest b/checkpoint-2412/latest new file mode 100644 index 0000000000000000000000000000000000000000..75087eef97c8712d556b81f66e003de493e93c96 --- /dev/null +++ b/checkpoint-2412/latest @@ -0,0 +1 @@ +global_step2412 \ No newline at end of file diff --git a/checkpoint-2412/model-00001-of-00003.safetensors b/checkpoint-2412/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f1c522c741bc956a541d5544734d12ff3a71b33 --- /dev/null +++ b/checkpoint-2412/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89fd0fface188ca3f7988aa53f25e087292d72ca99cd52ef8cb52cf180ad2ff +size 4943178720 diff --git a/checkpoint-2412/model-00002-of-00003.safetensors b/checkpoint-2412/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a1c7f2c1a284a17e9b7a9124040ee4bb6680b67 --- /dev/null +++ b/checkpoint-2412/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49dd97160e0a8ff75303f02969df38307407c8800ce94aaa86611ceb6727bca0 +size 4999819336 diff --git a/checkpoint-2412/model-00003-of-00003.safetensors b/checkpoint-2412/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f8cc928e41a10674f627e9a238420111f974bb7 --- /dev/null +++ b/checkpoint-2412/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03098a839ef612f1efe325b376aa90bc8311a01c1236120d9ca7934eb9b12fed +size 4540532728 diff --git a/checkpoint-2412/model.safetensors.index.json b/checkpoint-2412/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2 --- /dev/null +++ b/checkpoint-2412/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 14483496960 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/checkpoint-2412/rng_state_0.pth b/checkpoint-2412/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed9c956014a637b9d3ccb494c387c7452ae938e0 --- /dev/null +++ b/checkpoint-2412/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b7907b6e8bbc0deaf9b6cadef63205dade64f9fbf74f9a4dca9c34792d7aab +size 16240 diff --git a/checkpoint-2412/rng_state_1.pth b/checkpoint-2412/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a2452cb1ac950d724f0559bab3e53e6a671da5ba --- /dev/null +++ b/checkpoint-2412/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4ca3302c930a1b49ced40d5e2133aedc4c5857930d92deb8c6496a317958d8 +size 16240 diff --git a/checkpoint-2412/rng_state_2.pth b/checkpoint-2412/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..30ca1e0fbf8047c1cd0606a37b02d545623d4a67 --- /dev/null +++ b/checkpoint-2412/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbbf2364108e70a0ac183356d1693182b452bb464271c3d2f4ade972244d710d +size 16240 diff --git a/checkpoint-2412/rng_state_3.pth b/checkpoint-2412/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a342cc40db30db7d18c31cffe2a2e1b1d2f3b084 --- /dev/null +++ b/checkpoint-2412/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9269c171a7948127faa588109a1fb8043194b407d2dfbeda2e25ed8b35126a5 +size 16240 diff --git a/checkpoint-2412/rng_state_4.pth b/checkpoint-2412/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..ca08e0f4a907b0b1649b7bc3537dd48c83723830 --- /dev/null +++ b/checkpoint-2412/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f02625e4547fbacdb164e484867f76d5024a007c22c297f8ecbef13fc6aa3202 +size 16240 diff --git a/checkpoint-2412/rng_state_5.pth b/checkpoint-2412/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..1aeba77fabdef8a232c2785991d798bd3f84afd3 --- /dev/null +++ b/checkpoint-2412/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51eb0286c1f14a2c09c443d8c606951c3debeb25f9ba4f71e0aea90ae2f0786e +size 16240 diff --git a/checkpoint-2412/rng_state_6.pth b/checkpoint-2412/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..499c459dc2af4317a2a23f7877927bf7c586e439 --- /dev/null +++ b/checkpoint-2412/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:080bbd36834b7a1623430efdd9f598b791f466541d25b545ca410ec4a930a0f3 +size 16240 diff --git a/checkpoint-2412/rng_state_7.pth b/checkpoint-2412/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..cdfb9b9f9f3356413f6755deb29a84b7b4e360a2 --- /dev/null +++ b/checkpoint-2412/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54aa959bf290908dfe1fc65c2591b99982e9fdce5caf276626d0084ccffa7e95 +size 16240 diff --git a/checkpoint-2412/rng_state_8.pth b/checkpoint-2412/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..6533db02002842edcb0c9b2a6dd89506e90ac8c8 --- /dev/null +++ b/checkpoint-2412/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85f8554f99e72a1c251b463a30088dd49afece6deb61c5ad09834d35ff89308b +size 16240 diff --git a/checkpoint-2412/scheduler.pt b/checkpoint-2412/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb24d02e289560291fce88a5d78a2810c68f08f6 --- /dev/null +++ b/checkpoint-2412/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2380d2748a4c48cacb4bc12df77e29fd92e9aef87c62d8b17fbf348a1afa8525 +size 1064 diff --git a/checkpoint-2412/trainer_state.json b/checkpoint-2412/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2a8a3040b58cb80dcff351c71d92691ff1ac9c20 --- /dev/null +++ b/checkpoint-2412/trainer_state.json @@ -0,0 +1,16905 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9819689119170985, + "eval_steps": 500, + "global_step": 2412, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 27.81778461909011, + "learning_rate": 5.000000000000001e-07, + "loss": 0.7993, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 28.63833175363421, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9056, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 25.646828828014854, + "learning_rate": 1.5e-06, + "loss": 0.8473, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 9.834124771941388, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8192, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 10.558095859980105, + "learning_rate": 2.5e-06, + "loss": 0.7943, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 7.905789045775758, + "learning_rate": 3e-06, + "loss": 0.7075, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 7.259519170268483, + "learning_rate": 3.5e-06, + "loss": 0.7537, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 6.639042051048664, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7471, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 8.515070932390074, + "learning_rate": 4.5e-06, + "loss": 0.7689, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 8.916410424632533, + "learning_rate": 5e-06, + "loss": 0.7194, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 4.835046497413255, + "learning_rate": 4.9999978617243506e-06, + "loss": 0.6949, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 10.065648500649479, + "learning_rate": 4.9999914469010585e-06, + "loss": 0.7039, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 5.299372887839679, + "learning_rate": 4.999980755541098e-06, + "loss": 0.7067, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 5.693110837094718, + "learning_rate": 4.999965787662758e-06, + "loss": 0.7126, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 2.983869635716314, + "learning_rate": 4.999946543291642e-06, + "loss": 0.6496, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 4.2561193962441175, + "learning_rate": 4.999923022460671e-06, + "loss": 0.7036, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 3.011772824968437, + "learning_rate": 4.999895225210079e-06, + "loss": 0.7009, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 3.386638415717137, + "learning_rate": 4.9998631515874165e-06, + "loss": 0.6624, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 3.764658092125165, + "learning_rate": 4.999826801647551e-06, + "loss": 0.6687, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 2.3982096117966614, + "learning_rate": 4.999786175452662e-06, + "loss": 0.706, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 2.8051633678260193, + "learning_rate": 4.999741273072246e-06, + "loss": 0.7031, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 3.1177784624332614, + "learning_rate": 4.999692094583114e-06, + "loss": 0.7525, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 2.2533819675617806, + "learning_rate": 4.9996386400693906e-06, + "loss": 0.6767, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 2.61893793162573, + "learning_rate": 4.999580909622518e-06, + "loss": 0.6432, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 2.76057623723569, + "learning_rate": 4.999518903341251e-06, + "loss": 0.6809, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 2.27983032069553, + "learning_rate": 4.999452621331657e-06, + "loss": 0.6798, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 2.501904568120582, + "learning_rate": 4.99938206370712e-06, + "loss": 0.6412, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 2.819229290729669, + "learning_rate": 4.999307230588338e-06, + "loss": 0.6188, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 2.1233212322022212, + "learning_rate": 4.9992281221033224e-06, + "loss": 0.6378, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 2.7806911906686755, + "learning_rate": 4.999144738387396e-06, + "loss": 0.6653, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 2.4045490257014563, + "learning_rate": 4.999057079583199e-06, + "loss": 0.6377, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 2.3803717769210446, + "learning_rate": 4.998965145840681e-06, + "loss": 0.6855, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 2.3976652879633473, + "learning_rate": 4.998868937317106e-06, + "loss": 0.6284, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 2.2958541157119727, + "learning_rate": 4.998768454177051e-06, + "loss": 0.6521, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 2.1925196833696154, + "learning_rate": 4.998663696592403e-06, + "loss": 0.6619, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 2.361006042901851, + "learning_rate": 4.998554664742362e-06, + "loss": 0.6155, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 2.1577758143653614, + "learning_rate": 4.998441358813443e-06, + "loss": 0.6398, + "step": 37 + }, + { + "epoch": 0.03, + "grad_norm": 2.219872074512664, + "learning_rate": 4.998323778999467e-06, + "loss": 0.6051, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 2.2907501521408546, + "learning_rate": 4.9982019255015705e-06, + "loss": 0.6337, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 2.1769862324666183, + "learning_rate": 4.9980757985281955e-06, + "loss": 0.6606, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 2.4252479779661607, + "learning_rate": 4.997945398295101e-06, + "loss": 0.6685, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 2.3929541982084657, + "learning_rate": 4.99781072502535e-06, + "loss": 0.6084, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 1.932539969840091, + "learning_rate": 4.997671778949318e-06, + "loss": 0.6123, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 2.191742541327873, + "learning_rate": 4.997528560304688e-06, + "loss": 0.6247, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 2.423376784566499, + "learning_rate": 4.997381069336455e-06, + "loss": 0.7024, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 2.0599055392481076, + "learning_rate": 4.997229306296918e-06, + "loss": 0.6612, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 2.16832922087532, + "learning_rate": 4.997073271445686e-06, + "loss": 0.5949, + "step": 47 + }, + { + "epoch": 0.04, + "grad_norm": 2.0483598654319453, + "learning_rate": 4.9969129650496775e-06, + "loss": 0.6406, + "step": 48 + }, + { + "epoch": 0.04, + "grad_norm": 1.963056609139284, + "learning_rate": 4.996748387383113e-06, + "loss": 0.6361, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 2.2094923844269307, + "learning_rate": 4.996579538727527e-06, + "loss": 0.5901, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 2.1088153449411857, + "learning_rate": 4.996406419371749e-06, + "loss": 0.6458, + "step": 51 + }, + { + "epoch": 0.04, + "grad_norm": 2.093448940617732, + "learning_rate": 4.996229029611926e-06, + "loss": 0.6509, + "step": 52 + }, + { + "epoch": 0.04, + "grad_norm": 2.075116207412987, + "learning_rate": 4.996047369751502e-06, + "loss": 0.6295, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 2.138141165277684, + "learning_rate": 4.995861440101229e-06, + "loss": 0.6088, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 2.186316382848445, + "learning_rate": 4.995671240979161e-06, + "loss": 0.6307, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 2.2513741083982195, + "learning_rate": 4.995476772710657e-06, + "loss": 0.6175, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 2.0827167336870596, + "learning_rate": 4.995278035628379e-06, + "loss": 0.5935, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 2.117977588574442, + "learning_rate": 4.995075030072291e-06, + "loss": 0.5998, + "step": 58 + }, + { + "epoch": 0.05, + "grad_norm": 2.0996940200235485, + "learning_rate": 4.994867756389658e-06, + "loss": 0.6159, + "step": 59 + }, + { + "epoch": 0.05, + "grad_norm": 2.141096165691323, + "learning_rate": 4.994656214935045e-06, + "loss": 0.6294, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 2.022748830058395, + "learning_rate": 4.994440406070323e-06, + "loss": 0.6315, + "step": 61 + }, + { + "epoch": 0.05, + "grad_norm": 2.209132168720991, + "learning_rate": 4.994220330164654e-06, + "loss": 0.5645, + "step": 62 + }, + { + "epoch": 0.05, + "grad_norm": 2.0994557317862674, + "learning_rate": 4.993995987594509e-06, + "loss": 0.6272, + "step": 63 + }, + { + "epoch": 0.05, + "grad_norm": 2.204220831053169, + "learning_rate": 4.99376737874365e-06, + "loss": 0.6379, + "step": 64 + }, + { + "epoch": 0.05, + "grad_norm": 2.127733932186697, + "learning_rate": 4.993534504003141e-06, + "loss": 0.622, + "step": 65 + }, + { + "epoch": 0.05, + "grad_norm": 2.1338506582034316, + "learning_rate": 4.993297363771342e-06, + "loss": 0.6259, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 2.104802764460729, + "learning_rate": 4.993055958453912e-06, + "loss": 0.6414, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 2.0889535347771675, + "learning_rate": 4.9928102884638004e-06, + "loss": 0.6466, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 2.252225316694296, + "learning_rate": 4.992560354221258e-06, + "loss": 0.6167, + "step": 69 + }, + { + "epoch": 0.06, + "grad_norm": 2.015392533516649, + "learning_rate": 4.992306156153827e-06, + "loss": 0.5958, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 2.151741408948778, + "learning_rate": 4.992047694696343e-06, + "loss": 0.5875, + "step": 71 + }, + { + "epoch": 0.06, + "grad_norm": 2.0351299117412696, + "learning_rate": 4.991784970290935e-06, + "loss": 0.5935, + "step": 72 + }, + { + "epoch": 0.06, + "grad_norm": 2.0000962363827983, + "learning_rate": 4.991517983387026e-06, + "loss": 0.6091, + "step": 73 + }, + { + "epoch": 0.06, + "grad_norm": 2.202881736102415, + "learning_rate": 4.99124673444133e-06, + "loss": 0.6122, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 2.015074773396151, + "learning_rate": 4.990971223917848e-06, + "loss": 0.6134, + "step": 75 + }, + { + "epoch": 0.06, + "grad_norm": 2.009305960567766, + "learning_rate": 4.990691452287877e-06, + "loss": 0.6308, + "step": 76 + }, + { + "epoch": 0.06, + "grad_norm": 1.9967884756310221, + "learning_rate": 4.990407420029999e-06, + "loss": 0.6098, + "step": 77 + }, + { + "epoch": 0.06, + "grad_norm": 2.0858738033925905, + "learning_rate": 4.990119127630085e-06, + "loss": 0.6344, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 1.9427707561903895, + "learning_rate": 4.989826575581295e-06, + "loss": 0.6049, + "step": 79 + }, + { + "epoch": 0.07, + "grad_norm": 2.157150584766853, + "learning_rate": 4.989529764384073e-06, + "loss": 0.5965, + "step": 80 + }, + { + "epoch": 0.07, + "grad_norm": 2.0303527419352583, + "learning_rate": 4.989228694546151e-06, + "loss": 0.6524, + "step": 81 + }, + { + "epoch": 0.07, + "grad_norm": 2.128799919475717, + "learning_rate": 4.988923366582546e-06, + "loss": 0.5524, + "step": 82 + }, + { + "epoch": 0.07, + "grad_norm": 2.0122786280510696, + "learning_rate": 4.988613781015557e-06, + "loss": 0.6268, + "step": 83 + }, + { + "epoch": 0.07, + "grad_norm": 2.104580177719229, + "learning_rate": 4.988299938374769e-06, + "loss": 0.6229, + "step": 84 + }, + { + "epoch": 0.07, + "grad_norm": 2.3894843860356834, + "learning_rate": 4.9879818391970455e-06, + "loss": 0.6194, + "step": 85 + }, + { + "epoch": 0.07, + "grad_norm": 1.9615211372441477, + "learning_rate": 4.9876594840265355e-06, + "loss": 0.6355, + "step": 86 + }, + { + "epoch": 0.07, + "grad_norm": 2.4509852093141937, + "learning_rate": 4.987332873414666e-06, + "loss": 0.6405, + "step": 87 + }, + { + "epoch": 0.07, + "grad_norm": 2.178942375285086, + "learning_rate": 4.987002007920142e-06, + "loss": 0.5593, + "step": 88 + }, + { + "epoch": 0.07, + "grad_norm": 2.2625634345900445, + "learning_rate": 4.9866668881089515e-06, + "loss": 0.6133, + "step": 89 + }, + { + "epoch": 0.07, + "grad_norm": 2.363092638811143, + "learning_rate": 4.986327514554356e-06, + "loss": 0.6298, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 2.0401982492138546, + "learning_rate": 4.985983887836894e-06, + "loss": 0.6276, + "step": 91 + }, + { + "epoch": 0.08, + "grad_norm": 2.276956647922478, + "learning_rate": 4.985636008544381e-06, + "loss": 0.5691, + "step": 92 + }, + { + "epoch": 0.08, + "grad_norm": 2.1072762844110233, + "learning_rate": 4.985283877271908e-06, + "loss": 0.6175, + "step": 93 + }, + { + "epoch": 0.08, + "grad_norm": 2.2931866879442637, + "learning_rate": 4.984927494621836e-06, + "loss": 0.6419, + "step": 94 + }, + { + "epoch": 0.08, + "grad_norm": 2.112474101166308, + "learning_rate": 4.984566861203801e-06, + "loss": 0.607, + "step": 95 + }, + { + "epoch": 0.08, + "grad_norm": 2.1816059679212634, + "learning_rate": 4.984201977634711e-06, + "loss": 0.6136, + "step": 96 + }, + { + "epoch": 0.08, + "grad_norm": 2.0620776369966554, + "learning_rate": 4.9838328445387415e-06, + "loss": 0.6372, + "step": 97 + }, + { + "epoch": 0.08, + "grad_norm": 2.147592836641578, + "learning_rate": 4.983459462547341e-06, + "loss": 0.606, + "step": 98 + }, + { + "epoch": 0.08, + "grad_norm": 2.1808001877062453, + "learning_rate": 4.983081832299224e-06, + "loss": 0.6019, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 2.3751999527114087, + "learning_rate": 4.98269995444037e-06, + "loss": 0.6021, + "step": 100 + }, + { + "epoch": 0.08, + "grad_norm": 1.8769470206406913, + "learning_rate": 4.98231382962403e-06, + "loss": 0.6082, + "step": 101 + }, + { + "epoch": 0.08, + "grad_norm": 2.3060925784921347, + "learning_rate": 4.981923458510717e-06, + "loss": 0.6174, + "step": 102 + }, + { + "epoch": 0.09, + "grad_norm": 2.1543176832473683, + "learning_rate": 4.981528841768206e-06, + "loss": 0.6092, + "step": 103 + }, + { + "epoch": 0.09, + "grad_norm": 2.1558689520522547, + "learning_rate": 4.981129980071538e-06, + "loss": 0.587, + "step": 104 + }, + { + "epoch": 0.09, + "grad_norm": 2.3830532005188383, + "learning_rate": 4.980726874103014e-06, + "loss": 0.6518, + "step": 105 + }, + { + "epoch": 0.09, + "grad_norm": 2.3333119576634767, + "learning_rate": 4.980319524552195e-06, + "loss": 0.6096, + "step": 106 + }, + { + "epoch": 0.09, + "grad_norm": 2.1135146855324214, + "learning_rate": 4.9799079321159e-06, + "loss": 0.5728, + "step": 107 + }, + { + "epoch": 0.09, + "grad_norm": 2.2300463384326394, + "learning_rate": 4.9794920974982095e-06, + "loss": 0.6563, + "step": 108 + }, + { + "epoch": 0.09, + "grad_norm": 2.1745234017525443, + "learning_rate": 4.979072021410458e-06, + "loss": 0.5968, + "step": 109 + }, + { + "epoch": 0.09, + "grad_norm": 2.1536586182562334, + "learning_rate": 4.978647704571237e-06, + "loss": 0.6189, + "step": 110 + }, + { + "epoch": 0.09, + "grad_norm": 2.193809374687326, + "learning_rate": 4.97821914770639e-06, + "loss": 0.5864, + "step": 111 + }, + { + "epoch": 0.09, + "grad_norm": 2.0525896373682047, + "learning_rate": 4.977786351549017e-06, + "loss": 0.6101, + "step": 112 + }, + { + "epoch": 0.09, + "grad_norm": 2.216099286618384, + "learning_rate": 4.977349316839467e-06, + "loss": 0.5984, + "step": 113 + }, + { + "epoch": 0.09, + "grad_norm": 2.155122255962579, + "learning_rate": 4.97690804432534e-06, + "loss": 0.6311, + "step": 114 + }, + { + "epoch": 0.1, + "grad_norm": 2.2972101190291374, + "learning_rate": 4.976462534761487e-06, + "loss": 0.5813, + "step": 115 + }, + { + "epoch": 0.1, + "grad_norm": 1.9925413745245948, + "learning_rate": 4.9760127889100044e-06, + "loss": 0.6157, + "step": 116 + }, + { + "epoch": 0.1, + "grad_norm": 2.2802548684036568, + "learning_rate": 4.975558807540238e-06, + "loss": 0.6079, + "step": 117 + }, + { + "epoch": 0.1, + "grad_norm": 2.048888007394621, + "learning_rate": 4.9751005914287775e-06, + "loss": 0.6467, + "step": 118 + }, + { + "epoch": 0.1, + "grad_norm": 2.28661640438254, + "learning_rate": 4.974638141359456e-06, + "loss": 0.6029, + "step": 119 + }, + { + "epoch": 0.1, + "grad_norm": 2.004056683755783, + "learning_rate": 4.974171458123351e-06, + "loss": 0.6289, + "step": 120 + }, + { + "epoch": 0.1, + "grad_norm": 2.1628470048067667, + "learning_rate": 4.97370054251878e-06, + "loss": 0.6139, + "step": 121 + }, + { + "epoch": 0.1, + "grad_norm": 2.056119895466544, + "learning_rate": 4.9732253953513e-06, + "loss": 0.5798, + "step": 122 + }, + { + "epoch": 0.1, + "grad_norm": 2.1716513163164275, + "learning_rate": 4.972746017433709e-06, + "loss": 0.6085, + "step": 123 + }, + { + "epoch": 0.1, + "grad_norm": 2.255856676525811, + "learning_rate": 4.97226240958604e-06, + "loss": 0.6342, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 2.1049280498075373, + "learning_rate": 4.971774572635563e-06, + "loss": 0.6197, + "step": 125 + }, + { + "epoch": 0.1, + "grad_norm": 2.133349390995361, + "learning_rate": 4.97128250741678e-06, + "loss": 0.5751, + "step": 126 + }, + { + "epoch": 0.11, + "grad_norm": 2.2044887467317578, + "learning_rate": 4.97078621477143e-06, + "loss": 0.6611, + "step": 127 + }, + { + "epoch": 0.11, + "grad_norm": 2.1413863795698145, + "learning_rate": 4.970285695548481e-06, + "loss": 0.625, + "step": 128 + }, + { + "epoch": 0.11, + "grad_norm": 2.0229587336296615, + "learning_rate": 4.969780950604132e-06, + "loss": 0.5989, + "step": 129 + }, + { + "epoch": 0.11, + "grad_norm": 2.0983599595244247, + "learning_rate": 4.969271980801808e-06, + "loss": 0.5747, + "step": 130 + }, + { + "epoch": 0.11, + "grad_norm": 2.1059041140010786, + "learning_rate": 4.9687587870121645e-06, + "loss": 0.5869, + "step": 131 + }, + { + "epoch": 0.11, + "grad_norm": 1.8967441614595046, + "learning_rate": 4.9682413701130815e-06, + "loss": 0.6272, + "step": 132 + }, + { + "epoch": 0.11, + "grad_norm": 1.9976164993621088, + "learning_rate": 4.967719730989663e-06, + "loss": 0.6282, + "step": 133 + }, + { + "epoch": 0.11, + "grad_norm": 1.8719131324952145, + "learning_rate": 4.967193870534235e-06, + "loss": 0.6052, + "step": 134 + }, + { + "epoch": 0.11, + "grad_norm": 2.071702997476533, + "learning_rate": 4.9666637896463455e-06, + "loss": 0.5785, + "step": 135 + }, + { + "epoch": 0.11, + "grad_norm": 1.9549455320048341, + "learning_rate": 4.966129489232762e-06, + "loss": 0.5739, + "step": 136 + }, + { + "epoch": 0.11, + "grad_norm": 2.0656898626759315, + "learning_rate": 4.9655909702074684e-06, + "loss": 0.6651, + "step": 137 + }, + { + "epoch": 0.11, + "grad_norm": 2.1185948604203038, + "learning_rate": 4.965048233491669e-06, + "loss": 0.5759, + "step": 138 + }, + { + "epoch": 0.12, + "grad_norm": 2.08566019272993, + "learning_rate": 4.964501280013777e-06, + "loss": 0.6271, + "step": 139 + }, + { + "epoch": 0.12, + "grad_norm": 2.117420903965419, + "learning_rate": 4.963950110709425e-06, + "loss": 0.5968, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 1.9784944143818486, + "learning_rate": 4.963394726521453e-06, + "loss": 0.6112, + "step": 141 + }, + { + "epoch": 0.12, + "grad_norm": 2.077292948039572, + "learning_rate": 4.9628351283999144e-06, + "loss": 0.5636, + "step": 142 + }, + { + "epoch": 0.12, + "grad_norm": 2.223803520245629, + "learning_rate": 4.962271317302068e-06, + "loss": 0.6658, + "step": 143 + }, + { + "epoch": 0.12, + "grad_norm": 2.039369072186367, + "learning_rate": 4.9617032941923796e-06, + "loss": 0.5853, + "step": 144 + }, + { + "epoch": 0.12, + "grad_norm": 2.071470113085907, + "learning_rate": 4.961131060042522e-06, + "loss": 0.601, + "step": 145 + }, + { + "epoch": 0.12, + "grad_norm": 2.437470272347474, + "learning_rate": 4.960554615831372e-06, + "loss": 0.6593, + "step": 146 + }, + { + "epoch": 0.12, + "grad_norm": 2.178684122927139, + "learning_rate": 4.959973962545005e-06, + "loss": 0.607, + "step": 147 + }, + { + "epoch": 0.12, + "grad_norm": 2.097006749956471, + "learning_rate": 4.9593891011767e-06, + "loss": 0.5873, + "step": 148 + }, + { + "epoch": 0.12, + "grad_norm": 1.9801202541822784, + "learning_rate": 4.958800032726931e-06, + "loss": 0.5877, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 2.30001951085656, + "learning_rate": 4.958206758203373e-06, + "loss": 0.6368, + "step": 150 + }, + { + "epoch": 0.13, + "grad_norm": 1.990094260131078, + "learning_rate": 4.957609278620891e-06, + "loss": 0.59, + "step": 151 + }, + { + "epoch": 0.13, + "grad_norm": 2.262163752076628, + "learning_rate": 4.957007595001548e-06, + "loss": 0.5779, + "step": 152 + }, + { + "epoch": 0.13, + "grad_norm": 2.1970152093220983, + "learning_rate": 4.956401708374595e-06, + "loss": 0.5894, + "step": 153 + }, + { + "epoch": 0.13, + "grad_norm": 2.220825872684071, + "learning_rate": 4.9557916197764745e-06, + "loss": 0.6528, + "step": 154 + }, + { + "epoch": 0.13, + "grad_norm": 2.099472677591387, + "learning_rate": 4.955177330250817e-06, + "loss": 0.5798, + "step": 155 + }, + { + "epoch": 0.13, + "grad_norm": 2.159203936881569, + "learning_rate": 4.954558840848437e-06, + "loss": 0.6206, + "step": 156 + }, + { + "epoch": 0.13, + "grad_norm": 2.185152414039555, + "learning_rate": 4.953936152627338e-06, + "loss": 0.5624, + "step": 157 + }, + { + "epoch": 0.13, + "grad_norm": 2.0679748168992624, + "learning_rate": 4.953309266652701e-06, + "loss": 0.5859, + "step": 158 + }, + { + "epoch": 0.13, + "grad_norm": 2.327237187255128, + "learning_rate": 4.952678183996891e-06, + "loss": 0.5632, + "step": 159 + }, + { + "epoch": 0.13, + "grad_norm": 2.2865519679977417, + "learning_rate": 4.952042905739451e-06, + "loss": 0.6965, + "step": 160 + }, + { + "epoch": 0.13, + "grad_norm": 2.523435408018699, + "learning_rate": 4.9514034329671e-06, + "loss": 0.6217, + "step": 161 + }, + { + "epoch": 0.13, + "grad_norm": 2.4992653226709636, + "learning_rate": 4.950759766773734e-06, + "loss": 0.6175, + "step": 162 + }, + { + "epoch": 0.14, + "grad_norm": 2.432752824777114, + "learning_rate": 4.950111908260423e-06, + "loss": 0.5862, + "step": 163 + }, + { + "epoch": 0.14, + "grad_norm": 2.137500912204061, + "learning_rate": 4.949459858535404e-06, + "loss": 0.6124, + "step": 164 + }, + { + "epoch": 0.14, + "grad_norm": 2.2226376224120474, + "learning_rate": 4.94880361871409e-06, + "loss": 0.5891, + "step": 165 + }, + { + "epoch": 0.14, + "grad_norm": 2.3821839805775165, + "learning_rate": 4.9481431899190544e-06, + "loss": 0.6008, + "step": 166 + }, + { + "epoch": 0.14, + "grad_norm": 2.306242834684614, + "learning_rate": 4.947478573280044e-06, + "loss": 0.6159, + "step": 167 + }, + { + "epoch": 0.14, + "grad_norm": 2.3298092236851518, + "learning_rate": 4.946809769933963e-06, + "loss": 0.5809, + "step": 168 + }, + { + "epoch": 0.14, + "grad_norm": 2.364296499621558, + "learning_rate": 4.946136781024883e-06, + "loss": 0.5895, + "step": 169 + }, + { + "epoch": 0.14, + "grad_norm": 2.237241095609228, + "learning_rate": 4.945459607704029e-06, + "loss": 0.6144, + "step": 170 + }, + { + "epoch": 0.14, + "grad_norm": 2.4027419761972264, + "learning_rate": 4.9447782511297905e-06, + "loss": 0.5985, + "step": 171 + }, + { + "epoch": 0.14, + "grad_norm": 2.1547059182244284, + "learning_rate": 4.944092712467709e-06, + "loss": 0.5763, + "step": 172 + }, + { + "epoch": 0.14, + "grad_norm": 2.1530221667047984, + "learning_rate": 4.9434029928904805e-06, + "loss": 0.5692, + "step": 173 + }, + { + "epoch": 0.14, + "grad_norm": 2.228588593294869, + "learning_rate": 4.942709093577954e-06, + "loss": 0.5896, + "step": 174 + }, + { + "epoch": 0.15, + "grad_norm": 2.1597295307130198, + "learning_rate": 4.942011015717129e-06, + "loss": 0.5864, + "step": 175 + }, + { + "epoch": 0.15, + "grad_norm": 2.321140955498194, + "learning_rate": 4.941308760502149e-06, + "loss": 0.6089, + "step": 176 + }, + { + "epoch": 0.15, + "grad_norm": 2.220124736460707, + "learning_rate": 4.940602329134309e-06, + "loss": 0.5786, + "step": 177 + }, + { + "epoch": 0.15, + "grad_norm": 2.1698038563080417, + "learning_rate": 4.939891722822043e-06, + "loss": 0.5749, + "step": 178 + }, + { + "epoch": 0.15, + "grad_norm": 2.244425969121411, + "learning_rate": 4.93917694278093e-06, + "loss": 0.5877, + "step": 179 + }, + { + "epoch": 0.15, + "grad_norm": 2.143920008069458, + "learning_rate": 4.938457990233687e-06, + "loss": 0.6024, + "step": 180 + }, + { + "epoch": 0.15, + "grad_norm": 2.1786040820345813, + "learning_rate": 4.937734866410169e-06, + "loss": 0.5845, + "step": 181 + }, + { + "epoch": 0.15, + "grad_norm": 2.301832824481007, + "learning_rate": 4.9370075725473665e-06, + "loss": 0.6182, + "step": 182 + }, + { + "epoch": 0.15, + "grad_norm": 2.3748033727083997, + "learning_rate": 4.936276109889403e-06, + "loss": 0.6073, + "step": 183 + }, + { + "epoch": 0.15, + "grad_norm": 2.476334487382023, + "learning_rate": 4.935540479687534e-06, + "loss": 0.5793, + "step": 184 + }, + { + "epoch": 0.15, + "grad_norm": 2.2509466352322494, + "learning_rate": 4.934800683200143e-06, + "loss": 0.6133, + "step": 185 + }, + { + "epoch": 0.15, + "grad_norm": 2.8391697547684873, + "learning_rate": 4.934056721692742e-06, + "loss": 0.5967, + "step": 186 + }, + { + "epoch": 0.16, + "grad_norm": 2.4492364225391765, + "learning_rate": 4.933308596437965e-06, + "loss": 0.5676, + "step": 187 + }, + { + "epoch": 0.16, + "grad_norm": 2.685548141821295, + "learning_rate": 4.932556308715573e-06, + "loss": 0.6069, + "step": 188 + }, + { + "epoch": 0.16, + "grad_norm": 2.261217637824808, + "learning_rate": 4.931799859812443e-06, + "loss": 0.6411, + "step": 189 + }, + { + "epoch": 0.16, + "grad_norm": 2.3838284395200966, + "learning_rate": 4.931039251022573e-06, + "loss": 0.5745, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 2.2550921344466164, + "learning_rate": 4.930274483647074e-06, + "loss": 0.5989, + "step": 191 + }, + { + "epoch": 0.16, + "grad_norm": 2.078406234527636, + "learning_rate": 4.929505558994175e-06, + "loss": 0.5998, + "step": 192 + }, + { + "epoch": 0.16, + "grad_norm": 2.592864566091496, + "learning_rate": 4.928732478379214e-06, + "loss": 0.5842, + "step": 193 + }, + { + "epoch": 0.16, + "grad_norm": 2.092752299259724, + "learning_rate": 4.927955243124638e-06, + "loss": 0.5789, + "step": 194 + }, + { + "epoch": 0.16, + "grad_norm": 2.3799311595696966, + "learning_rate": 4.927173854560002e-06, + "loss": 0.6265, + "step": 195 + }, + { + "epoch": 0.16, + "grad_norm": 2.246876688010602, + "learning_rate": 4.926388314021964e-06, + "loss": 0.6126, + "step": 196 + }, + { + "epoch": 0.16, + "grad_norm": 2.1409898276704578, + "learning_rate": 4.925598622854287e-06, + "loss": 0.6073, + "step": 197 + }, + { + "epoch": 0.16, + "grad_norm": 2.5946158421875385, + "learning_rate": 4.924804782407834e-06, + "loss": 0.6154, + "step": 198 + }, + { + "epoch": 0.16, + "grad_norm": 2.1225494320427982, + "learning_rate": 4.924006794040562e-06, + "loss": 0.583, + "step": 199 + }, + { + "epoch": 0.17, + "grad_norm": 2.1971323526291338, + "learning_rate": 4.923204659117528e-06, + "loss": 0.6078, + "step": 200 + }, + { + "epoch": 0.17, + "grad_norm": 2.289185506404785, + "learning_rate": 4.92239837901088e-06, + "loss": 0.6127, + "step": 201 + }, + { + "epoch": 0.17, + "grad_norm": 2.0071007751625354, + "learning_rate": 4.921587955099858e-06, + "loss": 0.5804, + "step": 202 + }, + { + "epoch": 0.17, + "grad_norm": 2.2981840149068247, + "learning_rate": 4.920773388770789e-06, + "loss": 0.6027, + "step": 203 + }, + { + "epoch": 0.17, + "grad_norm": 2.236179116886702, + "learning_rate": 4.919954681417087e-06, + "loss": 0.6179, + "step": 204 + }, + { + "epoch": 0.17, + "grad_norm": 2.007422589251611, + "learning_rate": 4.91913183443925e-06, + "loss": 0.5647, + "step": 205 + }, + { + "epoch": 0.17, + "grad_norm": 2.1402813555735483, + "learning_rate": 4.918304849244857e-06, + "loss": 0.5841, + "step": 206 + }, + { + "epoch": 0.17, + "grad_norm": 2.0456415785177104, + "learning_rate": 4.917473727248565e-06, + "loss": 0.5524, + "step": 207 + }, + { + "epoch": 0.17, + "grad_norm": 1.9673558126020942, + "learning_rate": 4.916638469872109e-06, + "loss": 0.5698, + "step": 208 + }, + { + "epoch": 0.17, + "grad_norm": 2.015111672496819, + "learning_rate": 4.9157990785442964e-06, + "loss": 0.5957, + "step": 209 + }, + { + "epoch": 0.17, + "grad_norm": 1.9502065547578398, + "learning_rate": 4.9149555547010086e-06, + "loss": 0.5592, + "step": 210 + }, + { + "epoch": 0.17, + "grad_norm": 2.167936522558899, + "learning_rate": 4.9141078997851945e-06, + "loss": 0.5705, + "step": 211 + }, + { + "epoch": 0.18, + "grad_norm": 2.2066587458997935, + "learning_rate": 4.91325611524687e-06, + "loss": 0.5526, + "step": 212 + }, + { + "epoch": 0.18, + "grad_norm": 1.9132995625903553, + "learning_rate": 4.9124002025431136e-06, + "loss": 0.5767, + "step": 213 + }, + { + "epoch": 0.18, + "grad_norm": 2.0097281107801277, + "learning_rate": 4.91154016313807e-06, + "loss": 0.6185, + "step": 214 + }, + { + "epoch": 0.18, + "grad_norm": 2.023532008241332, + "learning_rate": 4.910675998502938e-06, + "loss": 0.6005, + "step": 215 + }, + { + "epoch": 0.18, + "grad_norm": 1.9253831001776973, + "learning_rate": 4.909807710115977e-06, + "loss": 0.5769, + "step": 216 + }, + { + "epoch": 0.18, + "grad_norm": 2.066862408842564, + "learning_rate": 4.908935299462497e-06, + "loss": 0.5671, + "step": 217 + }, + { + "epoch": 0.18, + "grad_norm": 1.9412704290792853, + "learning_rate": 4.908058768034862e-06, + "loss": 0.5568, + "step": 218 + }, + { + "epoch": 0.18, + "grad_norm": 2.185994457097553, + "learning_rate": 4.907178117332487e-06, + "loss": 0.5621, + "step": 219 + }, + { + "epoch": 0.18, + "grad_norm": 2.021517127546353, + "learning_rate": 4.906293348861829e-06, + "loss": 0.5672, + "step": 220 + }, + { + "epoch": 0.18, + "grad_norm": 2.099703967072734, + "learning_rate": 4.905404464136391e-06, + "loss": 0.5366, + "step": 221 + }, + { + "epoch": 0.18, + "grad_norm": 2.030197056583618, + "learning_rate": 4.904511464676718e-06, + "loss": 0.6064, + "step": 222 + }, + { + "epoch": 0.18, + "grad_norm": 2.4170102988954896, + "learning_rate": 4.903614352010393e-06, + "loss": 0.5919, + "step": 223 + }, + { + "epoch": 0.19, + "grad_norm": 2.0819468873015476, + "learning_rate": 4.9027131276720355e-06, + "loss": 0.5366, + "step": 224 + }, + { + "epoch": 0.19, + "grad_norm": 2.148008018153629, + "learning_rate": 4.901807793203299e-06, + "loss": 0.597, + "step": 225 + }, + { + "epoch": 0.19, + "grad_norm": 2.0303725862017186, + "learning_rate": 4.900898350152866e-06, + "loss": 0.6394, + "step": 226 + }, + { + "epoch": 0.19, + "grad_norm": 2.1598989214704334, + "learning_rate": 4.899984800076449e-06, + "loss": 0.5932, + "step": 227 + }, + { + "epoch": 0.19, + "grad_norm": 2.0816312637185255, + "learning_rate": 4.899067144536786e-06, + "loss": 0.5909, + "step": 228 + }, + { + "epoch": 0.19, + "grad_norm": 1.9024067197329315, + "learning_rate": 4.8981453851036365e-06, + "loss": 0.5463, + "step": 229 + }, + { + "epoch": 0.19, + "grad_norm": 2.1830926868871043, + "learning_rate": 4.897219523353781e-06, + "loss": 0.5821, + "step": 230 + }, + { + "epoch": 0.19, + "grad_norm": 2.1156269612794016, + "learning_rate": 4.8962895608710195e-06, + "loss": 0.5993, + "step": 231 + }, + { + "epoch": 0.19, + "grad_norm": 1.9653407654210864, + "learning_rate": 4.895355499246162e-06, + "loss": 0.5525, + "step": 232 + }, + { + "epoch": 0.19, + "grad_norm": 2.367769051061897, + "learning_rate": 4.894417340077036e-06, + "loss": 0.5683, + "step": 233 + }, + { + "epoch": 0.19, + "grad_norm": 2.078327064466567, + "learning_rate": 4.893475084968474e-06, + "loss": 0.6184, + "step": 234 + }, + { + "epoch": 0.19, + "grad_norm": 2.1661882731589475, + "learning_rate": 4.8925287355323195e-06, + "loss": 0.6321, + "step": 235 + }, + { + "epoch": 0.2, + "grad_norm": 2.182760952002799, + "learning_rate": 4.891578293387413e-06, + "loss": 0.6254, + "step": 236 + }, + { + "epoch": 0.2, + "grad_norm": 1.998723579962691, + "learning_rate": 4.890623760159605e-06, + "loss": 0.5371, + "step": 237 + }, + { + "epoch": 0.2, + "grad_norm": 2.319922346931926, + "learning_rate": 4.8896651374817365e-06, + "loss": 0.5941, + "step": 238 + }, + { + "epoch": 0.2, + "grad_norm": 2.090735197217999, + "learning_rate": 4.888702426993648e-06, + "loss": 0.577, + "step": 239 + }, + { + "epoch": 0.2, + "grad_norm": 2.1247199987228558, + "learning_rate": 4.887735630342173e-06, + "loss": 0.5928, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 2.33151114429804, + "learning_rate": 4.8867647491811315e-06, + "loss": 0.5838, + "step": 241 + }, + { + "epoch": 0.2, + "grad_norm": 2.1570026356289147, + "learning_rate": 4.885789785171334e-06, + "loss": 0.5642, + "step": 242 + }, + { + "epoch": 0.2, + "grad_norm": 2.049571197047368, + "learning_rate": 4.884810739980575e-06, + "loss": 0.6684, + "step": 243 + }, + { + "epoch": 0.2, + "grad_norm": 1.9810062424466381, + "learning_rate": 4.883827615283626e-06, + "loss": 0.5942, + "step": 244 + }, + { + "epoch": 0.2, + "grad_norm": 2.145869663660159, + "learning_rate": 4.882840412762244e-06, + "loss": 0.6356, + "step": 245 + }, + { + "epoch": 0.2, + "grad_norm": 2.19290302186514, + "learning_rate": 4.881849134105156e-06, + "loss": 0.6189, + "step": 246 + }, + { + "epoch": 0.2, + "grad_norm": 2.0561043419872984, + "learning_rate": 4.880853781008062e-06, + "loss": 0.5563, + "step": 247 + }, + { + "epoch": 0.21, + "grad_norm": 1.8831183793224635, + "learning_rate": 4.879854355173638e-06, + "loss": 0.5522, + "step": 248 + }, + { + "epoch": 0.21, + "grad_norm": 2.020981606684741, + "learning_rate": 4.878850858311518e-06, + "loss": 0.5548, + "step": 249 + }, + { + "epoch": 0.21, + "grad_norm": 2.060242570493272, + "learning_rate": 4.877843292138307e-06, + "loss": 0.5715, + "step": 250 + }, + { + "epoch": 0.21, + "grad_norm": 2.082455778933014, + "learning_rate": 4.8768316583775665e-06, + "loss": 0.5959, + "step": 251 + }, + { + "epoch": 0.21, + "grad_norm": 1.9830929719438626, + "learning_rate": 4.875815958759819e-06, + "loss": 0.5813, + "step": 252 + }, + { + "epoch": 0.21, + "grad_norm": 1.9772267506828567, + "learning_rate": 4.8747961950225406e-06, + "loss": 0.539, + "step": 253 + }, + { + "epoch": 0.21, + "grad_norm": 2.1492561995002104, + "learning_rate": 4.873772368910161e-06, + "loss": 0.6059, + "step": 254 + }, + { + "epoch": 0.21, + "grad_norm": 2.253757247139787, + "learning_rate": 4.872744482174058e-06, + "loss": 0.5897, + "step": 255 + }, + { + "epoch": 0.21, + "grad_norm": 2.3282624851882496, + "learning_rate": 4.8717125365725545e-06, + "loss": 0.5675, + "step": 256 + }, + { + "epoch": 0.21, + "grad_norm": 2.15573581133063, + "learning_rate": 4.8706765338709185e-06, + "loss": 0.5958, + "step": 257 + }, + { + "epoch": 0.21, + "grad_norm": 2.073289220218241, + "learning_rate": 4.869636475841358e-06, + "loss": 0.6052, + "step": 258 + }, + { + "epoch": 0.21, + "grad_norm": 2.293714090249444, + "learning_rate": 4.8685923642630165e-06, + "loss": 0.5786, + "step": 259 + }, + { + "epoch": 0.22, + "grad_norm": 1.9496544276539172, + "learning_rate": 4.867544200921974e-06, + "loss": 0.6163, + "step": 260 + }, + { + "epoch": 0.22, + "grad_norm": 2.5267016753690132, + "learning_rate": 4.866491987611239e-06, + "loss": 0.6223, + "step": 261 + }, + { + "epoch": 0.22, + "grad_norm": 1.8731249445320794, + "learning_rate": 4.865435726130751e-06, + "loss": 0.5632, + "step": 262 + }, + { + "epoch": 0.22, + "grad_norm": 2.3586331105798863, + "learning_rate": 4.86437541828737e-06, + "loss": 0.5769, + "step": 263 + }, + { + "epoch": 0.22, + "grad_norm": 2.0258106914510585, + "learning_rate": 4.863311065894883e-06, + "loss": 0.6103, + "step": 264 + }, + { + "epoch": 0.22, + "grad_norm": 2.2543614390885955, + "learning_rate": 4.862242670773991e-06, + "loss": 0.5844, + "step": 265 + }, + { + "epoch": 0.22, + "grad_norm": 1.9440299381244668, + "learning_rate": 4.861170234752314e-06, + "loss": 0.5559, + "step": 266 + }, + { + "epoch": 0.22, + "grad_norm": 2.254538268495492, + "learning_rate": 4.8600937596643815e-06, + "loss": 0.5709, + "step": 267 + }, + { + "epoch": 0.22, + "grad_norm": 2.007651746385687, + "learning_rate": 4.8590132473516346e-06, + "loss": 0.573, + "step": 268 + }, + { + "epoch": 0.22, + "grad_norm": 2.0735253118288837, + "learning_rate": 4.857928699662421e-06, + "loss": 0.5954, + "step": 269 + }, + { + "epoch": 0.22, + "grad_norm": 2.024775417101569, + "learning_rate": 4.856840118451989e-06, + "loss": 0.5992, + "step": 270 + }, + { + "epoch": 0.22, + "grad_norm": 2.1043310699945814, + "learning_rate": 4.855747505582488e-06, + "loss": 0.6507, + "step": 271 + }, + { + "epoch": 0.23, + "grad_norm": 2.0386353328313214, + "learning_rate": 4.854650862922965e-06, + "loss": 0.5666, + "step": 272 + }, + { + "epoch": 0.23, + "grad_norm": 1.978698841367705, + "learning_rate": 4.853550192349358e-06, + "loss": 0.5593, + "step": 273 + }, + { + "epoch": 0.23, + "grad_norm": 1.9386534247633986, + "learning_rate": 4.852445495744497e-06, + "loss": 0.5735, + "step": 274 + }, + { + "epoch": 0.23, + "grad_norm": 2.049346245018599, + "learning_rate": 4.8513367749981e-06, + "loss": 0.5415, + "step": 275 + }, + { + "epoch": 0.23, + "grad_norm": 2.1051969521216605, + "learning_rate": 4.850224032006765e-06, + "loss": 0.5532, + "step": 276 + }, + { + "epoch": 0.23, + "grad_norm": 2.2006792558872315, + "learning_rate": 4.849107268673975e-06, + "loss": 0.5696, + "step": 277 + }, + { + "epoch": 0.23, + "grad_norm": 2.0460787736353647, + "learning_rate": 4.847986486910088e-06, + "loss": 0.5658, + "step": 278 + }, + { + "epoch": 0.23, + "grad_norm": 2.1161843259225406, + "learning_rate": 4.846861688632336e-06, + "loss": 0.583, + "step": 279 + }, + { + "epoch": 0.23, + "grad_norm": 1.8882198480393542, + "learning_rate": 4.8457328757648224e-06, + "loss": 0.5693, + "step": 280 + }, + { + "epoch": 0.23, + "grad_norm": 2.1578413701109596, + "learning_rate": 4.844600050238517e-06, + "loss": 0.5409, + "step": 281 + }, + { + "epoch": 0.23, + "grad_norm": 2.03912467778954, + "learning_rate": 4.843463213991255e-06, + "loss": 0.5908, + "step": 282 + }, + { + "epoch": 0.23, + "grad_norm": 2.2333462480826247, + "learning_rate": 4.842322368967731e-06, + "loss": 0.6088, + "step": 283 + }, + { + "epoch": 0.24, + "grad_norm": 2.06698702157327, + "learning_rate": 4.8411775171194986e-06, + "loss": 0.5953, + "step": 284 + }, + { + "epoch": 0.24, + "grad_norm": 2.1433923121572045, + "learning_rate": 4.840028660404964e-06, + "loss": 0.5851, + "step": 285 + }, + { + "epoch": 0.24, + "grad_norm": 2.214858780835041, + "learning_rate": 4.838875800789386e-06, + "loss": 0.5913, + "step": 286 + }, + { + "epoch": 0.24, + "grad_norm": 2.038128612492624, + "learning_rate": 4.837718940244871e-06, + "loss": 0.5827, + "step": 287 + }, + { + "epoch": 0.24, + "grad_norm": 1.9894065096959768, + "learning_rate": 4.836558080750365e-06, + "loss": 0.5769, + "step": 288 + }, + { + "epoch": 0.24, + "grad_norm": 2.1711590153285822, + "learning_rate": 4.835393224291662e-06, + "loss": 0.654, + "step": 289 + }, + { + "epoch": 0.24, + "grad_norm": 2.105004451988696, + "learning_rate": 4.834224372861386e-06, + "loss": 0.6158, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 1.9554568023729102, + "learning_rate": 4.833051528459001e-06, + "loss": 0.5807, + "step": 291 + }, + { + "epoch": 0.24, + "grad_norm": 2.2693917834500312, + "learning_rate": 4.831874693090797e-06, + "loss": 0.5557, + "step": 292 + }, + { + "epoch": 0.24, + "grad_norm": 1.9081391627126192, + "learning_rate": 4.830693868769892e-06, + "loss": 0.6057, + "step": 293 + }, + { + "epoch": 0.24, + "grad_norm": 2.2133664110768585, + "learning_rate": 4.82950905751623e-06, + "loss": 0.6103, + "step": 294 + }, + { + "epoch": 0.24, + "grad_norm": 2.015392814211589, + "learning_rate": 4.8283202613565735e-06, + "loss": 0.5578, + "step": 295 + }, + { + "epoch": 0.25, + "grad_norm": 2.142124020349717, + "learning_rate": 4.8271274823245e-06, + "loss": 0.5675, + "step": 296 + }, + { + "epoch": 0.25, + "grad_norm": 1.981611826462286, + "learning_rate": 4.825930722460405e-06, + "loss": 0.5696, + "step": 297 + }, + { + "epoch": 0.25, + "grad_norm": 1.966759748348117, + "learning_rate": 4.824729983811486e-06, + "loss": 0.58, + "step": 298 + }, + { + "epoch": 0.25, + "grad_norm": 2.0117040369769397, + "learning_rate": 4.823525268431754e-06, + "loss": 0.6005, + "step": 299 + }, + { + "epoch": 0.25, + "grad_norm": 1.9579664917991193, + "learning_rate": 4.822316578382019e-06, + "loss": 0.5472, + "step": 300 + }, + { + "epoch": 0.25, + "grad_norm": 1.9075723479635032, + "learning_rate": 4.821103915729892e-06, + "loss": 0.5834, + "step": 301 + }, + { + "epoch": 0.25, + "grad_norm": 2.289340229011896, + "learning_rate": 4.819887282549777e-06, + "loss": 0.6088, + "step": 302 + }, + { + "epoch": 0.25, + "grad_norm": 2.0410700553735235, + "learning_rate": 4.818666680922874e-06, + "loss": 0.5449, + "step": 303 + }, + { + "epoch": 0.25, + "grad_norm": 2.074434792511819, + "learning_rate": 4.8174421129371675e-06, + "loss": 0.5826, + "step": 304 + }, + { + "epoch": 0.25, + "grad_norm": 2.1377170527698865, + "learning_rate": 4.816213580687428e-06, + "loss": 0.6262, + "step": 305 + }, + { + "epoch": 0.25, + "grad_norm": 2.060340839248083, + "learning_rate": 4.814981086275209e-06, + "loss": 0.5479, + "step": 306 + }, + { + "epoch": 0.25, + "grad_norm": 2.007036467413588, + "learning_rate": 4.813744631808841e-06, + "loss": 0.5642, + "step": 307 + }, + { + "epoch": 0.26, + "grad_norm": 2.016779606220332, + "learning_rate": 4.8125042194034285e-06, + "loss": 0.5503, + "step": 308 + }, + { + "epoch": 0.26, + "grad_norm": 1.930004252757651, + "learning_rate": 4.811259851180845e-06, + "loss": 0.582, + "step": 309 + }, + { + "epoch": 0.26, + "grad_norm": 1.9179477992752856, + "learning_rate": 4.810011529269734e-06, + "loss": 0.5678, + "step": 310 + }, + { + "epoch": 0.26, + "grad_norm": 2.023430757276848, + "learning_rate": 4.808759255805498e-06, + "loss": 0.614, + "step": 311 + }, + { + "epoch": 0.26, + "grad_norm": 1.8334738409404936, + "learning_rate": 4.807503032930306e-06, + "loss": 0.5742, + "step": 312 + }, + { + "epoch": 0.26, + "grad_norm": 1.937332706274502, + "learning_rate": 4.806242862793075e-06, + "loss": 0.6257, + "step": 313 + }, + { + "epoch": 0.26, + "grad_norm": 2.0265383045700363, + "learning_rate": 4.8049787475494786e-06, + "loss": 0.5733, + "step": 314 + }, + { + "epoch": 0.26, + "grad_norm": 2.056444039073761, + "learning_rate": 4.803710689361939e-06, + "loss": 0.578, + "step": 315 + }, + { + "epoch": 0.26, + "grad_norm": 2.411132719183335, + "learning_rate": 4.802438690399622e-06, + "loss": 0.5778, + "step": 316 + }, + { + "epoch": 0.26, + "grad_norm": 2.0233969242222853, + "learning_rate": 4.801162752838436e-06, + "loss": 0.5649, + "step": 317 + }, + { + "epoch": 0.26, + "grad_norm": 2.2809121915132815, + "learning_rate": 4.799882878861025e-06, + "loss": 0.5589, + "step": 318 + }, + { + "epoch": 0.26, + "grad_norm": 1.9806834041020271, + "learning_rate": 4.798599070656768e-06, + "loss": 0.5753, + "step": 319 + }, + { + "epoch": 0.27, + "grad_norm": 2.095099671577702, + "learning_rate": 4.797311330421773e-06, + "loss": 0.5644, + "step": 320 + }, + { + "epoch": 0.27, + "grad_norm": 2.1697606190375764, + "learning_rate": 4.796019660358877e-06, + "loss": 0.6009, + "step": 321 + }, + { + "epoch": 0.27, + "grad_norm": 1.9549416103216173, + "learning_rate": 4.794724062677635e-06, + "loss": 0.5429, + "step": 322 + }, + { + "epoch": 0.27, + "grad_norm": 1.9986949357292838, + "learning_rate": 4.793424539594323e-06, + "loss": 0.5456, + "step": 323 + }, + { + "epoch": 0.27, + "grad_norm": 1.9414831957796765, + "learning_rate": 4.792121093331935e-06, + "loss": 0.5468, + "step": 324 + }, + { + "epoch": 0.27, + "grad_norm": 2.100702188933012, + "learning_rate": 4.7908137261201685e-06, + "loss": 0.5763, + "step": 325 + }, + { + "epoch": 0.27, + "grad_norm": 2.2747471285831025, + "learning_rate": 4.789502440195436e-06, + "loss": 0.5637, + "step": 326 + }, + { + "epoch": 0.27, + "grad_norm": 1.8996382919319124, + "learning_rate": 4.788187237800849e-06, + "loss": 0.5285, + "step": 327 + }, + { + "epoch": 0.27, + "grad_norm": 2.3451495174978847, + "learning_rate": 4.786868121186218e-06, + "loss": 0.5638, + "step": 328 + }, + { + "epoch": 0.27, + "grad_norm": 2.0437536068229565, + "learning_rate": 4.7855450926080535e-06, + "loss": 0.5282, + "step": 329 + }, + { + "epoch": 0.27, + "grad_norm": 2.1185488514745554, + "learning_rate": 4.784218154329555e-06, + "loss": 0.5689, + "step": 330 + }, + { + "epoch": 0.27, + "grad_norm": 2.08745956731504, + "learning_rate": 4.78288730862061e-06, + "loss": 0.5772, + "step": 331 + }, + { + "epoch": 0.28, + "grad_norm": 1.9479507156354359, + "learning_rate": 4.781552557757789e-06, + "loss": 0.5419, + "step": 332 + }, + { + "epoch": 0.28, + "grad_norm": 2.0211480847937255, + "learning_rate": 4.780213904024346e-06, + "loss": 0.5757, + "step": 333 + }, + { + "epoch": 0.28, + "grad_norm": 1.9075335749936069, + "learning_rate": 4.7788713497102094e-06, + "loss": 0.5693, + "step": 334 + }, + { + "epoch": 0.28, + "grad_norm": 1.9590727137410602, + "learning_rate": 4.777524897111979e-06, + "loss": 0.5501, + "step": 335 + }, + { + "epoch": 0.28, + "grad_norm": 2.0328480247612752, + "learning_rate": 4.776174548532926e-06, + "loss": 0.587, + "step": 336 + }, + { + "epoch": 0.28, + "grad_norm": 2.062540517496736, + "learning_rate": 4.774820306282982e-06, + "loss": 0.5819, + "step": 337 + }, + { + "epoch": 0.28, + "grad_norm": 2.0054452800156195, + "learning_rate": 4.773462172678744e-06, + "loss": 0.5529, + "step": 338 + }, + { + "epoch": 0.28, + "grad_norm": 1.9641125644599562, + "learning_rate": 4.772100150043462e-06, + "loss": 0.5895, + "step": 339 + }, + { + "epoch": 0.28, + "grad_norm": 1.9196744569285298, + "learning_rate": 4.77073424070704e-06, + "loss": 0.5504, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 2.0002752186146484, + "learning_rate": 4.76936444700603e-06, + "loss": 0.5307, + "step": 341 + }, + { + "epoch": 0.28, + "grad_norm": 2.1068919823054344, + "learning_rate": 4.76799077128363e-06, + "loss": 0.5908, + "step": 342 + }, + { + "epoch": 0.28, + "grad_norm": 1.919597745459612, + "learning_rate": 4.766613215889678e-06, + "loss": 0.5423, + "step": 343 + }, + { + "epoch": 0.29, + "grad_norm": 2.0670928578728716, + "learning_rate": 4.765231783180648e-06, + "loss": 0.5901, + "step": 344 + }, + { + "epoch": 0.29, + "grad_norm": 1.906116148793229, + "learning_rate": 4.763846475519648e-06, + "loss": 0.5919, + "step": 345 + }, + { + "epoch": 0.29, + "grad_norm": 1.9133575268702454, + "learning_rate": 4.762457295276413e-06, + "loss": 0.585, + "step": 346 + }, + { + "epoch": 0.29, + "grad_norm": 2.133902651855379, + "learning_rate": 4.7610642448273025e-06, + "loss": 0.5444, + "step": 347 + }, + { + "epoch": 0.29, + "grad_norm": 1.95222194640397, + "learning_rate": 4.7596673265552985e-06, + "loss": 0.5941, + "step": 348 + }, + { + "epoch": 0.29, + "grad_norm": 2.095010268380277, + "learning_rate": 4.758266542849997e-06, + "loss": 0.6045, + "step": 349 + }, + { + "epoch": 0.29, + "grad_norm": 2.0493864712059655, + "learning_rate": 4.756861896107609e-06, + "loss": 0.6011, + "step": 350 + }, + { + "epoch": 0.29, + "grad_norm": 1.9222198823064967, + "learning_rate": 4.755453388730949e-06, + "loss": 0.5521, + "step": 351 + }, + { + "epoch": 0.29, + "grad_norm": 2.368147154955994, + "learning_rate": 4.754041023129442e-06, + "loss": 0.6117, + "step": 352 + }, + { + "epoch": 0.29, + "grad_norm": 1.9734596786106697, + "learning_rate": 4.752624801719108e-06, + "loss": 0.5727, + "step": 353 + }, + { + "epoch": 0.29, + "grad_norm": 2.151510566977991, + "learning_rate": 4.751204726922564e-06, + "loss": 0.6085, + "step": 354 + }, + { + "epoch": 0.29, + "grad_norm": 1.9291219072892685, + "learning_rate": 4.74978080116902e-06, + "loss": 0.5655, + "step": 355 + }, + { + "epoch": 0.3, + "grad_norm": 1.838592559018919, + "learning_rate": 4.748353026894273e-06, + "loss": 0.5508, + "step": 356 + }, + { + "epoch": 0.3, + "grad_norm": 2.069156589116884, + "learning_rate": 4.7469214065407e-06, + "loss": 0.5942, + "step": 357 + }, + { + "epoch": 0.3, + "grad_norm": 1.8960817746615841, + "learning_rate": 4.745485942557264e-06, + "loss": 0.5902, + "step": 358 + }, + { + "epoch": 0.3, + "grad_norm": 2.0606557307859634, + "learning_rate": 4.744046637399497e-06, + "loss": 0.556, + "step": 359 + }, + { + "epoch": 0.3, + "grad_norm": 1.9660065879130573, + "learning_rate": 4.742603493529505e-06, + "loss": 0.5364, + "step": 360 + }, + { + "epoch": 0.3, + "grad_norm": 1.9647921383638112, + "learning_rate": 4.741156513415958e-06, + "loss": 0.5601, + "step": 361 + }, + { + "epoch": 0.3, + "grad_norm": 2.049074688423064, + "learning_rate": 4.739705699534092e-06, + "loss": 0.556, + "step": 362 + }, + { + "epoch": 0.3, + "grad_norm": 1.962593945802751, + "learning_rate": 4.738251054365697e-06, + "loss": 0.5609, + "step": 363 + }, + { + "epoch": 0.3, + "grad_norm": 2.059675349950347, + "learning_rate": 4.736792580399119e-06, + "loss": 0.5499, + "step": 364 + }, + { + "epoch": 0.3, + "grad_norm": 1.8479566025134508, + "learning_rate": 4.7353302801292555e-06, + "loss": 0.5621, + "step": 365 + }, + { + "epoch": 0.3, + "grad_norm": 1.9405450724813613, + "learning_rate": 4.733864156057545e-06, + "loss": 0.5437, + "step": 366 + }, + { + "epoch": 0.3, + "grad_norm": 2.122487864033456, + "learning_rate": 4.7323942106919715e-06, + "loss": 0.5984, + "step": 367 + }, + { + "epoch": 0.31, + "grad_norm": 2.6822841144123046, + "learning_rate": 4.730920446547052e-06, + "loss": 0.5951, + "step": 368 + }, + { + "epoch": 0.31, + "grad_norm": 2.001405394086718, + "learning_rate": 4.729442866143838e-06, + "loss": 0.5552, + "step": 369 + }, + { + "epoch": 0.31, + "grad_norm": 2.081154186949651, + "learning_rate": 4.72796147200991e-06, + "loss": 0.587, + "step": 370 + }, + { + "epoch": 0.31, + "grad_norm": 2.1196544292473236, + "learning_rate": 4.72647626667937e-06, + "loss": 0.5882, + "step": 371 + }, + { + "epoch": 0.31, + "grad_norm": 2.107445583509131, + "learning_rate": 4.724987252692841e-06, + "loss": 0.5389, + "step": 372 + }, + { + "epoch": 0.31, + "grad_norm": 1.9529785007256542, + "learning_rate": 4.723494432597462e-06, + "loss": 0.6439, + "step": 373 + }, + { + "epoch": 0.31, + "grad_norm": 2.11513441515607, + "learning_rate": 4.72199780894688e-06, + "loss": 0.6089, + "step": 374 + }, + { + "epoch": 0.31, + "grad_norm": 1.9769899713721226, + "learning_rate": 4.7204973843012504e-06, + "loss": 0.5393, + "step": 375 + }, + { + "epoch": 0.31, + "grad_norm": 2.063749623036316, + "learning_rate": 4.718993161227231e-06, + "loss": 0.5987, + "step": 376 + }, + { + "epoch": 0.31, + "grad_norm": 2.0515862288253883, + "learning_rate": 4.717485142297977e-06, + "loss": 0.5772, + "step": 377 + }, + { + "epoch": 0.31, + "grad_norm": 1.8962297741946081, + "learning_rate": 4.715973330093135e-06, + "loss": 0.5424, + "step": 378 + }, + { + "epoch": 0.31, + "grad_norm": 2.2210958340400087, + "learning_rate": 4.7144577271988435e-06, + "loss": 0.6072, + "step": 379 + }, + { + "epoch": 0.32, + "grad_norm": 2.067113337475314, + "learning_rate": 4.712938336207724e-06, + "loss": 0.5482, + "step": 380 + }, + { + "epoch": 0.32, + "grad_norm": 1.8985489253954526, + "learning_rate": 4.711415159718876e-06, + "loss": 0.5593, + "step": 381 + }, + { + "epoch": 0.32, + "grad_norm": 2.085236381118245, + "learning_rate": 4.709888200337879e-06, + "loss": 0.5704, + "step": 382 + }, + { + "epoch": 0.32, + "grad_norm": 2.0967664183909784, + "learning_rate": 4.708357460676779e-06, + "loss": 0.5997, + "step": 383 + }, + { + "epoch": 0.32, + "grad_norm": 2.0454278026009645, + "learning_rate": 4.706822943354092e-06, + "loss": 0.5669, + "step": 384 + }, + { + "epoch": 0.32, + "grad_norm": 1.9171673309342674, + "learning_rate": 4.705284650994793e-06, + "loss": 0.517, + "step": 385 + }, + { + "epoch": 0.32, + "grad_norm": 2.2003223432761287, + "learning_rate": 4.70374258623032e-06, + "loss": 0.5957, + "step": 386 + }, + { + "epoch": 0.32, + "grad_norm": 1.936392519491186, + "learning_rate": 4.702196751698557e-06, + "loss": 0.5767, + "step": 387 + }, + { + "epoch": 0.32, + "grad_norm": 2.354272003403086, + "learning_rate": 4.700647150043841e-06, + "loss": 0.6515, + "step": 388 + }, + { + "epoch": 0.32, + "grad_norm": 1.9115059027323418, + "learning_rate": 4.699093783916955e-06, + "loss": 0.5579, + "step": 389 + }, + { + "epoch": 0.32, + "grad_norm": 1.9878827587010002, + "learning_rate": 4.697536655975115e-06, + "loss": 0.572, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 1.9729552535473858, + "learning_rate": 4.69597576888198e-06, + "loss": 0.5665, + "step": 391 + }, + { + "epoch": 0.32, + "grad_norm": 2.177634366499155, + "learning_rate": 4.694411125307632e-06, + "loss": 0.6363, + "step": 392 + }, + { + "epoch": 0.33, + "grad_norm": 1.8955146664976508, + "learning_rate": 4.692842727928584e-06, + "loss": 0.5682, + "step": 393 + }, + { + "epoch": 0.33, + "grad_norm": 2.175305874476245, + "learning_rate": 4.691270579427769e-06, + "loss": 0.5943, + "step": 394 + }, + { + "epoch": 0.33, + "grad_norm": 2.068140527232831, + "learning_rate": 4.689694682494537e-06, + "loss": 0.5659, + "step": 395 + }, + { + "epoch": 0.33, + "grad_norm": 1.9112960694448755, + "learning_rate": 4.688115039824648e-06, + "loss": 0.6048, + "step": 396 + }, + { + "epoch": 0.33, + "grad_norm": 1.9778305624626604, + "learning_rate": 4.686531654120272e-06, + "loss": 0.5695, + "step": 397 + }, + { + "epoch": 0.33, + "grad_norm": 2.096904163204813, + "learning_rate": 4.684944528089981e-06, + "loss": 0.6113, + "step": 398 + }, + { + "epoch": 0.33, + "grad_norm": 2.0011934144948516, + "learning_rate": 4.683353664448745e-06, + "loss": 0.5568, + "step": 399 + }, + { + "epoch": 0.33, + "grad_norm": 1.8562851971757464, + "learning_rate": 4.681759065917929e-06, + "loss": 0.5474, + "step": 400 + }, + { + "epoch": 0.33, + "grad_norm": 1.8190547574166316, + "learning_rate": 4.680160735225285e-06, + "loss": 0.5315, + "step": 401 + }, + { + "epoch": 0.33, + "grad_norm": 1.9247862956929132, + "learning_rate": 4.6785586751049505e-06, + "loss": 0.5568, + "step": 402 + }, + { + "epoch": 0.33, + "grad_norm": 1.8469793674077621, + "learning_rate": 4.676952888297442e-06, + "loss": 0.5811, + "step": 403 + }, + { + "epoch": 0.33, + "grad_norm": 1.946943145198674, + "learning_rate": 4.675343377549653e-06, + "loss": 0.5475, + "step": 404 + }, + { + "epoch": 0.34, + "grad_norm": 1.991304422730463, + "learning_rate": 4.6737301456148445e-06, + "loss": 0.5856, + "step": 405 + }, + { + "epoch": 0.34, + "grad_norm": 1.9168241989446437, + "learning_rate": 4.672113195252644e-06, + "loss": 0.6069, + "step": 406 + }, + { + "epoch": 0.34, + "grad_norm": 1.9305433665377905, + "learning_rate": 4.670492529229039e-06, + "loss": 0.5536, + "step": 407 + }, + { + "epoch": 0.34, + "grad_norm": 1.8441008898830742, + "learning_rate": 4.668868150316377e-06, + "loss": 0.5859, + "step": 408 + }, + { + "epoch": 0.34, + "grad_norm": 1.8879301596961315, + "learning_rate": 4.667240061293351e-06, + "loss": 0.5483, + "step": 409 + }, + { + "epoch": 0.34, + "grad_norm": 2.024767417636281, + "learning_rate": 4.665608264945004e-06, + "loss": 0.5414, + "step": 410 + }, + { + "epoch": 0.34, + "grad_norm": 2.1331610141797395, + "learning_rate": 4.663972764062722e-06, + "loss": 0.5811, + "step": 411 + }, + { + "epoch": 0.34, + "grad_norm": 1.8132480265817386, + "learning_rate": 4.662333561444226e-06, + "loss": 0.5573, + "step": 412 + }, + { + "epoch": 0.34, + "grad_norm": 1.9795813972027145, + "learning_rate": 4.6606906598935675e-06, + "loss": 0.5814, + "step": 413 + }, + { + "epoch": 0.34, + "grad_norm": 1.8782931074297053, + "learning_rate": 4.6590440622211295e-06, + "loss": 0.569, + "step": 414 + }, + { + "epoch": 0.34, + "grad_norm": 1.8219945335518706, + "learning_rate": 4.657393771243614e-06, + "loss": 0.5669, + "step": 415 + }, + { + "epoch": 0.34, + "grad_norm": 2.4047268604371306, + "learning_rate": 4.6557397897840454e-06, + "loss": 0.5602, + "step": 416 + }, + { + "epoch": 0.35, + "grad_norm": 2.064501780523946, + "learning_rate": 4.654082120671757e-06, + "loss": 0.5699, + "step": 417 + }, + { + "epoch": 0.35, + "grad_norm": 1.9183128854940252, + "learning_rate": 4.65242076674239e-06, + "loss": 0.6112, + "step": 418 + }, + { + "epoch": 0.35, + "grad_norm": 1.9315698971629633, + "learning_rate": 4.650755730837894e-06, + "loss": 0.5537, + "step": 419 + }, + { + "epoch": 0.35, + "grad_norm": 1.9527809333659218, + "learning_rate": 4.649087015806509e-06, + "loss": 0.5423, + "step": 420 + }, + { + "epoch": 0.35, + "grad_norm": 1.8940523915995442, + "learning_rate": 4.647414624502777e-06, + "loss": 0.5708, + "step": 421 + }, + { + "epoch": 0.35, + "grad_norm": 1.9976964785548623, + "learning_rate": 4.645738559787524e-06, + "loss": 0.6006, + "step": 422 + }, + { + "epoch": 0.35, + "grad_norm": 1.9098681403283917, + "learning_rate": 4.64405882452786e-06, + "loss": 0.5591, + "step": 423 + }, + { + "epoch": 0.35, + "grad_norm": 1.8695612182804557, + "learning_rate": 4.642375421597175e-06, + "loss": 0.5219, + "step": 424 + }, + { + "epoch": 0.35, + "grad_norm": 1.8912077704810082, + "learning_rate": 4.6406883538751315e-06, + "loss": 0.5224, + "step": 425 + }, + { + "epoch": 0.35, + "grad_norm": 1.9390714726978922, + "learning_rate": 4.638997624247664e-06, + "loss": 0.5359, + "step": 426 + }, + { + "epoch": 0.35, + "grad_norm": 2.051545992296337, + "learning_rate": 4.637303235606968e-06, + "loss": 0.544, + "step": 427 + }, + { + "epoch": 0.35, + "grad_norm": 2.0657109136265914, + "learning_rate": 4.6356051908515e-06, + "loss": 0.5429, + "step": 428 + }, + { + "epoch": 0.36, + "grad_norm": 2.0301022307984793, + "learning_rate": 4.63390349288597e-06, + "loss": 0.5787, + "step": 429 + }, + { + "epoch": 0.36, + "grad_norm": 2.052515756169346, + "learning_rate": 4.632198144621338e-06, + "loss": 0.5778, + "step": 430 + }, + { + "epoch": 0.36, + "grad_norm": 1.9741370495474897, + "learning_rate": 4.630489148974807e-06, + "loss": 0.5142, + "step": 431 + }, + { + "epoch": 0.36, + "grad_norm": 1.9713229498863698, + "learning_rate": 4.62877650886982e-06, + "loss": 0.6127, + "step": 432 + }, + { + "epoch": 0.36, + "grad_norm": 2.1609440121306007, + "learning_rate": 4.627060227236055e-06, + "loss": 0.5886, + "step": 433 + }, + { + "epoch": 0.36, + "grad_norm": 1.944966445355139, + "learning_rate": 4.625340307009418e-06, + "loss": 0.5657, + "step": 434 + }, + { + "epoch": 0.36, + "grad_norm": 2.031003925680835, + "learning_rate": 4.623616751132041e-06, + "loss": 0.5628, + "step": 435 + }, + { + "epoch": 0.36, + "grad_norm": 1.8774113373137704, + "learning_rate": 4.621889562552272e-06, + "loss": 0.6068, + "step": 436 + }, + { + "epoch": 0.36, + "grad_norm": 2.0385201543401785, + "learning_rate": 4.620158744224677e-06, + "loss": 0.5511, + "step": 437 + }, + { + "epoch": 0.36, + "grad_norm": 1.8440750841938207, + "learning_rate": 4.618424299110028e-06, + "loss": 0.5261, + "step": 438 + }, + { + "epoch": 0.36, + "grad_norm": 1.8978691755923442, + "learning_rate": 4.616686230175303e-06, + "loss": 0.5862, + "step": 439 + }, + { + "epoch": 0.36, + "grad_norm": 1.8120850246861446, + "learning_rate": 4.614944540393679e-06, + "loss": 0.5652, + "step": 440 + }, + { + "epoch": 0.37, + "grad_norm": 2.1821084695714914, + "learning_rate": 4.613199232744525e-06, + "loss": 0.5598, + "step": 441 + }, + { + "epoch": 0.37, + "grad_norm": 1.9626422737625222, + "learning_rate": 4.611450310213401e-06, + "loss": 0.5267, + "step": 442 + }, + { + "epoch": 0.37, + "grad_norm": 1.9714913234889215, + "learning_rate": 4.6096977757920505e-06, + "loss": 0.5658, + "step": 443 + }, + { + "epoch": 0.37, + "grad_norm": 2.0179324078198233, + "learning_rate": 4.607941632478393e-06, + "loss": 0.582, + "step": 444 + }, + { + "epoch": 0.37, + "grad_norm": 1.8565193856331161, + "learning_rate": 4.6061818832765246e-06, + "loss": 0.5715, + "step": 445 + }, + { + "epoch": 0.37, + "grad_norm": 1.9798501479599246, + "learning_rate": 4.604418531196708e-06, + "loss": 0.6007, + "step": 446 + }, + { + "epoch": 0.37, + "grad_norm": 2.0095846956468257, + "learning_rate": 4.602651579255369e-06, + "loss": 0.5947, + "step": 447 + }, + { + "epoch": 0.37, + "grad_norm": 1.9316541079988245, + "learning_rate": 4.600881030475093e-06, + "loss": 0.5501, + "step": 448 + }, + { + "epoch": 0.37, + "grad_norm": 2.080069353365406, + "learning_rate": 4.599106887884616e-06, + "loss": 0.5631, + "step": 449 + }, + { + "epoch": 0.37, + "grad_norm": 1.965973137652201, + "learning_rate": 4.5973291545188235e-06, + "loss": 0.5267, + "step": 450 + }, + { + "epoch": 0.37, + "grad_norm": 2.1082225966704087, + "learning_rate": 4.595547833418741e-06, + "loss": 0.6418, + "step": 451 + }, + { + "epoch": 0.37, + "grad_norm": 2.0359312594194083, + "learning_rate": 4.593762927631536e-06, + "loss": 0.5644, + "step": 452 + }, + { + "epoch": 0.38, + "grad_norm": 2.1254892914109433, + "learning_rate": 4.591974440210502e-06, + "loss": 0.5693, + "step": 453 + }, + { + "epoch": 0.38, + "grad_norm": 1.9121188587334927, + "learning_rate": 4.590182374215064e-06, + "loss": 0.5572, + "step": 454 + }, + { + "epoch": 0.38, + "grad_norm": 1.9348642624953207, + "learning_rate": 4.588386732710765e-06, + "loss": 0.5446, + "step": 455 + }, + { + "epoch": 0.38, + "grad_norm": 1.8667846547370581, + "learning_rate": 4.5865875187692695e-06, + "loss": 0.5681, + "step": 456 + }, + { + "epoch": 0.38, + "grad_norm": 1.9219061327454674, + "learning_rate": 4.5847847354683465e-06, + "loss": 0.5508, + "step": 457 + }, + { + "epoch": 0.38, + "grad_norm": 1.8106132369123122, + "learning_rate": 4.5829783858918756e-06, + "loss": 0.5626, + "step": 458 + }, + { + "epoch": 0.38, + "grad_norm": 1.7827483964442634, + "learning_rate": 4.5811684731298355e-06, + "loss": 0.5575, + "step": 459 + }, + { + "epoch": 0.38, + "grad_norm": 1.9284196979863513, + "learning_rate": 4.5793550002783e-06, + "loss": 0.5363, + "step": 460 + }, + { + "epoch": 0.38, + "grad_norm": 2.029647468705457, + "learning_rate": 4.577537970439433e-06, + "loss": 0.5415, + "step": 461 + }, + { + "epoch": 0.38, + "grad_norm": 2.0997127029950087, + "learning_rate": 4.575717386721482e-06, + "loss": 0.5814, + "step": 462 + }, + { + "epoch": 0.38, + "grad_norm": 1.9589290300656341, + "learning_rate": 4.573893252238777e-06, + "loss": 0.5156, + "step": 463 + }, + { + "epoch": 0.38, + "grad_norm": 1.905237143908251, + "learning_rate": 4.572065570111717e-06, + "loss": 0.5536, + "step": 464 + }, + { + "epoch": 0.39, + "grad_norm": 1.929519794935609, + "learning_rate": 4.570234343466775e-06, + "loss": 0.5879, + "step": 465 + }, + { + "epoch": 0.39, + "grad_norm": 2.096095808886982, + "learning_rate": 4.568399575436484e-06, + "loss": 0.6241, + "step": 466 + }, + { + "epoch": 0.39, + "grad_norm": 1.9486118894048778, + "learning_rate": 4.566561269159437e-06, + "loss": 0.6307, + "step": 467 + }, + { + "epoch": 0.39, + "grad_norm": 2.0839490306744586, + "learning_rate": 4.564719427780276e-06, + "loss": 0.5655, + "step": 468 + }, + { + "epoch": 0.39, + "grad_norm": 1.9439525665822102, + "learning_rate": 4.562874054449694e-06, + "loss": 0.5437, + "step": 469 + }, + { + "epoch": 0.39, + "grad_norm": 1.9409142791465297, + "learning_rate": 4.5610251523244244e-06, + "loss": 0.6429, + "step": 470 + }, + { + "epoch": 0.39, + "grad_norm": 1.8664574493795525, + "learning_rate": 4.559172724567238e-06, + "loss": 0.5826, + "step": 471 + }, + { + "epoch": 0.39, + "grad_norm": 1.80819349503324, + "learning_rate": 4.557316774346934e-06, + "loss": 0.5372, + "step": 472 + }, + { + "epoch": 0.39, + "grad_norm": 1.8680097526865296, + "learning_rate": 4.555457304838341e-06, + "loss": 0.5503, + "step": 473 + }, + { + "epoch": 0.39, + "grad_norm": 1.7466938790815696, + "learning_rate": 4.553594319222303e-06, + "loss": 0.5425, + "step": 474 + }, + { + "epoch": 0.39, + "grad_norm": 1.9610557658505607, + "learning_rate": 4.551727820685684e-06, + "loss": 0.5755, + "step": 475 + }, + { + "epoch": 0.39, + "grad_norm": 1.9414839604282412, + "learning_rate": 4.549857812421353e-06, + "loss": 0.5915, + "step": 476 + }, + { + "epoch": 0.4, + "grad_norm": 1.8484957644576423, + "learning_rate": 4.547984297628186e-06, + "loss": 0.5676, + "step": 477 + }, + { + "epoch": 0.4, + "grad_norm": 2.074524028551078, + "learning_rate": 4.546107279511055e-06, + "loss": 0.6084, + "step": 478 + }, + { + "epoch": 0.4, + "grad_norm": 2.069692704122282, + "learning_rate": 4.544226761280826e-06, + "loss": 0.5676, + "step": 479 + }, + { + "epoch": 0.4, + "grad_norm": 1.8975472248317244, + "learning_rate": 4.54234274615435e-06, + "loss": 0.5904, + "step": 480 + }, + { + "epoch": 0.4, + "grad_norm": 2.0118868982719897, + "learning_rate": 4.540455237354466e-06, + "loss": 0.5722, + "step": 481 + }, + { + "epoch": 0.4, + "grad_norm": 1.9733105429381828, + "learning_rate": 4.5385642381099814e-06, + "loss": 0.6112, + "step": 482 + }, + { + "epoch": 0.4, + "grad_norm": 1.862156914026863, + "learning_rate": 4.53666975165568e-06, + "loss": 0.5951, + "step": 483 + }, + { + "epoch": 0.4, + "grad_norm": 1.9512940035297868, + "learning_rate": 4.53477178123231e-06, + "loss": 0.5223, + "step": 484 + }, + { + "epoch": 0.4, + "grad_norm": 1.9202464191558823, + "learning_rate": 4.532870330086577e-06, + "loss": 0.5638, + "step": 485 + }, + { + "epoch": 0.4, + "grad_norm": 1.9015767656854419, + "learning_rate": 4.530965401471143e-06, + "loss": 0.5911, + "step": 486 + }, + { + "epoch": 0.4, + "grad_norm": 1.95190921973106, + "learning_rate": 4.529056998644619e-06, + "loss": 0.6053, + "step": 487 + }, + { + "epoch": 0.4, + "grad_norm": 2.0058459596081644, + "learning_rate": 4.527145124871556e-06, + "loss": 0.5466, + "step": 488 + }, + { + "epoch": 0.41, + "grad_norm": 1.8902620959998047, + "learning_rate": 4.5252297834224454e-06, + "loss": 0.5526, + "step": 489 + }, + { + "epoch": 0.41, + "grad_norm": 1.985466416169018, + "learning_rate": 4.523310977573711e-06, + "loss": 0.5958, + "step": 490 + }, + { + "epoch": 0.41, + "grad_norm": 2.1140148957176415, + "learning_rate": 4.521388710607699e-06, + "loss": 0.613, + "step": 491 + }, + { + "epoch": 0.41, + "grad_norm": 1.9470601192089525, + "learning_rate": 4.51946298581268e-06, + "loss": 0.5847, + "step": 492 + }, + { + "epoch": 0.41, + "grad_norm": 2.0227057176069603, + "learning_rate": 4.51753380648284e-06, + "loss": 0.5784, + "step": 493 + }, + { + "epoch": 0.41, + "grad_norm": 2.05501863673554, + "learning_rate": 4.515601175918269e-06, + "loss": 0.5501, + "step": 494 + }, + { + "epoch": 0.41, + "grad_norm": 2.0129325402811715, + "learning_rate": 4.513665097424967e-06, + "loss": 0.5641, + "step": 495 + }, + { + "epoch": 0.41, + "grad_norm": 2.0322333044110468, + "learning_rate": 4.51172557431483e-06, + "loss": 0.5422, + "step": 496 + }, + { + "epoch": 0.41, + "grad_norm": 1.9573055659958774, + "learning_rate": 4.509782609905644e-06, + "loss": 0.516, + "step": 497 + }, + { + "epoch": 0.41, + "grad_norm": 1.8223127451485421, + "learning_rate": 4.507836207521085e-06, + "loss": 0.5714, + "step": 498 + }, + { + "epoch": 0.41, + "grad_norm": 1.9343089861079434, + "learning_rate": 4.50588637049071e-06, + "loss": 0.5424, + "step": 499 + }, + { + "epoch": 0.41, + "grad_norm": 1.8940990649350729, + "learning_rate": 4.503933102149948e-06, + "loss": 0.5832, + "step": 500 + }, + { + "epoch": 0.42, + "grad_norm": 1.908617301933682, + "learning_rate": 4.501976405840101e-06, + "loss": 0.5399, + "step": 501 + }, + { + "epoch": 0.42, + "grad_norm": 1.8290259512093785, + "learning_rate": 4.500016284908334e-06, + "loss": 0.5561, + "step": 502 + }, + { + "epoch": 0.42, + "grad_norm": 1.9840280991844164, + "learning_rate": 4.49805274270767e-06, + "loss": 0.5645, + "step": 503 + }, + { + "epoch": 0.42, + "grad_norm": 1.9864953051636856, + "learning_rate": 4.496085782596984e-06, + "loss": 0.5369, + "step": 504 + }, + { + "epoch": 0.42, + "grad_norm": 1.979387839103732, + "learning_rate": 4.494115407940999e-06, + "loss": 0.6196, + "step": 505 + }, + { + "epoch": 0.42, + "grad_norm": 1.9266869362165981, + "learning_rate": 4.492141622110279e-06, + "loss": 0.5687, + "step": 506 + }, + { + "epoch": 0.42, + "grad_norm": 1.9887461782376619, + "learning_rate": 4.4901644284812205e-06, + "loss": 0.5264, + "step": 507 + }, + { + "epoch": 0.42, + "grad_norm": 1.8717867803152208, + "learning_rate": 4.488183830436052e-06, + "loss": 0.5612, + "step": 508 + }, + { + "epoch": 0.42, + "grad_norm": 2.0044226171493, + "learning_rate": 4.486199831362828e-06, + "loss": 0.5571, + "step": 509 + }, + { + "epoch": 0.42, + "grad_norm": 2.1075571016617958, + "learning_rate": 4.484212434655414e-06, + "loss": 0.5642, + "step": 510 + }, + { + "epoch": 0.42, + "grad_norm": 1.8031612547539957, + "learning_rate": 4.482221643713494e-06, + "loss": 0.5805, + "step": 511 + }, + { + "epoch": 0.42, + "grad_norm": 1.8782516337672304, + "learning_rate": 4.480227461942556e-06, + "loss": 0.5596, + "step": 512 + }, + { + "epoch": 0.43, + "grad_norm": 2.075073901596185, + "learning_rate": 4.478229892753886e-06, + "loss": 0.6124, + "step": 513 + }, + { + "epoch": 0.43, + "grad_norm": 2.0588983460568304, + "learning_rate": 4.47622893956457e-06, + "loss": 0.5589, + "step": 514 + }, + { + "epoch": 0.43, + "grad_norm": 1.850248236464706, + "learning_rate": 4.474224605797476e-06, + "loss": 0.5603, + "step": 515 + }, + { + "epoch": 0.43, + "grad_norm": 1.932844310652863, + "learning_rate": 4.472216894881261e-06, + "loss": 0.5571, + "step": 516 + }, + { + "epoch": 0.43, + "grad_norm": 2.09975454805468, + "learning_rate": 4.470205810250357e-06, + "loss": 0.5975, + "step": 517 + }, + { + "epoch": 0.43, + "grad_norm": 1.9694087093010304, + "learning_rate": 4.468191355344965e-06, + "loss": 0.5698, + "step": 518 + }, + { + "epoch": 0.43, + "grad_norm": 1.8794788153917539, + "learning_rate": 4.466173533611053e-06, + "loss": 0.5559, + "step": 519 + }, + { + "epoch": 0.43, + "grad_norm": 2.0650455557855434, + "learning_rate": 4.46415234850035e-06, + "loss": 0.5644, + "step": 520 + }, + { + "epoch": 0.43, + "grad_norm": 2.0062649027982022, + "learning_rate": 4.462127803470334e-06, + "loss": 0.608, + "step": 521 + }, + { + "epoch": 0.43, + "grad_norm": 2.043267877462657, + "learning_rate": 4.460099901984235e-06, + "loss": 0.573, + "step": 522 + }, + { + "epoch": 0.43, + "grad_norm": 2.056372436619027, + "learning_rate": 4.4580686475110235e-06, + "loss": 0.5748, + "step": 523 + }, + { + "epoch": 0.43, + "grad_norm": 1.8871033520138176, + "learning_rate": 4.456034043525404e-06, + "loss": 0.5339, + "step": 524 + }, + { + "epoch": 0.44, + "grad_norm": 1.889474616209236, + "learning_rate": 4.45399609350781e-06, + "loss": 0.5185, + "step": 525 + }, + { + "epoch": 0.44, + "grad_norm": 1.9767406217632912, + "learning_rate": 4.451954800944405e-06, + "loss": 0.5758, + "step": 526 + }, + { + "epoch": 0.44, + "grad_norm": 1.9588695861513832, + "learning_rate": 4.449910169327062e-06, + "loss": 0.5472, + "step": 527 + }, + { + "epoch": 0.44, + "grad_norm": 1.8852210889000718, + "learning_rate": 4.447862202153372e-06, + "loss": 0.5917, + "step": 528 + }, + { + "epoch": 0.44, + "grad_norm": 2.0103638871993077, + "learning_rate": 4.445810902926629e-06, + "loss": 0.5761, + "step": 529 + }, + { + "epoch": 0.44, + "grad_norm": 2.201836945389513, + "learning_rate": 4.443756275155827e-06, + "loss": 0.5614, + "step": 530 + }, + { + "epoch": 0.44, + "grad_norm": 1.900702305836831, + "learning_rate": 4.441698322355656e-06, + "loss": 0.5254, + "step": 531 + }, + { + "epoch": 0.44, + "grad_norm": 2.134694583439314, + "learning_rate": 4.4396370480464915e-06, + "loss": 0.5607, + "step": 532 + }, + { + "epoch": 0.44, + "grad_norm": 1.8073751630381198, + "learning_rate": 4.437572455754391e-06, + "loss": 0.536, + "step": 533 + }, + { + "epoch": 0.44, + "grad_norm": 1.9607338020142653, + "learning_rate": 4.435504549011088e-06, + "loss": 0.59, + "step": 534 + }, + { + "epoch": 0.44, + "grad_norm": 2.0756430867435274, + "learning_rate": 4.433433331353988e-06, + "loss": 0.5538, + "step": 535 + }, + { + "epoch": 0.44, + "grad_norm": 1.8280570853718465, + "learning_rate": 4.431358806326158e-06, + "loss": 0.5789, + "step": 536 + }, + { + "epoch": 0.45, + "grad_norm": 2.2005143967434977, + "learning_rate": 4.429280977476321e-06, + "loss": 0.545, + "step": 537 + }, + { + "epoch": 0.45, + "grad_norm": 1.896479397543979, + "learning_rate": 4.4271998483588565e-06, + "loss": 0.5791, + "step": 538 + }, + { + "epoch": 0.45, + "grad_norm": 2.117773381781195, + "learning_rate": 4.425115422533785e-06, + "loss": 0.5234, + "step": 539 + }, + { + "epoch": 0.45, + "grad_norm": 2.4438942429566617, + "learning_rate": 4.423027703566769e-06, + "loss": 0.5692, + "step": 540 + }, + { + "epoch": 0.45, + "grad_norm": 1.873481152225171, + "learning_rate": 4.4209366950291025e-06, + "loss": 0.5739, + "step": 541 + }, + { + "epoch": 0.45, + "grad_norm": 1.8655199147974673, + "learning_rate": 4.4188424004977085e-06, + "loss": 0.5795, + "step": 542 + }, + { + "epoch": 0.45, + "grad_norm": 1.948840412241188, + "learning_rate": 4.416744823555129e-06, + "loss": 0.5304, + "step": 543 + }, + { + "epoch": 0.45, + "grad_norm": 1.8389034133315045, + "learning_rate": 4.414643967789523e-06, + "loss": 0.5076, + "step": 544 + }, + { + "epoch": 0.45, + "grad_norm": 1.8269235720085213, + "learning_rate": 4.412539836794657e-06, + "loss": 0.5837, + "step": 545 + }, + { + "epoch": 0.45, + "grad_norm": 2.1298715969759505, + "learning_rate": 4.410432434169902e-06, + "loss": 0.5694, + "step": 546 + }, + { + "epoch": 0.45, + "grad_norm": 2.0057741366005746, + "learning_rate": 4.408321763520223e-06, + "loss": 0.557, + "step": 547 + }, + { + "epoch": 0.45, + "grad_norm": 1.7901331374893255, + "learning_rate": 4.406207828456177e-06, + "loss": 0.5746, + "step": 548 + }, + { + "epoch": 0.46, + "grad_norm": 2.1994839889416187, + "learning_rate": 4.404090632593904e-06, + "loss": 0.5407, + "step": 549 + }, + { + "epoch": 0.46, + "grad_norm": 1.9664921082690268, + "learning_rate": 4.401970179555123e-06, + "loss": 0.5322, + "step": 550 + }, + { + "epoch": 0.46, + "grad_norm": 1.9933486180243851, + "learning_rate": 4.399846472967124e-06, + "loss": 0.5798, + "step": 551 + }, + { + "epoch": 0.46, + "grad_norm": 1.986612256562151, + "learning_rate": 4.397719516462765e-06, + "loss": 0.5213, + "step": 552 + }, + { + "epoch": 0.46, + "grad_norm": 2.046550123292336, + "learning_rate": 4.395589313680459e-06, + "loss": 0.5857, + "step": 553 + }, + { + "epoch": 0.46, + "grad_norm": 1.7902327250340486, + "learning_rate": 4.393455868264176e-06, + "loss": 0.555, + "step": 554 + }, + { + "epoch": 0.46, + "grad_norm": 2.0203627138517146, + "learning_rate": 4.391319183863432e-06, + "loss": 0.6329, + "step": 555 + }, + { + "epoch": 0.46, + "grad_norm": 1.9373549045181289, + "learning_rate": 4.389179264133281e-06, + "loss": 0.566, + "step": 556 + }, + { + "epoch": 0.46, + "grad_norm": 1.8936753353678124, + "learning_rate": 4.387036112734316e-06, + "loss": 0.5555, + "step": 557 + }, + { + "epoch": 0.46, + "grad_norm": 1.8493817575820743, + "learning_rate": 4.3848897333326545e-06, + "loss": 0.5427, + "step": 558 + }, + { + "epoch": 0.46, + "grad_norm": 1.9119588677783816, + "learning_rate": 4.382740129599937e-06, + "loss": 0.5157, + "step": 559 + }, + { + "epoch": 0.46, + "grad_norm": 1.8190137094200924, + "learning_rate": 4.380587305213321e-06, + "loss": 0.503, + "step": 560 + }, + { + "epoch": 0.47, + "grad_norm": 1.9891332712764953, + "learning_rate": 4.37843126385547e-06, + "loss": 0.5761, + "step": 561 + }, + { + "epoch": 0.47, + "grad_norm": 1.8620896547461154, + "learning_rate": 4.376272009214555e-06, + "loss": 0.5259, + "step": 562 + }, + { + "epoch": 0.47, + "grad_norm": 1.8896721756477406, + "learning_rate": 4.37410954498424e-06, + "loss": 0.5632, + "step": 563 + }, + { + "epoch": 0.47, + "grad_norm": 1.8302281976781984, + "learning_rate": 4.37194387486368e-06, + "loss": 0.5612, + "step": 564 + }, + { + "epoch": 0.47, + "grad_norm": 2.0721820586440165, + "learning_rate": 4.369775002557516e-06, + "loss": 0.533, + "step": 565 + }, + { + "epoch": 0.47, + "grad_norm": 1.8259926551813157, + "learning_rate": 4.367602931775865e-06, + "loss": 0.526, + "step": 566 + }, + { + "epoch": 0.47, + "grad_norm": 1.8096334574000785, + "learning_rate": 4.3654276662343155e-06, + "loss": 0.5306, + "step": 567 + }, + { + "epoch": 0.47, + "grad_norm": 1.9675637591445598, + "learning_rate": 4.363249209653922e-06, + "loss": 0.5577, + "step": 568 + }, + { + "epoch": 0.47, + "grad_norm": 1.8800389115841605, + "learning_rate": 4.361067565761197e-06, + "loss": 0.5553, + "step": 569 + }, + { + "epoch": 0.47, + "grad_norm": 1.827485496395265, + "learning_rate": 4.358882738288105e-06, + "loss": 0.5587, + "step": 570 + }, + { + "epoch": 0.47, + "grad_norm": 1.820954908943235, + "learning_rate": 4.356694730972056e-06, + "loss": 0.6186, + "step": 571 + }, + { + "epoch": 0.47, + "grad_norm": 1.952072431699686, + "learning_rate": 4.3545035475559025e-06, + "loss": 0.5488, + "step": 572 + }, + { + "epoch": 0.48, + "grad_norm": 1.8292648968688423, + "learning_rate": 4.352309191787924e-06, + "loss": 0.5534, + "step": 573 + }, + { + "epoch": 0.48, + "grad_norm": 1.826293122529813, + "learning_rate": 4.350111667421835e-06, + "loss": 0.5872, + "step": 574 + }, + { + "epoch": 0.48, + "grad_norm": 1.9251425791166785, + "learning_rate": 4.347910978216763e-06, + "loss": 0.5298, + "step": 575 + }, + { + "epoch": 0.48, + "grad_norm": 1.8330818196811385, + "learning_rate": 4.345707127937253e-06, + "loss": 0.5871, + "step": 576 + }, + { + "epoch": 0.48, + "grad_norm": 1.7842986545873851, + "learning_rate": 4.3435001203532555e-06, + "loss": 0.4898, + "step": 577 + }, + { + "epoch": 0.48, + "grad_norm": 1.8778666245156521, + "learning_rate": 4.341289959240124e-06, + "loss": 0.5385, + "step": 578 + }, + { + "epoch": 0.48, + "grad_norm": 1.9300679499181266, + "learning_rate": 4.339076648378605e-06, + "loss": 0.5698, + "step": 579 + }, + { + "epoch": 0.48, + "grad_norm": 1.9440861965960357, + "learning_rate": 4.336860191554833e-06, + "loss": 0.5984, + "step": 580 + }, + { + "epoch": 0.48, + "grad_norm": 1.929951096053947, + "learning_rate": 4.3346405925603265e-06, + "loss": 0.6222, + "step": 581 + }, + { + "epoch": 0.48, + "grad_norm": 1.9138258400335695, + "learning_rate": 4.332417855191974e-06, + "loss": 0.5498, + "step": 582 + }, + { + "epoch": 0.48, + "grad_norm": 2.058548455869675, + "learning_rate": 4.330191983252039e-06, + "loss": 0.5218, + "step": 583 + }, + { + "epoch": 0.48, + "grad_norm": 2.243429045583125, + "learning_rate": 4.327962980548142e-06, + "loss": 0.5768, + "step": 584 + }, + { + "epoch": 0.48, + "grad_norm": 1.9213537104634244, + "learning_rate": 4.32573085089326e-06, + "loss": 0.5784, + "step": 585 + }, + { + "epoch": 0.49, + "grad_norm": 1.9165291289119128, + "learning_rate": 4.32349559810572e-06, + "loss": 0.5697, + "step": 586 + }, + { + "epoch": 0.49, + "grad_norm": 1.9674279518735756, + "learning_rate": 4.321257226009193e-06, + "loss": 0.5104, + "step": 587 + }, + { + "epoch": 0.49, + "grad_norm": 1.9051339015323923, + "learning_rate": 4.319015738432683e-06, + "loss": 0.5711, + "step": 588 + }, + { + "epoch": 0.49, + "grad_norm": 1.957357618850765, + "learning_rate": 4.3167711392105245e-06, + "loss": 0.5854, + "step": 589 + }, + { + "epoch": 0.49, + "grad_norm": 1.9859311708308915, + "learning_rate": 4.314523432182376e-06, + "loss": 0.547, + "step": 590 + }, + { + "epoch": 0.49, + "grad_norm": 1.773704456523191, + "learning_rate": 4.312272621193209e-06, + "loss": 0.5259, + "step": 591 + }, + { + "epoch": 0.49, + "grad_norm": 1.82988033655793, + "learning_rate": 4.31001871009331e-06, + "loss": 0.5209, + "step": 592 + }, + { + "epoch": 0.49, + "grad_norm": 1.8925134832060522, + "learning_rate": 4.307761702738264e-06, + "loss": 0.59, + "step": 593 + }, + { + "epoch": 0.49, + "grad_norm": 1.8477075780641046, + "learning_rate": 4.305501602988953e-06, + "loss": 0.5714, + "step": 594 + }, + { + "epoch": 0.49, + "grad_norm": 1.8568432886623798, + "learning_rate": 4.303238414711552e-06, + "loss": 0.5877, + "step": 595 + }, + { + "epoch": 0.49, + "grad_norm": 1.8179798660158206, + "learning_rate": 4.3009721417775166e-06, + "loss": 0.6029, + "step": 596 + }, + { + "epoch": 0.49, + "grad_norm": 1.8494963193854803, + "learning_rate": 4.29870278806358e-06, + "loss": 0.5236, + "step": 597 + }, + { + "epoch": 0.5, + "grad_norm": 1.9586017397154731, + "learning_rate": 4.296430357451744e-06, + "loss": 0.5998, + "step": 598 + }, + { + "epoch": 0.5, + "grad_norm": 1.926616057974202, + "learning_rate": 4.2941548538292765e-06, + "loss": 0.5914, + "step": 599 + }, + { + "epoch": 0.5, + "grad_norm": 1.9321738359144827, + "learning_rate": 4.291876281088701e-06, + "loss": 0.5358, + "step": 600 + }, + { + "epoch": 0.5, + "grad_norm": 1.8229177571361932, + "learning_rate": 4.289594643127788e-06, + "loss": 0.5284, + "step": 601 + }, + { + "epoch": 0.5, + "grad_norm": 1.849252449531427, + "learning_rate": 4.287309943849558e-06, + "loss": 0.5689, + "step": 602 + }, + { + "epoch": 0.5, + "grad_norm": 1.985343175388319, + "learning_rate": 4.285022187162261e-06, + "loss": 0.6101, + "step": 603 + }, + { + "epoch": 0.5, + "grad_norm": 1.9437791826489255, + "learning_rate": 4.2827313769793835e-06, + "loss": 0.5419, + "step": 604 + }, + { + "epoch": 0.5, + "grad_norm": 1.8027421078538746, + "learning_rate": 4.28043751721963e-06, + "loss": 0.5504, + "step": 605 + }, + { + "epoch": 0.5, + "grad_norm": 1.8221230935939319, + "learning_rate": 4.278140611806926e-06, + "loss": 0.5284, + "step": 606 + }, + { + "epoch": 0.5, + "grad_norm": 1.8597205853821357, + "learning_rate": 4.275840664670403e-06, + "loss": 0.623, + "step": 607 + }, + { + "epoch": 0.5, + "grad_norm": 1.7801370844338822, + "learning_rate": 4.2735376797444e-06, + "loss": 0.5265, + "step": 608 + }, + { + "epoch": 0.5, + "grad_norm": 1.9028094416250234, + "learning_rate": 4.271231660968449e-06, + "loss": 0.5764, + "step": 609 + }, + { + "epoch": 0.51, + "grad_norm": 1.9385737581380094, + "learning_rate": 4.268922612287273e-06, + "loss": 0.6047, + "step": 610 + }, + { + "epoch": 0.51, + "grad_norm": 1.760006169733744, + "learning_rate": 4.266610537650778e-06, + "loss": 0.4944, + "step": 611 + }, + { + "epoch": 0.51, + "grad_norm": 1.857083980479501, + "learning_rate": 4.264295441014047e-06, + "loss": 0.5174, + "step": 612 + }, + { + "epoch": 0.51, + "grad_norm": 1.8299942480819913, + "learning_rate": 4.261977326337332e-06, + "loss": 0.5814, + "step": 613 + }, + { + "epoch": 0.51, + "grad_norm": 1.8943903433033418, + "learning_rate": 4.259656197586046e-06, + "loss": 0.5514, + "step": 614 + }, + { + "epoch": 0.51, + "grad_norm": 1.7839062839610529, + "learning_rate": 4.257332058730761e-06, + "loss": 0.5857, + "step": 615 + }, + { + "epoch": 0.51, + "grad_norm": 2.7188975139736256, + "learning_rate": 4.255004913747196e-06, + "loss": 0.5509, + "step": 616 + }, + { + "epoch": 0.51, + "grad_norm": 1.8767461602206779, + "learning_rate": 4.252674766616212e-06, + "loss": 0.5038, + "step": 617 + }, + { + "epoch": 0.51, + "grad_norm": 1.8391588901867753, + "learning_rate": 4.250341621323809e-06, + "loss": 0.5196, + "step": 618 + }, + { + "epoch": 0.51, + "grad_norm": 1.8106924420187829, + "learning_rate": 4.248005481861111e-06, + "loss": 0.5458, + "step": 619 + }, + { + "epoch": 0.51, + "grad_norm": 1.9698953511074666, + "learning_rate": 4.245666352224367e-06, + "loss": 0.5963, + "step": 620 + }, + { + "epoch": 0.51, + "grad_norm": 1.8890424031569348, + "learning_rate": 4.243324236414939e-06, + "loss": 0.5277, + "step": 621 + }, + { + "epoch": 0.52, + "grad_norm": 1.8537879418167673, + "learning_rate": 4.240979138439301e-06, + "loss": 0.5407, + "step": 622 + }, + { + "epoch": 0.52, + "grad_norm": 1.9264981771759184, + "learning_rate": 4.238631062309023e-06, + "loss": 0.5788, + "step": 623 + }, + { + "epoch": 0.52, + "grad_norm": 1.949693389062837, + "learning_rate": 4.236280012040773e-06, + "loss": 0.5007, + "step": 624 + }, + { + "epoch": 0.52, + "grad_norm": 1.8845778025905608, + "learning_rate": 4.233925991656307e-06, + "loss": 0.5905, + "step": 625 + }, + { + "epoch": 0.52, + "grad_norm": 1.8977167810192608, + "learning_rate": 4.231569005182459e-06, + "loss": 0.5342, + "step": 626 + }, + { + "epoch": 0.52, + "grad_norm": 1.9579196623045914, + "learning_rate": 4.229209056651139e-06, + "loss": 0.554, + "step": 627 + }, + { + "epoch": 0.52, + "grad_norm": 1.8427820272426025, + "learning_rate": 4.226846150099324e-06, + "loss": 0.5629, + "step": 628 + }, + { + "epoch": 0.52, + "grad_norm": 1.865218131227253, + "learning_rate": 4.22448028956905e-06, + "loss": 0.558, + "step": 629 + }, + { + "epoch": 0.52, + "grad_norm": 1.7348773966225364, + "learning_rate": 4.222111479107406e-06, + "loss": 0.5332, + "step": 630 + }, + { + "epoch": 0.52, + "grad_norm": 1.779367140127678, + "learning_rate": 4.219739722766528e-06, + "loss": 0.569, + "step": 631 + }, + { + "epoch": 0.52, + "grad_norm": 1.92860570712595, + "learning_rate": 4.217365024603592e-06, + "loss": 0.5342, + "step": 632 + }, + { + "epoch": 0.52, + "grad_norm": 1.946965997476449, + "learning_rate": 4.214987388680804e-06, + "loss": 0.5482, + "step": 633 + }, + { + "epoch": 0.53, + "grad_norm": 1.7930454990298659, + "learning_rate": 4.212606819065399e-06, + "loss": 0.5376, + "step": 634 + }, + { + "epoch": 0.53, + "grad_norm": 1.8379498458279013, + "learning_rate": 4.210223319829626e-06, + "loss": 0.5741, + "step": 635 + }, + { + "epoch": 0.53, + "grad_norm": 1.742977498596499, + "learning_rate": 4.207836895050748e-06, + "loss": 0.5569, + "step": 636 + }, + { + "epoch": 0.53, + "grad_norm": 1.852541709372898, + "learning_rate": 4.205447548811032e-06, + "loss": 0.578, + "step": 637 + }, + { + "epoch": 0.53, + "grad_norm": 1.8180259569107267, + "learning_rate": 4.203055285197745e-06, + "loss": 0.5189, + "step": 638 + }, + { + "epoch": 0.53, + "grad_norm": 1.8177842562763082, + "learning_rate": 4.20066010830314e-06, + "loss": 0.5424, + "step": 639 + }, + { + "epoch": 0.53, + "grad_norm": 1.8068654723170434, + "learning_rate": 4.198262022224457e-06, + "loss": 0.5336, + "step": 640 + }, + { + "epoch": 0.53, + "grad_norm": 1.9664843499052276, + "learning_rate": 4.195861031063909e-06, + "loss": 0.5399, + "step": 641 + }, + { + "epoch": 0.53, + "grad_norm": 1.7812265481792608, + "learning_rate": 4.193457138928683e-06, + "loss": 0.534, + "step": 642 + }, + { + "epoch": 0.53, + "grad_norm": 1.908377487778027, + "learning_rate": 4.191050349930925e-06, + "loss": 0.5831, + "step": 643 + }, + { + "epoch": 0.53, + "grad_norm": 1.8124678634933105, + "learning_rate": 4.18864066818774e-06, + "loss": 0.5309, + "step": 644 + }, + { + "epoch": 0.53, + "grad_norm": 1.902443199964304, + "learning_rate": 4.186228097821176e-06, + "loss": 0.5452, + "step": 645 + }, + { + "epoch": 0.54, + "grad_norm": 1.9694387068719457, + "learning_rate": 4.183812642958227e-06, + "loss": 0.5462, + "step": 646 + }, + { + "epoch": 0.54, + "grad_norm": 1.945352264767711, + "learning_rate": 4.181394307730819e-06, + "loss": 0.4853, + "step": 647 + }, + { + "epoch": 0.54, + "grad_norm": 1.7967416728436914, + "learning_rate": 4.178973096275806e-06, + "loss": 0.5952, + "step": 648 + }, + { + "epoch": 0.54, + "grad_norm": 2.0602433101771616, + "learning_rate": 4.176549012734963e-06, + "loss": 0.6346, + "step": 649 + }, + { + "epoch": 0.54, + "grad_norm": 1.9158731498204968, + "learning_rate": 4.1741220612549746e-06, + "loss": 0.5101, + "step": 650 + }, + { + "epoch": 0.54, + "grad_norm": 1.951875972207364, + "learning_rate": 4.171692245987436e-06, + "loss": 0.5718, + "step": 651 + }, + { + "epoch": 0.54, + "grad_norm": 1.871788727804539, + "learning_rate": 4.169259571088839e-06, + "loss": 0.5516, + "step": 652 + }, + { + "epoch": 0.54, + "grad_norm": 1.945571804366465, + "learning_rate": 4.166824040720566e-06, + "loss": 0.5544, + "step": 653 + }, + { + "epoch": 0.54, + "grad_norm": 1.8975723622706568, + "learning_rate": 4.1643856590488866e-06, + "loss": 0.5643, + "step": 654 + }, + { + "epoch": 0.54, + "grad_norm": 1.9772846459626554, + "learning_rate": 4.161944430244945e-06, + "loss": 0.5487, + "step": 655 + }, + { + "epoch": 0.54, + "grad_norm": 2.036472038769578, + "learning_rate": 4.159500358484759e-06, + "loss": 0.5232, + "step": 656 + }, + { + "epoch": 0.54, + "grad_norm": 1.7742095436926848, + "learning_rate": 4.157053447949206e-06, + "loss": 0.4963, + "step": 657 + }, + { + "epoch": 0.55, + "grad_norm": 2.1819742476725814, + "learning_rate": 4.154603702824023e-06, + "loss": 0.5416, + "step": 658 + }, + { + "epoch": 0.55, + "grad_norm": 1.9151345309457093, + "learning_rate": 4.152151127299794e-06, + "loss": 0.5822, + "step": 659 + }, + { + "epoch": 0.55, + "grad_norm": 2.033640859083771, + "learning_rate": 4.149695725571944e-06, + "loss": 0.5876, + "step": 660 + }, + { + "epoch": 0.55, + "grad_norm": 1.8935471013235925, + "learning_rate": 4.147237501840734e-06, + "loss": 0.548, + "step": 661 + }, + { + "epoch": 0.55, + "grad_norm": 1.7836299476774775, + "learning_rate": 4.144776460311253e-06, + "loss": 0.5274, + "step": 662 + }, + { + "epoch": 0.55, + "grad_norm": 2.194666072449123, + "learning_rate": 4.142312605193407e-06, + "loss": 0.5934, + "step": 663 + }, + { + "epoch": 0.55, + "grad_norm": 1.988265407508224, + "learning_rate": 4.13984594070192e-06, + "loss": 0.5539, + "step": 664 + }, + { + "epoch": 0.55, + "grad_norm": 1.7594955740187146, + "learning_rate": 4.137376471056317e-06, + "loss": 0.5324, + "step": 665 + }, + { + "epoch": 0.55, + "grad_norm": 1.9342530277100989, + "learning_rate": 4.1349042004809224e-06, + "loss": 0.5902, + "step": 666 + }, + { + "epoch": 0.55, + "grad_norm": 1.9757082453588417, + "learning_rate": 4.132429133204856e-06, + "loss": 0.5874, + "step": 667 + }, + { + "epoch": 0.55, + "grad_norm": 1.7792467343474774, + "learning_rate": 4.129951273462016e-06, + "loss": 0.5516, + "step": 668 + }, + { + "epoch": 0.55, + "grad_norm": 1.9010392264817964, + "learning_rate": 4.127470625491082e-06, + "loss": 0.5793, + "step": 669 + }, + { + "epoch": 0.56, + "grad_norm": 2.054505290884914, + "learning_rate": 4.1249871935355e-06, + "loss": 0.5718, + "step": 670 + }, + { + "epoch": 0.56, + "grad_norm": 1.8010036617727825, + "learning_rate": 4.1225009818434805e-06, + "loss": 0.5698, + "step": 671 + }, + { + "epoch": 0.56, + "grad_norm": 1.975020822034628, + "learning_rate": 4.120011994667988e-06, + "loss": 0.5739, + "step": 672 + }, + { + "epoch": 0.56, + "grad_norm": 1.9801075045379748, + "learning_rate": 4.117520236266734e-06, + "loss": 0.5589, + "step": 673 + }, + { + "epoch": 0.56, + "grad_norm": 1.7773808874926829, + "learning_rate": 4.115025710902173e-06, + "loss": 0.5276, + "step": 674 + }, + { + "epoch": 0.56, + "grad_norm": 1.890298398205481, + "learning_rate": 4.112528422841491e-06, + "loss": 0.4914, + "step": 675 + }, + { + "epoch": 0.56, + "grad_norm": 1.9087570296379215, + "learning_rate": 4.110028376356599e-06, + "loss": 0.5412, + "step": 676 + }, + { + "epoch": 0.56, + "grad_norm": 1.8908271691889404, + "learning_rate": 4.1075255757241295e-06, + "loss": 0.5618, + "step": 677 + }, + { + "epoch": 0.56, + "grad_norm": 2.024312170169272, + "learning_rate": 4.105020025225423e-06, + "loss": 0.5618, + "step": 678 + }, + { + "epoch": 0.56, + "grad_norm": 1.8072403207581518, + "learning_rate": 4.102511729146528e-06, + "loss": 0.5744, + "step": 679 + }, + { + "epoch": 0.56, + "grad_norm": 1.7750572145097157, + "learning_rate": 4.100000691778185e-06, + "loss": 0.5716, + "step": 680 + }, + { + "epoch": 0.56, + "grad_norm": 1.8778337896632162, + "learning_rate": 4.097486917415827e-06, + "loss": 0.5683, + "step": 681 + }, + { + "epoch": 0.57, + "grad_norm": 1.9710167098273688, + "learning_rate": 4.094970410359568e-06, + "loss": 0.5273, + "step": 682 + }, + { + "epoch": 0.57, + "grad_norm": 1.9136975523972874, + "learning_rate": 4.092451174914196e-06, + "loss": 0.5239, + "step": 683 + }, + { + "epoch": 0.57, + "grad_norm": 1.929344793900944, + "learning_rate": 4.089929215389167e-06, + "loss": 0.5388, + "step": 684 + }, + { + "epoch": 0.57, + "grad_norm": 1.7211535229712278, + "learning_rate": 4.087404536098597e-06, + "loss": 0.5068, + "step": 685 + }, + { + "epoch": 0.57, + "grad_norm": 1.8739637749458882, + "learning_rate": 4.084877141361254e-06, + "loss": 0.5537, + "step": 686 + }, + { + "epoch": 0.57, + "grad_norm": 1.9268469960932768, + "learning_rate": 4.082347035500553e-06, + "loss": 0.5875, + "step": 687 + }, + { + "epoch": 0.57, + "grad_norm": 1.896542320004603, + "learning_rate": 4.079814222844541e-06, + "loss": 0.5314, + "step": 688 + }, + { + "epoch": 0.57, + "grad_norm": 1.723925126440519, + "learning_rate": 4.077278707725904e-06, + "loss": 0.5009, + "step": 689 + }, + { + "epoch": 0.57, + "grad_norm": 1.8345210205201996, + "learning_rate": 4.074740494481942e-06, + "loss": 0.5544, + "step": 690 + }, + { + "epoch": 0.57, + "grad_norm": 1.766819080519227, + "learning_rate": 4.072199587454578e-06, + "loss": 0.5393, + "step": 691 + }, + { + "epoch": 0.57, + "grad_norm": 1.9577975399484282, + "learning_rate": 4.069655990990337e-06, + "loss": 0.5357, + "step": 692 + }, + { + "epoch": 0.57, + "grad_norm": 1.8254761359015224, + "learning_rate": 4.06710970944035e-06, + "loss": 0.5797, + "step": 693 + }, + { + "epoch": 0.58, + "grad_norm": 2.1203973374999214, + "learning_rate": 4.064560747160337e-06, + "loss": 0.5811, + "step": 694 + }, + { + "epoch": 0.58, + "grad_norm": 1.9066221824053846, + "learning_rate": 4.062009108510605e-06, + "loss": 0.5014, + "step": 695 + }, + { + "epoch": 0.58, + "grad_norm": 1.951489716071849, + "learning_rate": 4.059454797856039e-06, + "loss": 0.529, + "step": 696 + }, + { + "epoch": 0.58, + "grad_norm": 1.8402907113209426, + "learning_rate": 4.056897819566096e-06, + "loss": 0.4942, + "step": 697 + }, + { + "epoch": 0.58, + "grad_norm": 2.0368715640768498, + "learning_rate": 4.0543381780147965e-06, + "loss": 0.5245, + "step": 698 + }, + { + "epoch": 0.58, + "grad_norm": 1.8154462049772704, + "learning_rate": 4.0517758775807135e-06, + "loss": 0.4979, + "step": 699 + }, + { + "epoch": 0.58, + "grad_norm": 1.890388895335948, + "learning_rate": 4.049210922646973e-06, + "loss": 0.5212, + "step": 700 + }, + { + "epoch": 0.58, + "grad_norm": 2.0215900504030166, + "learning_rate": 4.046643317601237e-06, + "loss": 0.5384, + "step": 701 + }, + { + "epoch": 0.58, + "grad_norm": 1.816997259900234, + "learning_rate": 4.0440730668357076e-06, + "loss": 0.492, + "step": 702 + }, + { + "epoch": 0.58, + "grad_norm": 1.968633766153865, + "learning_rate": 4.0415001747471036e-06, + "loss": 0.5917, + "step": 703 + }, + { + "epoch": 0.58, + "grad_norm": 1.8313487810801756, + "learning_rate": 4.0389246457366696e-06, + "loss": 0.5561, + "step": 704 + }, + { + "epoch": 0.58, + "grad_norm": 1.7954421155528784, + "learning_rate": 4.036346484210159e-06, + "loss": 0.5383, + "step": 705 + }, + { + "epoch": 0.59, + "grad_norm": 1.8517101217315919, + "learning_rate": 4.033765694577826e-06, + "loss": 0.5368, + "step": 706 + }, + { + "epoch": 0.59, + "grad_norm": 1.8888441616203875, + "learning_rate": 4.031182281254423e-06, + "loss": 0.5895, + "step": 707 + }, + { + "epoch": 0.59, + "grad_norm": 1.8131436351862782, + "learning_rate": 4.028596248659191e-06, + "loss": 0.5346, + "step": 708 + }, + { + "epoch": 0.59, + "grad_norm": 1.8803113487311214, + "learning_rate": 4.0260076012158486e-06, + "loss": 0.4987, + "step": 709 + }, + { + "epoch": 0.59, + "grad_norm": 1.8989122650791335, + "learning_rate": 4.023416343352589e-06, + "loss": 0.5007, + "step": 710 + }, + { + "epoch": 0.59, + "grad_norm": 1.9466291969735336, + "learning_rate": 4.020822479502074e-06, + "loss": 0.5868, + "step": 711 + }, + { + "epoch": 0.59, + "grad_norm": 1.869533367998661, + "learning_rate": 4.018226014101418e-06, + "loss": 0.5995, + "step": 712 + }, + { + "epoch": 0.59, + "grad_norm": 1.93738608926368, + "learning_rate": 4.015626951592187e-06, + "loss": 0.5625, + "step": 713 + }, + { + "epoch": 0.59, + "grad_norm": 1.8485080870897803, + "learning_rate": 4.013025296420394e-06, + "loss": 0.5585, + "step": 714 + }, + { + "epoch": 0.59, + "grad_norm": 1.8099669115387913, + "learning_rate": 4.010421053036481e-06, + "loss": 0.5384, + "step": 715 + }, + { + "epoch": 0.59, + "grad_norm": 1.8810123612010912, + "learning_rate": 4.007814225895321e-06, + "loss": 0.5589, + "step": 716 + }, + { + "epoch": 0.59, + "grad_norm": 1.8692823610937885, + "learning_rate": 4.005204819456205e-06, + "loss": 0.5474, + "step": 717 + }, + { + "epoch": 0.6, + "grad_norm": 1.8120887102918588, + "learning_rate": 4.00259283818284e-06, + "loss": 0.5138, + "step": 718 + }, + { + "epoch": 0.6, + "grad_norm": 1.7933926935301234, + "learning_rate": 3.999978286543331e-06, + "loss": 0.5235, + "step": 719 + }, + { + "epoch": 0.6, + "grad_norm": 1.8382360731306235, + "learning_rate": 3.997361169010187e-06, + "loss": 0.5846, + "step": 720 + }, + { + "epoch": 0.6, + "grad_norm": 1.993925306673069, + "learning_rate": 3.994741490060301e-06, + "loss": 0.5561, + "step": 721 + }, + { + "epoch": 0.6, + "grad_norm": 1.900088669959918, + "learning_rate": 3.9921192541749505e-06, + "loss": 0.5215, + "step": 722 + }, + { + "epoch": 0.6, + "grad_norm": 1.9250072769385074, + "learning_rate": 3.989494465839785e-06, + "loss": 0.54, + "step": 723 + }, + { + "epoch": 0.6, + "grad_norm": 1.7928905908766457, + "learning_rate": 3.986867129544822e-06, + "loss": 0.6066, + "step": 724 + }, + { + "epoch": 0.6, + "grad_norm": 1.9474900039545116, + "learning_rate": 3.984237249784437e-06, + "loss": 0.5173, + "step": 725 + }, + { + "epoch": 0.6, + "grad_norm": 1.9004077336349998, + "learning_rate": 3.981604831057357e-06, + "loss": 0.5409, + "step": 726 + }, + { + "epoch": 0.6, + "grad_norm": 1.7573843693188624, + "learning_rate": 3.97896987786665e-06, + "loss": 0.5239, + "step": 727 + }, + { + "epoch": 0.6, + "grad_norm": 1.899283660379949, + "learning_rate": 3.976332394719721e-06, + "loss": 0.4977, + "step": 728 + }, + { + "epoch": 0.6, + "grad_norm": 1.8353476568345033, + "learning_rate": 3.973692386128304e-06, + "loss": 0.5834, + "step": 729 + }, + { + "epoch": 0.61, + "grad_norm": 2.032325534167748, + "learning_rate": 3.971049856608451e-06, + "loss": 0.5343, + "step": 730 + }, + { + "epoch": 0.61, + "grad_norm": 1.8161347764383835, + "learning_rate": 3.9684048106805286e-06, + "loss": 0.585, + "step": 731 + }, + { + "epoch": 0.61, + "grad_norm": 1.836376388525165, + "learning_rate": 3.965757252869204e-06, + "loss": 0.5978, + "step": 732 + }, + { + "epoch": 0.61, + "grad_norm": 1.889118862096067, + "learning_rate": 3.963107187703446e-06, + "loss": 0.5393, + "step": 733 + }, + { + "epoch": 0.61, + "grad_norm": 1.7772829607776217, + "learning_rate": 3.96045461971651e-06, + "loss": 0.5164, + "step": 734 + }, + { + "epoch": 0.61, + "grad_norm": 1.7980410807492582, + "learning_rate": 3.957799553445932e-06, + "loss": 0.5455, + "step": 735 + }, + { + "epoch": 0.61, + "grad_norm": 1.907936099702467, + "learning_rate": 3.955141993433526e-06, + "loss": 0.532, + "step": 736 + }, + { + "epoch": 0.61, + "grad_norm": 1.8668064740862462, + "learning_rate": 3.9524819442253645e-06, + "loss": 0.5578, + "step": 737 + }, + { + "epoch": 0.61, + "grad_norm": 1.838952740378055, + "learning_rate": 3.949819410371785e-06, + "loss": 0.5784, + "step": 738 + }, + { + "epoch": 0.61, + "grad_norm": 1.9595767898211005, + "learning_rate": 3.947154396427373e-06, + "loss": 0.5213, + "step": 739 + }, + { + "epoch": 0.61, + "grad_norm": 1.9422968944070973, + "learning_rate": 3.944486906950954e-06, + "loss": 0.5709, + "step": 740 + }, + { + "epoch": 0.61, + "grad_norm": 1.760556693040696, + "learning_rate": 3.941816946505592e-06, + "loss": 0.5564, + "step": 741 + }, + { + "epoch": 0.62, + "grad_norm": 1.8054841879427592, + "learning_rate": 3.939144519658575e-06, + "loss": 0.5435, + "step": 742 + }, + { + "epoch": 0.62, + "grad_norm": 2.1072923992538, + "learning_rate": 3.936469630981412e-06, + "loss": 0.5622, + "step": 743 + }, + { + "epoch": 0.62, + "grad_norm": 1.711687978027928, + "learning_rate": 3.933792285049821e-06, + "loss": 0.5554, + "step": 744 + }, + { + "epoch": 0.62, + "grad_norm": 1.8166543944942228, + "learning_rate": 3.931112486443727e-06, + "loss": 0.5079, + "step": 745 + }, + { + "epoch": 0.62, + "grad_norm": 1.7923405334139695, + "learning_rate": 3.928430239747246e-06, + "loss": 0.5692, + "step": 746 + }, + { + "epoch": 0.62, + "grad_norm": 1.9611773239667012, + "learning_rate": 3.925745549548687e-06, + "loss": 0.5092, + "step": 747 + }, + { + "epoch": 0.62, + "grad_norm": 1.8440088039871827, + "learning_rate": 3.923058420440534e-06, + "loss": 0.5369, + "step": 748 + }, + { + "epoch": 0.62, + "grad_norm": 1.9272316571307881, + "learning_rate": 3.920368857019447e-06, + "loss": 0.5798, + "step": 749 + }, + { + "epoch": 0.62, + "grad_norm": 1.8248503445199376, + "learning_rate": 3.917676863886246e-06, + "loss": 0.5479, + "step": 750 + }, + { + "epoch": 0.62, + "grad_norm": 1.9200626612083824, + "learning_rate": 3.914982445645912e-06, + "loss": 0.549, + "step": 751 + }, + { + "epoch": 0.62, + "grad_norm": 1.8585556832275227, + "learning_rate": 3.91228560690757e-06, + "loss": 0.5283, + "step": 752 + }, + { + "epoch": 0.62, + "grad_norm": 1.819239895382093, + "learning_rate": 3.90958635228449e-06, + "loss": 0.535, + "step": 753 + }, + { + "epoch": 0.63, + "grad_norm": 1.7810389942543545, + "learning_rate": 3.90688468639407e-06, + "loss": 0.5125, + "step": 754 + }, + { + "epoch": 0.63, + "grad_norm": 1.9614453700373935, + "learning_rate": 3.904180613857837e-06, + "loss": 0.5406, + "step": 755 + }, + { + "epoch": 0.63, + "grad_norm": 1.805104940263808, + "learning_rate": 3.901474139301433e-06, + "loss": 0.5794, + "step": 756 + }, + { + "epoch": 0.63, + "grad_norm": 1.78756289235025, + "learning_rate": 3.898765267354607e-06, + "loss": 0.569, + "step": 757 + }, + { + "epoch": 0.63, + "grad_norm": 1.912300438003516, + "learning_rate": 3.896054002651213e-06, + "loss": 0.5565, + "step": 758 + }, + { + "epoch": 0.63, + "grad_norm": 1.8148356694353722, + "learning_rate": 3.893340349829195e-06, + "loss": 0.5471, + "step": 759 + }, + { + "epoch": 0.63, + "grad_norm": 1.6836223387492706, + "learning_rate": 3.890624313530583e-06, + "loss": 0.5145, + "step": 760 + }, + { + "epoch": 0.63, + "grad_norm": 1.8389298216964765, + "learning_rate": 3.887905898401485e-06, + "loss": 0.5441, + "step": 761 + }, + { + "epoch": 0.63, + "grad_norm": 1.7845754057436856, + "learning_rate": 3.885185109092078e-06, + "loss": 0.5478, + "step": 762 + }, + { + "epoch": 0.63, + "grad_norm": 1.77076035925993, + "learning_rate": 3.882461950256598e-06, + "loss": 0.5497, + "step": 763 + }, + { + "epoch": 0.63, + "grad_norm": 1.8011284465286703, + "learning_rate": 3.87973642655334e-06, + "loss": 0.5039, + "step": 764 + }, + { + "epoch": 0.63, + "grad_norm": 1.7400129481667248, + "learning_rate": 3.877008542644637e-06, + "loss": 0.5243, + "step": 765 + }, + { + "epoch": 0.64, + "grad_norm": 1.9899565111682327, + "learning_rate": 3.874278303196866e-06, + "loss": 0.5767, + "step": 766 + }, + { + "epoch": 0.64, + "grad_norm": 1.8345576263874734, + "learning_rate": 3.871545712880429e-06, + "loss": 0.5262, + "step": 767 + }, + { + "epoch": 0.64, + "grad_norm": 1.8375211207672395, + "learning_rate": 3.8688107763697505e-06, + "loss": 0.5467, + "step": 768 + }, + { + "epoch": 0.64, + "grad_norm": 1.8068462280574835, + "learning_rate": 3.8660734983432715e-06, + "loss": 0.5256, + "step": 769 + }, + { + "epoch": 0.64, + "grad_norm": 1.7823522202158735, + "learning_rate": 3.863333883483433e-06, + "loss": 0.5419, + "step": 770 + }, + { + "epoch": 0.64, + "grad_norm": 1.8881514180214427, + "learning_rate": 3.86059193647668e-06, + "loss": 0.541, + "step": 771 + }, + { + "epoch": 0.64, + "grad_norm": 1.8311064595650786, + "learning_rate": 3.85784766201344e-06, + "loss": 0.5455, + "step": 772 + }, + { + "epoch": 0.64, + "grad_norm": 1.9833459774866717, + "learning_rate": 3.855101064788126e-06, + "loss": 0.5723, + "step": 773 + }, + { + "epoch": 0.64, + "grad_norm": 1.7968096633022903, + "learning_rate": 3.852352149499125e-06, + "loss": 0.5153, + "step": 774 + }, + { + "epoch": 0.64, + "grad_norm": 1.775423895652992, + "learning_rate": 3.849600920848787e-06, + "loss": 0.5134, + "step": 775 + }, + { + "epoch": 0.64, + "grad_norm": 1.7262892998825556, + "learning_rate": 3.84684738354342e-06, + "loss": 0.5287, + "step": 776 + }, + { + "epoch": 0.64, + "grad_norm": 1.7866135638778051, + "learning_rate": 3.84409154229328e-06, + "loss": 0.57, + "step": 777 + }, + { + "epoch": 0.64, + "grad_norm": 1.787377916112687, + "learning_rate": 3.841333401812569e-06, + "loss": 0.5312, + "step": 778 + }, + { + "epoch": 0.65, + "grad_norm": 1.684801862246949, + "learning_rate": 3.838572966819416e-06, + "loss": 0.5822, + "step": 779 + }, + { + "epoch": 0.65, + "grad_norm": 1.79074773131748, + "learning_rate": 3.835810242035879e-06, + "loss": 0.5651, + "step": 780 + }, + { + "epoch": 0.65, + "grad_norm": 1.9234904827178134, + "learning_rate": 3.8330452321879305e-06, + "loss": 0.5527, + "step": 781 + }, + { + "epoch": 0.65, + "grad_norm": 2.1733402579018186, + "learning_rate": 3.830277942005455e-06, + "loss": 0.5545, + "step": 782 + }, + { + "epoch": 0.65, + "grad_norm": 2.112229504682016, + "learning_rate": 3.827508376222233e-06, + "loss": 0.5766, + "step": 783 + }, + { + "epoch": 0.65, + "grad_norm": 2.087174122744587, + "learning_rate": 3.824736539575944e-06, + "loss": 0.549, + "step": 784 + }, + { + "epoch": 0.65, + "grad_norm": 1.9570382810890106, + "learning_rate": 3.821962436808145e-06, + "loss": 0.4984, + "step": 785 + }, + { + "epoch": 0.65, + "grad_norm": 1.94720853153738, + "learning_rate": 3.819186072664277e-06, + "loss": 0.5303, + "step": 786 + }, + { + "epoch": 0.65, + "grad_norm": 2.21095404069362, + "learning_rate": 3.816407451893643e-06, + "loss": 0.5674, + "step": 787 + }, + { + "epoch": 0.65, + "grad_norm": 1.7284336698899117, + "learning_rate": 3.8136265792494094e-06, + "loss": 0.5952, + "step": 788 + }, + { + "epoch": 0.65, + "grad_norm": 1.940869697529687, + "learning_rate": 3.8108434594885934e-06, + "loss": 0.5198, + "step": 789 + }, + { + "epoch": 0.65, + "grad_norm": 1.9282749931884566, + "learning_rate": 3.808058097372057e-06, + "loss": 0.5499, + "step": 790 + }, + { + "epoch": 0.66, + "grad_norm": 2.0180195532646983, + "learning_rate": 3.8052704976644984e-06, + "loss": 0.5117, + "step": 791 + }, + { + "epoch": 0.66, + "grad_norm": 1.8303561179366206, + "learning_rate": 3.8024806651344424e-06, + "loss": 0.5034, + "step": 792 + }, + { + "epoch": 0.66, + "grad_norm": 2.0584295539484754, + "learning_rate": 3.7996886045542335e-06, + "loss": 0.5391, + "step": 793 + }, + { + "epoch": 0.66, + "grad_norm": 1.7736893833047733, + "learning_rate": 3.7968943207000284e-06, + "loss": 0.5378, + "step": 794 + }, + { + "epoch": 0.66, + "grad_norm": 1.7840353008162277, + "learning_rate": 3.794097818351786e-06, + "loss": 0.5091, + "step": 795 + }, + { + "epoch": 0.66, + "grad_norm": 2.0949100717616225, + "learning_rate": 3.791299102293261e-06, + "loss": 0.5731, + "step": 796 + }, + { + "epoch": 0.66, + "grad_norm": 2.048353193294094, + "learning_rate": 3.7884981773119943e-06, + "loss": 0.5576, + "step": 797 + }, + { + "epoch": 0.66, + "grad_norm": 1.9990070284918733, + "learning_rate": 3.7856950481993054e-06, + "loss": 0.5297, + "step": 798 + }, + { + "epoch": 0.66, + "grad_norm": 1.859560152641746, + "learning_rate": 3.7828897197502856e-06, + "loss": 0.5131, + "step": 799 + }, + { + "epoch": 0.66, + "grad_norm": 2.0054802770873916, + "learning_rate": 3.780082196763785e-06, + "loss": 0.5428, + "step": 800 + }, + { + "epoch": 0.66, + "grad_norm": 1.8985367093585213, + "learning_rate": 3.7772724840424126e-06, + "loss": 0.5206, + "step": 801 + }, + { + "epoch": 0.66, + "grad_norm": 1.9964704653764362, + "learning_rate": 3.774460586392519e-06, + "loss": 0.5929, + "step": 802 + }, + { + "epoch": 0.67, + "grad_norm": 1.7572936836574113, + "learning_rate": 3.771646508624194e-06, + "loss": 0.5428, + "step": 803 + }, + { + "epoch": 0.67, + "grad_norm": 1.9623695483620975, + "learning_rate": 3.768830255551258e-06, + "loss": 0.5685, + "step": 804 + }, + { + "epoch": 0.67, + "grad_norm": 1.9663290616402378, + "learning_rate": 3.76601183199125e-06, + "loss": 0.5351, + "step": 805 + }, + { + "epoch": 0.67, + "grad_norm": 1.7876590847889615, + "learning_rate": 3.763191242765424e-06, + "loss": 0.567, + "step": 806 + }, + { + "epoch": 0.67, + "grad_norm": 1.8500820456277005, + "learning_rate": 3.7603684926987383e-06, + "loss": 0.523, + "step": 807 + }, + { + "epoch": 0.67, + "grad_norm": 2.041973125533567, + "learning_rate": 3.757543586619845e-06, + "loss": 0.5531, + "step": 808 + }, + { + "epoch": 0.67, + "grad_norm": 1.7440376746222928, + "learning_rate": 3.754716529361089e-06, + "loss": 0.4913, + "step": 809 + }, + { + "epoch": 0.67, + "grad_norm": 1.7910937306897654, + "learning_rate": 3.7518873257584897e-06, + "loss": 0.5128, + "step": 810 + }, + { + "epoch": 0.67, + "grad_norm": 1.9334392608388238, + "learning_rate": 3.7490559806517434e-06, + "loss": 0.5861, + "step": 811 + }, + { + "epoch": 0.67, + "grad_norm": 2.0003597857127673, + "learning_rate": 3.746222498884206e-06, + "loss": 0.5535, + "step": 812 + }, + { + "epoch": 0.67, + "grad_norm": 1.7964615198133413, + "learning_rate": 3.74338688530289e-06, + "loss": 0.5409, + "step": 813 + }, + { + "epoch": 0.67, + "grad_norm": 1.7726488990007383, + "learning_rate": 3.740549144758453e-06, + "loss": 0.5714, + "step": 814 + }, + { + "epoch": 0.68, + "grad_norm": 1.9080323144095523, + "learning_rate": 3.737709282105193e-06, + "loss": 0.5534, + "step": 815 + }, + { + "epoch": 0.68, + "grad_norm": 1.9612361354867969, + "learning_rate": 3.734867302201038e-06, + "loss": 0.5282, + "step": 816 + }, + { + "epoch": 0.68, + "grad_norm": 1.873254058551618, + "learning_rate": 3.7320232099075363e-06, + "loss": 0.5422, + "step": 817 + }, + { + "epoch": 0.68, + "grad_norm": 1.8383882069199007, + "learning_rate": 3.7291770100898508e-06, + "loss": 0.5588, + "step": 818 + }, + { + "epoch": 0.68, + "grad_norm": 2.0137053963220835, + "learning_rate": 3.726328707616749e-06, + "loss": 0.5895, + "step": 819 + }, + { + "epoch": 0.68, + "grad_norm": 1.8207549211692964, + "learning_rate": 3.7234783073605957e-06, + "loss": 0.5428, + "step": 820 + }, + { + "epoch": 0.68, + "grad_norm": 1.7929761418069659, + "learning_rate": 3.7206258141973445e-06, + "loss": 0.555, + "step": 821 + }, + { + "epoch": 0.68, + "grad_norm": 1.8863691259545465, + "learning_rate": 3.7177712330065285e-06, + "loss": 0.5802, + "step": 822 + }, + { + "epoch": 0.68, + "grad_norm": 1.8383911000943605, + "learning_rate": 3.714914568671252e-06, + "loss": 0.4986, + "step": 823 + }, + { + "epoch": 0.68, + "grad_norm": 2.0032777947804044, + "learning_rate": 3.7120558260781846e-06, + "loss": 0.6456, + "step": 824 + }, + { + "epoch": 0.68, + "grad_norm": 1.733320874844507, + "learning_rate": 3.709195010117551e-06, + "loss": 0.5146, + "step": 825 + }, + { + "epoch": 0.68, + "grad_norm": 1.7411187007421471, + "learning_rate": 3.7063321256831193e-06, + "loss": 0.5297, + "step": 826 + }, + { + "epoch": 0.69, + "grad_norm": 1.8334107493901353, + "learning_rate": 3.7034671776722003e-06, + "loss": 0.545, + "step": 827 + }, + { + "epoch": 0.69, + "grad_norm": 1.931467221651553, + "learning_rate": 3.7006001709856314e-06, + "loss": 0.579, + "step": 828 + }, + { + "epoch": 0.69, + "grad_norm": 1.799522216655623, + "learning_rate": 3.697731110527774e-06, + "loss": 0.5453, + "step": 829 + }, + { + "epoch": 0.69, + "grad_norm": 1.8098119388805842, + "learning_rate": 3.6948600012065016e-06, + "loss": 0.5186, + "step": 830 + }, + { + "epoch": 0.69, + "grad_norm": 1.8419013342395714, + "learning_rate": 3.6919868479331934e-06, + "loss": 0.4833, + "step": 831 + }, + { + "epoch": 0.69, + "grad_norm": 1.8419148322752323, + "learning_rate": 3.6891116556227234e-06, + "loss": 0.5479, + "step": 832 + }, + { + "epoch": 0.69, + "grad_norm": 1.7858200344474908, + "learning_rate": 3.6862344291934545e-06, + "loss": 0.5264, + "step": 833 + }, + { + "epoch": 0.69, + "grad_norm": 1.8057437623830686, + "learning_rate": 3.6833551735672293e-06, + "loss": 0.5208, + "step": 834 + }, + { + "epoch": 0.69, + "grad_norm": 1.8570584000334132, + "learning_rate": 3.6804738936693617e-06, + "loss": 0.5652, + "step": 835 + }, + { + "epoch": 0.69, + "grad_norm": 1.7961732805960369, + "learning_rate": 3.677590594428629e-06, + "loss": 0.5693, + "step": 836 + }, + { + "epoch": 0.69, + "grad_norm": 1.954108513879844, + "learning_rate": 3.6747052807772614e-06, + "loss": 0.5673, + "step": 837 + }, + { + "epoch": 0.69, + "grad_norm": 1.834152772161213, + "learning_rate": 3.671817957650936e-06, + "loss": 0.5118, + "step": 838 + }, + { + "epoch": 0.7, + "grad_norm": 1.8035026424969205, + "learning_rate": 3.6689286299887663e-06, + "loss": 0.5778, + "step": 839 + }, + { + "epoch": 0.7, + "grad_norm": 1.7862771700309947, + "learning_rate": 3.666037302733295e-06, + "loss": 0.5575, + "step": 840 + }, + { + "epoch": 0.7, + "grad_norm": 1.7398650592861555, + "learning_rate": 3.6631439808304874e-06, + "loss": 0.5323, + "step": 841 + }, + { + "epoch": 0.7, + "grad_norm": 1.7082885736006344, + "learning_rate": 3.6602486692297183e-06, + "loss": 0.543, + "step": 842 + }, + { + "epoch": 0.7, + "grad_norm": 1.8242434568233548, + "learning_rate": 3.6573513728837685e-06, + "loss": 0.5579, + "step": 843 + }, + { + "epoch": 0.7, + "grad_norm": 1.8305967806472925, + "learning_rate": 3.6544520967488108e-06, + "loss": 0.5425, + "step": 844 + }, + { + "epoch": 0.7, + "grad_norm": 1.7126995402462595, + "learning_rate": 3.651550845784407e-06, + "loss": 0.5399, + "step": 845 + }, + { + "epoch": 0.7, + "grad_norm": 1.992190051239983, + "learning_rate": 3.648647624953496e-06, + "loss": 0.5951, + "step": 846 + }, + { + "epoch": 0.7, + "grad_norm": 1.9362402903409848, + "learning_rate": 3.6457424392223885e-06, + "loss": 0.5427, + "step": 847 + }, + { + "epoch": 0.7, + "grad_norm": 1.7390586845081806, + "learning_rate": 3.642835293560754e-06, + "loss": 0.5269, + "step": 848 + }, + { + "epoch": 0.7, + "grad_norm": 1.8601747321693383, + "learning_rate": 3.639926192941615e-06, + "loss": 0.5246, + "step": 849 + }, + { + "epoch": 0.7, + "grad_norm": 1.8305054240762129, + "learning_rate": 3.6370151423413396e-06, + "loss": 0.562, + "step": 850 + }, + { + "epoch": 0.71, + "grad_norm": 1.8361711553327809, + "learning_rate": 3.6341021467396296e-06, + "loss": 0.5066, + "step": 851 + }, + { + "epoch": 0.71, + "grad_norm": 1.9202617492772214, + "learning_rate": 3.6311872111195163e-06, + "loss": 0.5755, + "step": 852 + }, + { + "epoch": 0.71, + "grad_norm": 1.9056266366653432, + "learning_rate": 3.628270340467348e-06, + "loss": 0.5193, + "step": 853 + }, + { + "epoch": 0.71, + "grad_norm": 1.9700971504271882, + "learning_rate": 3.625351539772783e-06, + "loss": 0.5499, + "step": 854 + }, + { + "epoch": 0.71, + "grad_norm": 1.7142305580780086, + "learning_rate": 3.6224308140287818e-06, + "loss": 0.5597, + "step": 855 + }, + { + "epoch": 0.71, + "grad_norm": 1.7897876492593174, + "learning_rate": 3.6195081682315972e-06, + "loss": 0.5347, + "step": 856 + }, + { + "epoch": 0.71, + "grad_norm": 2.191923699092432, + "learning_rate": 3.616583607380769e-06, + "loss": 0.5251, + "step": 857 + }, + { + "epoch": 0.71, + "grad_norm": 1.8582876176666503, + "learning_rate": 3.61365713647911e-06, + "loss": 0.5067, + "step": 858 + }, + { + "epoch": 0.71, + "grad_norm": 1.991617360171558, + "learning_rate": 3.610728760532701e-06, + "loss": 0.6464, + "step": 859 + }, + { + "epoch": 0.71, + "grad_norm": 1.892621069660817, + "learning_rate": 3.607798484550881e-06, + "loss": 0.5145, + "step": 860 + }, + { + "epoch": 0.71, + "grad_norm": 1.7592963181570629, + "learning_rate": 3.6048663135462423e-06, + "loss": 0.5297, + "step": 861 + }, + { + "epoch": 0.71, + "grad_norm": 2.020192040751123, + "learning_rate": 3.6019322525346157e-06, + "loss": 0.5709, + "step": 862 + }, + { + "epoch": 0.72, + "grad_norm": 1.8575959680616767, + "learning_rate": 3.598996306535067e-06, + "loss": 0.5946, + "step": 863 + }, + { + "epoch": 0.72, + "grad_norm": 1.9638758131071599, + "learning_rate": 3.5960584805698845e-06, + "loss": 0.4833, + "step": 864 + }, + { + "epoch": 0.72, + "grad_norm": 1.7517341191956926, + "learning_rate": 3.593118779664574e-06, + "loss": 0.5439, + "step": 865 + }, + { + "epoch": 0.72, + "grad_norm": 1.7637144330636925, + "learning_rate": 3.590177208847848e-06, + "loss": 0.4898, + "step": 866 + }, + { + "epoch": 0.72, + "grad_norm": 2.107899096934758, + "learning_rate": 3.5872337731516186e-06, + "loss": 0.5332, + "step": 867 + }, + { + "epoch": 0.72, + "grad_norm": 2.016493645108941, + "learning_rate": 3.5842884776109875e-06, + "loss": 0.5313, + "step": 868 + }, + { + "epoch": 0.72, + "grad_norm": 1.8758602544873038, + "learning_rate": 3.581341327264236e-06, + "loss": 0.554, + "step": 869 + }, + { + "epoch": 0.72, + "grad_norm": 1.8566881639083022, + "learning_rate": 3.5783923271528222e-06, + "loss": 0.5322, + "step": 870 + }, + { + "epoch": 0.72, + "grad_norm": 1.9151838907738468, + "learning_rate": 3.5754414823213647e-06, + "loss": 0.5306, + "step": 871 + }, + { + "epoch": 0.72, + "grad_norm": 1.7893407766785276, + "learning_rate": 3.572488797817639e-06, + "loss": 0.5226, + "step": 872 + }, + { + "epoch": 0.72, + "grad_norm": 1.908122661974681, + "learning_rate": 3.569534278692569e-06, + "loss": 0.5132, + "step": 873 + }, + { + "epoch": 0.72, + "grad_norm": 1.9052513037253582, + "learning_rate": 3.5665779300002144e-06, + "loss": 0.513, + "step": 874 + }, + { + "epoch": 0.73, + "grad_norm": 1.7876914527016339, + "learning_rate": 3.563619756797767e-06, + "loss": 0.5627, + "step": 875 + }, + { + "epoch": 0.73, + "grad_norm": 1.9607045801516068, + "learning_rate": 3.5606597641455387e-06, + "loss": 0.4986, + "step": 876 + }, + { + "epoch": 0.73, + "grad_norm": 1.701462749441997, + "learning_rate": 3.5576979571069527e-06, + "loss": 0.5306, + "step": 877 + }, + { + "epoch": 0.73, + "grad_norm": 1.8413701238351416, + "learning_rate": 3.554734340748538e-06, + "loss": 0.5602, + "step": 878 + }, + { + "epoch": 0.73, + "grad_norm": 1.8762306249541667, + "learning_rate": 3.5517689201399162e-06, + "loss": 0.5663, + "step": 879 + }, + { + "epoch": 0.73, + "grad_norm": 1.833164968453507, + "learning_rate": 3.5488017003537977e-06, + "loss": 0.5264, + "step": 880 + }, + { + "epoch": 0.73, + "grad_norm": 1.766302763247428, + "learning_rate": 3.5458326864659687e-06, + "loss": 0.5498, + "step": 881 + }, + { + "epoch": 0.73, + "grad_norm": 1.821883208129187, + "learning_rate": 3.5428618835552867e-06, + "loss": 0.5468, + "step": 882 + }, + { + "epoch": 0.73, + "grad_norm": 1.7773758034614335, + "learning_rate": 3.5398892967036674e-06, + "loss": 0.505, + "step": 883 + }, + { + "epoch": 0.73, + "grad_norm": 1.8248820711070537, + "learning_rate": 3.5369149309960783e-06, + "loss": 0.5679, + "step": 884 + }, + { + "epoch": 0.73, + "grad_norm": 1.8248114104788378, + "learning_rate": 3.5339387915205305e-06, + "loss": 0.5351, + "step": 885 + }, + { + "epoch": 0.73, + "grad_norm": 2.00472132505421, + "learning_rate": 3.53096088336807e-06, + "loss": 0.5637, + "step": 886 + }, + { + "epoch": 0.74, + "grad_norm": 2.0594957277906656, + "learning_rate": 3.5279812116327667e-06, + "loss": 0.567, + "step": 887 + }, + { + "epoch": 0.74, + "grad_norm": 1.916227169502353, + "learning_rate": 3.5249997814117098e-06, + "loss": 0.5733, + "step": 888 + }, + { + "epoch": 0.74, + "grad_norm": 1.7595020268824906, + "learning_rate": 3.5220165978049937e-06, + "loss": 0.5512, + "step": 889 + }, + { + "epoch": 0.74, + "grad_norm": 1.8259487385184114, + "learning_rate": 3.5190316659157126e-06, + "loss": 0.5332, + "step": 890 + }, + { + "epoch": 0.74, + "grad_norm": 1.8216813752485344, + "learning_rate": 3.5160449908499538e-06, + "loss": 0.5718, + "step": 891 + }, + { + "epoch": 0.74, + "grad_norm": 1.8497964997952454, + "learning_rate": 3.5130565777167845e-06, + "loss": 0.5179, + "step": 892 + }, + { + "epoch": 0.74, + "grad_norm": 1.8242356367817554, + "learning_rate": 3.5100664316282464e-06, + "loss": 0.5587, + "step": 893 + }, + { + "epoch": 0.74, + "grad_norm": 1.7793507179190546, + "learning_rate": 3.5070745576993428e-06, + "loss": 0.5924, + "step": 894 + }, + { + "epoch": 0.74, + "grad_norm": 1.920176905610262, + "learning_rate": 3.5040809610480364e-06, + "loss": 0.5579, + "step": 895 + }, + { + "epoch": 0.74, + "grad_norm": 1.954421523744336, + "learning_rate": 3.5010856467952335e-06, + "loss": 0.5496, + "step": 896 + }, + { + "epoch": 0.74, + "grad_norm": 1.7785169911731862, + "learning_rate": 3.4980886200647817e-06, + "loss": 0.5383, + "step": 897 + }, + { + "epoch": 0.74, + "grad_norm": 1.853827977546151, + "learning_rate": 3.4950898859834555e-06, + "loss": 0.5501, + "step": 898 + }, + { + "epoch": 0.75, + "grad_norm": 1.9882198198152168, + "learning_rate": 3.4920894496809515e-06, + "loss": 0.5557, + "step": 899 + }, + { + "epoch": 0.75, + "grad_norm": 1.98090605107646, + "learning_rate": 3.489087316289877e-06, + "loss": 0.5661, + "step": 900 + }, + { + "epoch": 0.75, + "grad_norm": 2.0027723691714785, + "learning_rate": 3.486083490945743e-06, + "loss": 0.4791, + "step": 901 + }, + { + "epoch": 0.75, + "grad_norm": 2.0183911897675015, + "learning_rate": 3.4830779787869555e-06, + "loss": 0.5386, + "step": 902 + }, + { + "epoch": 0.75, + "grad_norm": 1.9385976919386894, + "learning_rate": 3.480070784954805e-06, + "loss": 0.5351, + "step": 903 + }, + { + "epoch": 0.75, + "grad_norm": 1.7612550957325825, + "learning_rate": 3.4770619145934586e-06, + "loss": 0.511, + "step": 904 + }, + { + "epoch": 0.75, + "grad_norm": 1.8677538420589843, + "learning_rate": 3.4740513728499515e-06, + "loss": 0.5942, + "step": 905 + }, + { + "epoch": 0.75, + "grad_norm": 1.9208446249900946, + "learning_rate": 3.4710391648741787e-06, + "loss": 0.5146, + "step": 906 + }, + { + "epoch": 0.75, + "grad_norm": 1.8008673055527855, + "learning_rate": 3.468025295818885e-06, + "loss": 0.5909, + "step": 907 + }, + { + "epoch": 0.75, + "grad_norm": 1.891052390507894, + "learning_rate": 3.465009770839657e-06, + "loss": 0.5527, + "step": 908 + }, + { + "epoch": 0.75, + "grad_norm": 2.0521048489395435, + "learning_rate": 3.4619925950949126e-06, + "loss": 0.5756, + "step": 909 + }, + { + "epoch": 0.75, + "grad_norm": 2.003295441830653, + "learning_rate": 3.4589737737458946e-06, + "loss": 0.5299, + "step": 910 + }, + { + "epoch": 0.76, + "grad_norm": 1.7635851435542724, + "learning_rate": 3.4559533119566612e-06, + "loss": 0.5338, + "step": 911 + }, + { + "epoch": 0.76, + "grad_norm": 1.834326490517508, + "learning_rate": 3.4529312148940763e-06, + "loss": 0.56, + "step": 912 + }, + { + "epoch": 0.76, + "grad_norm": 1.8618427761057224, + "learning_rate": 3.4499074877278016e-06, + "loss": 0.5189, + "step": 913 + }, + { + "epoch": 0.76, + "grad_norm": 2.04459004844406, + "learning_rate": 3.446882135630286e-06, + "loss": 0.5765, + "step": 914 + }, + { + "epoch": 0.76, + "grad_norm": 1.7467595732765806, + "learning_rate": 3.4438551637767604e-06, + "loss": 0.5512, + "step": 915 + }, + { + "epoch": 0.76, + "grad_norm": 1.7952035114217406, + "learning_rate": 3.4408265773452226e-06, + "loss": 0.5348, + "step": 916 + }, + { + "epoch": 0.76, + "grad_norm": 1.8448198186244822, + "learning_rate": 3.4377963815164362e-06, + "loss": 0.5187, + "step": 917 + }, + { + "epoch": 0.76, + "grad_norm": 1.7738820116169103, + "learning_rate": 3.4347645814739156e-06, + "loss": 0.507, + "step": 918 + }, + { + "epoch": 0.76, + "grad_norm": 1.9699054774415494, + "learning_rate": 3.4317311824039216e-06, + "loss": 0.5175, + "step": 919 + }, + { + "epoch": 0.76, + "grad_norm": 1.7482905457169124, + "learning_rate": 3.4286961894954473e-06, + "loss": 0.5188, + "step": 920 + }, + { + "epoch": 0.76, + "grad_norm": 1.8012194296110113, + "learning_rate": 3.425659607940215e-06, + "loss": 0.5465, + "step": 921 + }, + { + "epoch": 0.76, + "grad_norm": 1.7978097428012587, + "learning_rate": 3.422621442932662e-06, + "loss": 0.5257, + "step": 922 + }, + { + "epoch": 0.77, + "grad_norm": 1.8534167116514217, + "learning_rate": 3.419581699669937e-06, + "loss": 0.536, + "step": 923 + }, + { + "epoch": 0.77, + "grad_norm": 1.7733377878036733, + "learning_rate": 3.416540383351888e-06, + "loss": 0.5632, + "step": 924 + }, + { + "epoch": 0.77, + "grad_norm": 1.8124786776539388, + "learning_rate": 3.4134974991810503e-06, + "loss": 0.5471, + "step": 925 + }, + { + "epoch": 0.77, + "grad_norm": 1.8553271859579439, + "learning_rate": 3.4104530523626463e-06, + "loss": 0.538, + "step": 926 + }, + { + "epoch": 0.77, + "grad_norm": 1.8888926038913822, + "learning_rate": 3.4074070481045683e-06, + "loss": 0.4868, + "step": 927 + }, + { + "epoch": 0.77, + "grad_norm": 2.0158609319355505, + "learning_rate": 3.404359491617374e-06, + "loss": 0.5757, + "step": 928 + }, + { + "epoch": 0.77, + "grad_norm": 1.8376639720078027, + "learning_rate": 3.401310388114276e-06, + "loss": 0.5377, + "step": 929 + }, + { + "epoch": 0.77, + "grad_norm": 2.3651883595335232, + "learning_rate": 3.3982597428111336e-06, + "loss": 0.5536, + "step": 930 + }, + { + "epoch": 0.77, + "grad_norm": 1.908409388949023, + "learning_rate": 3.3952075609264423e-06, + "loss": 0.5349, + "step": 931 + }, + { + "epoch": 0.77, + "grad_norm": 1.8261622890952995, + "learning_rate": 3.3921538476813278e-06, + "loss": 0.4991, + "step": 932 + }, + { + "epoch": 0.77, + "grad_norm": 1.924034720876031, + "learning_rate": 3.3890986082995353e-06, + "loss": 0.536, + "step": 933 + }, + { + "epoch": 0.77, + "grad_norm": 1.829615974230478, + "learning_rate": 3.3860418480074188e-06, + "loss": 0.5163, + "step": 934 + }, + { + "epoch": 0.78, + "grad_norm": 1.7812992854973535, + "learning_rate": 3.3829835720339353e-06, + "loss": 0.5412, + "step": 935 + }, + { + "epoch": 0.78, + "grad_norm": 1.8270515542068861, + "learning_rate": 3.3799237856106348e-06, + "loss": 0.5459, + "step": 936 + }, + { + "epoch": 0.78, + "grad_norm": 1.8336967909163833, + "learning_rate": 3.3768624939716506e-06, + "loss": 0.5074, + "step": 937 + }, + { + "epoch": 0.78, + "grad_norm": 1.773892866992307, + "learning_rate": 3.373799702353691e-06, + "loss": 0.5457, + "step": 938 + }, + { + "epoch": 0.78, + "grad_norm": 1.8605607499004266, + "learning_rate": 3.370735415996031e-06, + "loss": 0.5691, + "step": 939 + }, + { + "epoch": 0.78, + "grad_norm": 1.7961529805945686, + "learning_rate": 3.3676696401405007e-06, + "loss": 0.5406, + "step": 940 + }, + { + "epoch": 0.78, + "grad_norm": 1.7406787561376078, + "learning_rate": 3.3646023800314792e-06, + "loss": 0.5297, + "step": 941 + }, + { + "epoch": 0.78, + "grad_norm": 1.9794693468141764, + "learning_rate": 3.361533640915885e-06, + "loss": 0.4765, + "step": 942 + }, + { + "epoch": 0.78, + "grad_norm": 1.820632707720892, + "learning_rate": 3.3584634280431657e-06, + "loss": 0.5395, + "step": 943 + }, + { + "epoch": 0.78, + "grad_norm": 1.8478126164835549, + "learning_rate": 3.3553917466652915e-06, + "loss": 0.5288, + "step": 944 + }, + { + "epoch": 0.78, + "grad_norm": 1.749509825583459, + "learning_rate": 3.352318602036742e-06, + "loss": 0.5343, + "step": 945 + }, + { + "epoch": 0.78, + "grad_norm": 1.8034305951190157, + "learning_rate": 3.3492439994145033e-06, + "loss": 0.5536, + "step": 946 + }, + { + "epoch": 0.79, + "grad_norm": 1.8172591817519397, + "learning_rate": 3.346167944058052e-06, + "loss": 0.5844, + "step": 947 + }, + { + "epoch": 0.79, + "grad_norm": 1.749562414198837, + "learning_rate": 3.3430904412293526e-06, + "loss": 0.4833, + "step": 948 + }, + { + "epoch": 0.79, + "grad_norm": 1.7243742428927225, + "learning_rate": 3.3400114961928444e-06, + "loss": 0.4828, + "step": 949 + }, + { + "epoch": 0.79, + "grad_norm": 1.757242299744874, + "learning_rate": 3.3369311142154337e-06, + "loss": 0.5282, + "step": 950 + }, + { + "epoch": 0.79, + "grad_norm": 2.036302581700697, + "learning_rate": 3.3338493005664853e-06, + "loss": 0.5315, + "step": 951 + }, + { + "epoch": 0.79, + "grad_norm": 1.886299636672335, + "learning_rate": 3.330766060517812e-06, + "loss": 0.5244, + "step": 952 + }, + { + "epoch": 0.79, + "grad_norm": 1.898853787733011, + "learning_rate": 3.3276813993436695e-06, + "loss": 0.5914, + "step": 953 + }, + { + "epoch": 0.79, + "grad_norm": 1.8359472984671243, + "learning_rate": 3.324595322320741e-06, + "loss": 0.5488, + "step": 954 + }, + { + "epoch": 0.79, + "grad_norm": 1.8768955168510497, + "learning_rate": 3.321507834728134e-06, + "loss": 0.5871, + "step": 955 + }, + { + "epoch": 0.79, + "grad_norm": 1.8358033818112791, + "learning_rate": 3.3184189418473674e-06, + "loss": 0.5632, + "step": 956 + }, + { + "epoch": 0.79, + "grad_norm": 1.792562502385941, + "learning_rate": 3.315328648962364e-06, + "loss": 0.4887, + "step": 957 + }, + { + "epoch": 0.79, + "grad_norm": 1.8732702930932368, + "learning_rate": 3.312236961359444e-06, + "loss": 0.5313, + "step": 958 + }, + { + "epoch": 0.8, + "grad_norm": 1.7708047128885986, + "learning_rate": 3.3091438843273115e-06, + "loss": 0.5348, + "step": 959 + }, + { + "epoch": 0.8, + "grad_norm": 1.9094434763935804, + "learning_rate": 3.3060494231570463e-06, + "loss": 0.5027, + "step": 960 + }, + { + "epoch": 0.8, + "grad_norm": 1.87927564418864, + "learning_rate": 3.3029535831420977e-06, + "loss": 0.511, + "step": 961 + }, + { + "epoch": 0.8, + "grad_norm": 1.717365559903535, + "learning_rate": 3.299856369578273e-06, + "loss": 0.5203, + "step": 962 + }, + { + "epoch": 0.8, + "grad_norm": 1.770779257052532, + "learning_rate": 3.2967577877637296e-06, + "loss": 0.5233, + "step": 963 + }, + { + "epoch": 0.8, + "grad_norm": 1.7541392466004568, + "learning_rate": 3.2936578429989653e-06, + "loss": 0.5013, + "step": 964 + }, + { + "epoch": 0.8, + "grad_norm": 1.7840578280891832, + "learning_rate": 3.290556540586809e-06, + "loss": 0.4844, + "step": 965 + }, + { + "epoch": 0.8, + "grad_norm": 1.7184305413001233, + "learning_rate": 3.287453885832413e-06, + "loss": 0.4694, + "step": 966 + }, + { + "epoch": 0.8, + "grad_norm": 1.8671517036325307, + "learning_rate": 3.2843498840432403e-06, + "loss": 0.4652, + "step": 967 + }, + { + "epoch": 0.8, + "grad_norm": 1.9960847871768508, + "learning_rate": 3.2812445405290612e-06, + "loss": 0.5906, + "step": 968 + }, + { + "epoch": 0.8, + "grad_norm": 1.7535227575839891, + "learning_rate": 3.27813786060194e-06, + "loss": 0.5482, + "step": 969 + }, + { + "epoch": 0.8, + "grad_norm": 1.929231862440999, + "learning_rate": 3.2750298495762278e-06, + "loss": 0.5334, + "step": 970 + }, + { + "epoch": 0.8, + "grad_norm": 1.7879676366114814, + "learning_rate": 3.2719205127685505e-06, + "loss": 0.515, + "step": 971 + }, + { + "epoch": 0.81, + "grad_norm": 1.7817120865072218, + "learning_rate": 3.2688098554978053e-06, + "loss": 0.5045, + "step": 972 + }, + { + "epoch": 0.81, + "grad_norm": 1.8725673808714274, + "learning_rate": 3.265697883085145e-06, + "loss": 0.5557, + "step": 973 + }, + { + "epoch": 0.81, + "grad_norm": 1.8554796275037901, + "learning_rate": 3.262584600853973e-06, + "loss": 0.5785, + "step": 974 + }, + { + "epoch": 0.81, + "grad_norm": 1.77078783324655, + "learning_rate": 3.259470014129936e-06, + "loss": 0.524, + "step": 975 + }, + { + "epoch": 0.81, + "grad_norm": 1.820843626030818, + "learning_rate": 3.256354128240907e-06, + "loss": 0.5144, + "step": 976 + }, + { + "epoch": 0.81, + "grad_norm": 1.9330495063889956, + "learning_rate": 3.253236948516987e-06, + "loss": 0.5405, + "step": 977 + }, + { + "epoch": 0.81, + "grad_norm": 1.9113413794485425, + "learning_rate": 3.2501184802904867e-06, + "loss": 0.5212, + "step": 978 + }, + { + "epoch": 0.81, + "grad_norm": 1.799188386703558, + "learning_rate": 3.2469987288959208e-06, + "loss": 0.5148, + "step": 979 + }, + { + "epoch": 0.81, + "grad_norm": 1.8610914183588203, + "learning_rate": 3.2438776996700023e-06, + "loss": 0.5363, + "step": 980 + }, + { + "epoch": 0.81, + "grad_norm": 1.8245263524947073, + "learning_rate": 3.240755397951625e-06, + "loss": 0.5216, + "step": 981 + }, + { + "epoch": 0.81, + "grad_norm": 1.7863270641417597, + "learning_rate": 3.2376318290818643e-06, + "loss": 0.5581, + "step": 982 + }, + { + "epoch": 0.81, + "grad_norm": 1.9266115141469626, + "learning_rate": 3.23450699840396e-06, + "loss": 0.5178, + "step": 983 + }, + { + "epoch": 0.82, + "grad_norm": 1.8044458399187253, + "learning_rate": 3.2313809112633133e-06, + "loss": 0.5252, + "step": 984 + }, + { + "epoch": 0.82, + "grad_norm": 1.8809392949423562, + "learning_rate": 3.2282535730074714e-06, + "loss": 0.486, + "step": 985 + }, + { + "epoch": 0.82, + "grad_norm": 1.9487997548787144, + "learning_rate": 3.2251249889861237e-06, + "loss": 0.5272, + "step": 986 + }, + { + "epoch": 0.82, + "grad_norm": 2.088279538426057, + "learning_rate": 3.2219951645510907e-06, + "loss": 0.5426, + "step": 987 + }, + { + "epoch": 0.82, + "grad_norm": 1.8280370745964312, + "learning_rate": 3.218864105056313e-06, + "loss": 0.5545, + "step": 988 + }, + { + "epoch": 0.82, + "grad_norm": 1.7678201455723743, + "learning_rate": 3.2157318158578473e-06, + "loss": 0.5476, + "step": 989 + }, + { + "epoch": 0.82, + "grad_norm": 1.708170466024094, + "learning_rate": 3.21259830231385e-06, + "loss": 0.5442, + "step": 990 + }, + { + "epoch": 0.82, + "grad_norm": 2.0427224573251483, + "learning_rate": 3.209463569784575e-06, + "loss": 0.5501, + "step": 991 + }, + { + "epoch": 0.82, + "grad_norm": 1.8557413526282036, + "learning_rate": 3.206327623632359e-06, + "loss": 0.5573, + "step": 992 + }, + { + "epoch": 0.82, + "grad_norm": 1.7138810851622357, + "learning_rate": 3.2031904692216153e-06, + "loss": 0.5267, + "step": 993 + }, + { + "epoch": 0.82, + "grad_norm": 1.9034028799031073, + "learning_rate": 3.2000521119188267e-06, + "loss": 0.5605, + "step": 994 + }, + { + "epoch": 0.82, + "grad_norm": 1.994571492675121, + "learning_rate": 3.1969125570925303e-06, + "loss": 0.53, + "step": 995 + }, + { + "epoch": 0.83, + "grad_norm": 1.771581881704634, + "learning_rate": 3.193771810113313e-06, + "loss": 0.6177, + "step": 996 + }, + { + "epoch": 0.83, + "grad_norm": 1.7808220445921694, + "learning_rate": 3.1906298763538005e-06, + "loss": 0.5215, + "step": 997 + }, + { + "epoch": 0.83, + "grad_norm": 1.8069794706642701, + "learning_rate": 3.1874867611886513e-06, + "loss": 0.5444, + "step": 998 + }, + { + "epoch": 0.83, + "grad_norm": 1.7806867210889854, + "learning_rate": 3.1843424699945403e-06, + "loss": 0.5471, + "step": 999 + }, + { + "epoch": 0.83, + "grad_norm": 1.7481554024627886, + "learning_rate": 3.1811970081501576e-06, + "loss": 0.5159, + "step": 1000 + }, + { + "epoch": 0.83, + "grad_norm": 1.8105318680671914, + "learning_rate": 3.1780503810361946e-06, + "loss": 0.4985, + "step": 1001 + }, + { + "epoch": 0.83, + "grad_norm": 1.7033701950072382, + "learning_rate": 3.1749025940353363e-06, + "loss": 0.5594, + "step": 1002 + }, + { + "epoch": 0.83, + "grad_norm": 2.3799847532384515, + "learning_rate": 3.1717536525322512e-06, + "loss": 0.5978, + "step": 1003 + }, + { + "epoch": 0.83, + "grad_norm": 1.7427559432173463, + "learning_rate": 3.1686035619135845e-06, + "loss": 0.5299, + "step": 1004 + }, + { + "epoch": 0.83, + "grad_norm": 1.7454547855925509, + "learning_rate": 3.1654523275679453e-06, + "loss": 0.5439, + "step": 1005 + }, + { + "epoch": 0.83, + "grad_norm": 1.7130931472340127, + "learning_rate": 3.162299954885899e-06, + "loss": 0.5379, + "step": 1006 + }, + { + "epoch": 0.83, + "grad_norm": 1.6940357366272063, + "learning_rate": 3.15914644925996e-06, + "loss": 0.5694, + "step": 1007 + }, + { + "epoch": 0.84, + "grad_norm": 1.8544220651543013, + "learning_rate": 3.1559918160845787e-06, + "loss": 0.5285, + "step": 1008 + }, + { + "epoch": 0.84, + "grad_norm": 1.8481774433371347, + "learning_rate": 3.1528360607561358e-06, + "loss": 0.5384, + "step": 1009 + }, + { + "epoch": 0.84, + "grad_norm": 1.8256828659009958, + "learning_rate": 3.149679188672932e-06, + "loss": 0.4806, + "step": 1010 + }, + { + "epoch": 0.84, + "grad_norm": 1.9380282822721238, + "learning_rate": 3.1465212052351766e-06, + "loss": 0.543, + "step": 1011 + }, + { + "epoch": 0.84, + "grad_norm": 1.985943690469791, + "learning_rate": 3.1433621158449807e-06, + "loss": 0.5549, + "step": 1012 + }, + { + "epoch": 0.84, + "grad_norm": 1.7038398790061953, + "learning_rate": 3.140201925906348e-06, + "loss": 0.4682, + "step": 1013 + }, + { + "epoch": 0.84, + "grad_norm": 1.8748481620529394, + "learning_rate": 3.1370406408251632e-06, + "loss": 0.5046, + "step": 1014 + }, + { + "epoch": 0.84, + "grad_norm": 1.7587036990451181, + "learning_rate": 3.133878266009186e-06, + "loss": 0.5203, + "step": 1015 + }, + { + "epoch": 0.84, + "grad_norm": 1.7503537433041947, + "learning_rate": 3.130714806868041e-06, + "loss": 0.5546, + "step": 1016 + }, + { + "epoch": 0.84, + "grad_norm": 1.7701505667314001, + "learning_rate": 3.127550268813205e-06, + "loss": 0.531, + "step": 1017 + }, + { + "epoch": 0.84, + "grad_norm": 1.771371589393474, + "learning_rate": 3.124384657258001e-06, + "loss": 0.5424, + "step": 1018 + }, + { + "epoch": 0.84, + "grad_norm": 1.8016015279719124, + "learning_rate": 3.1212179776175905e-06, + "loss": 0.5706, + "step": 1019 + }, + { + "epoch": 0.85, + "grad_norm": 1.810944889002695, + "learning_rate": 3.1180502353089598e-06, + "loss": 0.5502, + "step": 1020 + }, + { + "epoch": 0.85, + "grad_norm": 1.8062084514449492, + "learning_rate": 3.1148814357509147e-06, + "loss": 0.5337, + "step": 1021 + }, + { + "epoch": 0.85, + "grad_norm": 1.669643406466654, + "learning_rate": 3.111711584364068e-06, + "loss": 0.4802, + "step": 1022 + }, + { + "epoch": 0.85, + "grad_norm": 1.6852245083058144, + "learning_rate": 3.1085406865708333e-06, + "loss": 0.532, + "step": 1023 + }, + { + "epoch": 0.85, + "grad_norm": 1.8463748056800222, + "learning_rate": 3.1053687477954124e-06, + "loss": 0.5112, + "step": 1024 + }, + { + "epoch": 0.85, + "grad_norm": 1.7302148909577209, + "learning_rate": 3.10219577346379e-06, + "loss": 0.5549, + "step": 1025 + }, + { + "epoch": 0.85, + "grad_norm": 1.7752983463714818, + "learning_rate": 3.0990217690037206e-06, + "loss": 0.5606, + "step": 1026 + }, + { + "epoch": 0.85, + "grad_norm": 1.695119975844164, + "learning_rate": 3.09584673984472e-06, + "loss": 0.486, + "step": 1027 + }, + { + "epoch": 0.85, + "grad_norm": 1.793543444803663, + "learning_rate": 3.0926706914180605e-06, + "loss": 0.6474, + "step": 1028 + }, + { + "epoch": 0.85, + "grad_norm": 1.6954588940750932, + "learning_rate": 3.089493629156755e-06, + "loss": 0.5208, + "step": 1029 + }, + { + "epoch": 0.85, + "grad_norm": 1.9045089074493644, + "learning_rate": 3.08631555849555e-06, + "loss": 0.5291, + "step": 1030 + }, + { + "epoch": 0.85, + "grad_norm": 1.8481217904786489, + "learning_rate": 3.083136484870921e-06, + "loss": 0.5212, + "step": 1031 + }, + { + "epoch": 0.86, + "grad_norm": 1.6729420221561044, + "learning_rate": 3.0799564137210536e-06, + "loss": 0.5024, + "step": 1032 + }, + { + "epoch": 0.86, + "grad_norm": 1.8821832248249077, + "learning_rate": 3.076775350485845e-06, + "loss": 0.5459, + "step": 1033 + }, + { + "epoch": 0.86, + "grad_norm": 1.762473350167322, + "learning_rate": 3.0735933006068863e-06, + "loss": 0.4938, + "step": 1034 + }, + { + "epoch": 0.86, + "grad_norm": 1.7950707678098703, + "learning_rate": 3.0704102695274573e-06, + "loss": 0.4922, + "step": 1035 + }, + { + "epoch": 0.86, + "grad_norm": 1.6853644769275375, + "learning_rate": 3.0672262626925174e-06, + "loss": 0.47, + "step": 1036 + }, + { + "epoch": 0.86, + "grad_norm": 1.809909106997157, + "learning_rate": 3.0640412855486922e-06, + "loss": 0.5545, + "step": 1037 + }, + { + "epoch": 0.86, + "grad_norm": 2.019472393876661, + "learning_rate": 3.06085534354427e-06, + "loss": 0.5616, + "step": 1038 + }, + { + "epoch": 0.86, + "grad_norm": 1.7972785887075076, + "learning_rate": 3.057668442129188e-06, + "loss": 0.5269, + "step": 1039 + }, + { + "epoch": 0.86, + "grad_norm": 1.865555820217107, + "learning_rate": 3.054480586755026e-06, + "loss": 0.5752, + "step": 1040 + }, + { + "epoch": 0.86, + "grad_norm": 1.792147096098412, + "learning_rate": 3.051291782874995e-06, + "loss": 0.54, + "step": 1041 + }, + { + "epoch": 0.86, + "grad_norm": 1.8108893550848508, + "learning_rate": 3.048102035943927e-06, + "loss": 0.5367, + "step": 1042 + }, + { + "epoch": 0.86, + "grad_norm": 2.0966646553454793, + "learning_rate": 3.04491135141827e-06, + "loss": 0.5455, + "step": 1043 + }, + { + "epoch": 0.87, + "grad_norm": 1.7357403687049695, + "learning_rate": 3.041719734756073e-06, + "loss": 0.502, + "step": 1044 + }, + { + "epoch": 0.87, + "grad_norm": 1.8033826162723872, + "learning_rate": 3.038527191416982e-06, + "loss": 0.5644, + "step": 1045 + }, + { + "epoch": 0.87, + "grad_norm": 1.7822928111630525, + "learning_rate": 3.0353337268622267e-06, + "loss": 0.4938, + "step": 1046 + }, + { + "epoch": 0.87, + "grad_norm": 1.7910319343463081, + "learning_rate": 3.0321393465546134e-06, + "loss": 0.5889, + "step": 1047 + }, + { + "epoch": 0.87, + "grad_norm": 1.7457160087273953, + "learning_rate": 3.028944055958514e-06, + "loss": 0.5022, + "step": 1048 + }, + { + "epoch": 0.87, + "grad_norm": 1.691379648176161, + "learning_rate": 3.0257478605398595e-06, + "loss": 0.4841, + "step": 1049 + }, + { + "epoch": 0.87, + "grad_norm": 1.7452186987943483, + "learning_rate": 3.0225507657661257e-06, + "loss": 0.5626, + "step": 1050 + }, + { + "epoch": 0.87, + "grad_norm": 1.7578678635930594, + "learning_rate": 3.0193527771063297e-06, + "loss": 0.5115, + "step": 1051 + }, + { + "epoch": 0.87, + "grad_norm": 1.7879798898209605, + "learning_rate": 3.016153900031016e-06, + "loss": 0.5296, + "step": 1052 + }, + { + "epoch": 0.87, + "grad_norm": 1.6745604796677231, + "learning_rate": 3.0129541400122492e-06, + "loss": 0.5089, + "step": 1053 + }, + { + "epoch": 0.87, + "grad_norm": 1.8484438696306678, + "learning_rate": 3.0097535025236045e-06, + "loss": 0.6124, + "step": 1054 + }, + { + "epoch": 0.87, + "grad_norm": 1.8023880068850882, + "learning_rate": 3.0065519930401595e-06, + "loss": 0.4983, + "step": 1055 + }, + { + "epoch": 0.88, + "grad_norm": 1.743901583565096, + "learning_rate": 3.0033496170384803e-06, + "loss": 0.4998, + "step": 1056 + }, + { + "epoch": 0.88, + "grad_norm": 1.9494472820876043, + "learning_rate": 3.000146379996617e-06, + "loss": 0.537, + "step": 1057 + }, + { + "epoch": 0.88, + "grad_norm": 1.6992995489648048, + "learning_rate": 2.996942287394093e-06, + "loss": 0.5822, + "step": 1058 + }, + { + "epoch": 0.88, + "grad_norm": 1.8498288139189643, + "learning_rate": 2.993737344711895e-06, + "loss": 0.5651, + "step": 1059 + }, + { + "epoch": 0.88, + "grad_norm": 1.755920633785882, + "learning_rate": 2.990531557432464e-06, + "loss": 0.496, + "step": 1060 + }, + { + "epoch": 0.88, + "grad_norm": 1.7876484928074277, + "learning_rate": 2.9873249310396853e-06, + "loss": 0.5224, + "step": 1061 + }, + { + "epoch": 0.88, + "grad_norm": 1.7573987279473129, + "learning_rate": 2.98411747101888e-06, + "loss": 0.5228, + "step": 1062 + }, + { + "epoch": 0.88, + "grad_norm": 1.6995721104857204, + "learning_rate": 2.980909182856794e-06, + "loss": 0.4758, + "step": 1063 + }, + { + "epoch": 0.88, + "grad_norm": 1.907464743607936, + "learning_rate": 2.9777000720415916e-06, + "loss": 0.5254, + "step": 1064 + }, + { + "epoch": 0.88, + "grad_norm": 1.7921365259203703, + "learning_rate": 2.974490144062844e-06, + "loss": 0.5116, + "step": 1065 + }, + { + "epoch": 0.88, + "grad_norm": 1.9010192849593792, + "learning_rate": 2.9712794044115196e-06, + "loss": 0.5136, + "step": 1066 + }, + { + "epoch": 0.88, + "grad_norm": 1.742881813035793, + "learning_rate": 2.968067858579975e-06, + "loss": 0.5436, + "step": 1067 + }, + { + "epoch": 0.89, + "grad_norm": 1.7135933558215708, + "learning_rate": 2.964855512061947e-06, + "loss": 0.5268, + "step": 1068 + }, + { + "epoch": 0.89, + "grad_norm": 1.8360025545734582, + "learning_rate": 2.9616423703525414e-06, + "loss": 0.5238, + "step": 1069 + }, + { + "epoch": 0.89, + "grad_norm": 1.7090421713960848, + "learning_rate": 2.9584284389482237e-06, + "loss": 0.5051, + "step": 1070 + }, + { + "epoch": 0.89, + "grad_norm": 1.7462732547158757, + "learning_rate": 2.9552137233468113e-06, + "loss": 0.4838, + "step": 1071 + }, + { + "epoch": 0.89, + "grad_norm": 1.9336108910937513, + "learning_rate": 2.951998229047464e-06, + "loss": 0.5576, + "step": 1072 + }, + { + "epoch": 0.89, + "grad_norm": 1.784092660568157, + "learning_rate": 2.9487819615506702e-06, + "loss": 0.5349, + "step": 1073 + }, + { + "epoch": 0.89, + "grad_norm": 1.772640354616067, + "learning_rate": 2.945564926358245e-06, + "loss": 0.5423, + "step": 1074 + }, + { + "epoch": 0.89, + "grad_norm": 1.8491968859591044, + "learning_rate": 2.9423471289733125e-06, + "loss": 0.5453, + "step": 1075 + }, + { + "epoch": 0.89, + "grad_norm": 1.8283172103770493, + "learning_rate": 2.9391285749003046e-06, + "loss": 0.5318, + "step": 1076 + }, + { + "epoch": 0.89, + "grad_norm": 1.7802483696828226, + "learning_rate": 2.935909269644946e-06, + "loss": 0.4954, + "step": 1077 + }, + { + "epoch": 0.89, + "grad_norm": 1.8687809173149, + "learning_rate": 2.9326892187142457e-06, + "loss": 0.5428, + "step": 1078 + }, + { + "epoch": 0.89, + "grad_norm": 1.9218917868616974, + "learning_rate": 2.9294684276164888e-06, + "loss": 0.5125, + "step": 1079 + }, + { + "epoch": 0.9, + "grad_norm": 1.8406300824318225, + "learning_rate": 2.9262469018612278e-06, + "loss": 0.5186, + "step": 1080 + }, + { + "epoch": 0.9, + "grad_norm": 1.8153319034513924, + "learning_rate": 2.9230246469592695e-06, + "loss": 0.4878, + "step": 1081 + }, + { + "epoch": 0.9, + "grad_norm": 1.8381190525343576, + "learning_rate": 2.91980166842267e-06, + "loss": 0.5455, + "step": 1082 + }, + { + "epoch": 0.9, + "grad_norm": 1.7941629060330144, + "learning_rate": 2.9165779717647212e-06, + "loss": 0.5425, + "step": 1083 + }, + { + "epoch": 0.9, + "grad_norm": 1.755950985861856, + "learning_rate": 2.9133535624999466e-06, + "loss": 0.4992, + "step": 1084 + }, + { + "epoch": 0.9, + "grad_norm": 1.8065716401418646, + "learning_rate": 2.9101284461440853e-06, + "loss": 0.5569, + "step": 1085 + }, + { + "epoch": 0.9, + "grad_norm": 1.8487073865649808, + "learning_rate": 2.9069026282140887e-06, + "loss": 0.5352, + "step": 1086 + }, + { + "epoch": 0.9, + "grad_norm": 1.877024524581134, + "learning_rate": 2.903676114228107e-06, + "loss": 0.5584, + "step": 1087 + }, + { + "epoch": 0.9, + "grad_norm": 1.812931375367902, + "learning_rate": 2.9004489097054807e-06, + "loss": 0.5154, + "step": 1088 + }, + { + "epoch": 0.9, + "grad_norm": 1.7729938020658174, + "learning_rate": 2.897221020166732e-06, + "loss": 0.5386, + "step": 1089 + }, + { + "epoch": 0.9, + "grad_norm": 1.6991898958250629, + "learning_rate": 2.8939924511335555e-06, + "loss": 0.5467, + "step": 1090 + }, + { + "epoch": 0.9, + "grad_norm": 1.7298323860671052, + "learning_rate": 2.890763208128807e-06, + "loss": 0.5506, + "step": 1091 + }, + { + "epoch": 0.91, + "grad_norm": 1.9718362378496106, + "learning_rate": 2.887533296676497e-06, + "loss": 0.5453, + "step": 1092 + }, + { + "epoch": 0.91, + "grad_norm": 1.7003897379752575, + "learning_rate": 2.8843027223017767e-06, + "loss": 0.5016, + "step": 1093 + }, + { + "epoch": 0.91, + "grad_norm": 1.7604846690613096, + "learning_rate": 2.8810714905309346e-06, + "loss": 0.5206, + "step": 1094 + }, + { + "epoch": 0.91, + "grad_norm": 1.868522047775135, + "learning_rate": 2.8778396068913807e-06, + "loss": 0.5152, + "step": 1095 + }, + { + "epoch": 0.91, + "grad_norm": 1.8080911269766844, + "learning_rate": 2.874607076911642e-06, + "loss": 0.4966, + "step": 1096 + }, + { + "epoch": 0.91, + "grad_norm": 1.7767037245003534, + "learning_rate": 2.871373906121351e-06, + "loss": 0.5081, + "step": 1097 + }, + { + "epoch": 0.91, + "grad_norm": 1.733045586658075, + "learning_rate": 2.8681401000512356e-06, + "loss": 0.5031, + "step": 1098 + }, + { + "epoch": 0.91, + "grad_norm": 1.6767478479637847, + "learning_rate": 2.8649056642331103e-06, + "loss": 0.4856, + "step": 1099 + }, + { + "epoch": 0.91, + "grad_norm": 1.6820690185704608, + "learning_rate": 2.8616706041998686e-06, + "loss": 0.5151, + "step": 1100 + }, + { + "epoch": 0.91, + "grad_norm": 1.840181264549285, + "learning_rate": 2.8584349254854693e-06, + "loss": 0.5393, + "step": 1101 + }, + { + "epoch": 0.91, + "grad_norm": 1.827807570004724, + "learning_rate": 2.8551986336249322e-06, + "loss": 0.5572, + "step": 1102 + }, + { + "epoch": 0.91, + "grad_norm": 1.711815265099016, + "learning_rate": 2.8519617341543233e-06, + "loss": 0.5184, + "step": 1103 + }, + { + "epoch": 0.92, + "grad_norm": 1.7460018389221874, + "learning_rate": 2.8487242326107495e-06, + "loss": 0.5374, + "step": 1104 + }, + { + "epoch": 0.92, + "grad_norm": 1.985067366728648, + "learning_rate": 2.8454861345323475e-06, + "loss": 0.538, + "step": 1105 + }, + { + "epoch": 0.92, + "grad_norm": 1.8044567576569952, + "learning_rate": 2.8422474454582754e-06, + "loss": 0.4947, + "step": 1106 + }, + { + "epoch": 0.92, + "grad_norm": 1.7648712890692506, + "learning_rate": 2.8390081709286997e-06, + "loss": 0.5584, + "step": 1107 + }, + { + "epoch": 0.92, + "grad_norm": 1.7544905722043518, + "learning_rate": 2.8357683164847903e-06, + "loss": 0.5696, + "step": 1108 + }, + { + "epoch": 0.92, + "grad_norm": 1.7923136846837993, + "learning_rate": 2.8325278876687084e-06, + "loss": 0.5502, + "step": 1109 + }, + { + "epoch": 0.92, + "grad_norm": 2.077195937792951, + "learning_rate": 2.8292868900235986e-06, + "loss": 0.543, + "step": 1110 + }, + { + "epoch": 0.92, + "grad_norm": 1.7675854046933754, + "learning_rate": 2.826045329093578e-06, + "loss": 0.5422, + "step": 1111 + }, + { + "epoch": 0.92, + "grad_norm": 1.8457239401392898, + "learning_rate": 2.822803210423727e-06, + "loss": 0.5334, + "step": 1112 + }, + { + "epoch": 0.92, + "grad_norm": 1.7426929121470698, + "learning_rate": 2.8195605395600804e-06, + "loss": 0.4972, + "step": 1113 + }, + { + "epoch": 0.92, + "grad_norm": 1.7675216264197045, + "learning_rate": 2.8163173220496175e-06, + "loss": 0.5442, + "step": 1114 + }, + { + "epoch": 0.92, + "grad_norm": 1.7483102565661375, + "learning_rate": 2.8130735634402527e-06, + "loss": 0.5425, + "step": 1115 + }, + { + "epoch": 0.93, + "grad_norm": 1.692036399159914, + "learning_rate": 2.8098292692808253e-06, + "loss": 0.521, + "step": 1116 + }, + { + "epoch": 0.93, + "grad_norm": 1.799980213437577, + "learning_rate": 2.8065844451210933e-06, + "loss": 0.5597, + "step": 1117 + }, + { + "epoch": 0.93, + "grad_norm": 1.7666190830884467, + "learning_rate": 2.803339096511718e-06, + "loss": 0.5612, + "step": 1118 + }, + { + "epoch": 0.93, + "grad_norm": 1.792129515845057, + "learning_rate": 2.8000932290042597e-06, + "loss": 0.5334, + "step": 1119 + }, + { + "epoch": 0.93, + "grad_norm": 1.7395715578516604, + "learning_rate": 2.7968468481511663e-06, + "loss": 0.5545, + "step": 1120 + }, + { + "epoch": 0.93, + "grad_norm": 1.6843830287676704, + "learning_rate": 2.7935999595057623e-06, + "loss": 0.5659, + "step": 1121 + }, + { + "epoch": 0.93, + "grad_norm": 1.6432688824199502, + "learning_rate": 2.790352568622244e-06, + "loss": 0.4926, + "step": 1122 + }, + { + "epoch": 0.93, + "grad_norm": 1.7430642435954644, + "learning_rate": 2.787104681055663e-06, + "loss": 0.4666, + "step": 1123 + }, + { + "epoch": 0.93, + "grad_norm": 1.8067789882264202, + "learning_rate": 2.783856302361923e-06, + "loss": 0.5233, + "step": 1124 + }, + { + "epoch": 0.93, + "grad_norm": 1.7685143281757654, + "learning_rate": 2.780607438097769e-06, + "loss": 0.5506, + "step": 1125 + }, + { + "epoch": 0.93, + "grad_norm": 1.7163110868931304, + "learning_rate": 2.7773580938207717e-06, + "loss": 0.5044, + "step": 1126 + }, + { + "epoch": 0.93, + "grad_norm": 1.809036270322799, + "learning_rate": 2.7741082750893284e-06, + "loss": 0.5206, + "step": 1127 + }, + { + "epoch": 0.94, + "grad_norm": 1.8193898978325846, + "learning_rate": 2.770857987462645e-06, + "loss": 0.6064, + "step": 1128 + }, + { + "epoch": 0.94, + "grad_norm": 1.765826426309075, + "learning_rate": 2.76760723650073e-06, + "loss": 0.4914, + "step": 1129 + }, + { + "epoch": 0.94, + "grad_norm": 2.046345230237298, + "learning_rate": 2.764356027764385e-06, + "loss": 0.5938, + "step": 1130 + }, + { + "epoch": 0.94, + "grad_norm": 1.8264697696225647, + "learning_rate": 2.7611043668151948e-06, + "loss": 0.5476, + "step": 1131 + }, + { + "epoch": 0.94, + "grad_norm": 1.7776043318415495, + "learning_rate": 2.7578522592155166e-06, + "loss": 0.5318, + "step": 1132 + }, + { + "epoch": 0.94, + "grad_norm": 1.767284538432005, + "learning_rate": 2.7545997105284735e-06, + "loss": 0.5197, + "step": 1133 + }, + { + "epoch": 0.94, + "grad_norm": 1.831190014066027, + "learning_rate": 2.75134672631794e-06, + "loss": 0.4939, + "step": 1134 + }, + { + "epoch": 0.94, + "grad_norm": 1.7727769641989948, + "learning_rate": 2.7480933121485394e-06, + "loss": 0.5542, + "step": 1135 + }, + { + "epoch": 0.94, + "grad_norm": 1.7599576706599651, + "learning_rate": 2.7448394735856275e-06, + "loss": 0.5102, + "step": 1136 + }, + { + "epoch": 0.94, + "grad_norm": 1.7526987759875383, + "learning_rate": 2.7415852161952893e-06, + "loss": 0.5357, + "step": 1137 + }, + { + "epoch": 0.94, + "grad_norm": 1.7478180377944075, + "learning_rate": 2.7383305455443223e-06, + "loss": 0.552, + "step": 1138 + }, + { + "epoch": 0.94, + "grad_norm": 1.8026983878339322, + "learning_rate": 2.7350754672002334e-06, + "loss": 0.5324, + "step": 1139 + }, + { + "epoch": 0.95, + "grad_norm": 1.7539604119960455, + "learning_rate": 2.7318199867312267e-06, + "loss": 0.4951, + "step": 1140 + }, + { + "epoch": 0.95, + "grad_norm": 1.7060714376533908, + "learning_rate": 2.728564109706193e-06, + "loss": 0.5044, + "step": 1141 + }, + { + "epoch": 0.95, + "grad_norm": 1.896732668736906, + "learning_rate": 2.725307841694704e-06, + "loss": 0.5272, + "step": 1142 + }, + { + "epoch": 0.95, + "grad_norm": 1.9094037542829962, + "learning_rate": 2.722051188266998e-06, + "loss": 0.5036, + "step": 1143 + }, + { + "epoch": 0.95, + "grad_norm": 1.7529900591353695, + "learning_rate": 2.7187941549939723e-06, + "loss": 0.4962, + "step": 1144 + }, + { + "epoch": 0.95, + "grad_norm": 1.7652784724721573, + "learning_rate": 2.7155367474471763e-06, + "loss": 0.5159, + "step": 1145 + }, + { + "epoch": 0.95, + "grad_norm": 1.9070275680276054, + "learning_rate": 2.7122789711987964e-06, + "loss": 0.5269, + "step": 1146 + }, + { + "epoch": 0.95, + "grad_norm": 1.7630505518040367, + "learning_rate": 2.709020831821652e-06, + "loss": 0.5286, + "step": 1147 + }, + { + "epoch": 0.95, + "grad_norm": 1.7410138974922291, + "learning_rate": 2.7057623348891846e-06, + "loss": 0.4902, + "step": 1148 + }, + { + "epoch": 0.95, + "grad_norm": 1.745842560539345, + "learning_rate": 2.7025034859754446e-06, + "loss": 0.5178, + "step": 1149 + }, + { + "epoch": 0.95, + "grad_norm": 1.8498982578771728, + "learning_rate": 2.699244290655086e-06, + "loss": 0.55, + "step": 1150 + }, + { + "epoch": 0.95, + "grad_norm": 1.6360369924184164, + "learning_rate": 2.6959847545033558e-06, + "loss": 0.4988, + "step": 1151 + }, + { + "epoch": 0.96, + "grad_norm": 1.6784833460211517, + "learning_rate": 2.692724883096082e-06, + "loss": 0.5303, + "step": 1152 + }, + { + "epoch": 0.96, + "grad_norm": 1.7888637226825195, + "learning_rate": 2.68946468200967e-06, + "loss": 0.542, + "step": 1153 + }, + { + "epoch": 0.96, + "grad_norm": 1.7156031503954616, + "learning_rate": 2.686204156821084e-06, + "loss": 0.499, + "step": 1154 + }, + { + "epoch": 0.96, + "grad_norm": 1.802618839032982, + "learning_rate": 2.6829433131078464e-06, + "loss": 0.5095, + "step": 1155 + }, + { + "epoch": 0.96, + "grad_norm": 1.7018673816457677, + "learning_rate": 2.6796821564480237e-06, + "loss": 0.4911, + "step": 1156 + }, + { + "epoch": 0.96, + "grad_norm": 1.939833859373507, + "learning_rate": 2.6764206924202173e-06, + "loss": 0.5965, + "step": 1157 + }, + { + "epoch": 0.96, + "grad_norm": 1.757462214596805, + "learning_rate": 2.673158926603554e-06, + "loss": 0.5119, + "step": 1158 + }, + { + "epoch": 0.96, + "grad_norm": 1.824906787992325, + "learning_rate": 2.669896864577678e-06, + "loss": 0.4995, + "step": 1159 + }, + { + "epoch": 0.96, + "grad_norm": 1.6963319988581682, + "learning_rate": 2.666634511922739e-06, + "loss": 0.499, + "step": 1160 + }, + { + "epoch": 0.96, + "grad_norm": 1.7490967555131538, + "learning_rate": 2.6633718742193837e-06, + "loss": 0.5045, + "step": 1161 + }, + { + "epoch": 0.96, + "grad_norm": 1.7295387040616608, + "learning_rate": 2.660108957048749e-06, + "loss": 0.48, + "step": 1162 + }, + { + "epoch": 0.96, + "grad_norm": 1.7062936128447537, + "learning_rate": 2.656845765992447e-06, + "loss": 0.5024, + "step": 1163 + }, + { + "epoch": 0.96, + "grad_norm": 1.7291223687738257, + "learning_rate": 2.6535823066325594e-06, + "loss": 0.4965, + "step": 1164 + }, + { + "epoch": 0.97, + "grad_norm": 1.7660018876230184, + "learning_rate": 2.650318584551626e-06, + "loss": 0.6289, + "step": 1165 + }, + { + "epoch": 0.97, + "grad_norm": 1.6875948695046943, + "learning_rate": 2.6470546053326375e-06, + "loss": 0.5099, + "step": 1166 + }, + { + "epoch": 0.97, + "grad_norm": 1.7055862895950586, + "learning_rate": 2.643790374559023e-06, + "loss": 0.4748, + "step": 1167 + }, + { + "epoch": 0.97, + "grad_norm": 1.8397810404769834, + "learning_rate": 2.6405258978146443e-06, + "loss": 0.5547, + "step": 1168 + }, + { + "epoch": 0.97, + "grad_norm": 1.6780759297615608, + "learning_rate": 2.6372611806837804e-06, + "loss": 0.4696, + "step": 1169 + }, + { + "epoch": 0.97, + "grad_norm": 1.7463193906158438, + "learning_rate": 2.633996228751125e-06, + "loss": 0.5167, + "step": 1170 + }, + { + "epoch": 0.97, + "grad_norm": 1.7682737157303552, + "learning_rate": 2.6307310476017705e-06, + "loss": 0.5178, + "step": 1171 + }, + { + "epoch": 0.97, + "grad_norm": 1.7759532350573655, + "learning_rate": 2.627465642821203e-06, + "loss": 0.5411, + "step": 1172 + }, + { + "epoch": 0.97, + "grad_norm": 1.741742707150691, + "learning_rate": 2.624200019995293e-06, + "loss": 0.5357, + "step": 1173 + }, + { + "epoch": 0.97, + "grad_norm": 1.7638181255611864, + "learning_rate": 2.6209341847102787e-06, + "loss": 0.5598, + "step": 1174 + }, + { + "epoch": 0.97, + "grad_norm": 1.6585763596592404, + "learning_rate": 2.6176681425527663e-06, + "loss": 0.4891, + "step": 1175 + }, + { + "epoch": 0.97, + "grad_norm": 1.7652514703885578, + "learning_rate": 2.614401899109716e-06, + "loss": 0.5412, + "step": 1176 + }, + { + "epoch": 0.98, + "grad_norm": 1.7646286601286296, + "learning_rate": 2.6111354599684287e-06, + "loss": 0.4753, + "step": 1177 + }, + { + "epoch": 0.98, + "grad_norm": 1.7933546923906454, + "learning_rate": 2.6078688307165436e-06, + "loss": 0.5159, + "step": 1178 + }, + { + "epoch": 0.98, + "grad_norm": 1.8474498352431208, + "learning_rate": 2.6046020169420223e-06, + "loss": 0.4786, + "step": 1179 + }, + { + "epoch": 0.98, + "grad_norm": 1.816609500392057, + "learning_rate": 2.601335024233145e-06, + "loss": 0.5821, + "step": 1180 + }, + { + "epoch": 0.98, + "grad_norm": 1.7603922858788037, + "learning_rate": 2.598067858178495e-06, + "loss": 0.4749, + "step": 1181 + }, + { + "epoch": 0.98, + "grad_norm": 1.771168764538133, + "learning_rate": 2.594800524366956e-06, + "loss": 0.5221, + "step": 1182 + }, + { + "epoch": 0.98, + "grad_norm": 1.7428386931770696, + "learning_rate": 2.591533028387694e-06, + "loss": 0.5243, + "step": 1183 + }, + { + "epoch": 0.98, + "grad_norm": 1.7354647623517858, + "learning_rate": 2.588265375830155e-06, + "loss": 0.4665, + "step": 1184 + }, + { + "epoch": 0.98, + "grad_norm": 1.7757829783254058, + "learning_rate": 2.5849975722840537e-06, + "loss": 0.4713, + "step": 1185 + }, + { + "epoch": 0.98, + "grad_norm": 1.7660698291034924, + "learning_rate": 2.58172962333936e-06, + "loss": 0.5198, + "step": 1186 + }, + { + "epoch": 0.98, + "grad_norm": 1.7071465020770178, + "learning_rate": 2.5784615345862963e-06, + "loss": 0.5355, + "step": 1187 + }, + { + "epoch": 0.98, + "grad_norm": 1.6994920599655763, + "learning_rate": 2.5751933116153215e-06, + "loss": 0.4867, + "step": 1188 + }, + { + "epoch": 0.99, + "grad_norm": 1.7891977115774562, + "learning_rate": 2.5719249600171247e-06, + "loss": 0.5071, + "step": 1189 + }, + { + "epoch": 0.99, + "grad_norm": 1.6866451169084888, + "learning_rate": 2.568656485382616e-06, + "loss": 0.4767, + "step": 1190 + }, + { + "epoch": 0.99, + "grad_norm": 1.9106444693405875, + "learning_rate": 2.5653878933029134e-06, + "loss": 0.5063, + "step": 1191 + }, + { + "epoch": 0.99, + "grad_norm": 1.7546015951107552, + "learning_rate": 2.56211918936934e-06, + "loss": 0.5536, + "step": 1192 + }, + { + "epoch": 0.99, + "grad_norm": 1.7866083346923656, + "learning_rate": 2.5588503791734053e-06, + "loss": 0.4738, + "step": 1193 + }, + { + "epoch": 0.99, + "grad_norm": 1.6678313975517949, + "learning_rate": 2.5555814683068058e-06, + "loss": 0.5095, + "step": 1194 + }, + { + "epoch": 0.99, + "grad_norm": 1.694690087625629, + "learning_rate": 2.552312462361405e-06, + "loss": 0.5711, + "step": 1195 + }, + { + "epoch": 0.99, + "grad_norm": 1.7583066556547233, + "learning_rate": 2.5490433669292337e-06, + "loss": 0.5183, + "step": 1196 + }, + { + "epoch": 0.99, + "grad_norm": 1.8259327544569408, + "learning_rate": 2.5457741876024716e-06, + "loss": 0.5129, + "step": 1197 + }, + { + "epoch": 0.99, + "grad_norm": 1.743709458286742, + "learning_rate": 2.542504929973445e-06, + "loss": 0.509, + "step": 1198 + }, + { + "epoch": 0.99, + "grad_norm": 1.8551037168096902, + "learning_rate": 2.5392355996346134e-06, + "loss": 0.4874, + "step": 1199 + }, + { + "epoch": 0.99, + "grad_norm": 1.7705896553689628, + "learning_rate": 2.5359662021785596e-06, + "loss": 0.5102, + "step": 1200 + }, + { + "epoch": 1.0, + "grad_norm": 1.8456154073029885, + "learning_rate": 2.532696743197982e-06, + "loss": 0.5363, + "step": 1201 + }, + { + "epoch": 1.0, + "grad_norm": 1.7341454202963031, + "learning_rate": 2.529427228285686e-06, + "loss": 0.5013, + "step": 1202 + }, + { + "epoch": 1.0, + "grad_norm": 1.7923147732329405, + "learning_rate": 2.526157663034568e-06, + "loss": 0.5191, + "step": 1203 + }, + { + "epoch": 1.0, + "grad_norm": 1.731262319220837, + "learning_rate": 2.522888053037616e-06, + "loss": 0.4889, + "step": 1204 + }, + { + "epoch": 1.0, + "grad_norm": 1.797800368847369, + "learning_rate": 2.5196184038878895e-06, + "loss": 0.4868, + "step": 1205 + }, + { + "epoch": 1.0, + "grad_norm": 1.8182272292135089, + "learning_rate": 2.5163487211785194e-06, + "loss": 0.5159, + "step": 1206 + }, + { + "epoch": 1.0, + "grad_norm": 1.9699143840893472, + "learning_rate": 2.5130790105026908e-06, + "loss": 0.543, + "step": 1207 + }, + { + "epoch": 1.0, + "grad_norm": 1.805587879000798, + "learning_rate": 2.5098092774536397e-06, + "loss": 0.5162, + "step": 1208 + }, + { + "epoch": 1.0, + "grad_norm": 1.966538834153111, + "learning_rate": 2.506539527624637e-06, + "loss": 0.4973, + "step": 1209 + }, + { + "epoch": 1.0, + "grad_norm": 1.7007116827865891, + "learning_rate": 2.5032697666089833e-06, + "loss": 0.5337, + "step": 1210 + }, + { + "epoch": 1.0, + "grad_norm": 1.8200190388383481, + "learning_rate": 2.5e-06, + "loss": 0.492, + "step": 1211 + }, + { + "epoch": 1.0, + "grad_norm": 1.7811733389101785, + "learning_rate": 2.496730233391017e-06, + "loss": 0.533, + "step": 1212 + }, + { + "epoch": 1.01, + "grad_norm": 1.7692852455085013, + "learning_rate": 2.4934604723753636e-06, + "loss": 0.5151, + "step": 1213 + }, + { + "epoch": 1.01, + "grad_norm": 2.0118407638136726, + "learning_rate": 2.4901907225463607e-06, + "loss": 0.566, + "step": 1214 + }, + { + "epoch": 1.01, + "grad_norm": 1.9919699597672162, + "learning_rate": 2.486920989497309e-06, + "loss": 0.5296, + "step": 1215 + }, + { + "epoch": 1.01, + "grad_norm": 1.7399123797451834, + "learning_rate": 2.483651278821481e-06, + "loss": 0.5535, + "step": 1216 + }, + { + "epoch": 1.01, + "grad_norm": 2.0162050634113617, + "learning_rate": 2.4803815961121117e-06, + "loss": 0.5105, + "step": 1217 + }, + { + "epoch": 1.01, + "grad_norm": 1.9472302767468135, + "learning_rate": 2.4771119469623856e-06, + "loss": 0.4829, + "step": 1218 + }, + { + "epoch": 1.01, + "grad_norm": 1.9358326178363474, + "learning_rate": 2.4738423369654327e-06, + "loss": 0.5895, + "step": 1219 + }, + { + "epoch": 1.01, + "grad_norm": 1.8202396491898063, + "learning_rate": 2.470572771714315e-06, + "loss": 0.5159, + "step": 1220 + }, + { + "epoch": 1.01, + "grad_norm": 2.0705540084815652, + "learning_rate": 2.4673032568020183e-06, + "loss": 0.5375, + "step": 1221 + }, + { + "epoch": 1.01, + "grad_norm": 1.9290016818033147, + "learning_rate": 2.464033797821441e-06, + "loss": 0.5328, + "step": 1222 + }, + { + "epoch": 1.01, + "grad_norm": 1.858876842427081, + "learning_rate": 2.460764400365387e-06, + "loss": 0.5246, + "step": 1223 + }, + { + "epoch": 1.01, + "grad_norm": 1.7372257522644121, + "learning_rate": 2.457495070026555e-06, + "loss": 0.5557, + "step": 1224 + }, + { + "epoch": 1.02, + "grad_norm": 2.042578607858068, + "learning_rate": 2.454225812397529e-06, + "loss": 0.5493, + "step": 1225 + }, + { + "epoch": 1.02, + "grad_norm": 1.80578953353184, + "learning_rate": 2.450956633070767e-06, + "loss": 0.4722, + "step": 1226 + }, + { + "epoch": 1.02, + "grad_norm": 1.6245117501883604, + "learning_rate": 2.4476875376385954e-06, + "loss": 0.4861, + "step": 1227 + }, + { + "epoch": 1.0, + "grad_norm": 2.3717275673814986, + "learning_rate": 2.4444185316931955e-06, + "loss": 0.4955, + "step": 1228 + }, + { + "epoch": 1.0, + "grad_norm": 2.789230426976571, + "learning_rate": 2.441149620826595e-06, + "loss": 0.401, + "step": 1229 + }, + { + "epoch": 1.0, + "grad_norm": 2.3165196574538163, + "learning_rate": 2.437880810630661e-06, + "loss": 0.391, + "step": 1230 + }, + { + "epoch": 1.0, + "grad_norm": 3.7748119497874244, + "learning_rate": 2.434612106697087e-06, + "loss": 0.3971, + "step": 1231 + }, + { + "epoch": 1.0, + "grad_norm": 2.516708769328096, + "learning_rate": 2.4313435146173845e-06, + "loss": 0.3677, + "step": 1232 + }, + { + "epoch": 1.0, + "grad_norm": 2.0383812730416593, + "learning_rate": 2.4280750399828757e-06, + "loss": 0.3834, + "step": 1233 + }, + { + "epoch": 1.01, + "grad_norm": 2.388274870254754, + "learning_rate": 2.424806688384679e-06, + "loss": 0.38, + "step": 1234 + }, + { + "epoch": 1.01, + "grad_norm": 2.428758767469847, + "learning_rate": 2.4215384654137037e-06, + "loss": 0.3557, + "step": 1235 + }, + { + "epoch": 1.01, + "grad_norm": 1.9871015940327752, + "learning_rate": 2.41827037666064e-06, + "loss": 0.3742, + "step": 1236 + }, + { + "epoch": 1.01, + "grad_norm": 2.0490853630896595, + "learning_rate": 2.415002427715948e-06, + "loss": 0.4077, + "step": 1237 + }, + { + "epoch": 1.01, + "grad_norm": 2.36022057857035, + "learning_rate": 2.4117346241698457e-06, + "loss": 0.4079, + "step": 1238 + }, + { + "epoch": 1.01, + "grad_norm": 2.4014397498962974, + "learning_rate": 2.408466971612307e-06, + "loss": 0.3783, + "step": 1239 + }, + { + "epoch": 1.01, + "grad_norm": 2.1970209263326246, + "learning_rate": 2.405199475633045e-06, + "loss": 0.4019, + "step": 1240 + }, + { + "epoch": 1.01, + "grad_norm": 1.8747804397851657, + "learning_rate": 2.4019321418215053e-06, + "loss": 0.3657, + "step": 1241 + }, + { + "epoch": 1.01, + "grad_norm": 2.0377029592503666, + "learning_rate": 2.398664975766856e-06, + "loss": 0.3575, + "step": 1242 + }, + { + "epoch": 1.01, + "grad_norm": 2.2162687478729133, + "learning_rate": 2.3953979830579785e-06, + "loss": 0.3891, + "step": 1243 + }, + { + "epoch": 1.01, + "grad_norm": 2.0736112974636605, + "learning_rate": 2.3921311692834577e-06, + "loss": 0.3872, + "step": 1244 + }, + { + "epoch": 1.01, + "grad_norm": 1.8065329023464558, + "learning_rate": 2.3888645400315717e-06, + "loss": 0.3684, + "step": 1245 + }, + { + "epoch": 1.02, + "grad_norm": 2.144863722944226, + "learning_rate": 2.385598100890285e-06, + "loss": 0.3781, + "step": 1246 + }, + { + "epoch": 1.02, + "grad_norm": 2.245173550848138, + "learning_rate": 2.382331857447234e-06, + "loss": 0.3906, + "step": 1247 + }, + { + "epoch": 1.02, + "grad_norm": 2.0580037557233806, + "learning_rate": 2.379065815289723e-06, + "loss": 0.3461, + "step": 1248 + }, + { + "epoch": 1.02, + "grad_norm": 1.754328637936701, + "learning_rate": 2.3757999800047088e-06, + "loss": 0.3626, + "step": 1249 + }, + { + "epoch": 1.02, + "grad_norm": 1.8749369460952616, + "learning_rate": 2.3725343571787974e-06, + "loss": 0.3723, + "step": 1250 + }, + { + "epoch": 1.02, + "grad_norm": 1.9635590762348785, + "learning_rate": 2.36926895239823e-06, + "loss": 0.3506, + "step": 1251 + }, + { + "epoch": 1.02, + "grad_norm": 1.9091295881177242, + "learning_rate": 2.3660037712488758e-06, + "loss": 0.3705, + "step": 1252 + }, + { + "epoch": 1.02, + "grad_norm": 2.0807822077632445, + "learning_rate": 2.36273881931622e-06, + "loss": 0.4083, + "step": 1253 + }, + { + "epoch": 1.02, + "grad_norm": 1.9247801946548893, + "learning_rate": 2.3594741021853565e-06, + "loss": 0.3896, + "step": 1254 + }, + { + "epoch": 1.02, + "grad_norm": 2.003234826375957, + "learning_rate": 2.356209625440977e-06, + "loss": 0.3928, + "step": 1255 + }, + { + "epoch": 1.02, + "grad_norm": 1.9601094488156638, + "learning_rate": 2.352945394667363e-06, + "loss": 0.346, + "step": 1256 + }, + { + "epoch": 1.02, + "grad_norm": 1.835912356231795, + "learning_rate": 2.3496814154483754e-06, + "loss": 0.3268, + "step": 1257 + }, + { + "epoch": 1.03, + "grad_norm": 1.851616138864044, + "learning_rate": 2.346417693367442e-06, + "loss": 0.395, + "step": 1258 + }, + { + "epoch": 1.03, + "grad_norm": 2.017511453982363, + "learning_rate": 2.3431542340075535e-06, + "loss": 0.3989, + "step": 1259 + }, + { + "epoch": 1.03, + "grad_norm": 1.9337327085061278, + "learning_rate": 2.3398910429512516e-06, + "loss": 0.4168, + "step": 1260 + }, + { + "epoch": 1.03, + "grad_norm": 1.8957440589808827, + "learning_rate": 2.3366281257806167e-06, + "loss": 0.3626, + "step": 1261 + }, + { + "epoch": 1.03, + "grad_norm": 1.819897111464585, + "learning_rate": 2.3333654880772622e-06, + "loss": 0.3737, + "step": 1262 + }, + { + "epoch": 1.03, + "grad_norm": 1.9283607336926767, + "learning_rate": 2.3301031354223226e-06, + "loss": 0.3595, + "step": 1263 + }, + { + "epoch": 1.03, + "grad_norm": 1.8049670593502345, + "learning_rate": 2.3268410733964463e-06, + "loss": 0.3645, + "step": 1264 + }, + { + "epoch": 1.03, + "grad_norm": 1.866103990559354, + "learning_rate": 2.3235793075797835e-06, + "loss": 0.391, + "step": 1265 + }, + { + "epoch": 1.03, + "grad_norm": 1.774992664072412, + "learning_rate": 2.3203178435519767e-06, + "loss": 0.3863, + "step": 1266 + }, + { + "epoch": 1.03, + "grad_norm": 1.8431093658964484, + "learning_rate": 2.3170566868921553e-06, + "loss": 0.4175, + "step": 1267 + }, + { + "epoch": 1.03, + "grad_norm": 1.7731154009482526, + "learning_rate": 2.3137958431789175e-06, + "loss": 0.3651, + "step": 1268 + }, + { + "epoch": 1.03, + "grad_norm": 1.980392583405916, + "learning_rate": 2.3105353179903313e-06, + "loss": 0.3919, + "step": 1269 + }, + { + "epoch": 1.04, + "grad_norm": 1.8435910751312221, + "learning_rate": 2.3072751169039183e-06, + "loss": 0.3466, + "step": 1270 + }, + { + "epoch": 1.04, + "grad_norm": 1.88150621693115, + "learning_rate": 2.304015245496645e-06, + "loss": 0.3991, + "step": 1271 + }, + { + "epoch": 1.04, + "grad_norm": 1.9365960105712363, + "learning_rate": 2.300755709344915e-06, + "loss": 0.3675, + "step": 1272 + }, + { + "epoch": 1.04, + "grad_norm": 1.8120924423380202, + "learning_rate": 2.297496514024556e-06, + "loss": 0.389, + "step": 1273 + }, + { + "epoch": 1.04, + "grad_norm": 1.822066570446833, + "learning_rate": 2.2942376651108158e-06, + "loss": 0.3355, + "step": 1274 + }, + { + "epoch": 1.04, + "grad_norm": 1.968043494993567, + "learning_rate": 2.290979168178348e-06, + "loss": 0.3909, + "step": 1275 + }, + { + "epoch": 1.04, + "grad_norm": 1.8571689944285859, + "learning_rate": 2.287721028801204e-06, + "loss": 0.376, + "step": 1276 + }, + { + "epoch": 1.04, + "grad_norm": 2.003415605331929, + "learning_rate": 2.2844632525528245e-06, + "loss": 0.3439, + "step": 1277 + }, + { + "epoch": 1.04, + "grad_norm": 2.248040597881556, + "learning_rate": 2.2812058450060285e-06, + "loss": 0.3789, + "step": 1278 + }, + { + "epoch": 1.04, + "grad_norm": 1.8018969815730068, + "learning_rate": 2.2779488117330032e-06, + "loss": 0.3756, + "step": 1279 + }, + { + "epoch": 1.04, + "grad_norm": 1.90374397055853, + "learning_rate": 2.2746921583052967e-06, + "loss": 0.4126, + "step": 1280 + }, + { + "epoch": 1.04, + "grad_norm": 1.8558365521624263, + "learning_rate": 2.2714358902938073e-06, + "loss": 0.3959, + "step": 1281 + }, + { + "epoch": 1.05, + "grad_norm": 1.8375175796231433, + "learning_rate": 2.268180013268774e-06, + "loss": 0.4048, + "step": 1282 + }, + { + "epoch": 1.05, + "grad_norm": 1.984205865069469, + "learning_rate": 2.2649245327997674e-06, + "loss": 0.4039, + "step": 1283 + }, + { + "epoch": 1.05, + "grad_norm": 1.8933532928718015, + "learning_rate": 2.261669454455679e-06, + "loss": 0.3781, + "step": 1284 + }, + { + "epoch": 1.05, + "grad_norm": 1.9740915743952114, + "learning_rate": 2.2584147838047116e-06, + "loss": 0.4003, + "step": 1285 + }, + { + "epoch": 1.05, + "grad_norm": 1.8808844925592019, + "learning_rate": 2.2551605264143725e-06, + "loss": 0.3449, + "step": 1286 + }, + { + "epoch": 1.05, + "grad_norm": 1.9307797122579196, + "learning_rate": 2.251906687851461e-06, + "loss": 0.4182, + "step": 1287 + }, + { + "epoch": 1.05, + "grad_norm": 1.8492505145939904, + "learning_rate": 2.2486532736820614e-06, + "loss": 0.3736, + "step": 1288 + }, + { + "epoch": 1.05, + "grad_norm": 1.8826597143825838, + "learning_rate": 2.245400289471528e-06, + "loss": 0.3987, + "step": 1289 + }, + { + "epoch": 1.05, + "grad_norm": 1.8696499317715565, + "learning_rate": 2.242147740784484e-06, + "loss": 0.3725, + "step": 1290 + }, + { + "epoch": 1.05, + "grad_norm": 2.0572316139676463, + "learning_rate": 2.2388956331848057e-06, + "loss": 0.3777, + "step": 1291 + }, + { + "epoch": 1.05, + "grad_norm": 1.9916048666817696, + "learning_rate": 2.2356439722356154e-06, + "loss": 0.3435, + "step": 1292 + }, + { + "epoch": 1.05, + "grad_norm": 1.7903849297787813, + "learning_rate": 2.2323927634992706e-06, + "loss": 0.3691, + "step": 1293 + }, + { + "epoch": 1.06, + "grad_norm": 1.8840722711485807, + "learning_rate": 2.2291420125373555e-06, + "loss": 0.3619, + "step": 1294 + }, + { + "epoch": 1.06, + "grad_norm": 1.853222255447046, + "learning_rate": 2.225891724910672e-06, + "loss": 0.3406, + "step": 1295 + }, + { + "epoch": 1.06, + "grad_norm": 1.8075515802139996, + "learning_rate": 2.2226419061792282e-06, + "loss": 0.3775, + "step": 1296 + }, + { + "epoch": 1.06, + "grad_norm": 1.8220733253527324, + "learning_rate": 2.2193925619022323e-06, + "loss": 0.3652, + "step": 1297 + }, + { + "epoch": 1.06, + "grad_norm": 1.9758397782161456, + "learning_rate": 2.2161436976380774e-06, + "loss": 0.3825, + "step": 1298 + }, + { + "epoch": 1.06, + "grad_norm": 2.0469053125573202, + "learning_rate": 2.212895318944338e-06, + "loss": 0.4162, + "step": 1299 + }, + { + "epoch": 1.06, + "grad_norm": 1.8037669439194224, + "learning_rate": 2.2096474313777574e-06, + "loss": 0.3584, + "step": 1300 + }, + { + "epoch": 1.06, + "grad_norm": 1.8852980241376032, + "learning_rate": 2.206400040494238e-06, + "loss": 0.3786, + "step": 1301 + }, + { + "epoch": 1.06, + "grad_norm": 1.8014277477129081, + "learning_rate": 2.2031531518488345e-06, + "loss": 0.4126, + "step": 1302 + }, + { + "epoch": 1.06, + "grad_norm": 1.844230526856602, + "learning_rate": 2.1999067709957407e-06, + "loss": 0.4005, + "step": 1303 + }, + { + "epoch": 1.06, + "grad_norm": 1.9775624321749639, + "learning_rate": 2.1966609034882825e-06, + "loss": 0.4279, + "step": 1304 + }, + { + "epoch": 1.06, + "grad_norm": 1.7752280618538778, + "learning_rate": 2.193415554878907e-06, + "loss": 0.3512, + "step": 1305 + }, + { + "epoch": 1.07, + "grad_norm": 1.8490455260047038, + "learning_rate": 2.1901707307191743e-06, + "loss": 0.3828, + "step": 1306 + }, + { + "epoch": 1.07, + "grad_norm": 5.328150832014928, + "learning_rate": 2.1869264365597477e-06, + "loss": 0.3909, + "step": 1307 + }, + { + "epoch": 1.07, + "grad_norm": 1.8437062886123319, + "learning_rate": 2.1836826779503838e-06, + "loss": 0.37, + "step": 1308 + }, + { + "epoch": 1.07, + "grad_norm": 2.008796830412121, + "learning_rate": 2.1804394604399204e-06, + "loss": 0.4077, + "step": 1309 + }, + { + "epoch": 1.07, + "grad_norm": 1.800679268264127, + "learning_rate": 2.1771967895762736e-06, + "loss": 0.3679, + "step": 1310 + }, + { + "epoch": 1.07, + "grad_norm": 1.8462133413299637, + "learning_rate": 2.173954670906423e-06, + "loss": 0.3602, + "step": 1311 + }, + { + "epoch": 1.07, + "grad_norm": 1.809976917930169, + "learning_rate": 2.1707131099764022e-06, + "loss": 0.3899, + "step": 1312 + }, + { + "epoch": 1.07, + "grad_norm": 1.8544861012991105, + "learning_rate": 2.1674721123312924e-06, + "loss": 0.3747, + "step": 1313 + }, + { + "epoch": 1.07, + "grad_norm": 1.8852269898368, + "learning_rate": 2.1642316835152106e-06, + "loss": 0.4467, + "step": 1314 + }, + { + "epoch": 1.07, + "grad_norm": 1.9122728391881445, + "learning_rate": 2.1609918290713007e-06, + "loss": 0.3402, + "step": 1315 + }, + { + "epoch": 1.07, + "grad_norm": 1.9590310432156601, + "learning_rate": 2.1577525545417254e-06, + "loss": 0.3732, + "step": 1316 + }, + { + "epoch": 1.07, + "grad_norm": 1.8276147883157745, + "learning_rate": 2.1545138654676525e-06, + "loss": 0.3953, + "step": 1317 + }, + { + "epoch": 1.08, + "grad_norm": 1.8133703409989375, + "learning_rate": 2.151275767389252e-06, + "loss": 0.3539, + "step": 1318 + }, + { + "epoch": 1.08, + "grad_norm": 1.8006183709975836, + "learning_rate": 2.148038265845678e-06, + "loss": 0.4006, + "step": 1319 + }, + { + "epoch": 1.08, + "grad_norm": 1.8947220090164194, + "learning_rate": 2.144801366375069e-06, + "loss": 0.4406, + "step": 1320 + }, + { + "epoch": 1.08, + "grad_norm": 1.8280103512099313, + "learning_rate": 2.141565074514531e-06, + "loss": 0.3815, + "step": 1321 + }, + { + "epoch": 1.08, + "grad_norm": 1.8706012819390525, + "learning_rate": 2.138329395800132e-06, + "loss": 0.3445, + "step": 1322 + }, + { + "epoch": 1.08, + "grad_norm": 1.9063701163877025, + "learning_rate": 2.1350943357668905e-06, + "loss": 0.3983, + "step": 1323 + }, + { + "epoch": 1.08, + "grad_norm": 2.033333592395131, + "learning_rate": 2.131859899948765e-06, + "loss": 0.3686, + "step": 1324 + }, + { + "epoch": 1.08, + "grad_norm": 2.0894724502176425, + "learning_rate": 2.1286260938786497e-06, + "loss": 0.3811, + "step": 1325 + }, + { + "epoch": 1.08, + "grad_norm": 1.9145691870270913, + "learning_rate": 2.125392923088358e-06, + "loss": 0.3783, + "step": 1326 + }, + { + "epoch": 1.08, + "grad_norm": 1.941699323344672, + "learning_rate": 2.1221603931086193e-06, + "loss": 0.3842, + "step": 1327 + }, + { + "epoch": 1.08, + "grad_norm": 2.0079800551627565, + "learning_rate": 2.118928509469066e-06, + "loss": 0.3885, + "step": 1328 + }, + { + "epoch": 1.08, + "grad_norm": 1.851351482771633, + "learning_rate": 2.1156972776982238e-06, + "loss": 0.3281, + "step": 1329 + }, + { + "epoch": 1.08, + "grad_norm": 1.9104937018736412, + "learning_rate": 2.112466703323504e-06, + "loss": 0.4231, + "step": 1330 + }, + { + "epoch": 1.09, + "grad_norm": 1.92374307717419, + "learning_rate": 2.1092367918711935e-06, + "loss": 0.3702, + "step": 1331 + }, + { + "epoch": 1.09, + "grad_norm": 1.8725737952655952, + "learning_rate": 2.1060075488664453e-06, + "loss": 0.3591, + "step": 1332 + }, + { + "epoch": 1.09, + "grad_norm": 1.850042908610832, + "learning_rate": 2.1027789798332688e-06, + "loss": 0.3368, + "step": 1333 + }, + { + "epoch": 1.09, + "grad_norm": 1.9324592525287807, + "learning_rate": 2.0995510902945197e-06, + "loss": 0.3676, + "step": 1334 + }, + { + "epoch": 1.09, + "grad_norm": 1.9116116557564555, + "learning_rate": 2.0963238857718934e-06, + "loss": 0.3817, + "step": 1335 + }, + { + "epoch": 1.09, + "grad_norm": 1.9148726445140338, + "learning_rate": 2.0930973717859117e-06, + "loss": 0.3704, + "step": 1336 + }, + { + "epoch": 1.09, + "grad_norm": 1.8376871831619126, + "learning_rate": 2.089871553855915e-06, + "loss": 0.3521, + "step": 1337 + }, + { + "epoch": 1.09, + "grad_norm": 2.069303925978208, + "learning_rate": 2.086646437500054e-06, + "loss": 0.3848, + "step": 1338 + }, + { + "epoch": 1.09, + "grad_norm": 1.876178784774616, + "learning_rate": 2.08342202823528e-06, + "loss": 0.3697, + "step": 1339 + }, + { + "epoch": 1.09, + "grad_norm": 1.8981757166548485, + "learning_rate": 2.0801983315773317e-06, + "loss": 0.3864, + "step": 1340 + }, + { + "epoch": 1.09, + "grad_norm": 1.8313223303972075, + "learning_rate": 2.0769753530407317e-06, + "loss": 0.3768, + "step": 1341 + }, + { + "epoch": 1.09, + "grad_norm": 1.9073767874852925, + "learning_rate": 2.073753098138773e-06, + "loss": 0.3991, + "step": 1342 + }, + { + "epoch": 1.1, + "grad_norm": 1.837313805268737, + "learning_rate": 2.0705315723835116e-06, + "loss": 0.3959, + "step": 1343 + }, + { + "epoch": 1.1, + "grad_norm": 1.9539946764244502, + "learning_rate": 2.067310781285755e-06, + "loss": 0.4305, + "step": 1344 + }, + { + "epoch": 1.1, + "grad_norm": 2.019270181770809, + "learning_rate": 2.0640907303550545e-06, + "loss": 0.3601, + "step": 1345 + }, + { + "epoch": 1.1, + "grad_norm": 2.406213238917182, + "learning_rate": 2.0608714250996954e-06, + "loss": 0.4426, + "step": 1346 + }, + { + "epoch": 1.1, + "grad_norm": 1.9236578073704644, + "learning_rate": 2.0576528710266875e-06, + "loss": 0.4038, + "step": 1347 + }, + { + "epoch": 1.1, + "grad_norm": 2.048182172212149, + "learning_rate": 2.054435073641756e-06, + "loss": 0.3746, + "step": 1348 + }, + { + "epoch": 1.1, + "grad_norm": 1.928863945427719, + "learning_rate": 2.0512180384493306e-06, + "loss": 0.3894, + "step": 1349 + }, + { + "epoch": 1.1, + "grad_norm": 1.8335551339682872, + "learning_rate": 2.0480017709525372e-06, + "loss": 0.3693, + "step": 1350 + }, + { + "epoch": 1.1, + "grad_norm": 1.9647819756067608, + "learning_rate": 2.044786276653189e-06, + "loss": 0.3781, + "step": 1351 + }, + { + "epoch": 1.1, + "grad_norm": 2.12907859222308, + "learning_rate": 2.041571561051777e-06, + "loss": 0.4171, + "step": 1352 + }, + { + "epoch": 1.1, + "grad_norm": 1.9030554994611362, + "learning_rate": 2.0383576296474595e-06, + "loss": 0.3871, + "step": 1353 + }, + { + "epoch": 1.1, + "grad_norm": 1.8482128197200014, + "learning_rate": 2.0351444879380533e-06, + "loss": 0.3801, + "step": 1354 + }, + { + "epoch": 1.11, + "grad_norm": 1.9237098856083394, + "learning_rate": 2.031932141420026e-06, + "loss": 0.397, + "step": 1355 + }, + { + "epoch": 1.11, + "grad_norm": 1.9292461604759314, + "learning_rate": 2.0287205955884812e-06, + "loss": 0.3808, + "step": 1356 + }, + { + "epoch": 1.11, + "grad_norm": 1.905891034454967, + "learning_rate": 2.025509855937156e-06, + "loss": 0.3991, + "step": 1357 + }, + { + "epoch": 1.11, + "grad_norm": 1.8451385574242787, + "learning_rate": 2.0222999279584084e-06, + "loss": 0.3801, + "step": 1358 + }, + { + "epoch": 1.11, + "grad_norm": 1.949400009057099, + "learning_rate": 2.0190908171432073e-06, + "loss": 0.3892, + "step": 1359 + }, + { + "epoch": 1.11, + "grad_norm": 1.9605363810464835, + "learning_rate": 2.0158825289811214e-06, + "loss": 0.3965, + "step": 1360 + }, + { + "epoch": 1.11, + "grad_norm": 1.8606173348780064, + "learning_rate": 2.012675068960315e-06, + "loss": 0.3954, + "step": 1361 + }, + { + "epoch": 1.11, + "grad_norm": 1.894555038278285, + "learning_rate": 2.009468442567537e-06, + "loss": 0.3872, + "step": 1362 + }, + { + "epoch": 1.11, + "grad_norm": 1.8879641436732342, + "learning_rate": 2.006262655288106e-06, + "loss": 0.381, + "step": 1363 + }, + { + "epoch": 1.11, + "grad_norm": 6.804463123370788, + "learning_rate": 2.003057712605908e-06, + "loss": 0.3598, + "step": 1364 + }, + { + "epoch": 1.11, + "grad_norm": 1.9484231062475323, + "learning_rate": 1.9998536200033843e-06, + "loss": 0.387, + "step": 1365 + }, + { + "epoch": 1.11, + "grad_norm": 1.9430636182866459, + "learning_rate": 1.996650382961521e-06, + "loss": 0.3815, + "step": 1366 + }, + { + "epoch": 1.12, + "grad_norm": 1.8099872908810362, + "learning_rate": 1.9934480069598418e-06, + "loss": 0.3931, + "step": 1367 + }, + { + "epoch": 1.12, + "grad_norm": 2.0871498559503583, + "learning_rate": 1.990246497476396e-06, + "loss": 0.3946, + "step": 1368 + }, + { + "epoch": 1.12, + "grad_norm": 1.9534152521538926, + "learning_rate": 1.9870458599877524e-06, + "loss": 0.3998, + "step": 1369 + }, + { + "epoch": 1.12, + "grad_norm": 1.9712355359168434, + "learning_rate": 1.9838460999689854e-06, + "loss": 0.3741, + "step": 1370 + }, + { + "epoch": 1.12, + "grad_norm": 1.8831191819719022, + "learning_rate": 1.980647222893671e-06, + "loss": 0.3758, + "step": 1371 + }, + { + "epoch": 1.12, + "grad_norm": 2.03493312021646, + "learning_rate": 1.977449234233875e-06, + "loss": 0.4066, + "step": 1372 + }, + { + "epoch": 1.12, + "grad_norm": 1.9837157371609282, + "learning_rate": 1.9742521394601413e-06, + "loss": 0.3757, + "step": 1373 + }, + { + "epoch": 1.12, + "grad_norm": 1.9871704920253919, + "learning_rate": 1.9710559440414867e-06, + "loss": 0.3811, + "step": 1374 + }, + { + "epoch": 1.12, + "grad_norm": 1.8609975534569105, + "learning_rate": 1.9678606534453874e-06, + "loss": 0.3709, + "step": 1375 + }, + { + "epoch": 1.12, + "grad_norm": 1.8599855946550903, + "learning_rate": 1.9646662731377737e-06, + "loss": 0.3589, + "step": 1376 + }, + { + "epoch": 1.12, + "grad_norm": 2.0183183444158224, + "learning_rate": 1.9614728085830185e-06, + "loss": 0.3521, + "step": 1377 + }, + { + "epoch": 1.12, + "grad_norm": 1.9976152320569405, + "learning_rate": 1.958280265243927e-06, + "loss": 0.3757, + "step": 1378 + }, + { + "epoch": 1.13, + "grad_norm": 1.9951401325370672, + "learning_rate": 1.9550886485817313e-06, + "loss": 0.3947, + "step": 1379 + }, + { + "epoch": 1.13, + "grad_norm": 1.9553672687038417, + "learning_rate": 1.9518979640560737e-06, + "loss": 0.3473, + "step": 1380 + }, + { + "epoch": 1.13, + "grad_norm": 1.9340367763443969, + "learning_rate": 1.9487082171250057e-06, + "loss": 0.37, + "step": 1381 + }, + { + "epoch": 1.13, + "grad_norm": 1.8996712185125788, + "learning_rate": 1.9455194132449745e-06, + "loss": 0.3924, + "step": 1382 + }, + { + "epoch": 1.13, + "grad_norm": 1.9351658663427442, + "learning_rate": 1.9423315578708126e-06, + "loss": 0.3959, + "step": 1383 + }, + { + "epoch": 1.13, + "grad_norm": 2.0174109611058504, + "learning_rate": 1.939144656455731e-06, + "loss": 0.3987, + "step": 1384 + }, + { + "epoch": 1.13, + "grad_norm": 1.76886531168205, + "learning_rate": 1.9359587144513086e-06, + "loss": 0.4277, + "step": 1385 + }, + { + "epoch": 1.13, + "grad_norm": 2.1774228741508455, + "learning_rate": 1.9327737373074834e-06, + "loss": 0.4474, + "step": 1386 + }, + { + "epoch": 1.13, + "grad_norm": 1.8335022286037221, + "learning_rate": 1.929589730472543e-06, + "loss": 0.3586, + "step": 1387 + }, + { + "epoch": 1.13, + "grad_norm": 1.944762597816562, + "learning_rate": 1.926406699393114e-06, + "loss": 0.3916, + "step": 1388 + }, + { + "epoch": 1.13, + "grad_norm": 1.9158836718088024, + "learning_rate": 1.9232246495141554e-06, + "loss": 0.3471, + "step": 1389 + }, + { + "epoch": 1.13, + "grad_norm": 1.9546368466405357, + "learning_rate": 1.920043586278947e-06, + "loss": 0.3747, + "step": 1390 + }, + { + "epoch": 1.14, + "grad_norm": 1.9070019014660136, + "learning_rate": 1.9168635151290803e-06, + "loss": 0.3524, + "step": 1391 + }, + { + "epoch": 1.14, + "grad_norm": 2.023146490194608, + "learning_rate": 1.9136844415044502e-06, + "loss": 0.3707, + "step": 1392 + }, + { + "epoch": 1.14, + "grad_norm": 1.8809251159178713, + "learning_rate": 1.910506370843246e-06, + "loss": 0.3801, + "step": 1393 + }, + { + "epoch": 1.14, + "grad_norm": 2.0409011175956784, + "learning_rate": 1.9073293085819402e-06, + "loss": 0.373, + "step": 1394 + }, + { + "epoch": 1.14, + "grad_norm": 2.0117643519136315, + "learning_rate": 1.9041532601552804e-06, + "loss": 0.3645, + "step": 1395 + }, + { + "epoch": 1.14, + "grad_norm": 1.9716378326274158, + "learning_rate": 1.9009782309962805e-06, + "loss": 0.3614, + "step": 1396 + }, + { + "epoch": 1.14, + "grad_norm": 1.9329872273189466, + "learning_rate": 1.8978042265362103e-06, + "loss": 0.3551, + "step": 1397 + }, + { + "epoch": 1.14, + "grad_norm": 1.9199554634763143, + "learning_rate": 1.8946312522045874e-06, + "loss": 0.3902, + "step": 1398 + }, + { + "epoch": 1.14, + "grad_norm": 1.9590655710866773, + "learning_rate": 1.891459313429167e-06, + "loss": 0.4142, + "step": 1399 + }, + { + "epoch": 1.14, + "grad_norm": 2.0331664011816972, + "learning_rate": 1.8882884156359324e-06, + "loss": 0.3656, + "step": 1400 + }, + { + "epoch": 1.14, + "grad_norm": 2.0472909494424583, + "learning_rate": 1.8851185642490863e-06, + "loss": 0.3886, + "step": 1401 + }, + { + "epoch": 1.14, + "grad_norm": 1.9929489595454677, + "learning_rate": 1.8819497646910408e-06, + "loss": 0.3672, + "step": 1402 + }, + { + "epoch": 1.15, + "grad_norm": 1.9438211462442658, + "learning_rate": 1.87878202238241e-06, + "loss": 0.3713, + "step": 1403 + }, + { + "epoch": 1.15, + "grad_norm": 1.9090031612890588, + "learning_rate": 1.8756153427419996e-06, + "loss": 0.3806, + "step": 1404 + }, + { + "epoch": 1.15, + "grad_norm": 1.8225379267675694, + "learning_rate": 1.872449731186796e-06, + "loss": 0.3412, + "step": 1405 + }, + { + "epoch": 1.15, + "grad_norm": 1.7944071121109437, + "learning_rate": 1.86928519313196e-06, + "loss": 0.3642, + "step": 1406 + }, + { + "epoch": 1.15, + "grad_norm": 1.9414616279338623, + "learning_rate": 1.8661217339908142e-06, + "loss": 0.3806, + "step": 1407 + }, + { + "epoch": 1.15, + "grad_norm": 1.944356212181711, + "learning_rate": 1.8629593591748374e-06, + "loss": 0.3987, + "step": 1408 + }, + { + "epoch": 1.15, + "grad_norm": 1.857841085738498, + "learning_rate": 1.8597980740936528e-06, + "loss": 0.3899, + "step": 1409 + }, + { + "epoch": 1.15, + "grad_norm": 1.8710356295384132, + "learning_rate": 1.8566378841550205e-06, + "loss": 0.3784, + "step": 1410 + }, + { + "epoch": 1.15, + "grad_norm": 1.8728296119496737, + "learning_rate": 1.8534787947648247e-06, + "loss": 0.3867, + "step": 1411 + }, + { + "epoch": 1.15, + "grad_norm": 1.8738844694805654, + "learning_rate": 1.8503208113270687e-06, + "loss": 0.3696, + "step": 1412 + }, + { + "epoch": 1.15, + "grad_norm": 1.9649370685779552, + "learning_rate": 1.8471639392438648e-06, + "loss": 0.3986, + "step": 1413 + }, + { + "epoch": 1.15, + "grad_norm": 1.7859555369523812, + "learning_rate": 1.8440081839154222e-06, + "loss": 0.3871, + "step": 1414 + }, + { + "epoch": 1.16, + "grad_norm": 1.8610430021362592, + "learning_rate": 1.840853550740041e-06, + "loss": 0.333, + "step": 1415 + }, + { + "epoch": 1.16, + "grad_norm": 1.9871037672382785, + "learning_rate": 1.8377000451141013e-06, + "loss": 0.3655, + "step": 1416 + }, + { + "epoch": 1.16, + "grad_norm": 2.0510993717790544, + "learning_rate": 1.8345476724320549e-06, + "loss": 0.3345, + "step": 1417 + }, + { + "epoch": 1.16, + "grad_norm": 2.022865297999793, + "learning_rate": 1.8313964380864157e-06, + "loss": 0.4238, + "step": 1418 + }, + { + "epoch": 1.16, + "grad_norm": 2.0272213314003786, + "learning_rate": 1.8282463474677485e-06, + "loss": 0.3775, + "step": 1419 + }, + { + "epoch": 1.16, + "grad_norm": 2.006744012043913, + "learning_rate": 1.825097405964665e-06, + "loss": 0.3886, + "step": 1420 + }, + { + "epoch": 1.16, + "grad_norm": 2.0596399522136406, + "learning_rate": 1.8219496189638065e-06, + "loss": 0.4091, + "step": 1421 + }, + { + "epoch": 1.16, + "grad_norm": 1.8816895162930982, + "learning_rate": 1.8188029918498434e-06, + "loss": 0.4065, + "step": 1422 + }, + { + "epoch": 1.16, + "grad_norm": 1.9988370328142775, + "learning_rate": 1.8156575300054607e-06, + "loss": 0.3968, + "step": 1423 + }, + { + "epoch": 1.16, + "grad_norm": 2.0379288149529216, + "learning_rate": 1.8125132388113497e-06, + "loss": 0.3893, + "step": 1424 + }, + { + "epoch": 1.16, + "grad_norm": 1.8764951987892278, + "learning_rate": 1.8093701236461999e-06, + "loss": 0.3757, + "step": 1425 + }, + { + "epoch": 1.16, + "grad_norm": 1.9911843473469748, + "learning_rate": 1.806228189886688e-06, + "loss": 0.3891, + "step": 1426 + }, + { + "epoch": 1.17, + "grad_norm": 1.9631453513585595, + "learning_rate": 1.8030874429074701e-06, + "loss": 0.3969, + "step": 1427 + }, + { + "epoch": 1.17, + "grad_norm": 1.8998526626952037, + "learning_rate": 1.7999478880811735e-06, + "loss": 0.3919, + "step": 1428 + }, + { + "epoch": 1.17, + "grad_norm": 1.8805553933080315, + "learning_rate": 1.7968095307783845e-06, + "loss": 0.3767, + "step": 1429 + }, + { + "epoch": 1.17, + "grad_norm": 1.9958093732421776, + "learning_rate": 1.7936723763676426e-06, + "loss": 0.3861, + "step": 1430 + }, + { + "epoch": 1.17, + "grad_norm": 1.8587137598489651, + "learning_rate": 1.7905364302154264e-06, + "loss": 0.3289, + "step": 1431 + }, + { + "epoch": 1.17, + "grad_norm": 2.0380004642313785, + "learning_rate": 1.7874016976861504e-06, + "loss": 0.3531, + "step": 1432 + }, + { + "epoch": 1.17, + "grad_norm": 1.9171820086465794, + "learning_rate": 1.784268184142154e-06, + "loss": 0.3986, + "step": 1433 + }, + { + "epoch": 1.17, + "grad_norm": 1.95855879390137, + "learning_rate": 1.7811358949436874e-06, + "loss": 0.3402, + "step": 1434 + }, + { + "epoch": 1.17, + "grad_norm": 1.9995990338040457, + "learning_rate": 1.7780048354489101e-06, + "loss": 0.3599, + "step": 1435 + }, + { + "epoch": 1.17, + "grad_norm": 1.9243145774410442, + "learning_rate": 1.7748750110138768e-06, + "loss": 0.4399, + "step": 1436 + }, + { + "epoch": 1.17, + "grad_norm": 2.279285862974166, + "learning_rate": 1.7717464269925288e-06, + "loss": 0.3614, + "step": 1437 + }, + { + "epoch": 1.17, + "grad_norm": 1.9005095716347011, + "learning_rate": 1.7686190887366875e-06, + "loss": 0.3665, + "step": 1438 + }, + { + "epoch": 1.18, + "grad_norm": 1.8076423185524721, + "learning_rate": 1.7654930015960401e-06, + "loss": 0.3408, + "step": 1439 + }, + { + "epoch": 1.18, + "grad_norm": 1.8762893879880087, + "learning_rate": 1.762368170918136e-06, + "loss": 0.39, + "step": 1440 + }, + { + "epoch": 1.18, + "grad_norm": 2.0153368993119556, + "learning_rate": 1.7592446020483762e-06, + "loss": 0.3539, + "step": 1441 + }, + { + "epoch": 1.18, + "grad_norm": 1.9585515808006808, + "learning_rate": 1.7561223003299994e-06, + "loss": 0.3956, + "step": 1442 + }, + { + "epoch": 1.18, + "grad_norm": 2.124848103864915, + "learning_rate": 1.7530012711040794e-06, + "loss": 0.4119, + "step": 1443 + }, + { + "epoch": 1.18, + "grad_norm": 2.012402459921111, + "learning_rate": 1.749881519709514e-06, + "loss": 0.408, + "step": 1444 + }, + { + "epoch": 1.18, + "grad_norm": 1.9649268732755643, + "learning_rate": 1.7467630514830136e-06, + "loss": 0.3283, + "step": 1445 + }, + { + "epoch": 1.18, + "grad_norm": 1.8596310758669552, + "learning_rate": 1.7436458717590931e-06, + "loss": 0.4354, + "step": 1446 + }, + { + "epoch": 1.18, + "grad_norm": 1.9102148486337966, + "learning_rate": 1.7405299858700648e-06, + "loss": 0.3954, + "step": 1447 + }, + { + "epoch": 1.18, + "grad_norm": 1.8553487771224224, + "learning_rate": 1.737415399146027e-06, + "loss": 0.3668, + "step": 1448 + }, + { + "epoch": 1.18, + "grad_norm": 2.1142472778200756, + "learning_rate": 1.7343021169148554e-06, + "loss": 0.3745, + "step": 1449 + }, + { + "epoch": 1.18, + "grad_norm": 1.9058887276269199, + "learning_rate": 1.7311901445021955e-06, + "loss": 0.3818, + "step": 1450 + }, + { + "epoch": 1.19, + "grad_norm": 2.0622661899571666, + "learning_rate": 1.7280794872314499e-06, + "loss": 0.3961, + "step": 1451 + }, + { + "epoch": 1.19, + "grad_norm": 1.8962754770592172, + "learning_rate": 1.7249701504237737e-06, + "loss": 0.3586, + "step": 1452 + }, + { + "epoch": 1.19, + "grad_norm": 1.8165490259194481, + "learning_rate": 1.7218621393980606e-06, + "loss": 0.3311, + "step": 1453 + }, + { + "epoch": 1.19, + "grad_norm": 1.9977375977133494, + "learning_rate": 1.7187554594709396e-06, + "loss": 0.3674, + "step": 1454 + }, + { + "epoch": 1.19, + "grad_norm": 1.8504323227168384, + "learning_rate": 1.7156501159567607e-06, + "loss": 0.3743, + "step": 1455 + }, + { + "epoch": 1.19, + "grad_norm": 1.9541250949627105, + "learning_rate": 1.7125461141675881e-06, + "loss": 0.3812, + "step": 1456 + }, + { + "epoch": 1.19, + "grad_norm": 1.993766367538168, + "learning_rate": 1.7094434594131914e-06, + "loss": 0.355, + "step": 1457 + }, + { + "epoch": 1.19, + "grad_norm": 1.851815452351873, + "learning_rate": 1.7063421570010349e-06, + "loss": 0.3792, + "step": 1458 + }, + { + "epoch": 1.19, + "grad_norm": 1.8699896985814497, + "learning_rate": 1.7032422122362704e-06, + "loss": 0.345, + "step": 1459 + }, + { + "epoch": 1.19, + "grad_norm": 1.941362367589001, + "learning_rate": 1.700143630421727e-06, + "loss": 0.3735, + "step": 1460 + }, + { + "epoch": 1.19, + "grad_norm": 1.844833441576945, + "learning_rate": 1.6970464168579034e-06, + "loss": 0.3883, + "step": 1461 + }, + { + "epoch": 1.19, + "grad_norm": 1.9382330200940399, + "learning_rate": 1.6939505768429548e-06, + "loss": 0.3451, + "step": 1462 + }, + { + "epoch": 1.2, + "grad_norm": 1.9404379114850492, + "learning_rate": 1.6908561156726894e-06, + "loss": 0.3886, + "step": 1463 + }, + { + "epoch": 1.2, + "grad_norm": 1.89967752240511, + "learning_rate": 1.6877630386405567e-06, + "loss": 0.4322, + "step": 1464 + }, + { + "epoch": 1.2, + "grad_norm": 1.9542258627644085, + "learning_rate": 1.6846713510376363e-06, + "loss": 0.4143, + "step": 1465 + }, + { + "epoch": 1.2, + "grad_norm": 2.0224476812069305, + "learning_rate": 1.6815810581526337e-06, + "loss": 0.3885, + "step": 1466 + }, + { + "epoch": 1.2, + "grad_norm": 1.9984358815769925, + "learning_rate": 1.6784921652718666e-06, + "loss": 0.326, + "step": 1467 + }, + { + "epoch": 1.2, + "grad_norm": 1.9112545672749313, + "learning_rate": 1.675404677679259e-06, + "loss": 0.3818, + "step": 1468 + }, + { + "epoch": 1.2, + "grad_norm": 1.8535662369823578, + "learning_rate": 1.6723186006563309e-06, + "loss": 0.348, + "step": 1469 + }, + { + "epoch": 1.2, + "grad_norm": 1.9484817526163822, + "learning_rate": 1.6692339394821877e-06, + "loss": 0.3357, + "step": 1470 + }, + { + "epoch": 1.2, + "grad_norm": 1.898163029912662, + "learning_rate": 1.6661506994335164e-06, + "loss": 0.3755, + "step": 1471 + }, + { + "epoch": 1.2, + "grad_norm": 1.8795795559493234, + "learning_rate": 1.6630688857845678e-06, + "loss": 0.3616, + "step": 1472 + }, + { + "epoch": 1.2, + "grad_norm": 1.9167503410588418, + "learning_rate": 1.6599885038071566e-06, + "loss": 0.3592, + "step": 1473 + }, + { + "epoch": 1.2, + "grad_norm": 1.9765253259894953, + "learning_rate": 1.6569095587706485e-06, + "loss": 0.3953, + "step": 1474 + }, + { + "epoch": 1.21, + "grad_norm": 1.9352433621405845, + "learning_rate": 1.6538320559419488e-06, + "loss": 0.3528, + "step": 1475 + }, + { + "epoch": 1.21, + "grad_norm": 2.0111021011512125, + "learning_rate": 1.6507560005854977e-06, + "loss": 0.407, + "step": 1476 + }, + { + "epoch": 1.21, + "grad_norm": 1.8339393905209536, + "learning_rate": 1.6476813979632589e-06, + "loss": 0.3668, + "step": 1477 + }, + { + "epoch": 1.21, + "grad_norm": 1.9309495145983575, + "learning_rate": 1.6446082533347096e-06, + "loss": 0.4106, + "step": 1478 + }, + { + "epoch": 1.21, + "grad_norm": 1.8708341753950297, + "learning_rate": 1.641536571956835e-06, + "loss": 0.3749, + "step": 1479 + }, + { + "epoch": 1.21, + "grad_norm": 1.8244009733234272, + "learning_rate": 1.6384663590841154e-06, + "loss": 0.3832, + "step": 1480 + }, + { + "epoch": 1.21, + "grad_norm": 1.8878853394194013, + "learning_rate": 1.6353976199685222e-06, + "loss": 0.3539, + "step": 1481 + }, + { + "epoch": 1.21, + "grad_norm": 1.8830734244466278, + "learning_rate": 1.6323303598595006e-06, + "loss": 0.3852, + "step": 1482 + }, + { + "epoch": 1.21, + "grad_norm": 1.866253132730359, + "learning_rate": 1.6292645840039697e-06, + "loss": 0.364, + "step": 1483 + }, + { + "epoch": 1.21, + "grad_norm": 1.977321954101075, + "learning_rate": 1.6262002976463098e-06, + "loss": 0.3866, + "step": 1484 + }, + { + "epoch": 1.21, + "grad_norm": 1.9753878011905568, + "learning_rate": 1.62313750602835e-06, + "loss": 0.3999, + "step": 1485 + }, + { + "epoch": 1.21, + "grad_norm": 1.9461948334927384, + "learning_rate": 1.6200762143893659e-06, + "loss": 0.3769, + "step": 1486 + }, + { + "epoch": 1.22, + "grad_norm": 1.9597078370114984, + "learning_rate": 1.6170164279660656e-06, + "loss": 0.3546, + "step": 1487 + }, + { + "epoch": 1.22, + "grad_norm": 2.0333727955548735, + "learning_rate": 1.6139581519925818e-06, + "loss": 0.3631, + "step": 1488 + }, + { + "epoch": 1.22, + "grad_norm": 1.8957200128798963, + "learning_rate": 1.6109013917004657e-06, + "loss": 0.3738, + "step": 1489 + }, + { + "epoch": 1.22, + "grad_norm": 1.8758015207075704, + "learning_rate": 1.6078461523186722e-06, + "loss": 0.3511, + "step": 1490 + }, + { + "epoch": 1.22, + "grad_norm": 1.9539261883496823, + "learning_rate": 1.6047924390735587e-06, + "loss": 0.4074, + "step": 1491 + }, + { + "epoch": 1.22, + "grad_norm": 2.046216911945662, + "learning_rate": 1.6017402571888677e-06, + "loss": 0.3729, + "step": 1492 + }, + { + "epoch": 1.22, + "grad_norm": 2.0334239477316194, + "learning_rate": 1.5986896118857247e-06, + "loss": 0.3999, + "step": 1493 + }, + { + "epoch": 1.22, + "grad_norm": 2.0768274033669556, + "learning_rate": 1.5956405083826266e-06, + "loss": 0.3982, + "step": 1494 + }, + { + "epoch": 1.22, + "grad_norm": 1.9997134218487143, + "learning_rate": 1.592592951895432e-06, + "loss": 0.4319, + "step": 1495 + }, + { + "epoch": 1.22, + "grad_norm": 1.9000589337955354, + "learning_rate": 1.5895469476373545e-06, + "loss": 0.3813, + "step": 1496 + }, + { + "epoch": 1.22, + "grad_norm": 1.8787692854188953, + "learning_rate": 1.5865025008189501e-06, + "loss": 0.3801, + "step": 1497 + }, + { + "epoch": 1.22, + "grad_norm": 1.8346902202639779, + "learning_rate": 1.5834596166481132e-06, + "loss": 0.3533, + "step": 1498 + }, + { + "epoch": 1.23, + "grad_norm": 1.8993496821666367, + "learning_rate": 1.5804183003300627e-06, + "loss": 0.429, + "step": 1499 + }, + { + "epoch": 1.23, + "grad_norm": 2.342530229905022, + "learning_rate": 1.5773785570673378e-06, + "loss": 0.3356, + "step": 1500 + }, + { + "epoch": 1.23, + "grad_norm": 2.1048882391009127, + "learning_rate": 1.5743403920597856e-06, + "loss": 0.3896, + "step": 1501 + }, + { + "epoch": 1.23, + "grad_norm": 1.8528209728378324, + "learning_rate": 1.5713038105045535e-06, + "loss": 0.3307, + "step": 1502 + }, + { + "epoch": 1.23, + "grad_norm": 1.9057632190431548, + "learning_rate": 1.5682688175960797e-06, + "loss": 0.3806, + "step": 1503 + }, + { + "epoch": 1.23, + "grad_norm": 1.8724905465304538, + "learning_rate": 1.5652354185260848e-06, + "loss": 0.3637, + "step": 1504 + }, + { + "epoch": 1.23, + "grad_norm": 1.8484069152287292, + "learning_rate": 1.5622036184835648e-06, + "loss": 0.3161, + "step": 1505 + }, + { + "epoch": 1.23, + "grad_norm": 1.8399814687678377, + "learning_rate": 1.559173422654778e-06, + "loss": 0.3745, + "step": 1506 + }, + { + "epoch": 1.23, + "grad_norm": 1.8838641942793775, + "learning_rate": 1.5561448362232404e-06, + "loss": 0.3537, + "step": 1507 + }, + { + "epoch": 1.23, + "grad_norm": 1.8623848433104377, + "learning_rate": 1.5531178643697142e-06, + "loss": 0.3624, + "step": 1508 + }, + { + "epoch": 1.23, + "grad_norm": 1.8997144759052735, + "learning_rate": 1.5500925122721988e-06, + "loss": 0.3679, + "step": 1509 + }, + { + "epoch": 1.23, + "grad_norm": 1.8976582272389906, + "learning_rate": 1.5470687851059235e-06, + "loss": 0.3736, + "step": 1510 + }, + { + "epoch": 1.24, + "grad_norm": 1.8750760623537808, + "learning_rate": 1.5440466880433388e-06, + "loss": 0.3735, + "step": 1511 + }, + { + "epoch": 1.24, + "grad_norm": 1.990180186983658, + "learning_rate": 1.5410262262541065e-06, + "loss": 0.3797, + "step": 1512 + }, + { + "epoch": 1.24, + "grad_norm": 1.8820633605632435, + "learning_rate": 1.538007404905089e-06, + "loss": 0.3659, + "step": 1513 + }, + { + "epoch": 1.24, + "grad_norm": 1.9458293982836543, + "learning_rate": 1.5349902291603441e-06, + "loss": 0.4092, + "step": 1514 + }, + { + "epoch": 1.24, + "grad_norm": 1.822097097325058, + "learning_rate": 1.5319747041811158e-06, + "loss": 0.3276, + "step": 1515 + }, + { + "epoch": 1.24, + "grad_norm": 2.0516824372881457, + "learning_rate": 1.528960835125822e-06, + "loss": 0.4232, + "step": 1516 + }, + { + "epoch": 1.24, + "grad_norm": 2.0624060387577816, + "learning_rate": 1.5259486271500489e-06, + "loss": 0.3996, + "step": 1517 + }, + { + "epoch": 1.24, + "grad_norm": 1.9158764361943028, + "learning_rate": 1.522938085406542e-06, + "loss": 0.3728, + "step": 1518 + }, + { + "epoch": 1.24, + "grad_norm": 1.9071590654189663, + "learning_rate": 1.5199292150451956e-06, + "loss": 0.3459, + "step": 1519 + }, + { + "epoch": 1.24, + "grad_norm": 1.9532115896688163, + "learning_rate": 1.5169220212130449e-06, + "loss": 0.3513, + "step": 1520 + }, + { + "epoch": 1.24, + "grad_norm": 1.9901825773245059, + "learning_rate": 1.5139165090542574e-06, + "loss": 0.3468, + "step": 1521 + }, + { + "epoch": 1.24, + "grad_norm": 1.7913388603914477, + "learning_rate": 1.510912683710124e-06, + "loss": 0.3381, + "step": 1522 + }, + { + "epoch": 1.24, + "grad_norm": 1.8270379040698477, + "learning_rate": 1.5079105503190497e-06, + "loss": 0.3873, + "step": 1523 + }, + { + "epoch": 1.25, + "grad_norm": 1.9259224146444094, + "learning_rate": 1.5049101140165453e-06, + "loss": 0.3553, + "step": 1524 + }, + { + "epoch": 1.25, + "grad_norm": 1.7933642267566716, + "learning_rate": 1.501911379935219e-06, + "loss": 0.3928, + "step": 1525 + }, + { + "epoch": 1.25, + "grad_norm": 1.859002957520952, + "learning_rate": 1.498914353204767e-06, + "loss": 0.3331, + "step": 1526 + }, + { + "epoch": 1.25, + "grad_norm": 1.9280095918192017, + "learning_rate": 1.4959190389519646e-06, + "loss": 0.3902, + "step": 1527 + }, + { + "epoch": 1.25, + "grad_norm": 1.9929705610530277, + "learning_rate": 1.492925442300658e-06, + "loss": 0.3765, + "step": 1528 + }, + { + "epoch": 1.25, + "grad_norm": 2.02617558936789, + "learning_rate": 1.4899335683717546e-06, + "loss": 0.3815, + "step": 1529 + }, + { + "epoch": 1.25, + "grad_norm": 1.8532248246777345, + "learning_rate": 1.4869434222832157e-06, + "loss": 0.3998, + "step": 1530 + }, + { + "epoch": 1.25, + "grad_norm": 1.8616511215661515, + "learning_rate": 1.4839550091500464e-06, + "loss": 0.4005, + "step": 1531 + }, + { + "epoch": 1.25, + "grad_norm": 1.9696593290003677, + "learning_rate": 1.4809683340842885e-06, + "loss": 0.4136, + "step": 1532 + }, + { + "epoch": 1.25, + "grad_norm": 1.9439323576237217, + "learning_rate": 1.477983402195008e-06, + "loss": 0.3674, + "step": 1533 + }, + { + "epoch": 1.25, + "grad_norm": 1.8858064066643994, + "learning_rate": 1.475000218588291e-06, + "loss": 0.3505, + "step": 1534 + }, + { + "epoch": 1.25, + "grad_norm": 1.9565923900750009, + "learning_rate": 1.4720187883672337e-06, + "loss": 0.379, + "step": 1535 + }, + { + "epoch": 1.26, + "grad_norm": 1.9482950589580994, + "learning_rate": 1.4690391166319307e-06, + "loss": 0.3962, + "step": 1536 + }, + { + "epoch": 1.26, + "grad_norm": 1.979462387227227, + "learning_rate": 1.4660612084794701e-06, + "loss": 0.3662, + "step": 1537 + }, + { + "epoch": 1.26, + "grad_norm": 1.894203355197371, + "learning_rate": 1.4630850690039221e-06, + "loss": 0.3703, + "step": 1538 + }, + { + "epoch": 1.26, + "grad_norm": 1.8798042105520323, + "learning_rate": 1.460110703296333e-06, + "loss": 0.3631, + "step": 1539 + }, + { + "epoch": 1.26, + "grad_norm": 1.9687008779986372, + "learning_rate": 1.4571381164447137e-06, + "loss": 0.4081, + "step": 1540 + }, + { + "epoch": 1.26, + "grad_norm": 2.043706332156422, + "learning_rate": 1.454167313534031e-06, + "loss": 0.3629, + "step": 1541 + }, + { + "epoch": 1.26, + "grad_norm": 1.9336401989651433, + "learning_rate": 1.4511982996462038e-06, + "loss": 0.4042, + "step": 1542 + }, + { + "epoch": 1.26, + "grad_norm": 1.9550529998108908, + "learning_rate": 1.4482310798600852e-06, + "loss": 0.3768, + "step": 1543 + }, + { + "epoch": 1.26, + "grad_norm": 1.874147928818456, + "learning_rate": 1.4452656592514633e-06, + "loss": 0.4125, + "step": 1544 + }, + { + "epoch": 1.26, + "grad_norm": 1.848295970105597, + "learning_rate": 1.442302042893048e-06, + "loss": 0.3646, + "step": 1545 + }, + { + "epoch": 1.26, + "grad_norm": 1.991422406332833, + "learning_rate": 1.439340235854462e-06, + "loss": 0.3885, + "step": 1546 + }, + { + "epoch": 1.26, + "grad_norm": 1.89855710617557, + "learning_rate": 1.436380243202233e-06, + "loss": 0.3658, + "step": 1547 + }, + { + "epoch": 1.27, + "grad_norm": 1.8657910310229384, + "learning_rate": 1.4334220699997856e-06, + "loss": 0.3659, + "step": 1548 + }, + { + "epoch": 1.27, + "grad_norm": 1.9035891506078888, + "learning_rate": 1.4304657213074314e-06, + "loss": 0.3662, + "step": 1549 + }, + { + "epoch": 1.27, + "grad_norm": 1.9026573701280374, + "learning_rate": 1.4275112021823618e-06, + "loss": 0.3712, + "step": 1550 + }, + { + "epoch": 1.27, + "grad_norm": 1.9342408780305267, + "learning_rate": 1.4245585176786363e-06, + "loss": 0.355, + "step": 1551 + }, + { + "epoch": 1.27, + "grad_norm": 1.8785254217068754, + "learning_rate": 1.4216076728471794e-06, + "loss": 0.3985, + "step": 1552 + }, + { + "epoch": 1.27, + "grad_norm": 1.9602955113202258, + "learning_rate": 1.4186586727357649e-06, + "loss": 0.4063, + "step": 1553 + }, + { + "epoch": 1.27, + "grad_norm": 2.083823151902659, + "learning_rate": 1.4157115223890136e-06, + "loss": 0.4121, + "step": 1554 + }, + { + "epoch": 1.27, + "grad_norm": 1.8676871403375772, + "learning_rate": 1.4127662268483818e-06, + "loss": 0.3912, + "step": 1555 + }, + { + "epoch": 1.27, + "grad_norm": 1.9120128683776039, + "learning_rate": 1.4098227911521523e-06, + "loss": 0.3453, + "step": 1556 + }, + { + "epoch": 1.27, + "grad_norm": 1.844790264464269, + "learning_rate": 1.4068812203354264e-06, + "loss": 0.3666, + "step": 1557 + }, + { + "epoch": 1.27, + "grad_norm": 1.8477236162312085, + "learning_rate": 1.4039415194301159e-06, + "loss": 0.3652, + "step": 1558 + }, + { + "epoch": 1.27, + "grad_norm": 1.9200270211079769, + "learning_rate": 1.4010036934649334e-06, + "loss": 0.3755, + "step": 1559 + }, + { + "epoch": 1.28, + "grad_norm": 1.8353558471804892, + "learning_rate": 1.3980677474653838e-06, + "loss": 0.3653, + "step": 1560 + }, + { + "epoch": 1.28, + "grad_norm": 1.9621989060334357, + "learning_rate": 1.3951336864537572e-06, + "loss": 0.4104, + "step": 1561 + }, + { + "epoch": 1.28, + "grad_norm": 1.8245538722983388, + "learning_rate": 1.3922015154491194e-06, + "loss": 0.3991, + "step": 1562 + }, + { + "epoch": 1.28, + "grad_norm": 1.933539870056334, + "learning_rate": 1.3892712394673002e-06, + "loss": 0.3877, + "step": 1563 + }, + { + "epoch": 1.28, + "grad_norm": 1.8275785324682217, + "learning_rate": 1.3863428635208915e-06, + "loss": 0.3546, + "step": 1564 + }, + { + "epoch": 1.28, + "grad_norm": 2.0450836317829215, + "learning_rate": 1.3834163926192318e-06, + "loss": 0.3847, + "step": 1565 + }, + { + "epoch": 1.28, + "grad_norm": 3.523986698344347, + "learning_rate": 1.380491831768403e-06, + "loss": 0.3502, + "step": 1566 + }, + { + "epoch": 1.28, + "grad_norm": 1.9164812764116064, + "learning_rate": 1.3775691859712193e-06, + "loss": 0.309, + "step": 1567 + }, + { + "epoch": 1.28, + "grad_norm": 2.0951493120042604, + "learning_rate": 1.3746484602272178e-06, + "loss": 0.3678, + "step": 1568 + }, + { + "epoch": 1.28, + "grad_norm": 1.8843177010635455, + "learning_rate": 1.3717296595326527e-06, + "loss": 0.358, + "step": 1569 + }, + { + "epoch": 1.28, + "grad_norm": 1.9562282189438478, + "learning_rate": 1.3688127888804837e-06, + "loss": 0.4021, + "step": 1570 + }, + { + "epoch": 1.28, + "grad_norm": 1.997781626544885, + "learning_rate": 1.36589785326037e-06, + "loss": 0.4158, + "step": 1571 + }, + { + "epoch": 1.29, + "grad_norm": 1.8805954764404564, + "learning_rate": 1.3629848576586604e-06, + "loss": 0.3678, + "step": 1572 + }, + { + "epoch": 1.29, + "grad_norm": 2.037723153555198, + "learning_rate": 1.3600738070583858e-06, + "loss": 0.3611, + "step": 1573 + }, + { + "epoch": 1.29, + "grad_norm": 1.9504207408498462, + "learning_rate": 1.3571647064392467e-06, + "loss": 0.4096, + "step": 1574 + }, + { + "epoch": 1.29, + "grad_norm": 2.0573869926356494, + "learning_rate": 1.3542575607776117e-06, + "loss": 0.3698, + "step": 1575 + }, + { + "epoch": 1.29, + "grad_norm": 1.9648011988919714, + "learning_rate": 1.3513523750465049e-06, + "loss": 0.3557, + "step": 1576 + }, + { + "epoch": 1.29, + "grad_norm": 2.0566628239070077, + "learning_rate": 1.3484491542155941e-06, + "loss": 0.4099, + "step": 1577 + }, + { + "epoch": 1.29, + "grad_norm": 1.840088910062188, + "learning_rate": 1.3455479032511903e-06, + "loss": 0.3759, + "step": 1578 + }, + { + "epoch": 1.29, + "grad_norm": 1.916068103431673, + "learning_rate": 1.3426486271162326e-06, + "loss": 0.36, + "step": 1579 + }, + { + "epoch": 1.29, + "grad_norm": 1.932989091441797, + "learning_rate": 1.3397513307702817e-06, + "loss": 0.3658, + "step": 1580 + }, + { + "epoch": 1.29, + "grad_norm": 1.8629067871512175, + "learning_rate": 1.3368560191695126e-06, + "loss": 0.3562, + "step": 1581 + }, + { + "epoch": 1.29, + "grad_norm": 2.0118302341661307, + "learning_rate": 1.3339626972667048e-06, + "loss": 0.3878, + "step": 1582 + }, + { + "epoch": 1.29, + "grad_norm": 1.9124583307461076, + "learning_rate": 1.3310713700112348e-06, + "loss": 0.3809, + "step": 1583 + }, + { + "epoch": 1.3, + "grad_norm": 1.9774861213509043, + "learning_rate": 1.328182042349065e-06, + "loss": 0.4137, + "step": 1584 + }, + { + "epoch": 1.3, + "grad_norm": 1.9114216906066048, + "learning_rate": 1.3252947192227388e-06, + "loss": 0.3837, + "step": 1585 + }, + { + "epoch": 1.3, + "grad_norm": 1.8560468375199388, + "learning_rate": 1.3224094055713713e-06, + "loss": 0.3603, + "step": 1586 + }, + { + "epoch": 1.3, + "grad_norm": 1.9212128604014926, + "learning_rate": 1.3195261063306381e-06, + "loss": 0.3458, + "step": 1587 + }, + { + "epoch": 1.3, + "grad_norm": 1.9251208352537634, + "learning_rate": 1.316644826432772e-06, + "loss": 0.3844, + "step": 1588 + }, + { + "epoch": 1.3, + "grad_norm": 1.883081065391109, + "learning_rate": 1.313765570806547e-06, + "loss": 0.4208, + "step": 1589 + }, + { + "epoch": 1.3, + "grad_norm": 1.8564972529452957, + "learning_rate": 1.3108883443772779e-06, + "loss": 0.3622, + "step": 1590 + }, + { + "epoch": 1.3, + "grad_norm": 1.9725309818034906, + "learning_rate": 1.3080131520668075e-06, + "loss": 0.3489, + "step": 1591 + }, + { + "epoch": 1.3, + "grad_norm": 1.9747828638689664, + "learning_rate": 1.3051399987934988e-06, + "loss": 0.38, + "step": 1592 + }, + { + "epoch": 1.3, + "grad_norm": 1.8498395134731278, + "learning_rate": 1.3022688894722271e-06, + "loss": 0.3797, + "step": 1593 + }, + { + "epoch": 1.3, + "grad_norm": 1.8845414148933772, + "learning_rate": 1.2993998290143698e-06, + "loss": 0.3335, + "step": 1594 + }, + { + "epoch": 1.3, + "grad_norm": 1.9610318168301932, + "learning_rate": 1.296532822327801e-06, + "loss": 0.3769, + "step": 1595 + }, + { + "epoch": 1.31, + "grad_norm": 1.8917429842068785, + "learning_rate": 1.2936678743168813e-06, + "loss": 0.3981, + "step": 1596 + }, + { + "epoch": 1.31, + "grad_norm": 2.005525949740854, + "learning_rate": 1.29080498988245e-06, + "loss": 0.3789, + "step": 1597 + }, + { + "epoch": 1.31, + "grad_norm": 1.891996578027132, + "learning_rate": 1.2879441739218152e-06, + "loss": 0.3906, + "step": 1598 + }, + { + "epoch": 1.31, + "grad_norm": 2.0224573297517114, + "learning_rate": 1.285085431328748e-06, + "loss": 0.3852, + "step": 1599 + }, + { + "epoch": 1.31, + "grad_norm": 1.9933585122271171, + "learning_rate": 1.282228766993472e-06, + "loss": 0.3811, + "step": 1600 + }, + { + "epoch": 1.31, + "grad_norm": 1.9655771579152717, + "learning_rate": 1.2793741858026565e-06, + "loss": 0.3799, + "step": 1601 + }, + { + "epoch": 1.31, + "grad_norm": 1.8953068551718162, + "learning_rate": 1.2765216926394047e-06, + "loss": 0.3508, + "step": 1602 + }, + { + "epoch": 1.31, + "grad_norm": 1.8702448937265155, + "learning_rate": 1.2736712923832526e-06, + "loss": 0.3427, + "step": 1603 + }, + { + "epoch": 1.31, + "grad_norm": 1.9279047888369216, + "learning_rate": 1.2708229899101505e-06, + "loss": 0.3755, + "step": 1604 + }, + { + "epoch": 1.31, + "grad_norm": 1.8867926377124098, + "learning_rate": 1.2679767900924647e-06, + "loss": 0.3366, + "step": 1605 + }, + { + "epoch": 1.31, + "grad_norm": 1.8256946570291102, + "learning_rate": 1.2651326977989629e-06, + "loss": 0.3419, + "step": 1606 + }, + { + "epoch": 1.31, + "grad_norm": 1.8845001674022432, + "learning_rate": 1.2622907178948074e-06, + "loss": 0.3593, + "step": 1607 + }, + { + "epoch": 1.32, + "grad_norm": 1.8041550297275601, + "learning_rate": 1.2594508552415474e-06, + "loss": 0.3565, + "step": 1608 + }, + { + "epoch": 1.32, + "grad_norm": 1.929162466271085, + "learning_rate": 1.2566131146971105e-06, + "loss": 0.346, + "step": 1609 + }, + { + "epoch": 1.32, + "grad_norm": 1.9783530922620556, + "learning_rate": 1.2537775011157943e-06, + "loss": 0.3655, + "step": 1610 + }, + { + "epoch": 1.32, + "grad_norm": 1.9493980516637623, + "learning_rate": 1.2509440193482564e-06, + "loss": 0.417, + "step": 1611 + }, + { + "epoch": 1.32, + "grad_norm": 1.8895548928491517, + "learning_rate": 1.2481126742415098e-06, + "loss": 0.3731, + "step": 1612 + }, + { + "epoch": 1.32, + "grad_norm": 1.874868433424839, + "learning_rate": 1.2452834706389122e-06, + "loss": 0.3743, + "step": 1613 + }, + { + "epoch": 1.32, + "grad_norm": 1.917114604759422, + "learning_rate": 1.2424564133801553e-06, + "loss": 0.3412, + "step": 1614 + }, + { + "epoch": 1.32, + "grad_norm": 1.9354723425395528, + "learning_rate": 1.2396315073012636e-06, + "loss": 0.3564, + "step": 1615 + }, + { + "epoch": 1.32, + "grad_norm": 1.9621850514310992, + "learning_rate": 1.2368087572345772e-06, + "loss": 0.348, + "step": 1616 + }, + { + "epoch": 1.32, + "grad_norm": 2.058589411316211, + "learning_rate": 1.233988168008751e-06, + "loss": 0.3679, + "step": 1617 + }, + { + "epoch": 1.32, + "grad_norm": 1.9516795286397743, + "learning_rate": 1.2311697444487431e-06, + "loss": 0.3635, + "step": 1618 + }, + { + "epoch": 1.32, + "grad_norm": 1.9233248775745249, + "learning_rate": 1.2283534913758066e-06, + "loss": 0.3957, + "step": 1619 + }, + { + "epoch": 1.33, + "grad_norm": 1.9303786560618386, + "learning_rate": 1.225539413607482e-06, + "loss": 0.3806, + "step": 1620 + }, + { + "epoch": 1.33, + "grad_norm": 2.030744520145863, + "learning_rate": 1.222727515957588e-06, + "loss": 0.4023, + "step": 1621 + }, + { + "epoch": 1.33, + "grad_norm": 1.9537051918570292, + "learning_rate": 1.2199178032362149e-06, + "loss": 0.3808, + "step": 1622 + }, + { + "epoch": 1.33, + "grad_norm": 1.8928085054817043, + "learning_rate": 1.2171102802497148e-06, + "loss": 0.3982, + "step": 1623 + }, + { + "epoch": 1.33, + "grad_norm": 2.0571192296380296, + "learning_rate": 1.2143049518006952e-06, + "loss": 0.4044, + "step": 1624 + }, + { + "epoch": 1.33, + "grad_norm": 1.856402590326006, + "learning_rate": 1.2115018226880063e-06, + "loss": 0.3977, + "step": 1625 + }, + { + "epoch": 1.33, + "grad_norm": 1.927548078890778, + "learning_rate": 1.208700897706739e-06, + "loss": 0.4048, + "step": 1626 + }, + { + "epoch": 1.33, + "grad_norm": 1.9400375481531664, + "learning_rate": 1.205902181648215e-06, + "loss": 0.3605, + "step": 1627 + }, + { + "epoch": 1.33, + "grad_norm": 1.873775466516257, + "learning_rate": 1.2031056792999726e-06, + "loss": 0.3375, + "step": 1628 + }, + { + "epoch": 1.33, + "grad_norm": 1.9913863168589552, + "learning_rate": 1.2003113954457673e-06, + "loss": 0.3964, + "step": 1629 + }, + { + "epoch": 1.33, + "grad_norm": 1.9685736172926571, + "learning_rate": 1.1975193348655584e-06, + "loss": 0.3587, + "step": 1630 + }, + { + "epoch": 1.33, + "grad_norm": 1.8698671252931964, + "learning_rate": 1.1947295023355022e-06, + "loss": 0.3568, + "step": 1631 + }, + { + "epoch": 1.34, + "grad_norm": 1.9615330930141146, + "learning_rate": 1.1919419026279434e-06, + "loss": 0.385, + "step": 1632 + }, + { + "epoch": 1.34, + "grad_norm": 1.8699401980633292, + "learning_rate": 1.189156540511407e-06, + "loss": 0.362, + "step": 1633 + }, + { + "epoch": 1.34, + "grad_norm": 2.054845402143213, + "learning_rate": 1.186373420750592e-06, + "loss": 0.3746, + "step": 1634 + }, + { + "epoch": 1.34, + "grad_norm": 1.828582523525085, + "learning_rate": 1.1835925481063575e-06, + "loss": 0.3915, + "step": 1635 + }, + { + "epoch": 1.34, + "grad_norm": 1.9369510226251998, + "learning_rate": 1.1808139273357232e-06, + "loss": 0.3736, + "step": 1636 + }, + { + "epoch": 1.34, + "grad_norm": 1.9623351823945685, + "learning_rate": 1.1780375631918544e-06, + "loss": 0.3861, + "step": 1637 + }, + { + "epoch": 1.34, + "grad_norm": 2.057951803903781, + "learning_rate": 1.1752634604240565e-06, + "loss": 0.3988, + "step": 1638 + }, + { + "epoch": 1.34, + "grad_norm": 1.926766218075179, + "learning_rate": 1.1724916237777675e-06, + "loss": 0.3526, + "step": 1639 + }, + { + "epoch": 1.34, + "grad_norm": 1.8312750701887877, + "learning_rate": 1.1697220579945466e-06, + "loss": 0.3518, + "step": 1640 + }, + { + "epoch": 1.34, + "grad_norm": 2.025004547929062, + "learning_rate": 1.1669547678120701e-06, + "loss": 0.3651, + "step": 1641 + }, + { + "epoch": 1.34, + "grad_norm": 2.3363123335351874, + "learning_rate": 1.1641897579641221e-06, + "loss": 0.4033, + "step": 1642 + }, + { + "epoch": 1.34, + "grad_norm": 1.8749245234784346, + "learning_rate": 1.1614270331805844e-06, + "loss": 0.3701, + "step": 1643 + }, + { + "epoch": 1.35, + "grad_norm": 1.8332454151429327, + "learning_rate": 1.1586665981874323e-06, + "loss": 0.3911, + "step": 1644 + }, + { + "epoch": 1.35, + "grad_norm": 2.217946505455479, + "learning_rate": 1.1559084577067206e-06, + "loss": 0.3346, + "step": 1645 + }, + { + "epoch": 1.35, + "grad_norm": 1.799776695931742, + "learning_rate": 1.1531526164565816e-06, + "loss": 0.3489, + "step": 1646 + }, + { + "epoch": 1.35, + "grad_norm": 1.9376934559686718, + "learning_rate": 1.150399079151214e-06, + "loss": 0.3721, + "step": 1647 + }, + { + "epoch": 1.35, + "grad_norm": 1.826040524283735, + "learning_rate": 1.1476478505008753e-06, + "loss": 0.3464, + "step": 1648 + }, + { + "epoch": 1.35, + "grad_norm": 1.9007570045973046, + "learning_rate": 1.144898935211874e-06, + "loss": 0.3859, + "step": 1649 + }, + { + "epoch": 1.35, + "grad_norm": 2.1474984005060334, + "learning_rate": 1.1421523379865603e-06, + "loss": 0.3456, + "step": 1650 + }, + { + "epoch": 1.35, + "grad_norm": 1.843189989485683, + "learning_rate": 1.1394080635233204e-06, + "loss": 0.3052, + "step": 1651 + }, + { + "epoch": 1.35, + "grad_norm": 2.009903889503656, + "learning_rate": 1.136666116516567e-06, + "loss": 0.4498, + "step": 1652 + }, + { + "epoch": 1.35, + "grad_norm": 3.0285468769549473, + "learning_rate": 1.1339265016567294e-06, + "loss": 0.3532, + "step": 1653 + }, + { + "epoch": 1.35, + "grad_norm": 1.8725192886740858, + "learning_rate": 1.1311892236302508e-06, + "loss": 0.3685, + "step": 1654 + }, + { + "epoch": 1.35, + "grad_norm": 1.8726862166869487, + "learning_rate": 1.128454287119573e-06, + "loss": 0.3761, + "step": 1655 + }, + { + "epoch": 1.36, + "grad_norm": 1.8883766624211467, + "learning_rate": 1.1257216968031357e-06, + "loss": 0.3574, + "step": 1656 + }, + { + "epoch": 1.36, + "grad_norm": 1.9004020165185547, + "learning_rate": 1.1229914573553641e-06, + "loss": 0.3638, + "step": 1657 + }, + { + "epoch": 1.36, + "grad_norm": 1.8723325311418417, + "learning_rate": 1.1202635734466612e-06, + "loss": 0.3468, + "step": 1658 + }, + { + "epoch": 1.36, + "grad_norm": 1.804021127084218, + "learning_rate": 1.1175380497434022e-06, + "loss": 0.3534, + "step": 1659 + }, + { + "epoch": 1.36, + "grad_norm": 1.8962566852248846, + "learning_rate": 1.1148148909079229e-06, + "loss": 0.3943, + "step": 1660 + }, + { + "epoch": 1.36, + "grad_norm": 1.9982200928541012, + "learning_rate": 1.1120941015985152e-06, + "loss": 0.4224, + "step": 1661 + }, + { + "epoch": 1.36, + "grad_norm": 1.8053179049263286, + "learning_rate": 1.109375686469417e-06, + "loss": 0.3389, + "step": 1662 + }, + { + "epoch": 1.36, + "grad_norm": 1.888467793597335, + "learning_rate": 1.106659650170805e-06, + "loss": 0.387, + "step": 1663 + }, + { + "epoch": 1.36, + "grad_norm": 1.8685159814187862, + "learning_rate": 1.1039459973487876e-06, + "loss": 0.3428, + "step": 1664 + }, + { + "epoch": 1.36, + "grad_norm": 1.972180811818148, + "learning_rate": 1.101234732645393e-06, + "loss": 0.3587, + "step": 1665 + }, + { + "epoch": 1.36, + "grad_norm": 2.252459557872569, + "learning_rate": 1.0985258606985683e-06, + "loss": 0.3684, + "step": 1666 + }, + { + "epoch": 1.36, + "grad_norm": 1.9679034729828595, + "learning_rate": 1.0958193861421634e-06, + "loss": 0.338, + "step": 1667 + }, + { + "epoch": 1.37, + "grad_norm": 1.9117564762331398, + "learning_rate": 1.0931153136059304e-06, + "loss": 0.4016, + "step": 1668 + }, + { + "epoch": 1.37, + "grad_norm": 1.915297031471561, + "learning_rate": 1.0904136477155112e-06, + "loss": 0.3629, + "step": 1669 + }, + { + "epoch": 1.37, + "grad_norm": 1.8376703588677337, + "learning_rate": 1.0877143930924306e-06, + "loss": 0.371, + "step": 1670 + }, + { + "epoch": 1.37, + "grad_norm": 1.9070422380758454, + "learning_rate": 1.085017554354089e-06, + "loss": 0.3533, + "step": 1671 + }, + { + "epoch": 1.37, + "grad_norm": 1.9752631861235486, + "learning_rate": 1.0823231361137543e-06, + "loss": 0.4164, + "step": 1672 + }, + { + "epoch": 1.37, + "grad_norm": 1.885197204563304, + "learning_rate": 1.0796311429805536e-06, + "loss": 0.3929, + "step": 1673 + }, + { + "epoch": 1.37, + "grad_norm": 1.9090106863841916, + "learning_rate": 1.0769415795594659e-06, + "loss": 0.3449, + "step": 1674 + }, + { + "epoch": 1.37, + "grad_norm": 2.022637519082336, + "learning_rate": 1.074254450451314e-06, + "loss": 0.3553, + "step": 1675 + }, + { + "epoch": 1.37, + "grad_norm": 1.942217527277708, + "learning_rate": 1.0715697602527542e-06, + "loss": 0.3936, + "step": 1676 + }, + { + "epoch": 1.37, + "grad_norm": 1.8809306152215932, + "learning_rate": 1.0688875135562738e-06, + "loss": 0.3481, + "step": 1677 + }, + { + "epoch": 1.37, + "grad_norm": 2.0969194462234513, + "learning_rate": 1.0662077149501798e-06, + "loss": 0.3864, + "step": 1678 + }, + { + "epoch": 1.37, + "grad_norm": 1.8365124296835973, + "learning_rate": 1.0635303690185894e-06, + "loss": 0.3778, + "step": 1679 + }, + { + "epoch": 1.38, + "grad_norm": 1.9221630207347382, + "learning_rate": 1.0608554803414256e-06, + "loss": 0.3443, + "step": 1680 + }, + { + "epoch": 1.38, + "grad_norm": 1.9319799829762891, + "learning_rate": 1.0581830534944084e-06, + "loss": 0.3759, + "step": 1681 + }, + { + "epoch": 1.38, + "grad_norm": 2.00532761754314, + "learning_rate": 1.055513093049046e-06, + "loss": 0.373, + "step": 1682 + }, + { + "epoch": 1.38, + "grad_norm": 1.8361577324130107, + "learning_rate": 1.052845603572627e-06, + "loss": 0.3671, + "step": 1683 + }, + { + "epoch": 1.38, + "grad_norm": 1.9246365496147386, + "learning_rate": 1.0501805896282144e-06, + "loss": 0.3888, + "step": 1684 + }, + { + "epoch": 1.38, + "grad_norm": 1.933677406014513, + "learning_rate": 1.047518055774636e-06, + "loss": 0.428, + "step": 1685 + }, + { + "epoch": 1.38, + "grad_norm": 1.8497481971894003, + "learning_rate": 1.0448580065664754e-06, + "loss": 0.339, + "step": 1686 + }, + { + "epoch": 1.38, + "grad_norm": 1.9674163310656592, + "learning_rate": 1.042200446554068e-06, + "loss": 0.3933, + "step": 1687 + }, + { + "epoch": 1.38, + "grad_norm": 1.8703345670634528, + "learning_rate": 1.039545380283491e-06, + "loss": 0.3805, + "step": 1688 + }, + { + "epoch": 1.38, + "grad_norm": 1.8996794102359933, + "learning_rate": 1.0368928122965547e-06, + "loss": 0.3612, + "step": 1689 + }, + { + "epoch": 1.38, + "grad_norm": 1.8163372630466865, + "learning_rate": 1.0342427471307973e-06, + "loss": 0.3631, + "step": 1690 + }, + { + "epoch": 1.38, + "grad_norm": 1.8990581755942872, + "learning_rate": 1.031595189319473e-06, + "loss": 0.4539, + "step": 1691 + }, + { + "epoch": 1.39, + "grad_norm": 1.9101558963616596, + "learning_rate": 1.0289501433915493e-06, + "loss": 0.4649, + "step": 1692 + }, + { + "epoch": 1.39, + "grad_norm": 1.8873611659348446, + "learning_rate": 1.0263076138716962e-06, + "loss": 0.3649, + "step": 1693 + }, + { + "epoch": 1.39, + "grad_norm": 1.823482013352725, + "learning_rate": 1.0236676052802791e-06, + "loss": 0.3648, + "step": 1694 + }, + { + "epoch": 1.39, + "grad_norm": 1.8931382792204232, + "learning_rate": 1.0210301221333512e-06, + "loss": 0.3589, + "step": 1695 + }, + { + "epoch": 1.39, + "grad_norm": 2.0713580311911355, + "learning_rate": 1.0183951689426438e-06, + "loss": 0.3474, + "step": 1696 + }, + { + "epoch": 1.39, + "grad_norm": 1.8607620741027457, + "learning_rate": 1.0157627502155632e-06, + "loss": 0.3773, + "step": 1697 + }, + { + "epoch": 1.39, + "grad_norm": 1.8645944548746636, + "learning_rate": 1.0131328704551782e-06, + "loss": 0.3457, + "step": 1698 + }, + { + "epoch": 1.39, + "grad_norm": 1.852711235772826, + "learning_rate": 1.0105055341602153e-06, + "loss": 0.3559, + "step": 1699 + }, + { + "epoch": 1.39, + "grad_norm": 1.969084245230365, + "learning_rate": 1.00788074582505e-06, + "loss": 0.3786, + "step": 1700 + }, + { + "epoch": 1.39, + "grad_norm": 1.939185809703108, + "learning_rate": 1.005258509939699e-06, + "loss": 0.3649, + "step": 1701 + }, + { + "epoch": 1.39, + "grad_norm": 1.9104875321871906, + "learning_rate": 1.0026388309898132e-06, + "loss": 0.388, + "step": 1702 + }, + { + "epoch": 1.39, + "grad_norm": 2.161662535348609, + "learning_rate": 1.0000217134566694e-06, + "loss": 0.3692, + "step": 1703 + }, + { + "epoch": 1.4, + "grad_norm": 1.8024704245485432, + "learning_rate": 9.974071618171613e-07, + "loss": 0.3751, + "step": 1704 + }, + { + "epoch": 1.4, + "grad_norm": 1.7739470701867779, + "learning_rate": 9.94795180543796e-07, + "loss": 0.3373, + "step": 1705 + }, + { + "epoch": 1.4, + "grad_norm": 1.8188576734630457, + "learning_rate": 9.921857741046806e-07, + "loss": 0.3945, + "step": 1706 + }, + { + "epoch": 1.4, + "grad_norm": 1.9054961265186567, + "learning_rate": 9.895789469635204e-07, + "loss": 0.3518, + "step": 1707 + }, + { + "epoch": 1.4, + "grad_norm": 1.8782724635395873, + "learning_rate": 9.869747035796071e-07, + "loss": 0.37, + "step": 1708 + }, + { + "epoch": 1.4, + "grad_norm": 1.838615529167183, + "learning_rate": 9.843730484078128e-07, + "loss": 0.3376, + "step": 1709 + }, + { + "epoch": 1.4, + "grad_norm": 1.785535753238471, + "learning_rate": 9.817739858985828e-07, + "loss": 0.337, + "step": 1710 + }, + { + "epoch": 1.4, + "grad_norm": 1.8535882977550358, + "learning_rate": 9.791775204979263e-07, + "loss": 0.3391, + "step": 1711 + }, + { + "epoch": 1.4, + "grad_norm": 1.882614515071742, + "learning_rate": 9.765836566474105e-07, + "loss": 0.391, + "step": 1712 + }, + { + "epoch": 1.4, + "grad_norm": 1.8285960302994975, + "learning_rate": 9.739923987841518e-07, + "loss": 0.356, + "step": 1713 + }, + { + "epoch": 1.4, + "grad_norm": 1.8054856197120326, + "learning_rate": 9.714037513408093e-07, + "loss": 0.3623, + "step": 1714 + }, + { + "epoch": 1.4, + "grad_norm": 1.8671208649893825, + "learning_rate": 9.68817718745577e-07, + "loss": 0.3693, + "step": 1715 + }, + { + "epoch": 1.4, + "grad_norm": 1.9004503058230886, + "learning_rate": 9.662343054221743e-07, + "loss": 0.3327, + "step": 1716 + }, + { + "epoch": 1.41, + "grad_norm": 1.9148999919712566, + "learning_rate": 9.636535157898422e-07, + "loss": 0.3618, + "step": 1717 + }, + { + "epoch": 1.41, + "grad_norm": 1.8635582232372712, + "learning_rate": 9.610753542633309e-07, + "loss": 0.3884, + "step": 1718 + }, + { + "epoch": 1.41, + "grad_norm": 1.9383472683274976, + "learning_rate": 9.58499825252897e-07, + "loss": 0.3953, + "step": 1719 + }, + { + "epoch": 1.41, + "grad_norm": 1.946035726357351, + "learning_rate": 9.559269331642937e-07, + "loss": 0.3292, + "step": 1720 + }, + { + "epoch": 1.41, + "grad_norm": 1.8700217872447233, + "learning_rate": 9.533566823987628e-07, + "loss": 0.361, + "step": 1721 + }, + { + "epoch": 1.41, + "grad_norm": 1.8900223904453795, + "learning_rate": 9.507890773530276e-07, + "loss": 0.3349, + "step": 1722 + }, + { + "epoch": 1.41, + "grad_norm": 1.9125828500996216, + "learning_rate": 9.482241224192867e-07, + "loss": 0.3641, + "step": 1723 + }, + { + "epoch": 1.41, + "grad_norm": 1.940533327906808, + "learning_rate": 9.456618219852042e-07, + "loss": 0.4036, + "step": 1724 + }, + { + "epoch": 1.41, + "grad_norm": 2.0712298544333687, + "learning_rate": 9.431021804339047e-07, + "loss": 0.3934, + "step": 1725 + }, + { + "epoch": 1.41, + "grad_norm": 1.8791027421557622, + "learning_rate": 9.40545202143962e-07, + "loss": 0.3507, + "step": 1726 + }, + { + "epoch": 1.41, + "grad_norm": 1.9686923479849525, + "learning_rate": 9.379908914893962e-07, + "loss": 0.3497, + "step": 1727 + }, + { + "epoch": 1.41, + "grad_norm": 2.0437193308441253, + "learning_rate": 9.354392528396638e-07, + "loss": 0.395, + "step": 1728 + }, + { + "epoch": 1.42, + "grad_norm": 1.864988214025856, + "learning_rate": 9.328902905596512e-07, + "loss": 0.379, + "step": 1729 + }, + { + "epoch": 1.42, + "grad_norm": 1.963062444850751, + "learning_rate": 9.303440090096633e-07, + "loss": 0.3565, + "step": 1730 + }, + { + "epoch": 1.42, + "grad_norm": 1.9399170798660286, + "learning_rate": 9.278004125454232e-07, + "loss": 0.415, + "step": 1731 + }, + { + "epoch": 1.42, + "grad_norm": 1.874726297624515, + "learning_rate": 9.252595055180585e-07, + "loss": 0.3606, + "step": 1732 + }, + { + "epoch": 1.42, + "grad_norm": 1.834934005776965, + "learning_rate": 9.227212922740971e-07, + "loss": 0.4104, + "step": 1733 + }, + { + "epoch": 1.42, + "grad_norm": 1.8726418919835732, + "learning_rate": 9.20185777155459e-07, + "loss": 0.3325, + "step": 1734 + }, + { + "epoch": 1.42, + "grad_norm": 1.9432074923657174, + "learning_rate": 9.176529644994481e-07, + "loss": 0.3663, + "step": 1735 + }, + { + "epoch": 1.42, + "grad_norm": 1.7937207452405413, + "learning_rate": 9.151228586387464e-07, + "loss": 0.3225, + "step": 1736 + }, + { + "epoch": 1.42, + "grad_norm": 1.911607719176901, + "learning_rate": 9.125954639014037e-07, + "loss": 0.3491, + "step": 1737 + }, + { + "epoch": 1.42, + "grad_norm": 1.8954594851178048, + "learning_rate": 9.100707846108337e-07, + "loss": 0.3474, + "step": 1738 + }, + { + "epoch": 1.42, + "grad_norm": 1.9081066235083353, + "learning_rate": 9.075488250858047e-07, + "loss": 0.3654, + "step": 1739 + }, + { + "epoch": 1.42, + "grad_norm": 1.9384836973235149, + "learning_rate": 9.050295896404326e-07, + "loss": 0.3519, + "step": 1740 + }, + { + "epoch": 1.43, + "grad_norm": 1.9655302768136176, + "learning_rate": 9.02513082584173e-07, + "loss": 0.3482, + "step": 1741 + }, + { + "epoch": 1.43, + "grad_norm": 1.900218584161994, + "learning_rate": 8.999993082218156e-07, + "loss": 0.3576, + "step": 1742 + }, + { + "epoch": 1.43, + "grad_norm": 2.030742409886431, + "learning_rate": 8.974882708534724e-07, + "loss": 0.3055, + "step": 1743 + }, + { + "epoch": 1.43, + "grad_norm": 1.865959678567607, + "learning_rate": 8.949799747745766e-07, + "loss": 0.3485, + "step": 1744 + }, + { + "epoch": 1.43, + "grad_norm": 1.8300998571759115, + "learning_rate": 8.924744242758707e-07, + "loss": 0.3412, + "step": 1745 + }, + { + "epoch": 1.43, + "grad_norm": 2.3841641123937514, + "learning_rate": 8.899716236434019e-07, + "loss": 0.3484, + "step": 1746 + }, + { + "epoch": 1.43, + "grad_norm": 1.844271076789803, + "learning_rate": 8.874715771585105e-07, + "loss": 0.3762, + "step": 1747 + }, + { + "epoch": 1.43, + "grad_norm": 1.8687696131042617, + "learning_rate": 8.84974289097828e-07, + "loss": 0.402, + "step": 1748 + }, + { + "epoch": 1.43, + "grad_norm": 1.889973499535232, + "learning_rate": 8.824797637332669e-07, + "loss": 0.3566, + "step": 1749 + }, + { + "epoch": 1.43, + "grad_norm": 1.8681107208205963, + "learning_rate": 8.799880053320131e-07, + "loss": 0.4057, + "step": 1750 + }, + { + "epoch": 1.43, + "grad_norm": 1.8928327876139377, + "learning_rate": 8.774990181565201e-07, + "loss": 0.3784, + "step": 1751 + }, + { + "epoch": 1.43, + "grad_norm": 1.931089236577729, + "learning_rate": 8.750128064645002e-07, + "loss": 0.4008, + "step": 1752 + }, + { + "epoch": 1.44, + "grad_norm": 1.9573581859995763, + "learning_rate": 8.725293745089181e-07, + "loss": 0.3486, + "step": 1753 + }, + { + "epoch": 1.44, + "grad_norm": 1.9164746693234396, + "learning_rate": 8.700487265379845e-07, + "loss": 0.3634, + "step": 1754 + }, + { + "epoch": 1.44, + "grad_norm": 1.812159570787973, + "learning_rate": 8.675708667951446e-07, + "loss": 0.3476, + "step": 1755 + }, + { + "epoch": 1.44, + "grad_norm": 2.0355096473340146, + "learning_rate": 8.650957995190784e-07, + "loss": 0.3562, + "step": 1756 + }, + { + "epoch": 1.44, + "grad_norm": 1.8995538618272807, + "learning_rate": 8.626235289436846e-07, + "loss": 0.3767, + "step": 1757 + }, + { + "epoch": 1.44, + "grad_norm": 1.8751894629115184, + "learning_rate": 8.601540592980812e-07, + "loss": 0.3709, + "step": 1758 + }, + { + "epoch": 1.44, + "grad_norm": 1.8772906072081945, + "learning_rate": 8.576873948065931e-07, + "loss": 0.3692, + "step": 1759 + }, + { + "epoch": 1.44, + "grad_norm": 1.855725719743314, + "learning_rate": 8.552235396887479e-07, + "loss": 0.3461, + "step": 1760 + }, + { + "epoch": 1.44, + "grad_norm": 1.9058932387569096, + "learning_rate": 8.52762498159266e-07, + "loss": 0.4035, + "step": 1761 + }, + { + "epoch": 1.44, + "grad_norm": 1.8155999399280405, + "learning_rate": 8.503042744280565e-07, + "loss": 0.3821, + "step": 1762 + }, + { + "epoch": 1.44, + "grad_norm": 1.9191184065214926, + "learning_rate": 8.478488727002062e-07, + "loss": 0.4182, + "step": 1763 + }, + { + "epoch": 1.44, + "grad_norm": 1.8660511914055784, + "learning_rate": 8.453962971759766e-07, + "loss": 0.3936, + "step": 1764 + }, + { + "epoch": 1.45, + "grad_norm": 1.8559359079620885, + "learning_rate": 8.429465520507932e-07, + "loss": 0.3555, + "step": 1765 + }, + { + "epoch": 1.45, + "grad_norm": 1.871625930259135, + "learning_rate": 8.404996415152414e-07, + "loss": 0.3336, + "step": 1766 + }, + { + "epoch": 1.45, + "grad_norm": 1.9146405985810966, + "learning_rate": 8.38055569755055e-07, + "loss": 0.3595, + "step": 1767 + }, + { + "epoch": 1.45, + "grad_norm": 1.8172916285896499, + "learning_rate": 8.356143409511145e-07, + "loss": 0.3763, + "step": 1768 + }, + { + "epoch": 1.45, + "grad_norm": 1.9045338434685268, + "learning_rate": 8.331759592794344e-07, + "loss": 0.3454, + "step": 1769 + }, + { + "epoch": 1.45, + "grad_norm": 1.9019450574908656, + "learning_rate": 8.307404289111618e-07, + "loss": 0.3782, + "step": 1770 + }, + { + "epoch": 1.45, + "grad_norm": 1.8040956687408418, + "learning_rate": 8.283077540125642e-07, + "loss": 0.3397, + "step": 1771 + }, + { + "epoch": 1.45, + "grad_norm": 1.8854623689371994, + "learning_rate": 8.258779387450258e-07, + "loss": 0.3632, + "step": 1772 + }, + { + "epoch": 1.45, + "grad_norm": 1.8703628366355571, + "learning_rate": 8.234509872650381e-07, + "loss": 0.3796, + "step": 1773 + }, + { + "epoch": 1.45, + "grad_norm": 1.8974382562927672, + "learning_rate": 8.210269037241945e-07, + "loss": 0.3577, + "step": 1774 + }, + { + "epoch": 1.45, + "grad_norm": 1.8041564148309792, + "learning_rate": 8.186056922691816e-07, + "loss": 0.3423, + "step": 1775 + }, + { + "epoch": 1.45, + "grad_norm": 1.8871513088592733, + "learning_rate": 8.161873570417742e-07, + "loss": 0.3724, + "step": 1776 + }, + { + "epoch": 1.46, + "grad_norm": 1.7959090299202567, + "learning_rate": 8.137719021788248e-07, + "loss": 0.3514, + "step": 1777 + }, + { + "epoch": 1.46, + "grad_norm": 1.77414937614363, + "learning_rate": 8.113593318122609e-07, + "loss": 0.3655, + "step": 1778 + }, + { + "epoch": 1.46, + "grad_norm": 1.8415138040355723, + "learning_rate": 8.089496500690747e-07, + "loss": 0.3469, + "step": 1779 + }, + { + "epoch": 1.46, + "grad_norm": 1.943916626029921, + "learning_rate": 8.06542861071318e-07, + "loss": 0.3626, + "step": 1780 + }, + { + "epoch": 1.46, + "grad_norm": 1.9699325195709307, + "learning_rate": 8.041389689360921e-07, + "loss": 0.3897, + "step": 1781 + }, + { + "epoch": 1.46, + "grad_norm": 1.8300758832916175, + "learning_rate": 8.01737977775545e-07, + "loss": 0.3528, + "step": 1782 + }, + { + "epoch": 1.46, + "grad_norm": 1.8854405268423242, + "learning_rate": 7.993398916968609e-07, + "loss": 0.3458, + "step": 1783 + }, + { + "epoch": 1.46, + "grad_norm": 1.8610707367327934, + "learning_rate": 7.969447148022555e-07, + "loss": 0.3825, + "step": 1784 + }, + { + "epoch": 1.46, + "grad_norm": 1.8761158349166456, + "learning_rate": 7.945524511889676e-07, + "loss": 0.361, + "step": 1785 + }, + { + "epoch": 1.46, + "grad_norm": 1.8316905966902863, + "learning_rate": 7.921631049492526e-07, + "loss": 0.3791, + "step": 1786 + }, + { + "epoch": 1.46, + "grad_norm": 1.8815617462853849, + "learning_rate": 7.897766801703754e-07, + "loss": 0.3334, + "step": 1787 + }, + { + "epoch": 1.46, + "grad_norm": 1.8069850793814037, + "learning_rate": 7.873931809346022e-07, + "loss": 0.3063, + "step": 1788 + }, + { + "epoch": 1.47, + "grad_norm": 1.877897596569181, + "learning_rate": 7.850126113191961e-07, + "loss": 0.3551, + "step": 1789 + }, + { + "epoch": 1.47, + "grad_norm": 1.933100704380605, + "learning_rate": 7.826349753964083e-07, + "loss": 0.4, + "step": 1790 + }, + { + "epoch": 1.47, + "grad_norm": 1.8588317568608963, + "learning_rate": 7.802602772334719e-07, + "loss": 0.3695, + "step": 1791 + }, + { + "epoch": 1.47, + "grad_norm": 1.75903586927703, + "learning_rate": 7.778885208925943e-07, + "loss": 0.3334, + "step": 1792 + }, + { + "epoch": 1.47, + "grad_norm": 1.847597726088611, + "learning_rate": 7.755197104309512e-07, + "loss": 0.3508, + "step": 1793 + }, + { + "epoch": 1.47, + "grad_norm": 1.8730373365521515, + "learning_rate": 7.731538499006767e-07, + "loss": 0.3727, + "step": 1794 + }, + { + "epoch": 1.47, + "grad_norm": 1.8696875894594878, + "learning_rate": 7.707909433488611e-07, + "loss": 0.3694, + "step": 1795 + }, + { + "epoch": 1.47, + "grad_norm": 1.8224097896476315, + "learning_rate": 7.684309948175414e-07, + "loss": 0.3682, + "step": 1796 + }, + { + "epoch": 1.47, + "grad_norm": 1.8896591788553188, + "learning_rate": 7.660740083436943e-07, + "loss": 0.353, + "step": 1797 + }, + { + "epoch": 1.47, + "grad_norm": 1.8622597363460462, + "learning_rate": 7.637199879592275e-07, + "loss": 0.3835, + "step": 1798 + }, + { + "epoch": 1.47, + "grad_norm": 1.8261440807434144, + "learning_rate": 7.61368937690978e-07, + "loss": 0.3673, + "step": 1799 + }, + { + "epoch": 1.47, + "grad_norm": 1.86324753247062, + "learning_rate": 7.590208615607001e-07, + "loss": 0.3613, + "step": 1800 + }, + { + "epoch": 1.48, + "grad_norm": 1.8704051001710107, + "learning_rate": 7.566757635850608e-07, + "loss": 0.3756, + "step": 1801 + }, + { + "epoch": 1.48, + "grad_norm": 1.8547689419526656, + "learning_rate": 7.543336477756336e-07, + "loss": 0.3557, + "step": 1802 + }, + { + "epoch": 1.48, + "grad_norm": 1.8970591656145008, + "learning_rate": 7.519945181388893e-07, + "loss": 0.3713, + "step": 1803 + }, + { + "epoch": 1.48, + "grad_norm": 2.034710049647413, + "learning_rate": 7.496583786761911e-07, + "loss": 0.379, + "step": 1804 + }, + { + "epoch": 1.48, + "grad_norm": 1.7207339510591724, + "learning_rate": 7.47325233383788e-07, + "loss": 0.324, + "step": 1805 + }, + { + "epoch": 1.48, + "grad_norm": 1.8353430031672993, + "learning_rate": 7.449950862528046e-07, + "loss": 0.3688, + "step": 1806 + }, + { + "epoch": 1.48, + "grad_norm": 1.8248952138910253, + "learning_rate": 7.426679412692403e-07, + "loss": 0.3744, + "step": 1807 + }, + { + "epoch": 1.48, + "grad_norm": 1.8581710166024752, + "learning_rate": 7.403438024139547e-07, + "loss": 0.3591, + "step": 1808 + }, + { + "epoch": 1.48, + "grad_norm": 1.972956887111899, + "learning_rate": 7.380226736626692e-07, + "loss": 0.3786, + "step": 1809 + }, + { + "epoch": 1.48, + "grad_norm": 1.875119757327761, + "learning_rate": 7.357045589859535e-07, + "loss": 0.3924, + "step": 1810 + }, + { + "epoch": 1.48, + "grad_norm": 1.829358260084735, + "learning_rate": 7.333894623492222e-07, + "loss": 0.3489, + "step": 1811 + }, + { + "epoch": 1.48, + "grad_norm": 1.8778690881245192, + "learning_rate": 7.310773877127275e-07, + "loss": 0.3906, + "step": 1812 + }, + { + "epoch": 1.49, + "grad_norm": 1.8807953543932978, + "learning_rate": 7.287683390315514e-07, + "loss": 0.3388, + "step": 1813 + }, + { + "epoch": 1.49, + "grad_norm": 1.8813619639740409, + "learning_rate": 7.264623202556001e-07, + "loss": 0.3678, + "step": 1814 + }, + { + "epoch": 1.49, + "grad_norm": 1.8588744507841983, + "learning_rate": 7.241593353295967e-07, + "loss": 0.3628, + "step": 1815 + }, + { + "epoch": 1.49, + "grad_norm": 1.854298814001063, + "learning_rate": 7.218593881930744e-07, + "loss": 0.3719, + "step": 1816 + }, + { + "epoch": 1.49, + "grad_norm": 1.9792414995864196, + "learning_rate": 7.195624827803704e-07, + "loss": 0.3954, + "step": 1817 + }, + { + "epoch": 1.49, + "grad_norm": 1.9020874510906967, + "learning_rate": 7.172686230206174e-07, + "loss": 0.3501, + "step": 1818 + }, + { + "epoch": 1.49, + "grad_norm": 1.8501064231209523, + "learning_rate": 7.1497781283774e-07, + "loss": 0.3733, + "step": 1819 + }, + { + "epoch": 1.49, + "grad_norm": 1.8611190389122063, + "learning_rate": 7.126900561504435e-07, + "loss": 0.3883, + "step": 1820 + }, + { + "epoch": 1.49, + "grad_norm": 1.8961852046693481, + "learning_rate": 7.104053568722128e-07, + "loss": 0.3524, + "step": 1821 + }, + { + "epoch": 1.49, + "grad_norm": 1.8752890672773106, + "learning_rate": 7.081237189113005e-07, + "loss": 0.3524, + "step": 1822 + }, + { + "epoch": 1.49, + "grad_norm": 1.9058307416670415, + "learning_rate": 7.058451461707239e-07, + "loss": 0.3653, + "step": 1823 + }, + { + "epoch": 1.49, + "grad_norm": 1.9173580029522834, + "learning_rate": 7.035696425482563e-07, + "loss": 0.4105, + "step": 1824 + }, + { + "epoch": 1.5, + "grad_norm": 2.1303031903916674, + "learning_rate": 7.012972119364206e-07, + "loss": 0.354, + "step": 1825 + }, + { + "epoch": 1.5, + "grad_norm": 1.9010891424886316, + "learning_rate": 6.990278582224835e-07, + "loss": 0.3662, + "step": 1826 + }, + { + "epoch": 1.5, + "grad_norm": 1.877460865866335, + "learning_rate": 6.967615852884485e-07, + "loss": 0.3898, + "step": 1827 + }, + { + "epoch": 1.5, + "grad_norm": 1.8440076993566192, + "learning_rate": 6.944983970110475e-07, + "loss": 0.3582, + "step": 1828 + }, + { + "epoch": 1.5, + "grad_norm": 1.8515961219329589, + "learning_rate": 6.922382972617372e-07, + "loss": 0.3653, + "step": 1829 + }, + { + "epoch": 1.5, + "grad_norm": 1.830548627340717, + "learning_rate": 6.899812899066907e-07, + "loss": 0.3387, + "step": 1830 + }, + { + "epoch": 1.5, + "grad_norm": 1.8380839219635814, + "learning_rate": 6.877273788067918e-07, + "loss": 0.352, + "step": 1831 + }, + { + "epoch": 1.5, + "grad_norm": 2.013553404360448, + "learning_rate": 6.854765678176256e-07, + "loss": 0.3702, + "step": 1832 + }, + { + "epoch": 1.5, + "grad_norm": 1.8665688769908853, + "learning_rate": 6.832288607894766e-07, + "loss": 0.3733, + "step": 1833 + }, + { + "epoch": 1.5, + "grad_norm": 1.8644441622970171, + "learning_rate": 6.809842615673179e-07, + "loss": 0.3799, + "step": 1834 + }, + { + "epoch": 1.5, + "grad_norm": 1.8383420889549984, + "learning_rate": 6.787427739908079e-07, + "loss": 0.3667, + "step": 1835 + }, + { + "epoch": 1.5, + "grad_norm": 1.8631784953833668, + "learning_rate": 6.765044018942804e-07, + "loss": 0.3297, + "step": 1836 + }, + { + "epoch": 1.51, + "grad_norm": 1.9010037089689114, + "learning_rate": 6.742691491067419e-07, + "loss": 0.3692, + "step": 1837 + }, + { + "epoch": 1.51, + "grad_norm": 1.8921348077299611, + "learning_rate": 6.720370194518599e-07, + "loss": 0.3456, + "step": 1838 + }, + { + "epoch": 1.51, + "grad_norm": 1.8727807839161668, + "learning_rate": 6.698080167479621e-07, + "loss": 0.3385, + "step": 1839 + }, + { + "epoch": 1.51, + "grad_norm": 1.8815655707023475, + "learning_rate": 6.675821448080261e-07, + "loss": 0.3439, + "step": 1840 + }, + { + "epoch": 1.51, + "grad_norm": 1.9290903087621594, + "learning_rate": 6.653594074396744e-07, + "loss": 0.3283, + "step": 1841 + }, + { + "epoch": 1.51, + "grad_norm": 1.839315528952413, + "learning_rate": 6.631398084451671e-07, + "loss": 0.383, + "step": 1842 + }, + { + "epoch": 1.51, + "grad_norm": 1.888172853861002, + "learning_rate": 6.609233516213955e-07, + "loss": 0.3326, + "step": 1843 + }, + { + "epoch": 1.51, + "grad_norm": 1.9530270141902042, + "learning_rate": 6.58710040759877e-07, + "loss": 0.3629, + "step": 1844 + }, + { + "epoch": 1.51, + "grad_norm": 1.8350634300423698, + "learning_rate": 6.564998796467453e-07, + "loss": 0.3378, + "step": 1845 + }, + { + "epoch": 1.51, + "grad_norm": 1.8651246191841713, + "learning_rate": 6.542928720627478e-07, + "loss": 0.327, + "step": 1846 + }, + { + "epoch": 1.51, + "grad_norm": 1.9096045764816365, + "learning_rate": 6.520890217832373e-07, + "loss": 0.3255, + "step": 1847 + }, + { + "epoch": 1.51, + "grad_norm": 1.919058821000027, + "learning_rate": 6.498883325781658e-07, + "loss": 0.3722, + "step": 1848 + }, + { + "epoch": 1.52, + "grad_norm": 1.7941064379223837, + "learning_rate": 6.476908082120758e-07, + "loss": 0.4079, + "step": 1849 + }, + { + "epoch": 1.52, + "grad_norm": 1.9058204893972528, + "learning_rate": 6.454964524440988e-07, + "loss": 0.3447, + "step": 1850 + }, + { + "epoch": 1.52, + "grad_norm": 1.7622865373821626, + "learning_rate": 6.433052690279443e-07, + "loss": 0.328, + "step": 1851 + }, + { + "epoch": 1.52, + "grad_norm": 2.496163673119719, + "learning_rate": 6.411172617118958e-07, + "loss": 0.3457, + "step": 1852 + }, + { + "epoch": 1.52, + "grad_norm": 1.952829151233476, + "learning_rate": 6.389324342388034e-07, + "loss": 0.3757, + "step": 1853 + }, + { + "epoch": 1.52, + "grad_norm": 1.8737496346585327, + "learning_rate": 6.367507903460782e-07, + "loss": 0.3499, + "step": 1854 + }, + { + "epoch": 1.52, + "grad_norm": 1.789743920699054, + "learning_rate": 6.345723337656845e-07, + "loss": 0.3686, + "step": 1855 + }, + { + "epoch": 1.52, + "grad_norm": 1.814108105986017, + "learning_rate": 6.32397068224136e-07, + "loss": 0.3456, + "step": 1856 + }, + { + "epoch": 1.52, + "grad_norm": 1.8844464079872227, + "learning_rate": 6.302249974424848e-07, + "loss": 0.3408, + "step": 1857 + }, + { + "epoch": 1.52, + "grad_norm": 1.8123689124000075, + "learning_rate": 6.280561251363212e-07, + "loss": 0.3415, + "step": 1858 + }, + { + "epoch": 1.52, + "grad_norm": 1.8581667911676272, + "learning_rate": 6.258904550157616e-07, + "loss": 0.3598, + "step": 1859 + }, + { + "epoch": 1.52, + "grad_norm": 1.8951304560085334, + "learning_rate": 6.23727990785446e-07, + "loss": 0.388, + "step": 1860 + }, + { + "epoch": 1.53, + "grad_norm": 1.9149385859826418, + "learning_rate": 6.215687361445305e-07, + "loss": 0.3218, + "step": 1861 + }, + { + "epoch": 1.53, + "grad_norm": 1.9124781784882163, + "learning_rate": 6.194126947866799e-07, + "loss": 0.3754, + "step": 1862 + }, + { + "epoch": 1.53, + "grad_norm": 1.834455896221774, + "learning_rate": 6.172598704000632e-07, + "loss": 0.3437, + "step": 1863 + }, + { + "epoch": 1.53, + "grad_norm": 1.8282538931136811, + "learning_rate": 6.151102666673461e-07, + "loss": 0.4017, + "step": 1864 + }, + { + "epoch": 1.53, + "grad_norm": 1.9403955088374527, + "learning_rate": 6.129638872656842e-07, + "loss": 0.3945, + "step": 1865 + }, + { + "epoch": 1.53, + "grad_norm": 1.9233473407163304, + "learning_rate": 6.108207358667189e-07, + "loss": 0.3461, + "step": 1866 + }, + { + "epoch": 1.53, + "grad_norm": 1.8541305863089477, + "learning_rate": 6.086808161365685e-07, + "loss": 0.3832, + "step": 1867 + }, + { + "epoch": 1.53, + "grad_norm": 1.9957679209687724, + "learning_rate": 6.065441317358245e-07, + "loss": 0.3689, + "step": 1868 + }, + { + "epoch": 1.53, + "grad_norm": 1.8808330149547094, + "learning_rate": 6.044106863195415e-07, + "loss": 0.3695, + "step": 1869 + }, + { + "epoch": 1.53, + "grad_norm": 1.8104036881057104, + "learning_rate": 6.022804835372364e-07, + "loss": 0.3483, + "step": 1870 + }, + { + "epoch": 1.53, + "grad_norm": 1.862734707805995, + "learning_rate": 6.001535270328768e-07, + "loss": 0.3631, + "step": 1871 + }, + { + "epoch": 1.53, + "grad_norm": 1.7886949265475778, + "learning_rate": 5.980298204448781e-07, + "loss": 0.3427, + "step": 1872 + }, + { + "epoch": 1.54, + "grad_norm": 2.0013909122499585, + "learning_rate": 5.959093674060973e-07, + "loss": 0.3227, + "step": 1873 + }, + { + "epoch": 1.54, + "grad_norm": 1.8454555132680566, + "learning_rate": 5.937921715438242e-07, + "loss": 0.3712, + "step": 1874 + }, + { + "epoch": 1.54, + "grad_norm": 1.7955230940580693, + "learning_rate": 5.916782364797774e-07, + "loss": 0.358, + "step": 1875 + }, + { + "epoch": 1.54, + "grad_norm": 1.981678881628013, + "learning_rate": 5.895675658300981e-07, + "loss": 0.3436, + "step": 1876 + }, + { + "epoch": 1.54, + "grad_norm": 1.8838900630160964, + "learning_rate": 5.874601632053426e-07, + "loss": 0.3672, + "step": 1877 + }, + { + "epoch": 1.54, + "grad_norm": 1.9199361104982893, + "learning_rate": 5.853560322104778e-07, + "loss": 0.369, + "step": 1878 + }, + { + "epoch": 1.54, + "grad_norm": 1.8961860490578715, + "learning_rate": 5.832551764448719e-07, + "loss": 0.3376, + "step": 1879 + }, + { + "epoch": 1.54, + "grad_norm": 1.9335243111005527, + "learning_rate": 5.811575995022925e-07, + "loss": 0.3544, + "step": 1880 + }, + { + "epoch": 1.54, + "grad_norm": 1.9130052171381386, + "learning_rate": 5.790633049708979e-07, + "loss": 0.3491, + "step": 1881 + }, + { + "epoch": 1.54, + "grad_norm": 1.8114767475508076, + "learning_rate": 5.76972296433232e-07, + "loss": 0.349, + "step": 1882 + }, + { + "epoch": 1.54, + "grad_norm": 1.869088387575796, + "learning_rate": 5.748845774662154e-07, + "loss": 0.339, + "step": 1883 + }, + { + "epoch": 1.54, + "grad_norm": 1.8823416706934062, + "learning_rate": 5.728001516411441e-07, + "loss": 0.3344, + "step": 1884 + }, + { + "epoch": 1.55, + "grad_norm": 2.028993532989763, + "learning_rate": 5.707190225236791e-07, + "loss": 0.3315, + "step": 1885 + }, + { + "epoch": 1.55, + "grad_norm": 1.8841725898032713, + "learning_rate": 5.686411936738428e-07, + "loss": 0.3391, + "step": 1886 + }, + { + "epoch": 1.55, + "grad_norm": 1.8925869708717353, + "learning_rate": 5.665666686460119e-07, + "loss": 0.3856, + "step": 1887 + }, + { + "epoch": 1.55, + "grad_norm": 1.7967049620852429, + "learning_rate": 5.644954509889125e-07, + "loss": 0.327, + "step": 1888 + }, + { + "epoch": 1.55, + "grad_norm": 1.8430042776590718, + "learning_rate": 5.624275442456101e-07, + "loss": 0.3796, + "step": 1889 + }, + { + "epoch": 1.55, + "grad_norm": 1.9114914845740119, + "learning_rate": 5.603629519535092e-07, + "loss": 0.3232, + "step": 1890 + }, + { + "epoch": 1.55, + "grad_norm": 1.926168461469419, + "learning_rate": 5.583016776443443e-07, + "loss": 0.3707, + "step": 1891 + }, + { + "epoch": 1.55, + "grad_norm": 1.8385673934837172, + "learning_rate": 5.562437248441727e-07, + "loss": 0.3178, + "step": 1892 + }, + { + "epoch": 1.55, + "grad_norm": 1.822703583469435, + "learning_rate": 5.54189097073371e-07, + "loss": 0.3435, + "step": 1893 + }, + { + "epoch": 1.55, + "grad_norm": 1.9179700429647883, + "learning_rate": 5.52137797846628e-07, + "loss": 0.3822, + "step": 1894 + }, + { + "epoch": 1.55, + "grad_norm": 1.8022936248540935, + "learning_rate": 5.500898306729385e-07, + "loss": 0.3326, + "step": 1895 + }, + { + "epoch": 1.55, + "grad_norm": 1.8093411058422164, + "learning_rate": 5.48045199055596e-07, + "loss": 0.369, + "step": 1896 + }, + { + "epoch": 1.56, + "grad_norm": 1.9644587652808347, + "learning_rate": 5.460039064921901e-07, + "loss": 0.3642, + "step": 1897 + }, + { + "epoch": 1.56, + "grad_norm": 1.894471765138494, + "learning_rate": 5.439659564745975e-07, + "loss": 0.3433, + "step": 1898 + }, + { + "epoch": 1.56, + "grad_norm": 1.813778968949277, + "learning_rate": 5.41931352488978e-07, + "loss": 0.3451, + "step": 1899 + }, + { + "epoch": 1.56, + "grad_norm": 1.912342915517377, + "learning_rate": 5.399000980157657e-07, + "loss": 0.3531, + "step": 1900 + }, + { + "epoch": 1.56, + "grad_norm": 1.9171703337147434, + "learning_rate": 5.378721965296665e-07, + "loss": 0.3319, + "step": 1901 + }, + { + "epoch": 1.56, + "grad_norm": 2.130973697729064, + "learning_rate": 5.35847651499651e-07, + "loss": 0.3455, + "step": 1902 + }, + { + "epoch": 1.56, + "grad_norm": 1.8703910349961417, + "learning_rate": 5.33826466388947e-07, + "loss": 0.3749, + "step": 1903 + }, + { + "epoch": 1.56, + "grad_norm": 1.8325169995594843, + "learning_rate": 5.318086446550352e-07, + "loss": 0.3663, + "step": 1904 + }, + { + "epoch": 1.56, + "grad_norm": 1.9036889163080901, + "learning_rate": 5.297941897496428e-07, + "loss": 0.3592, + "step": 1905 + }, + { + "epoch": 1.56, + "grad_norm": 2.0123341027513892, + "learning_rate": 5.277831051187382e-07, + "loss": 0.4212, + "step": 1906 + }, + { + "epoch": 1.56, + "grad_norm": 1.862718607721721, + "learning_rate": 5.257753942025243e-07, + "loss": 0.3739, + "step": 1907 + }, + { + "epoch": 1.56, + "grad_norm": 1.8541370296695225, + "learning_rate": 5.237710604354313e-07, + "loss": 0.4162, + "step": 1908 + }, + { + "epoch": 1.56, + "grad_norm": 1.8209725179155203, + "learning_rate": 5.217701072461149e-07, + "loss": 0.3288, + "step": 1909 + }, + { + "epoch": 1.57, + "grad_norm": 1.9438902543671965, + "learning_rate": 5.197725380574456e-07, + "loss": 0.3753, + "step": 1910 + }, + { + "epoch": 1.57, + "grad_norm": 1.8631654581464705, + "learning_rate": 5.177783562865066e-07, + "loss": 0.3437, + "step": 1911 + }, + { + "epoch": 1.57, + "grad_norm": 1.8396840995640549, + "learning_rate": 5.157875653445866e-07, + "loss": 0.3669, + "step": 1912 + }, + { + "epoch": 1.57, + "grad_norm": 1.7855189269312173, + "learning_rate": 5.138001686371729e-07, + "loss": 0.3593, + "step": 1913 + }, + { + "epoch": 1.57, + "grad_norm": 1.8497984943634604, + "learning_rate": 5.118161695639479e-07, + "loss": 0.3295, + "step": 1914 + }, + { + "epoch": 1.57, + "grad_norm": 1.8450016226883872, + "learning_rate": 5.0983557151878e-07, + "loss": 0.3514, + "step": 1915 + }, + { + "epoch": 1.57, + "grad_norm": 1.8115984473579492, + "learning_rate": 5.078583778897216e-07, + "loss": 0.3599, + "step": 1916 + }, + { + "epoch": 1.57, + "grad_norm": 1.8768743283745053, + "learning_rate": 5.058845920590008e-07, + "loss": 0.336, + "step": 1917 + }, + { + "epoch": 1.57, + "grad_norm": 1.898132261271154, + "learning_rate": 5.039142174030159e-07, + "loss": 0.3895, + "step": 1918 + }, + { + "epoch": 1.57, + "grad_norm": 1.8621712512957447, + "learning_rate": 5.019472572923307e-07, + "loss": 0.3718, + "step": 1919 + }, + { + "epoch": 1.57, + "grad_norm": 1.9340085582222217, + "learning_rate": 4.999837150916664e-07, + "loss": 0.3633, + "step": 1920 + }, + { + "epoch": 1.57, + "grad_norm": 1.8812999232707277, + "learning_rate": 4.980235941598999e-07, + "loss": 0.3666, + "step": 1921 + }, + { + "epoch": 1.58, + "grad_norm": 1.783816493860109, + "learning_rate": 4.960668978500529e-07, + "loss": 0.3487, + "step": 1922 + }, + { + "epoch": 1.58, + "grad_norm": 1.8845458416987717, + "learning_rate": 4.94113629509291e-07, + "loss": 0.379, + "step": 1923 + }, + { + "epoch": 1.58, + "grad_norm": 2.039608954599768, + "learning_rate": 4.921637924789153e-07, + "loss": 0.3652, + "step": 1924 + }, + { + "epoch": 1.58, + "grad_norm": 1.8867923218874316, + "learning_rate": 4.902173900943564e-07, + "loss": 0.3561, + "step": 1925 + }, + { + "epoch": 1.58, + "grad_norm": 1.8133572570953833, + "learning_rate": 4.882744256851707e-07, + "loss": 0.3718, + "step": 1926 + }, + { + "epoch": 1.58, + "grad_norm": 1.9247010567225666, + "learning_rate": 4.86334902575033e-07, + "loss": 0.3692, + "step": 1927 + }, + { + "epoch": 1.58, + "grad_norm": 1.879349133553964, + "learning_rate": 4.84398824081731e-07, + "loss": 0.3639, + "step": 1928 + }, + { + "epoch": 1.58, + "grad_norm": 1.8500016574014475, + "learning_rate": 4.824661935171613e-07, + "loss": 0.3351, + "step": 1929 + }, + { + "epoch": 1.58, + "grad_norm": 1.8591348368444711, + "learning_rate": 4.805370141873198e-07, + "loss": 0.3901, + "step": 1930 + }, + { + "epoch": 1.58, + "grad_norm": 1.8661345765477586, + "learning_rate": 4.786112893923011e-07, + "loss": 0.3875, + "step": 1931 + }, + { + "epoch": 1.58, + "grad_norm": 1.8849185643314905, + "learning_rate": 4.766890224262896e-07, + "loss": 0.382, + "step": 1932 + }, + { + "epoch": 1.58, + "grad_norm": 1.913717676777297, + "learning_rate": 4.747702165775542e-07, + "loss": 0.3636, + "step": 1933 + }, + { + "epoch": 1.59, + "grad_norm": 1.8488810215590799, + "learning_rate": 4.728548751284448e-07, + "loss": 0.3725, + "step": 1934 + }, + { + "epoch": 1.59, + "grad_norm": 1.8776942493725268, + "learning_rate": 4.7094300135538203e-07, + "loss": 0.345, + "step": 1935 + }, + { + "epoch": 1.59, + "grad_norm": 1.947623892429973, + "learning_rate": 4.690345985288572e-07, + "loss": 0.3454, + "step": 1936 + }, + { + "epoch": 1.59, + "grad_norm": 1.8541251767713411, + "learning_rate": 4.671296699134234e-07, + "loss": 0.3698, + "step": 1937 + }, + { + "epoch": 1.59, + "grad_norm": 1.9190013210275396, + "learning_rate": 4.652282187676907e-07, + "loss": 0.3586, + "step": 1938 + }, + { + "epoch": 1.59, + "grad_norm": 1.975817777540498, + "learning_rate": 4.6333024834432086e-07, + "loss": 0.3656, + "step": 1939 + }, + { + "epoch": 1.59, + "grad_norm": 1.894711831278788, + "learning_rate": 4.6143576189001977e-07, + "loss": 0.3404, + "step": 1940 + }, + { + "epoch": 1.59, + "grad_norm": 1.8817727906924275, + "learning_rate": 4.595447626455354e-07, + "loss": 0.3438, + "step": 1941 + }, + { + "epoch": 1.59, + "grad_norm": 1.926259453987348, + "learning_rate": 4.576572538456503e-07, + "loss": 0.4277, + "step": 1942 + }, + { + "epoch": 1.59, + "grad_norm": 1.8789373169400134, + "learning_rate": 4.557732387191752e-07, + "loss": 0.356, + "step": 1943 + }, + { + "epoch": 1.59, + "grad_norm": 1.8546547967236782, + "learning_rate": 4.5389272048894566e-07, + "loss": 0.3646, + "step": 1944 + }, + { + "epoch": 1.59, + "grad_norm": 1.7952836118848592, + "learning_rate": 4.5201570237181413e-07, + "loss": 0.3527, + "step": 1945 + }, + { + "epoch": 1.6, + "grad_norm": 1.8451716234273645, + "learning_rate": 4.5014218757864714e-07, + "loss": 0.3581, + "step": 1946 + }, + { + "epoch": 1.6, + "grad_norm": 2.057595831083482, + "learning_rate": 4.482721793143166e-07, + "loss": 0.3895, + "step": 1947 + }, + { + "epoch": 1.6, + "grad_norm": 1.807840858302715, + "learning_rate": 4.464056807776973e-07, + "loss": 0.3269, + "step": 1948 + }, + { + "epoch": 1.6, + "grad_norm": 2.056535275565557, + "learning_rate": 4.445426951616605e-07, + "loss": 0.4116, + "step": 1949 + }, + { + "epoch": 1.6, + "grad_norm": 1.8423208982758352, + "learning_rate": 4.4268322565306663e-07, + "loss": 0.3315, + "step": 1950 + }, + { + "epoch": 1.6, + "grad_norm": 1.8618265222451655, + "learning_rate": 4.4082727543276303e-07, + "loss": 0.3594, + "step": 1951 + }, + { + "epoch": 1.6, + "grad_norm": 1.798959848387909, + "learning_rate": 4.3897484767557593e-07, + "loss": 0.4065, + "step": 1952 + }, + { + "epoch": 1.6, + "grad_norm": 1.8468408859615186, + "learning_rate": 4.3712594555030656e-07, + "loss": 0.3339, + "step": 1953 + }, + { + "epoch": 1.6, + "grad_norm": 1.8960514245664648, + "learning_rate": 4.352805722197248e-07, + "loss": 0.3619, + "step": 1954 + }, + { + "epoch": 1.6, + "grad_norm": 1.8201726610868778, + "learning_rate": 4.334387308405641e-07, + "loss": 0.3652, + "step": 1955 + }, + { + "epoch": 1.6, + "grad_norm": 1.8960787128219156, + "learning_rate": 4.316004245635158e-07, + "loss": 0.3482, + "step": 1956 + }, + { + "epoch": 1.6, + "grad_norm": 1.8962249531145783, + "learning_rate": 4.297656565332248e-07, + "loss": 0.3542, + "step": 1957 + }, + { + "epoch": 1.61, + "grad_norm": 1.9355222361840436, + "learning_rate": 4.279344298882834e-07, + "loss": 0.3531, + "step": 1958 + }, + { + "epoch": 1.61, + "grad_norm": 1.8619155316505231, + "learning_rate": 4.2610674776122406e-07, + "loss": 0.3659, + "step": 1959 + }, + { + "epoch": 1.61, + "grad_norm": 1.9040620290695986, + "learning_rate": 4.242826132785188e-07, + "loss": 0.349, + "step": 1960 + }, + { + "epoch": 1.61, + "grad_norm": 1.8614871067348076, + "learning_rate": 4.224620295605683e-07, + "loss": 0.3851, + "step": 1961 + }, + { + "epoch": 1.61, + "grad_norm": 1.92540215370278, + "learning_rate": 4.2064499972170073e-07, + "loss": 0.379, + "step": 1962 + }, + { + "epoch": 1.61, + "grad_norm": 1.7674536324643837, + "learning_rate": 4.188315268701651e-07, + "loss": 0.3226, + "step": 1963 + }, + { + "epoch": 1.61, + "grad_norm": 2.0097846655442146, + "learning_rate": 4.170216141081246e-07, + "loss": 0.3924, + "step": 1964 + }, + { + "epoch": 1.61, + "grad_norm": 1.9454008124232989, + "learning_rate": 4.1521526453165374e-07, + "loss": 0.3599, + "step": 1965 + }, + { + "epoch": 1.61, + "grad_norm": 1.8974492665299996, + "learning_rate": 4.134124812307311e-07, + "loss": 0.3606, + "step": 1966 + }, + { + "epoch": 1.61, + "grad_norm": 1.9289392453232093, + "learning_rate": 4.116132672892345e-07, + "loss": 0.3733, + "step": 1967 + }, + { + "epoch": 1.61, + "grad_norm": 1.9100167242810886, + "learning_rate": 4.098176257849365e-07, + "loss": 0.3502, + "step": 1968 + }, + { + "epoch": 1.61, + "grad_norm": 1.8550217698156015, + "learning_rate": 4.0802555978949804e-07, + "loss": 0.3433, + "step": 1969 + }, + { + "epoch": 1.62, + "grad_norm": 1.8671430861470548, + "learning_rate": 4.06237072368465e-07, + "loss": 0.3739, + "step": 1970 + }, + { + "epoch": 1.62, + "grad_norm": 1.8622163328469568, + "learning_rate": 4.0445216658125896e-07, + "loss": 0.3565, + "step": 1971 + }, + { + "epoch": 1.62, + "grad_norm": 1.957221135499962, + "learning_rate": 4.0267084548117786e-07, + "loss": 0.4121, + "step": 1972 + }, + { + "epoch": 1.62, + "grad_norm": 1.9308316014476776, + "learning_rate": 4.0089311211538473e-07, + "loss": 0.3375, + "step": 1973 + }, + { + "epoch": 1.62, + "grad_norm": 2.0943893420426685, + "learning_rate": 3.9911896952490786e-07, + "loss": 0.3447, + "step": 1974 + }, + { + "epoch": 1.62, + "grad_norm": 1.8555953393062348, + "learning_rate": 3.9734842074463125e-07, + "loss": 0.3423, + "step": 1975 + }, + { + "epoch": 1.62, + "grad_norm": 1.8787468374607186, + "learning_rate": 3.9558146880329246e-07, + "loss": 0.3629, + "step": 1976 + }, + { + "epoch": 1.62, + "grad_norm": 1.9054645054466235, + "learning_rate": 3.9381811672347584e-07, + "loss": 0.347, + "step": 1977 + }, + { + "epoch": 1.62, + "grad_norm": 1.843062330976259, + "learning_rate": 3.920583675216072e-07, + "loss": 0.377, + "step": 1978 + }, + { + "epoch": 1.62, + "grad_norm": 1.867154897712441, + "learning_rate": 3.903022242079499e-07, + "loss": 0.3896, + "step": 1979 + }, + { + "epoch": 1.62, + "grad_norm": 1.9141706940850274, + "learning_rate": 3.885496897865992e-07, + "loss": 0.3807, + "step": 1980 + }, + { + "epoch": 1.62, + "grad_norm": 1.7693227259004407, + "learning_rate": 3.868007672554755e-07, + "loss": 0.3074, + "step": 1981 + }, + { + "epoch": 1.63, + "grad_norm": 1.9095853727200514, + "learning_rate": 3.850554596063219e-07, + "loss": 0.3716, + "step": 1982 + }, + { + "epoch": 1.63, + "grad_norm": 1.9430202747777054, + "learning_rate": 3.833137698246975e-07, + "loss": 0.3624, + "step": 1983 + }, + { + "epoch": 1.63, + "grad_norm": 1.9653761109332166, + "learning_rate": 3.8157570088997257e-07, + "loss": 0.3526, + "step": 1984 + }, + { + "epoch": 1.63, + "grad_norm": 1.7667551472503558, + "learning_rate": 3.798412557753245e-07, + "loss": 0.3568, + "step": 1985 + }, + { + "epoch": 1.63, + "grad_norm": 1.8604050001183414, + "learning_rate": 3.78110437447729e-07, + "loss": 0.3477, + "step": 1986 + }, + { + "epoch": 1.63, + "grad_norm": 1.8184815785613908, + "learning_rate": 3.7638324886796e-07, + "loss": 0.3181, + "step": 1987 + }, + { + "epoch": 1.63, + "grad_norm": 1.8521943071413647, + "learning_rate": 3.7465969299058215e-07, + "loss": 0.317, + "step": 1988 + }, + { + "epoch": 1.63, + "grad_norm": 1.8922496050409547, + "learning_rate": 3.729397727639453e-07, + "loss": 0.3685, + "step": 1989 + }, + { + "epoch": 1.63, + "grad_norm": 1.8919544150457748, + "learning_rate": 3.712234911301807e-07, + "loss": 0.3811, + "step": 1990 + }, + { + "epoch": 1.63, + "grad_norm": 1.7571917950373177, + "learning_rate": 3.6951085102519377e-07, + "loss": 0.3069, + "step": 1991 + }, + { + "epoch": 1.63, + "grad_norm": 1.835944343826793, + "learning_rate": 3.6780185537866275e-07, + "loss": 0.3559, + "step": 1992 + }, + { + "epoch": 1.63, + "grad_norm": 1.924915839626172, + "learning_rate": 3.6609650711403044e-07, + "loss": 0.3736, + "step": 1993 + }, + { + "epoch": 1.64, + "grad_norm": 1.7729901621180486, + "learning_rate": 3.6439480914850057e-07, + "loss": 0.3202, + "step": 1994 + }, + { + "epoch": 1.64, + "grad_norm": 1.7439650920581296, + "learning_rate": 3.6269676439303234e-07, + "loss": 0.3138, + "step": 1995 + }, + { + "epoch": 1.64, + "grad_norm": 1.888200639953336, + "learning_rate": 3.6100237575233647e-07, + "loss": 0.3137, + "step": 1996 + }, + { + "epoch": 1.64, + "grad_norm": 1.9215140175987564, + "learning_rate": 3.593116461248691e-07, + "loss": 0.3618, + "step": 1997 + }, + { + "epoch": 1.64, + "grad_norm": 1.8621195398901151, + "learning_rate": 3.576245784028262e-07, + "loss": 0.3381, + "step": 1998 + }, + { + "epoch": 1.64, + "grad_norm": 1.813427333802716, + "learning_rate": 3.5594117547214064e-07, + "loss": 0.3159, + "step": 1999 + }, + { + "epoch": 1.64, + "grad_norm": 1.8109572860251124, + "learning_rate": 3.542614402124769e-07, + "loss": 0.3431, + "step": 2000 + }, + { + "epoch": 1.64, + "grad_norm": 1.8438243884785575, + "learning_rate": 3.5258537549722334e-07, + "loss": 0.3404, + "step": 2001 + }, + { + "epoch": 1.64, + "grad_norm": 1.8637548681582927, + "learning_rate": 3.5091298419349137e-07, + "loss": 0.3558, + "step": 2002 + }, + { + "epoch": 1.64, + "grad_norm": 1.9054188507583447, + "learning_rate": 3.492442691621073e-07, + "loss": 0.3514, + "step": 2003 + }, + { + "epoch": 1.64, + "grad_norm": 1.8508749827944715, + "learning_rate": 3.4757923325761e-07, + "loss": 0.3919, + "step": 2004 + }, + { + "epoch": 1.64, + "grad_norm": 1.9381031698615256, + "learning_rate": 3.459178793282439e-07, + "loss": 0.3445, + "step": 2005 + }, + { + "epoch": 1.65, + "grad_norm": 1.8965478043748623, + "learning_rate": 3.442602102159548e-07, + "loss": 0.3686, + "step": 2006 + }, + { + "epoch": 1.65, + "grad_norm": 1.8508387871408407, + "learning_rate": 3.4260622875638554e-07, + "loss": 0.3579, + "step": 2007 + }, + { + "epoch": 1.65, + "grad_norm": 1.8672067538829913, + "learning_rate": 3.4095593777887097e-07, + "loss": 0.3311, + "step": 2008 + }, + { + "epoch": 1.65, + "grad_norm": 1.889545394249109, + "learning_rate": 3.393093401064335e-07, + "loss": 0.3903, + "step": 2009 + }, + { + "epoch": 1.65, + "grad_norm": 1.9058059259656173, + "learning_rate": 3.3766643855577514e-07, + "loss": 0.3382, + "step": 2010 + }, + { + "epoch": 1.65, + "grad_norm": 1.827046925655219, + "learning_rate": 3.360272359372785e-07, + "loss": 0.4026, + "step": 2011 + }, + { + "epoch": 1.65, + "grad_norm": 1.8937619782731472, + "learning_rate": 3.3439173505499606e-07, + "loss": 0.3908, + "step": 2012 + }, + { + "epoch": 1.65, + "grad_norm": 1.8634842581354367, + "learning_rate": 3.327599387066499e-07, + "loss": 0.3317, + "step": 2013 + }, + { + "epoch": 1.65, + "grad_norm": 1.8390150223835013, + "learning_rate": 3.3113184968362384e-07, + "loss": 0.3556, + "step": 2014 + }, + { + "epoch": 1.65, + "grad_norm": 1.8764780958956686, + "learning_rate": 3.2950747077096084e-07, + "loss": 0.3517, + "step": 2015 + }, + { + "epoch": 1.65, + "grad_norm": 1.8020390033097442, + "learning_rate": 3.2788680474735687e-07, + "loss": 0.3417, + "step": 2016 + }, + { + "epoch": 1.65, + "grad_norm": 1.9477716924859427, + "learning_rate": 3.262698543851561e-07, + "loss": 0.3596, + "step": 2017 + }, + { + "epoch": 1.66, + "grad_norm": 1.894137770283384, + "learning_rate": 3.2465662245034696e-07, + "loss": 0.4035, + "step": 2018 + }, + { + "epoch": 1.66, + "grad_norm": 1.783142135883036, + "learning_rate": 3.230471117025577e-07, + "loss": 0.3529, + "step": 2019 + }, + { + "epoch": 1.66, + "grad_norm": 1.803387656385472, + "learning_rate": 3.214413248950496e-07, + "loss": 0.3509, + "step": 2020 + }, + { + "epoch": 1.66, + "grad_norm": 1.8805063690885642, + "learning_rate": 3.198392647747159e-07, + "loss": 0.3749, + "step": 2021 + }, + { + "epoch": 1.66, + "grad_norm": 2.0107348664022147, + "learning_rate": 3.182409340820719e-07, + "loss": 0.3485, + "step": 2022 + }, + { + "epoch": 1.66, + "grad_norm": 1.8288895736395419, + "learning_rate": 3.1664633555125615e-07, + "loss": 0.3757, + "step": 2023 + }, + { + "epoch": 1.66, + "grad_norm": 1.8466692854518314, + "learning_rate": 3.1505547191002017e-07, + "loss": 0.3465, + "step": 2024 + }, + { + "epoch": 1.66, + "grad_norm": 1.8190080826897819, + "learning_rate": 3.1346834587972915e-07, + "loss": 0.374, + "step": 2025 + }, + { + "epoch": 1.66, + "grad_norm": 1.741058949057854, + "learning_rate": 3.118849601753529e-07, + "loss": 0.3628, + "step": 2026 + }, + { + "epoch": 1.66, + "grad_norm": 1.9154631726200175, + "learning_rate": 3.1030531750546377e-07, + "loss": 0.3823, + "step": 2027 + }, + { + "epoch": 1.66, + "grad_norm": 1.8222826028897707, + "learning_rate": 3.0872942057223105e-07, + "loss": 0.3857, + "step": 2028 + }, + { + "epoch": 1.66, + "grad_norm": 1.8691052236878156, + "learning_rate": 3.071572720714161e-07, + "loss": 0.3687, + "step": 2029 + }, + { + "epoch": 1.67, + "grad_norm": 1.8329290245063994, + "learning_rate": 3.0558887469236824e-07, + "loss": 0.3459, + "step": 2030 + }, + { + "epoch": 1.67, + "grad_norm": 2.105450216487807, + "learning_rate": 3.040242311180211e-07, + "loss": 0.3497, + "step": 2031 + }, + { + "epoch": 1.67, + "grad_norm": 1.8682164915371384, + "learning_rate": 3.02463344024885e-07, + "loss": 0.3631, + "step": 2032 + }, + { + "epoch": 1.67, + "grad_norm": 1.870413375372597, + "learning_rate": 3.0090621608304586e-07, + "loss": 0.337, + "step": 2033 + }, + { + "epoch": 1.67, + "grad_norm": 1.831129712998651, + "learning_rate": 2.9935284995615874e-07, + "loss": 0.3223, + "step": 2034 + }, + { + "epoch": 1.67, + "grad_norm": 1.9934638606782258, + "learning_rate": 2.978032483014434e-07, + "loss": 0.3776, + "step": 2035 + }, + { + "epoch": 1.67, + "grad_norm": 1.9217667136109262, + "learning_rate": 2.9625741376968107e-07, + "loss": 0.3615, + "step": 2036 + }, + { + "epoch": 1.67, + "grad_norm": 1.8233621789333103, + "learning_rate": 2.947153490052068e-07, + "loss": 0.3745, + "step": 2037 + }, + { + "epoch": 1.67, + "grad_norm": 1.8933236552388286, + "learning_rate": 2.9317705664590857e-07, + "loss": 0.3521, + "step": 2038 + }, + { + "epoch": 1.67, + "grad_norm": 1.8877528361725595, + "learning_rate": 2.9164253932322114e-07, + "loss": 0.3753, + "step": 2039 + }, + { + "epoch": 1.67, + "grad_norm": 1.8701101519376129, + "learning_rate": 2.901117996621214e-07, + "loss": 0.3385, + "step": 2040 + }, + { + "epoch": 1.67, + "grad_norm": 2.022104989564633, + "learning_rate": 2.885848402811242e-07, + "loss": 0.3604, + "step": 2041 + }, + { + "epoch": 1.68, + "grad_norm": 1.981072876489218, + "learning_rate": 2.8706166379227685e-07, + "loss": 0.3844, + "step": 2042 + }, + { + "epoch": 1.68, + "grad_norm": 1.8524158018731827, + "learning_rate": 2.8554227280115673e-07, + "loss": 0.35, + "step": 2043 + }, + { + "epoch": 1.68, + "grad_norm": 1.938845970147165, + "learning_rate": 2.8402666990686526e-07, + "loss": 0.3396, + "step": 2044 + }, + { + "epoch": 1.68, + "grad_norm": 1.830032191851363, + "learning_rate": 2.825148577020237e-07, + "loss": 0.3376, + "step": 2045 + }, + { + "epoch": 1.68, + "grad_norm": 1.815946726661388, + "learning_rate": 2.8100683877276935e-07, + "loss": 0.3186, + "step": 2046 + }, + { + "epoch": 1.68, + "grad_norm": 1.9148486946074896, + "learning_rate": 2.7950261569874987e-07, + "loss": 0.3818, + "step": 2047 + }, + { + "epoch": 1.68, + "grad_norm": 2.0362020767823688, + "learning_rate": 2.7800219105312107e-07, + "loss": 0.3692, + "step": 2048 + }, + { + "epoch": 1.68, + "grad_norm": 1.9516570398785693, + "learning_rate": 2.765055674025388e-07, + "loss": 0.3761, + "step": 2049 + }, + { + "epoch": 1.68, + "grad_norm": 1.878773060905854, + "learning_rate": 2.75012747307159e-07, + "loss": 0.3683, + "step": 2050 + }, + { + "epoch": 1.68, + "grad_norm": 1.7913050498897873, + "learning_rate": 2.735237333206306e-07, + "loss": 0.3561, + "step": 2051 + }, + { + "epoch": 1.68, + "grad_norm": 1.9868033431365109, + "learning_rate": 2.720385279900908e-07, + "loss": 0.4107, + "step": 2052 + }, + { + "epoch": 1.68, + "grad_norm": 1.8593813692288108, + "learning_rate": 2.7055713385616246e-07, + "loss": 0.3705, + "step": 2053 + }, + { + "epoch": 1.69, + "grad_norm": 1.8698518318714625, + "learning_rate": 2.6907955345294864e-07, + "loss": 0.3566, + "step": 2054 + }, + { + "epoch": 1.69, + "grad_norm": 1.9690930287334685, + "learning_rate": 2.6760578930802917e-07, + "loss": 0.3855, + "step": 2055 + }, + { + "epoch": 1.69, + "grad_norm": 1.8559735887870026, + "learning_rate": 2.661358439424552e-07, + "loss": 0.3442, + "step": 2056 + }, + { + "epoch": 1.69, + "grad_norm": 1.7913911949444496, + "learning_rate": 2.6466971987074514e-07, + "loss": 0.3424, + "step": 2057 + }, + { + "epoch": 1.69, + "grad_norm": 1.8619135470758108, + "learning_rate": 2.6320741960088104e-07, + "loss": 0.3666, + "step": 2058 + }, + { + "epoch": 1.69, + "grad_norm": 1.9162607121997142, + "learning_rate": 2.6174894563430365e-07, + "loss": 0.3398, + "step": 2059 + }, + { + "epoch": 1.69, + "grad_norm": 1.8850582422937106, + "learning_rate": 2.602943004659092e-07, + "loss": 0.3267, + "step": 2060 + }, + { + "epoch": 1.69, + "grad_norm": 1.8451237530567572, + "learning_rate": 2.588434865840425e-07, + "loss": 0.3535, + "step": 2061 + }, + { + "epoch": 1.69, + "grad_norm": 1.9560351999911851, + "learning_rate": 2.573965064704964e-07, + "loss": 0.3927, + "step": 2062 + }, + { + "epoch": 1.69, + "grad_norm": 1.9375022045866468, + "learning_rate": 2.5595336260050367e-07, + "loss": 0.345, + "step": 2063 + }, + { + "epoch": 1.69, + "grad_norm": 1.9389910057698705, + "learning_rate": 2.5451405744273684e-07, + "loss": 0.3632, + "step": 2064 + }, + { + "epoch": 1.69, + "grad_norm": 1.9464447650421863, + "learning_rate": 2.5307859345930025e-07, + "loss": 0.3732, + "step": 2065 + }, + { + "epoch": 1.7, + "grad_norm": 1.901436015964627, + "learning_rate": 2.516469731057286e-07, + "loss": 0.3624, + "step": 2066 + }, + { + "epoch": 1.7, + "grad_norm": 1.861052476130716, + "learning_rate": 2.5021919883098043e-07, + "loss": 0.3465, + "step": 2067 + }, + { + "epoch": 1.7, + "grad_norm": 1.8061147604719698, + "learning_rate": 2.4879527307743624e-07, + "loss": 0.3271, + "step": 2068 + }, + { + "epoch": 1.7, + "grad_norm": 1.8600512074627422, + "learning_rate": 2.473751982808925e-07, + "loss": 0.3524, + "step": 2069 + }, + { + "epoch": 1.7, + "grad_norm": 1.9116929096391897, + "learning_rate": 2.459589768705581e-07, + "loss": 0.4268, + "step": 2070 + }, + { + "epoch": 1.7, + "grad_norm": 1.9375533647948024, + "learning_rate": 2.445466112690506e-07, + "loss": 0.4426, + "step": 2071 + }, + { + "epoch": 1.7, + "grad_norm": 1.8187896872609466, + "learning_rate": 2.431381038923922e-07, + "loss": 0.4264, + "step": 2072 + }, + { + "epoch": 1.7, + "grad_norm": 1.8722345082962843, + "learning_rate": 2.4173345715000326e-07, + "loss": 0.3428, + "step": 2073 + }, + { + "epoch": 1.7, + "grad_norm": 1.807831006800025, + "learning_rate": 2.4033267344470256e-07, + "loss": 0.3593, + "step": 2074 + }, + { + "epoch": 1.7, + "grad_norm": 1.8381163414971127, + "learning_rate": 2.389357551726981e-07, + "loss": 0.336, + "step": 2075 + }, + { + "epoch": 1.7, + "grad_norm": 1.8053442770688852, + "learning_rate": 2.3754270472358786e-07, + "loss": 0.36, + "step": 2076 + }, + { + "epoch": 1.7, + "grad_norm": 1.8102492703550743, + "learning_rate": 2.3615352448035228e-07, + "loss": 0.3373, + "step": 2077 + }, + { + "epoch": 1.71, + "grad_norm": 1.8640600559614149, + "learning_rate": 2.3476821681935185e-07, + "loss": 0.3713, + "step": 2078 + }, + { + "epoch": 1.71, + "grad_norm": 1.8259178467876043, + "learning_rate": 2.3338678411032184e-07, + "loss": 0.3812, + "step": 2079 + }, + { + "epoch": 1.71, + "grad_norm": 1.8572135345525957, + "learning_rate": 2.3200922871636973e-07, + "loss": 0.3564, + "step": 2080 + }, + { + "epoch": 1.71, + "grad_norm": 1.8808861682893652, + "learning_rate": 2.3063555299396994e-07, + "loss": 0.344, + "step": 2081 + }, + { + "epoch": 1.71, + "grad_norm": 2.082790897941687, + "learning_rate": 2.292657592929609e-07, + "loss": 0.4149, + "step": 2082 + }, + { + "epoch": 1.71, + "grad_norm": 1.8154127177582888, + "learning_rate": 2.278998499565388e-07, + "loss": 0.3309, + "step": 2083 + }, + { + "epoch": 1.71, + "grad_norm": 1.9281000674444286, + "learning_rate": 2.265378273212565e-07, + "loss": 0.35, + "step": 2084 + }, + { + "epoch": 1.71, + "grad_norm": 1.787806749105191, + "learning_rate": 2.2517969371701808e-07, + "loss": 0.3332, + "step": 2085 + }, + { + "epoch": 1.71, + "grad_norm": 1.8495975031177656, + "learning_rate": 2.2382545146707485e-07, + "loss": 0.3859, + "step": 2086 + }, + { + "epoch": 1.71, + "grad_norm": 2.0959876699169335, + "learning_rate": 2.224751028880215e-07, + "loss": 0.3441, + "step": 2087 + }, + { + "epoch": 1.71, + "grad_norm": 1.8576485516029098, + "learning_rate": 2.2112865028979135e-07, + "loss": 0.4039, + "step": 2088 + }, + { + "epoch": 1.71, + "grad_norm": 1.7715289202721822, + "learning_rate": 2.1978609597565425e-07, + "loss": 0.3379, + "step": 2089 + }, + { + "epoch": 1.72, + "grad_norm": 1.9403112506850888, + "learning_rate": 2.1844744224221115e-07, + "loss": 0.3668, + "step": 2090 + }, + { + "epoch": 1.72, + "grad_norm": 1.8854691153731733, + "learning_rate": 2.1711269137939083e-07, + "loss": 0.3612, + "step": 2091 + }, + { + "epoch": 1.72, + "grad_norm": 1.9371643123801792, + "learning_rate": 2.1578184567044552e-07, + "loss": 0.4192, + "step": 2092 + }, + { + "epoch": 1.72, + "grad_norm": 1.7867188340579265, + "learning_rate": 2.1445490739194663e-07, + "loss": 0.3679, + "step": 2093 + }, + { + "epoch": 1.72, + "grad_norm": 1.7966033645477641, + "learning_rate": 2.1313187881378205e-07, + "loss": 0.3849, + "step": 2094 + }, + { + "epoch": 1.72, + "grad_norm": 1.7756177099499322, + "learning_rate": 2.1181276219915224e-07, + "loss": 0.3277, + "step": 2095 + }, + { + "epoch": 1.72, + "grad_norm": 1.8346072475530897, + "learning_rate": 2.104975598045647e-07, + "loss": 0.3565, + "step": 2096 + }, + { + "epoch": 1.72, + "grad_norm": 2.011276790677077, + "learning_rate": 2.091862738798317e-07, + "loss": 0.3498, + "step": 2097 + }, + { + "epoch": 1.72, + "grad_norm": 1.8618849009320824, + "learning_rate": 2.0787890666806588e-07, + "loss": 0.3457, + "step": 2098 + }, + { + "epoch": 1.72, + "grad_norm": 1.9013266132795015, + "learning_rate": 2.0657546040567688e-07, + "loss": 0.4026, + "step": 2099 + }, + { + "epoch": 1.72, + "grad_norm": 1.8662915733623995, + "learning_rate": 2.0527593732236563e-07, + "loss": 0.3239, + "step": 2100 + }, + { + "epoch": 1.72, + "grad_norm": 1.862742931005612, + "learning_rate": 2.0398033964112386e-07, + "loss": 0.3534, + "step": 2101 + }, + { + "epoch": 1.72, + "grad_norm": 1.9697615787915368, + "learning_rate": 2.0268866957822737e-07, + "loss": 0.4434, + "step": 2102 + }, + { + "epoch": 1.73, + "grad_norm": 1.928940882803443, + "learning_rate": 2.0140092934323286e-07, + "loss": 0.378, + "step": 2103 + }, + { + "epoch": 1.73, + "grad_norm": 2.316878206169073, + "learning_rate": 2.00117121138976e-07, + "loss": 0.3245, + "step": 2104 + }, + { + "epoch": 1.73, + "grad_norm": 1.8290409402278658, + "learning_rate": 1.9883724716156488e-07, + "loss": 0.3167, + "step": 2105 + }, + { + "epoch": 1.73, + "grad_norm": 1.8470715714524868, + "learning_rate": 1.975613096003784e-07, + "loss": 0.3692, + "step": 2106 + }, + { + "epoch": 1.73, + "grad_norm": 1.7589018750118917, + "learning_rate": 1.9628931063806145e-07, + "loss": 0.3428, + "step": 2107 + }, + { + "epoch": 1.73, + "grad_norm": 1.8386934947013096, + "learning_rate": 1.9502125245052184e-07, + "loss": 0.302, + "step": 2108 + }, + { + "epoch": 1.73, + "grad_norm": 2.013634549023632, + "learning_rate": 1.9375713720692578e-07, + "loss": 0.374, + "step": 2109 + }, + { + "epoch": 1.73, + "grad_norm": 1.9630403258780862, + "learning_rate": 1.9249696706969468e-07, + "loss": 0.3639, + "step": 2110 + }, + { + "epoch": 1.73, + "grad_norm": 1.815851928434014, + "learning_rate": 1.9124074419450188e-07, + "loss": 0.3911, + "step": 2111 + }, + { + "epoch": 1.73, + "grad_norm": 1.7873258577827946, + "learning_rate": 1.899884707302671e-07, + "loss": 0.3587, + "step": 2112 + }, + { + "epoch": 1.73, + "grad_norm": 1.7437701243798065, + "learning_rate": 1.8874014881915592e-07, + "loss": 0.3259, + "step": 2113 + }, + { + "epoch": 1.73, + "grad_norm": 1.7944960642642218, + "learning_rate": 1.8749578059657269e-07, + "loss": 0.3244, + "step": 2114 + }, + { + "epoch": 1.74, + "grad_norm": 1.8363546908139476, + "learning_rate": 1.862553681911594e-07, + "loss": 0.3456, + "step": 2115 + }, + { + "epoch": 1.74, + "grad_norm": 1.8197485549623056, + "learning_rate": 1.8501891372479124e-07, + "loss": 0.3283, + "step": 2116 + }, + { + "epoch": 1.74, + "grad_norm": 1.9637296453576303, + "learning_rate": 1.837864193125724e-07, + "loss": 0.3826, + "step": 2117 + }, + { + "epoch": 1.74, + "grad_norm": 1.8379188611549326, + "learning_rate": 1.8255788706283333e-07, + "loss": 0.3479, + "step": 2118 + }, + { + "epoch": 1.74, + "grad_norm": 1.9065317118193474, + "learning_rate": 1.813333190771263e-07, + "loss": 0.389, + "step": 2119 + }, + { + "epoch": 1.74, + "grad_norm": 1.825398324635909, + "learning_rate": 1.8011271745022236e-07, + "loss": 0.3152, + "step": 2120 + }, + { + "epoch": 1.74, + "grad_norm": 1.84277570139951, + "learning_rate": 1.788960842701079e-07, + "loss": 0.3303, + "step": 2121 + }, + { + "epoch": 1.74, + "grad_norm": 2.1819179580807093, + "learning_rate": 1.7768342161798124e-07, + "loss": 0.3643, + "step": 2122 + }, + { + "epoch": 1.74, + "grad_norm": 1.8011890761495493, + "learning_rate": 1.7647473156824635e-07, + "loss": 0.339, + "step": 2123 + }, + { + "epoch": 1.74, + "grad_norm": 1.8221066042168634, + "learning_rate": 1.7527001618851458e-07, + "loss": 0.3569, + "step": 2124 + }, + { + "epoch": 1.74, + "grad_norm": 1.8261396232814944, + "learning_rate": 1.7406927753959635e-07, + "loss": 0.4109, + "step": 2125 + }, + { + "epoch": 1.74, + "grad_norm": 1.805174532249848, + "learning_rate": 1.728725176755e-07, + "loss": 0.3393, + "step": 2126 + }, + { + "epoch": 1.75, + "grad_norm": 1.8848775064893697, + "learning_rate": 1.7167973864342713e-07, + "loss": 0.3496, + "step": 2127 + }, + { + "epoch": 1.75, + "grad_norm": 1.8100491319883585, + "learning_rate": 1.7049094248377028e-07, + "loss": 0.3281, + "step": 2128 + }, + { + "epoch": 1.75, + "grad_norm": 1.8831196622854411, + "learning_rate": 1.6930613123010835e-07, + "loss": 0.3841, + "step": 2129 + }, + { + "epoch": 1.75, + "grad_norm": 1.9209980081592088, + "learning_rate": 1.6812530690920424e-07, + "loss": 0.391, + "step": 2130 + }, + { + "epoch": 1.75, + "grad_norm": 1.9944918420194047, + "learning_rate": 1.669484715409997e-07, + "loss": 0.3926, + "step": 2131 + }, + { + "epoch": 1.75, + "grad_norm": 1.9395008522068116, + "learning_rate": 1.6577562713861407e-07, + "loss": 0.3883, + "step": 2132 + }, + { + "epoch": 1.75, + "grad_norm": 1.8304355795594052, + "learning_rate": 1.646067757083389e-07, + "loss": 0.3389, + "step": 2133 + }, + { + "epoch": 1.75, + "grad_norm": 1.8402459886338345, + "learning_rate": 1.6344191924963476e-07, + "loss": 0.3922, + "step": 2134 + }, + { + "epoch": 1.75, + "grad_norm": 1.8132899531779554, + "learning_rate": 1.622810597551297e-07, + "loss": 0.3469, + "step": 2135 + }, + { + "epoch": 1.75, + "grad_norm": 1.921034055540662, + "learning_rate": 1.6112419921061357e-07, + "loss": 0.3653, + "step": 2136 + }, + { + "epoch": 1.75, + "grad_norm": 1.8056140554264193, + "learning_rate": 1.5997133959503586e-07, + "loss": 0.3415, + "step": 2137 + }, + { + "epoch": 1.75, + "grad_norm": 1.8588044133851904, + "learning_rate": 1.5882248288050212e-07, + "loss": 0.3506, + "step": 2138 + }, + { + "epoch": 1.76, + "grad_norm": 1.8108046119552568, + "learning_rate": 1.5767763103226973e-07, + "loss": 0.388, + "step": 2139 + }, + { + "epoch": 1.76, + "grad_norm": 1.8763098742970408, + "learning_rate": 1.5653678600874579e-07, + "loss": 0.3664, + "step": 2140 + }, + { + "epoch": 1.76, + "grad_norm": 1.8497677696649508, + "learning_rate": 1.553999497614833e-07, + "loss": 0.3639, + "step": 2141 + }, + { + "epoch": 1.76, + "grad_norm": 1.9655979382195377, + "learning_rate": 1.5426712423517786e-07, + "loss": 0.3506, + "step": 2142 + }, + { + "epoch": 1.76, + "grad_norm": 1.790628708967862, + "learning_rate": 1.5313831136766404e-07, + "loss": 0.3406, + "step": 2143 + }, + { + "epoch": 1.76, + "grad_norm": 1.9608335290613008, + "learning_rate": 1.5201351308991224e-07, + "loss": 0.3501, + "step": 2144 + }, + { + "epoch": 1.76, + "grad_norm": 1.7903484977756507, + "learning_rate": 1.50892731326025e-07, + "loss": 0.3676, + "step": 2145 + }, + { + "epoch": 1.76, + "grad_norm": 1.8855435922984911, + "learning_rate": 1.4977596799323535e-07, + "loss": 0.3799, + "step": 2146 + }, + { + "epoch": 1.76, + "grad_norm": 1.8324689872102904, + "learning_rate": 1.4866322500190101e-07, + "loss": 0.3755, + "step": 2147 + }, + { + "epoch": 1.76, + "grad_norm": 1.8514508819914763, + "learning_rate": 1.4755450425550323e-07, + "loss": 0.3347, + "step": 2148 + }, + { + "epoch": 1.76, + "grad_norm": 1.7886393781902339, + "learning_rate": 1.4644980765064265e-07, + "loss": 0.3585, + "step": 2149 + }, + { + "epoch": 1.76, + "grad_norm": 2.0582700711129904, + "learning_rate": 1.45349137077036e-07, + "loss": 0.3524, + "step": 2150 + }, + { + "epoch": 1.77, + "grad_norm": 1.8479484119761578, + "learning_rate": 1.442524944175122e-07, + "loss": 0.3588, + "step": 2151 + }, + { + "epoch": 1.77, + "grad_norm": 1.9799010734714735, + "learning_rate": 1.431598815480112e-07, + "loss": 0.3851, + "step": 2152 + }, + { + "epoch": 1.77, + "grad_norm": 1.8154268548789099, + "learning_rate": 1.4207130033757953e-07, + "loss": 0.3659, + "step": 2153 + }, + { + "epoch": 1.77, + "grad_norm": 1.788826962078112, + "learning_rate": 1.409867526483655e-07, + "loss": 0.3532, + "step": 2154 + }, + { + "epoch": 1.77, + "grad_norm": 1.831394990369725, + "learning_rate": 1.399062403356191e-07, + "loss": 0.3656, + "step": 2155 + }, + { + "epoch": 1.77, + "grad_norm": 1.860812513843525, + "learning_rate": 1.3882976524768694e-07, + "loss": 0.3437, + "step": 2156 + }, + { + "epoch": 1.77, + "grad_norm": 1.8191445465710199, + "learning_rate": 1.3775732922600955e-07, + "loss": 0.3453, + "step": 2157 + }, + { + "epoch": 1.77, + "grad_norm": 1.912203673098316, + "learning_rate": 1.3668893410511752e-07, + "loss": 0.356, + "step": 2158 + }, + { + "epoch": 1.77, + "grad_norm": 1.7634263389685063, + "learning_rate": 1.3562458171262977e-07, + "loss": 0.3364, + "step": 2159 + }, + { + "epoch": 1.77, + "grad_norm": 1.9606772030976105, + "learning_rate": 1.345642738692493e-07, + "loss": 0.4129, + "step": 2160 + }, + { + "epoch": 1.77, + "grad_norm": 1.8903730951236122, + "learning_rate": 1.3350801238876054e-07, + "loss": 0.3552, + "step": 2161 + }, + { + "epoch": 1.77, + "grad_norm": 1.8151033977020858, + "learning_rate": 1.3245579907802647e-07, + "loss": 0.3749, + "step": 2162 + }, + { + "epoch": 1.78, + "grad_norm": 1.798306670267368, + "learning_rate": 1.3140763573698368e-07, + "loss": 0.3579, + "step": 2163 + }, + { + "epoch": 1.78, + "grad_norm": 1.8830967664423854, + "learning_rate": 1.3036352415864317e-07, + "loss": 0.3807, + "step": 2164 + }, + { + "epoch": 1.78, + "grad_norm": 1.8497739705616796, + "learning_rate": 1.2932346612908236e-07, + "loss": 0.3435, + "step": 2165 + }, + { + "epoch": 1.78, + "grad_norm": 1.8775626143477988, + "learning_rate": 1.2828746342744642e-07, + "loss": 0.3694, + "step": 2166 + }, + { + "epoch": 1.78, + "grad_norm": 1.86888174731765, + "learning_rate": 1.2725551782594297e-07, + "loss": 0.3773, + "step": 2167 + }, + { + "epoch": 1.78, + "grad_norm": 1.8106535392312983, + "learning_rate": 1.2622763108983943e-07, + "loss": 0.3658, + "step": 2168 + }, + { + "epoch": 1.78, + "grad_norm": 1.7565946139355706, + "learning_rate": 1.2520380497745955e-07, + "loss": 0.3327, + "step": 2169 + }, + { + "epoch": 1.78, + "grad_norm": 1.8337917759239957, + "learning_rate": 1.2418404124018152e-07, + "loss": 0.3581, + "step": 2170 + }, + { + "epoch": 1.78, + "grad_norm": 1.8611220399958432, + "learning_rate": 1.2316834162243385e-07, + "loss": 0.3157, + "step": 2171 + }, + { + "epoch": 1.78, + "grad_norm": 1.838068997080928, + "learning_rate": 1.2215670786169365e-07, + "loss": 0.3784, + "step": 2172 + }, + { + "epoch": 1.78, + "grad_norm": 1.8073457370585166, + "learning_rate": 1.2114914168848247e-07, + "loss": 0.3601, + "step": 2173 + }, + { + "epoch": 1.78, + "grad_norm": 1.7903431751846044, + "learning_rate": 1.20145644826363e-07, + "loss": 0.3387, + "step": 2174 + }, + { + "epoch": 1.79, + "grad_norm": 1.8158848144806372, + "learning_rate": 1.1914621899193762e-07, + "loss": 0.34, + "step": 2175 + }, + { + "epoch": 1.79, + "grad_norm": 1.766558225950262, + "learning_rate": 1.181508658948452e-07, + "loss": 0.3226, + "step": 2176 + }, + { + "epoch": 1.79, + "grad_norm": 1.794967554546229, + "learning_rate": 1.1715958723775706e-07, + "loss": 0.3798, + "step": 2177 + }, + { + "epoch": 1.79, + "grad_norm": 1.7695727585693561, + "learning_rate": 1.1617238471637455e-07, + "loss": 0.3678, + "step": 2178 + }, + { + "epoch": 1.79, + "grad_norm": 1.8143490780194036, + "learning_rate": 1.1518926001942655e-07, + "loss": 0.3237, + "step": 2179 + }, + { + "epoch": 1.79, + "grad_norm": 1.8739602172417291, + "learning_rate": 1.142102148286664e-07, + "loss": 0.3639, + "step": 2180 + }, + { + "epoch": 1.79, + "grad_norm": 1.7739231697525675, + "learning_rate": 1.1323525081886888e-07, + "loss": 0.3159, + "step": 2181 + }, + { + "epoch": 1.79, + "grad_norm": 2.105039856872307, + "learning_rate": 1.1226436965782767e-07, + "loss": 0.432, + "step": 2182 + }, + { + "epoch": 1.79, + "grad_norm": 1.8072327831095898, + "learning_rate": 1.112975730063523e-07, + "loss": 0.3792, + "step": 2183 + }, + { + "epoch": 1.79, + "grad_norm": 1.8941794154335998, + "learning_rate": 1.1033486251826403e-07, + "loss": 0.3968, + "step": 2184 + }, + { + "epoch": 1.79, + "grad_norm": 2.1193960588145755, + "learning_rate": 1.0937623984039552e-07, + "loss": 0.3529, + "step": 2185 + }, + { + "epoch": 1.79, + "grad_norm": 1.9461657576343148, + "learning_rate": 1.0842170661258672e-07, + "loss": 0.3957, + "step": 2186 + }, + { + "epoch": 1.8, + "grad_norm": 1.7950594098824268, + "learning_rate": 1.0747126446768147e-07, + "loss": 0.3405, + "step": 2187 + }, + { + "epoch": 1.8, + "grad_norm": 1.8279881694837754, + "learning_rate": 1.065249150315259e-07, + "loss": 0.3341, + "step": 2188 + }, + { + "epoch": 1.8, + "grad_norm": 1.7886544153404058, + "learning_rate": 1.0558265992296451e-07, + "loss": 0.3536, + "step": 2189 + }, + { + "epoch": 1.8, + "grad_norm": 1.8349909807030522, + "learning_rate": 1.0464450075383825e-07, + "loss": 0.3485, + "step": 2190 + }, + { + "epoch": 1.8, + "grad_norm": 1.801301802497824, + "learning_rate": 1.0371043912898144e-07, + "loss": 0.3438, + "step": 2191 + }, + { + "epoch": 1.8, + "grad_norm": 2.2727551395393815, + "learning_rate": 1.0278047664621927e-07, + "loss": 0.3638, + "step": 2192 + }, + { + "epoch": 1.8, + "grad_norm": 1.8256549405277607, + "learning_rate": 1.0185461489636422e-07, + "loss": 0.3825, + "step": 2193 + }, + { + "epoch": 1.8, + "grad_norm": 1.8324272080700041, + "learning_rate": 1.0093285546321496e-07, + "loss": 0.359, + "step": 2194 + }, + { + "epoch": 1.8, + "grad_norm": 1.9572385811446105, + "learning_rate": 1.0001519992355158e-07, + "loss": 0.3805, + "step": 2195 + }, + { + "epoch": 1.8, + "grad_norm": 1.8745619608955748, + "learning_rate": 9.910164984713477e-08, + "loss": 0.3922, + "step": 2196 + }, + { + "epoch": 1.8, + "grad_norm": 1.8621961607473747, + "learning_rate": 9.819220679670172e-08, + "loss": 0.3448, + "step": 2197 + }, + { + "epoch": 1.8, + "grad_norm": 1.900350756763101, + "learning_rate": 9.728687232796463e-08, + "loss": 0.394, + "step": 2198 + }, + { + "epoch": 1.81, + "grad_norm": 1.8728312451342384, + "learning_rate": 9.638564798960748e-08, + "loss": 0.3457, + "step": 2199 + }, + { + "epoch": 1.81, + "grad_norm": 1.8715935041572054, + "learning_rate": 9.548853532328261e-08, + "loss": 0.345, + "step": 2200 + }, + { + "epoch": 1.81, + "grad_norm": 1.8278526441263347, + "learning_rate": 9.459553586360998e-08, + "loss": 0.3418, + "step": 2201 + }, + { + "epoch": 1.81, + "grad_norm": 1.9429462293638, + "learning_rate": 9.370665113817206e-08, + "loss": 0.3986, + "step": 2202 + }, + { + "epoch": 1.81, + "grad_norm": 1.8143835995774138, + "learning_rate": 9.282188266751341e-08, + "loss": 0.3367, + "step": 2203 + }, + { + "epoch": 1.81, + "grad_norm": 2.0092496115099596, + "learning_rate": 9.194123196513776e-08, + "loss": 0.345, + "step": 2204 + }, + { + "epoch": 1.81, + "grad_norm": 1.9645251631663325, + "learning_rate": 9.106470053750371e-08, + "loss": 0.3701, + "step": 2205 + }, + { + "epoch": 1.81, + "grad_norm": 1.866996771603958, + "learning_rate": 9.019228988402406e-08, + "loss": 0.3366, + "step": 2206 + }, + { + "epoch": 1.81, + "grad_norm": 1.9147717656924788, + "learning_rate": 8.932400149706227e-08, + "loss": 0.3575, + "step": 2207 + }, + { + "epoch": 1.81, + "grad_norm": 1.9482208231738272, + "learning_rate": 8.84598368619305e-08, + "loss": 0.3414, + "step": 2208 + }, + { + "epoch": 1.81, + "grad_norm": 1.8233257333540858, + "learning_rate": 8.759979745688623e-08, + "loss": 0.3318, + "step": 2209 + }, + { + "epoch": 1.81, + "grad_norm": 1.8579605341403027, + "learning_rate": 8.674388475313073e-08, + "loss": 0.3771, + "step": 2210 + }, + { + "epoch": 1.82, + "grad_norm": 1.8725714643585427, + "learning_rate": 8.589210021480581e-08, + "loss": 0.406, + "step": 2211 + }, + { + "epoch": 1.82, + "grad_norm": 1.7878785317641386, + "learning_rate": 8.504444529899153e-08, + "loss": 0.341, + "step": 2212 + }, + { + "epoch": 1.82, + "grad_norm": 1.9092348427688413, + "learning_rate": 8.420092145570408e-08, + "loss": 0.3627, + "step": 2213 + }, + { + "epoch": 1.82, + "grad_norm": 1.931178088082703, + "learning_rate": 8.3361530127892e-08, + "loss": 0.3368, + "step": 2214 + }, + { + "epoch": 1.82, + "grad_norm": 1.8737770088221923, + "learning_rate": 8.252627275143587e-08, + "loss": 0.3306, + "step": 2215 + }, + { + "epoch": 1.82, + "grad_norm": 1.868328247002196, + "learning_rate": 8.16951507551439e-08, + "loss": 0.339, + "step": 2216 + }, + { + "epoch": 1.82, + "grad_norm": 1.7549812055184346, + "learning_rate": 8.086816556075045e-08, + "loss": 0.3324, + "step": 2217 + }, + { + "epoch": 1.82, + "grad_norm": 1.9136663871575121, + "learning_rate": 8.00453185829131e-08, + "loss": 0.3783, + "step": 2218 + }, + { + "epoch": 1.82, + "grad_norm": 1.804706334856571, + "learning_rate": 7.922661122921116e-08, + "loss": 0.3293, + "step": 2219 + }, + { + "epoch": 1.82, + "grad_norm": 1.8409617616372291, + "learning_rate": 7.841204490014215e-08, + "loss": 0.3381, + "step": 2220 + }, + { + "epoch": 1.82, + "grad_norm": 1.8083991705398357, + "learning_rate": 7.760162098911978e-08, + "loss": 0.3463, + "step": 2221 + }, + { + "epoch": 1.82, + "grad_norm": 22.560759057837398, + "learning_rate": 7.679534088247231e-08, + "loss": 0.4184, + "step": 2222 + }, + { + "epoch": 1.83, + "grad_norm": 3.0344748163153143, + "learning_rate": 7.599320595943815e-08, + "loss": 0.3611, + "step": 2223 + }, + { + "epoch": 1.83, + "grad_norm": 1.8265541046231377, + "learning_rate": 7.519521759216691e-08, + "loss": 0.3513, + "step": 2224 + }, + { + "epoch": 1.83, + "grad_norm": 1.8696576868008075, + "learning_rate": 7.440137714571277e-08, + "loss": 0.3466, + "step": 2225 + }, + { + "epoch": 1.83, + "grad_norm": 1.8962707956349882, + "learning_rate": 7.361168597803614e-08, + "loss": 0.3665, + "step": 2226 + }, + { + "epoch": 1.83, + "grad_norm": 1.845514555585491, + "learning_rate": 7.282614543999867e-08, + "loss": 0.344, + "step": 2227 + }, + { + "epoch": 1.83, + "grad_norm": 1.87201142236449, + "learning_rate": 7.204475687536238e-08, + "loss": 0.3986, + "step": 2228 + }, + { + "epoch": 1.83, + "grad_norm": 1.8581673469597215, + "learning_rate": 7.126752162078643e-08, + "loss": 0.3234, + "step": 2229 + }, + { + "epoch": 1.83, + "grad_norm": 1.8315260718739734, + "learning_rate": 7.049444100582503e-08, + "loss": 0.3214, + "step": 2230 + }, + { + "epoch": 1.83, + "grad_norm": 1.7806104471214492, + "learning_rate": 6.972551635292618e-08, + "loss": 0.3229, + "step": 2231 + }, + { + "epoch": 1.83, + "grad_norm": 1.7994027257586662, + "learning_rate": 6.896074897742827e-08, + "loss": 0.3583, + "step": 2232 + }, + { + "epoch": 1.83, + "grad_norm": 1.7550904179180664, + "learning_rate": 6.820014018755761e-08, + "loss": 0.3191, + "step": 2233 + }, + { + "epoch": 1.83, + "grad_norm": 1.937456980479283, + "learning_rate": 6.744369128442785e-08, + "loss": 0.3771, + "step": 2234 + }, + { + "epoch": 1.84, + "grad_norm": 1.7707253123670128, + "learning_rate": 6.66914035620353e-08, + "loss": 0.3286, + "step": 2235 + }, + { + "epoch": 1.84, + "grad_norm": 1.8227176243965777, + "learning_rate": 6.594327830725916e-08, + "loss": 0.3634, + "step": 2236 + }, + { + "epoch": 1.84, + "grad_norm": 1.8234900630660766, + "learning_rate": 6.51993167998577e-08, + "loss": 0.3655, + "step": 2237 + }, + { + "epoch": 1.84, + "grad_norm": 1.9640311575750178, + "learning_rate": 6.445952031246678e-08, + "loss": 0.3801, + "step": 2238 + }, + { + "epoch": 1.84, + "grad_norm": 1.8231576961256921, + "learning_rate": 6.372389011059743e-08, + "loss": 0.349, + "step": 2239 + }, + { + "epoch": 1.84, + "grad_norm": 1.8533851518368958, + "learning_rate": 6.299242745263445e-08, + "loss": 0.354, + "step": 2240 + }, + { + "epoch": 1.84, + "grad_norm": 1.799650055342726, + "learning_rate": 6.226513358983166e-08, + "loss": 0.3854, + "step": 2241 + }, + { + "epoch": 1.84, + "grad_norm": 1.832735461763389, + "learning_rate": 6.154200976631358e-08, + "loss": 0.3495, + "step": 2242 + }, + { + "epoch": 1.84, + "grad_norm": 2.0635623794685487, + "learning_rate": 6.082305721907044e-08, + "loss": 0.4287, + "step": 2243 + }, + { + "epoch": 1.84, + "grad_norm": 1.8920812885996825, + "learning_rate": 6.010827717795736e-08, + "loss": 0.3444, + "step": 2244 + }, + { + "epoch": 1.84, + "grad_norm": 1.8850298512782395, + "learning_rate": 5.9397670865691813e-08, + "loss": 0.3713, + "step": 2245 + }, + { + "epoch": 1.84, + "grad_norm": 1.9351828186998583, + "learning_rate": 5.8691239497851436e-08, + "loss": 0.406, + "step": 2246 + }, + { + "epoch": 1.85, + "grad_norm": 1.784816624778513, + "learning_rate": 5.7988984282872085e-08, + "loss": 0.3607, + "step": 2247 + }, + { + "epoch": 1.85, + "grad_norm": 1.826165368757773, + "learning_rate": 5.729090642204615e-08, + "loss": 0.3238, + "step": 2248 + }, + { + "epoch": 1.85, + "grad_norm": 1.8992022791618013, + "learning_rate": 5.659700710951982e-08, + "loss": 0.3437, + "step": 2249 + }, + { + "epoch": 1.85, + "grad_norm": 1.8504942451770303, + "learning_rate": 5.5907287532291354e-08, + "loss": 0.344, + "step": 2250 + }, + { + "epoch": 1.85, + "grad_norm": 1.8238313969289717, + "learning_rate": 5.5221748870209756e-08, + "loss": 0.3805, + "step": 2251 + }, + { + "epoch": 1.85, + "grad_norm": 1.8659566459192327, + "learning_rate": 5.4540392295971136e-08, + "loss": 0.3322, + "step": 2252 + }, + { + "epoch": 1.85, + "grad_norm": 1.8525482310150154, + "learning_rate": 5.386321897511787e-08, + "loss": 0.3229, + "step": 2253 + }, + { + "epoch": 1.85, + "grad_norm": 1.8318342126706884, + "learning_rate": 5.319023006603668e-08, + "loss": 0.3552, + "step": 2254 + }, + { + "epoch": 1.85, + "grad_norm": 1.9361724490090373, + "learning_rate": 5.252142671995669e-08, + "loss": 0.3742, + "step": 2255 + }, + { + "epoch": 1.85, + "grad_norm": 1.9026081066119354, + "learning_rate": 5.185681008094579e-08, + "loss": 0.3313, + "step": 2256 + }, + { + "epoch": 1.85, + "grad_norm": 1.9824621364268147, + "learning_rate": 5.119638128591148e-08, + "loss": 0.3723, + "step": 2257 + }, + { + "epoch": 1.85, + "grad_norm": 1.8800250495832693, + "learning_rate": 5.0540141464596185e-08, + "loss": 0.3695, + "step": 2258 + }, + { + "epoch": 1.86, + "grad_norm": 1.7729891099483797, + "learning_rate": 4.988809173957804e-08, + "loss": 0.3487, + "step": 2259 + }, + { + "epoch": 1.86, + "grad_norm": 1.814679974377573, + "learning_rate": 4.924023322626592e-08, + "loss": 0.3689, + "step": 2260 + }, + { + "epoch": 1.86, + "grad_norm": 1.8464925551585676, + "learning_rate": 4.8596567032900274e-08, + "loss": 0.3539, + "step": 2261 + }, + { + "epoch": 1.86, + "grad_norm": 1.8903105581775805, + "learning_rate": 4.7957094260549784e-08, + "loss": 0.364, + "step": 2262 + }, + { + "epoch": 1.86, + "grad_norm": 1.805466964295455, + "learning_rate": 4.7321816003109424e-08, + "loss": 0.3628, + "step": 2263 + }, + { + "epoch": 1.86, + "grad_norm": 1.8143326259659651, + "learning_rate": 4.6690733347299624e-08, + "loss": 0.3661, + "step": 2264 + }, + { + "epoch": 1.86, + "grad_norm": 1.843803583791867, + "learning_rate": 4.6063847372662676e-08, + "loss": 0.3458, + "step": 2265 + }, + { + "epoch": 1.86, + "grad_norm": 1.908507563592907, + "learning_rate": 4.5441159151563275e-08, + "loss": 0.4031, + "step": 2266 + }, + { + "epoch": 1.86, + "grad_norm": 1.826011543168382, + "learning_rate": 4.4822669749184364e-08, + "loss": 0.3152, + "step": 2267 + }, + { + "epoch": 1.86, + "grad_norm": 1.8153710037318442, + "learning_rate": 4.420838022352631e-08, + "loss": 0.3342, + "step": 2268 + }, + { + "epoch": 1.86, + "grad_norm": 1.9759369609394648, + "learning_rate": 4.359829162540574e-08, + "loss": 0.3566, + "step": 2269 + }, + { + "epoch": 1.86, + "grad_norm": 1.836606089281238, + "learning_rate": 4.2992404998452867e-08, + "loss": 0.3373, + "step": 2270 + }, + { + "epoch": 1.87, + "grad_norm": 1.7419439850247866, + "learning_rate": 4.2390721379109434e-08, + "loss": 0.3258, + "step": 2271 + }, + { + "epoch": 1.87, + "grad_norm": 1.8113632431171702, + "learning_rate": 4.1793241796627694e-08, + "loss": 0.3396, + "step": 2272 + }, + { + "epoch": 1.87, + "grad_norm": 1.782592709936038, + "learning_rate": 4.119996727306896e-08, + "loss": 0.3166, + "step": 2273 + }, + { + "epoch": 1.87, + "grad_norm": 1.8240157047778767, + "learning_rate": 4.0610898823300605e-08, + "loss": 0.3698, + "step": 2274 + }, + { + "epoch": 1.87, + "grad_norm": 1.9054984374125201, + "learning_rate": 4.0026037454995446e-08, + "loss": 0.3836, + "step": 2275 + }, + { + "epoch": 1.87, + "grad_norm": 1.7353865384456557, + "learning_rate": 3.9445384168628474e-08, + "loss": 0.3253, + "step": 2276 + }, + { + "epoch": 1.87, + "grad_norm": 1.9177471239749773, + "learning_rate": 3.88689399574782e-08, + "loss": 0.3705, + "step": 2277 + }, + { + "epoch": 1.87, + "grad_norm": 1.767304846703738, + "learning_rate": 3.8296705807621124e-08, + "loss": 0.3394, + "step": 2278 + }, + { + "epoch": 1.87, + "grad_norm": 1.8307581850341796, + "learning_rate": 3.772868269793312e-08, + "loss": 0.3742, + "step": 2279 + }, + { + "epoch": 1.87, + "grad_norm": 2.2094772282591357, + "learning_rate": 3.716487160008608e-08, + "loss": 0.3737, + "step": 2280 + }, + { + "epoch": 1.87, + "grad_norm": 1.831225952922472, + "learning_rate": 3.660527347854687e-08, + "loss": 0.3355, + "step": 2281 + }, + { + "epoch": 1.87, + "grad_norm": 1.8903588912458535, + "learning_rate": 3.604988929057529e-08, + "loss": 0.3504, + "step": 2282 + }, + { + "epoch": 1.88, + "grad_norm": 1.831527380801516, + "learning_rate": 3.549871998622334e-08, + "loss": 0.3622, + "step": 2283 + }, + { + "epoch": 1.88, + "grad_norm": 1.8110423459241198, + "learning_rate": 3.4951766508332377e-08, + "loss": 0.3578, + "step": 2284 + }, + { + "epoch": 1.88, + "grad_norm": 1.8576409618357517, + "learning_rate": 3.440902979253202e-08, + "loss": 0.3385, + "step": 2285 + }, + { + "epoch": 1.88, + "grad_norm": 1.7451272854809423, + "learning_rate": 3.387051076723907e-08, + "loss": 0.3799, + "step": 2286 + }, + { + "epoch": 1.88, + "grad_norm": 1.8297163144764026, + "learning_rate": 3.333621035365525e-08, + "loss": 0.3593, + "step": 2287 + }, + { + "epoch": 1.88, + "grad_norm": 1.85432730355764, + "learning_rate": 3.280612946576556e-08, + "loss": 0.3639, + "step": 2288 + }, + { + "epoch": 1.88, + "grad_norm": 1.7735697250412514, + "learning_rate": 3.2280269010337427e-08, + "loss": 0.3617, + "step": 2289 + }, + { + "epoch": 1.88, + "grad_norm": 1.7625079229313618, + "learning_rate": 3.175862988691852e-08, + "loss": 0.329, + "step": 2290 + }, + { + "epoch": 1.88, + "grad_norm": 2.105587141461978, + "learning_rate": 3.1241212987835614e-08, + "loss": 0.4002, + "step": 2291 + }, + { + "epoch": 1.88, + "grad_norm": 1.8530942331843046, + "learning_rate": 3.072801919819235e-08, + "loss": 0.3574, + "step": 2292 + }, + { + "epoch": 1.88, + "grad_norm": 1.8187789742996616, + "learning_rate": 3.021904939586873e-08, + "loss": 0.3385, + "step": 2293 + }, + { + "epoch": 1.88, + "grad_norm": 1.8408002748586405, + "learning_rate": 2.971430445151885e-08, + "loss": 0.3443, + "step": 2294 + }, + { + "epoch": 1.88, + "grad_norm": 1.8528110336406811, + "learning_rate": 2.9213785228569823e-08, + "loss": 0.3298, + "step": 2295 + }, + { + "epoch": 1.89, + "grad_norm": 1.7149723027801804, + "learning_rate": 2.8717492583220095e-08, + "loss": 0.3607, + "step": 2296 + }, + { + "epoch": 1.89, + "grad_norm": 1.9360256295655223, + "learning_rate": 2.8225427364438063e-08, + "loss": 0.3833, + "step": 2297 + }, + { + "epoch": 1.89, + "grad_norm": 1.8530219977927616, + "learning_rate": 2.773759041396068e-08, + "loss": 0.3414, + "step": 2298 + }, + { + "epoch": 1.89, + "grad_norm": 1.7555751947021212, + "learning_rate": 2.7253982566291525e-08, + "loss": 0.376, + "step": 2299 + }, + { + "epoch": 1.89, + "grad_norm": 1.8257207937080222, + "learning_rate": 2.677460464870024e-08, + "loss": 0.3375, + "step": 2300 + }, + { + "epoch": 1.89, + "grad_norm": 1.808041093231935, + "learning_rate": 2.629945748122087e-08, + "loss": 0.3259, + "step": 2301 + }, + { + "epoch": 1.89, + "grad_norm": 1.844719556713403, + "learning_rate": 2.5828541876649628e-08, + "loss": 0.32, + "step": 2302 + }, + { + "epoch": 1.89, + "grad_norm": 1.8627811857138301, + "learning_rate": 2.5361858640544357e-08, + "loss": 0.3467, + "step": 2303 + }, + { + "epoch": 1.89, + "grad_norm": 1.775658122498733, + "learning_rate": 2.489940857122314e-08, + "loss": 0.3438, + "step": 2304 + }, + { + "epoch": 1.89, + "grad_norm": 2.0652341897824873, + "learning_rate": 2.4441192459762342e-08, + "loss": 0.4325, + "step": 2305 + }, + { + "epoch": 1.89, + "grad_norm": 1.8808828610296175, + "learning_rate": 2.3987211089996075e-08, + "loss": 0.3599, + "step": 2306 + }, + { + "epoch": 1.89, + "grad_norm": 1.8324673797834097, + "learning_rate": 2.3537465238513966e-08, + "loss": 0.3293, + "step": 2307 + }, + { + "epoch": 1.9, + "grad_norm": 1.788314595093782, + "learning_rate": 2.3091955674660606e-08, + "loss": 0.374, + "step": 2308 + }, + { + "epoch": 1.9, + "grad_norm": 1.9054450644102343, + "learning_rate": 2.265068316053387e-08, + "loss": 0.3817, + "step": 2309 + }, + { + "epoch": 1.9, + "grad_norm": 1.7959481618901236, + "learning_rate": 2.2213648450983284e-08, + "loss": 0.3293, + "step": 2310 + }, + { + "epoch": 1.9, + "grad_norm": 1.829622659307099, + "learning_rate": 2.178085229360999e-08, + "loss": 0.3623, + "step": 2311 + }, + { + "epoch": 1.9, + "grad_norm": 1.7576464280597102, + "learning_rate": 2.1352295428763435e-08, + "loss": 0.318, + "step": 2312 + }, + { + "epoch": 1.9, + "grad_norm": 1.7741716877498643, + "learning_rate": 2.092797858954193e-08, + "loss": 0.3561, + "step": 2313 + }, + { + "epoch": 1.9, + "grad_norm": 1.7225396697993693, + "learning_rate": 2.050790250179041e-08, + "loss": 0.3088, + "step": 2314 + }, + { + "epoch": 1.9, + "grad_norm": 1.783671372833311, + "learning_rate": 2.0092067884100175e-08, + "loss": 0.3378, + "step": 2315 + }, + { + "epoch": 1.9, + "grad_norm": 1.878137104399538, + "learning_rate": 1.9680475447805826e-08, + "loss": 0.3528, + "step": 2316 + }, + { + "epoch": 1.9, + "grad_norm": 1.7570308196145663, + "learning_rate": 1.9273125896986378e-08, + "loss": 0.3266, + "step": 2317 + }, + { + "epoch": 1.9, + "grad_norm": 1.8198609878561622, + "learning_rate": 1.8870019928461936e-08, + "loss": 0.3866, + "step": 2318 + }, + { + "epoch": 1.9, + "grad_norm": 1.9491640718315564, + "learning_rate": 1.8471158231793962e-08, + "loss": 0.3731, + "step": 2319 + }, + { + "epoch": 1.91, + "grad_norm": 2.0157828598711407, + "learning_rate": 1.807654148928334e-08, + "loss": 0.3225, + "step": 2320 + }, + { + "epoch": 1.91, + "grad_norm": 1.830799493750694, + "learning_rate": 1.7686170375969813e-08, + "loss": 0.3378, + "step": 2321 + }, + { + "epoch": 1.91, + "grad_norm": 1.776821773854751, + "learning_rate": 1.7300045559630053e-08, + "loss": 0.3426, + "step": 2322 + }, + { + "epoch": 1.91, + "grad_norm": 1.8331771417191418, + "learning_rate": 1.691816770077709e-08, + "loss": 0.3879, + "step": 2323 + }, + { + "epoch": 1.91, + "grad_norm": 1.8367134813568278, + "learning_rate": 1.654053745265921e-08, + "loss": 0.3577, + "step": 2324 + }, + { + "epoch": 1.91, + "grad_norm": 1.8655517772206949, + "learning_rate": 1.6167155461258298e-08, + "loss": 0.3106, + "step": 2325 + }, + { + "epoch": 1.91, + "grad_norm": 1.7984794341036208, + "learning_rate": 1.5798022365289544e-08, + "loss": 0.3339, + "step": 2326 + }, + { + "epoch": 1.91, + "grad_norm": 1.8046596260708332, + "learning_rate": 1.5433138796198954e-08, + "loss": 0.3514, + "step": 2327 + }, + { + "epoch": 1.91, + "grad_norm": 1.80328251848771, + "learning_rate": 1.5072505378164182e-08, + "loss": 0.3813, + "step": 2328 + }, + { + "epoch": 1.91, + "grad_norm": 1.8143267874334106, + "learning_rate": 1.4716122728092586e-08, + "loss": 0.3705, + "step": 2329 + }, + { + "epoch": 1.91, + "grad_norm": 1.8115674095692667, + "learning_rate": 1.4363991455619008e-08, + "loss": 0.358, + "step": 2330 + }, + { + "epoch": 1.91, + "grad_norm": 1.9202718564659147, + "learning_rate": 1.401611216310661e-08, + "loss": 0.4294, + "step": 2331 + }, + { + "epoch": 1.92, + "grad_norm": 1.900298805533325, + "learning_rate": 1.36724854456452e-08, + "loss": 0.3429, + "step": 2332 + }, + { + "epoch": 1.92, + "grad_norm": 1.8195483339075496, + "learning_rate": 1.3333111891049023e-08, + "loss": 0.3788, + "step": 2333 + }, + { + "epoch": 1.92, + "grad_norm": 1.7931061906479377, + "learning_rate": 1.2997992079858135e-08, + "loss": 0.3483, + "step": 2334 + }, + { + "epoch": 1.92, + "grad_norm": 1.740931180103152, + "learning_rate": 1.266712658533481e-08, + "loss": 0.353, + "step": 2335 + }, + { + "epoch": 1.92, + "grad_norm": 1.8508839449199255, + "learning_rate": 1.2340515973464917e-08, + "loss": 0.4318, + "step": 2336 + }, + { + "epoch": 1.92, + "grad_norm": 1.7123271515142926, + "learning_rate": 1.2018160802954592e-08, + "loss": 0.3511, + "step": 2337 + }, + { + "epoch": 1.92, + "grad_norm": 1.8655323178391352, + "learning_rate": 1.170006162523163e-08, + "loss": 0.3334, + "step": 2338 + }, + { + "epoch": 1.92, + "grad_norm": 1.8116660117809809, + "learning_rate": 1.1386218984443253e-08, + "loss": 0.3517, + "step": 2339 + }, + { + "epoch": 1.92, + "grad_norm": 1.7942153746632152, + "learning_rate": 1.1076633417454463e-08, + "loss": 0.3172, + "step": 2340 + }, + { + "epoch": 1.92, + "grad_norm": 1.7673253216996463, + "learning_rate": 1.0771305453849134e-08, + "loss": 0.3634, + "step": 2341 + }, + { + "epoch": 1.92, + "grad_norm": 1.8407979709727775, + "learning_rate": 1.0470235615927526e-08, + "loss": 0.3381, + "step": 2342 + }, + { + "epoch": 1.92, + "grad_norm": 1.8972203790540076, + "learning_rate": 1.0173424418705724e-08, + "loss": 0.4329, + "step": 2343 + }, + { + "epoch": 1.93, + "grad_norm": 1.7916957004045877, + "learning_rate": 9.880872369915362e-09, + "loss": 0.319, + "step": 2344 + }, + { + "epoch": 1.93, + "grad_norm": 1.8307160856502611, + "learning_rate": 9.592579970001404e-09, + "loss": 0.3661, + "step": 2345 + }, + { + "epoch": 1.93, + "grad_norm": 1.8705249856696158, + "learning_rate": 9.30854771212325e-09, + "loss": 0.346, + "step": 2346 + }, + { + "epoch": 1.93, + "grad_norm": 1.9255943662811519, + "learning_rate": 9.028776082152246e-09, + "loss": 0.3957, + "step": 2347 + }, + { + "epoch": 1.93, + "grad_norm": 1.8466026013927237, + "learning_rate": 8.75326555867112e-09, + "loss": 0.3208, + "step": 2348 + }, + { + "epoch": 1.93, + "grad_norm": 2.188566840717273, + "learning_rate": 8.482016612974265e-09, + "loss": 0.3949, + "step": 2349 + }, + { + "epoch": 1.93, + "grad_norm": 1.8661101622885223, + "learning_rate": 8.215029709065515e-09, + "loss": 0.3837, + "step": 2350 + }, + { + "epoch": 1.93, + "grad_norm": 1.8861120420225161, + "learning_rate": 7.952305303658147e-09, + "loss": 0.3582, + "step": 2351 + }, + { + "epoch": 1.93, + "grad_norm": 1.813038172439941, + "learning_rate": 7.693843846174055e-09, + "loss": 0.3784, + "step": 2352 + }, + { + "epoch": 1.93, + "grad_norm": 1.732272473538418, + "learning_rate": 7.439645778742344e-09, + "loss": 0.3083, + "step": 2353 + }, + { + "epoch": 1.93, + "grad_norm": 1.7583227773146088, + "learning_rate": 7.189711536199906e-09, + "loss": 0.3407, + "step": 2354 + }, + { + "epoch": 1.93, + "grad_norm": 1.9598993214652383, + "learning_rate": 6.944041546088909e-09, + "loss": 0.3442, + "step": 2355 + }, + { + "epoch": 1.94, + "grad_norm": 1.920153009242741, + "learning_rate": 6.702636228657911e-09, + "loss": 0.3507, + "step": 2356 + }, + { + "epoch": 1.94, + "grad_norm": 1.93057180027477, + "learning_rate": 6.465495996859639e-09, + "loss": 0.3528, + "step": 2357 + }, + { + "epoch": 1.94, + "grad_norm": 1.757464657460147, + "learning_rate": 6.2326212563507126e-09, + "loss": 0.31, + "step": 2358 + }, + { + "epoch": 1.94, + "grad_norm": 1.793945156305048, + "learning_rate": 6.004012405492199e-09, + "loss": 0.3255, + "step": 2359 + }, + { + "epoch": 1.94, + "grad_norm": 1.8164806685160202, + "learning_rate": 5.77966983534628e-09, + "loss": 0.3363, + "step": 2360 + }, + { + "epoch": 1.94, + "grad_norm": 1.7782525894382621, + "learning_rate": 5.5595939296784755e-09, + "loss": 0.3416, + "step": 2361 + }, + { + "epoch": 1.94, + "grad_norm": 1.832295574318913, + "learning_rate": 5.343785064954865e-09, + "loss": 0.3628, + "step": 2362 + }, + { + "epoch": 1.94, + "grad_norm": 1.729926329285944, + "learning_rate": 5.132243610342924e-09, + "loss": 0.3362, + "step": 2363 + }, + { + "epoch": 1.94, + "grad_norm": 1.8177287184944615, + "learning_rate": 4.9249699277093e-09, + "loss": 0.344, + "step": 2364 + }, + { + "epoch": 1.94, + "grad_norm": 1.8317448819287787, + "learning_rate": 4.721964371620924e-09, + "loss": 0.4136, + "step": 2365 + }, + { + "epoch": 1.94, + "grad_norm": 1.7302394089099904, + "learning_rate": 4.523227289343068e-09, + "loss": 0.3212, + "step": 2366 + }, + { + "epoch": 1.94, + "grad_norm": 1.8449995563751413, + "learning_rate": 4.328759020839624e-09, + "loss": 0.3686, + "step": 2367 + }, + { + "epoch": 1.95, + "grad_norm": 1.8177433334210171, + "learning_rate": 4.138559898771988e-09, + "loss": 0.3923, + "step": 2368 + }, + { + "epoch": 1.95, + "grad_norm": 1.8895824040710925, + "learning_rate": 3.95263024849879e-09, + "loss": 0.4414, + "step": 2369 + }, + { + "epoch": 1.95, + "grad_norm": 1.8778393603343977, + "learning_rate": 3.7709703880747795e-09, + "loss": 0.351, + "step": 2370 + }, + { + "epoch": 1.95, + "grad_norm": 1.8123035152551727, + "learning_rate": 3.5935806282511032e-09, + "loss": 0.3596, + "step": 2371 + }, + { + "epoch": 1.95, + "grad_norm": 1.8483506706663495, + "learning_rate": 3.4204612724744736e-09, + "loss": 0.3666, + "step": 2372 + }, + { + "epoch": 1.95, + "grad_norm": 1.9665278586805264, + "learning_rate": 3.2516126168866123e-09, + "loss": 0.3447, + "step": 2373 + }, + { + "epoch": 1.95, + "grad_norm": 1.9266775035122918, + "learning_rate": 3.0870349503231424e-09, + "loss": 0.3763, + "step": 2374 + }, + { + "epoch": 1.95, + "grad_norm": 1.8202142350172061, + "learning_rate": 2.92672855431414e-09, + "loss": 0.3275, + "step": 2375 + }, + { + "epoch": 1.95, + "grad_norm": 1.8013001627180567, + "learning_rate": 2.7706937030827495e-09, + "loss": 0.3557, + "step": 2376 + }, + { + "epoch": 1.95, + "grad_norm": 1.906524152202692, + "learning_rate": 2.6189306635460156e-09, + "loss": 0.3548, + "step": 2377 + }, + { + "epoch": 1.95, + "grad_norm": 1.8346090500701675, + "learning_rate": 2.471439695312383e-09, + "loss": 0.3672, + "step": 2378 + }, + { + "epoch": 1.95, + "grad_norm": 1.823882086144735, + "learning_rate": 2.328221050683088e-09, + "loss": 0.3349, + "step": 2379 + }, + { + "epoch": 1.96, + "grad_norm": 1.783279345835604, + "learning_rate": 2.189274974650768e-09, + "loss": 0.3651, + "step": 2380 + }, + { + "epoch": 1.96, + "grad_norm": 1.8063215302334035, + "learning_rate": 2.0546017048994615e-09, + "loss": 0.3829, + "step": 2381 + }, + { + "epoch": 1.96, + "grad_norm": 1.9347786435697172, + "learning_rate": 1.924201471804332e-09, + "loss": 0.3535, + "step": 2382 + }, + { + "epoch": 1.96, + "grad_norm": 1.7349198388308216, + "learning_rate": 1.79807449843028e-09, + "loss": 0.3374, + "step": 2383 + }, + { + "epoch": 1.96, + "grad_norm": 1.8780406239663383, + "learning_rate": 1.6762210005330515e-09, + "loss": 0.3509, + "step": 2384 + }, + { + "epoch": 1.96, + "grad_norm": 1.8146143727655357, + "learning_rate": 1.558641186557297e-09, + "loss": 0.3994, + "step": 2385 + }, + { + "epoch": 1.96, + "grad_norm": 1.8397391761113984, + "learning_rate": 1.4453352576379587e-09, + "loss": 0.354, + "step": 2386 + }, + { + "epoch": 1.96, + "grad_norm": 1.7893297095614948, + "learning_rate": 1.3363034075980496e-09, + "loss": 0.3669, + "step": 2387 + }, + { + "epoch": 1.96, + "grad_norm": 1.7642311756811444, + "learning_rate": 1.2315458229500422e-09, + "loss": 0.3613, + "step": 2388 + }, + { + "epoch": 1.96, + "grad_norm": 1.9684442048645372, + "learning_rate": 1.1310626828942017e-09, + "loss": 0.3317, + "step": 2389 + }, + { + "epoch": 1.96, + "grad_norm": 1.7590264201884553, + "learning_rate": 1.034854159319143e-09, + "loss": 0.3164, + "step": 2390 + }, + { + "epoch": 1.96, + "grad_norm": 1.769739988978357, + "learning_rate": 9.42920416801274e-10, + "loss": 0.3105, + "step": 2391 + }, + { + "epoch": 1.97, + "grad_norm": 1.8187497681660625, + "learning_rate": 8.552616126039637e-10, + "loss": 0.3491, + "step": 2392 + }, + { + "epoch": 1.97, + "grad_norm": 1.8697839343972154, + "learning_rate": 7.718778966783746e-10, + "loss": 0.3381, + "step": 2393 + }, + { + "epoch": 1.97, + "grad_norm": 1.8195517409055602, + "learning_rate": 6.927694116623529e-10, + "loss": 0.3579, + "step": 2394 + }, + { + "epoch": 1.97, + "grad_norm": 1.957748954418363, + "learning_rate": 6.179362928804278e-10, + "loss": 0.3428, + "step": 2395 + }, + { + "epoch": 1.97, + "grad_norm": 1.7764755140707773, + "learning_rate": 5.473786683440896e-10, + "loss": 0.3572, + "step": 2396 + }, + { + "epoch": 1.97, + "grad_norm": 1.8254010648782029, + "learning_rate": 4.810966587501242e-10, + "loss": 0.3385, + "step": 2397 + }, + { + "epoch": 1.97, + "grad_norm": 2.398100665461426, + "learning_rate": 4.1909037748227856e-10, + "loss": 0.3683, + "step": 2398 + }, + { + "epoch": 1.97, + "grad_norm": 1.7370690524344994, + "learning_rate": 3.613599306093174e-10, + "loss": 0.3453, + "step": 2399 + }, + { + "epoch": 1.97, + "grad_norm": 1.9603879188454718, + "learning_rate": 3.079054168866891e-10, + "loss": 0.3446, + "step": 2400 + }, + { + "epoch": 1.97, + "grad_norm": 1.8244354138147691, + "learning_rate": 2.5872692775430474e-10, + "loss": 0.3107, + "step": 2401 + }, + { + "epoch": 1.97, + "grad_norm": 1.842040045340226, + "learning_rate": 2.1382454733848147e-10, + "loss": 0.3794, + "step": 2402 + }, + { + "epoch": 1.97, + "grad_norm": 1.8958611045361276, + "learning_rate": 1.7319835244944405e-10, + "loss": 0.3499, + "step": 2403 + }, + { + "epoch": 1.98, + "grad_norm": 1.795028879660856, + "learning_rate": 1.3684841258354564e-10, + "loss": 0.3571, + "step": 2404 + }, + { + "epoch": 1.98, + "grad_norm": 1.9239030192337772, + "learning_rate": 1.0477478992187984e-10, + "loss": 0.3801, + "step": 2405 + }, + { + "epoch": 1.98, + "grad_norm": 1.7943896830711028, + "learning_rate": 7.697753933000318e-11, + "loss": 0.3379, + "step": 2406 + }, + { + "epoch": 1.98, + "grad_norm": 1.8578795406276247, + "learning_rate": 5.345670835849026e-11, + "loss": 0.3335, + "step": 2407 + }, + { + "epoch": 1.98, + "grad_norm": 1.7907314686158868, + "learning_rate": 3.4212337242656156e-11, + "loss": 0.3253, + "step": 2408 + }, + { + "epoch": 1.98, + "grad_norm": 1.9623684204229361, + "learning_rate": 1.9244458902278884e-11, + "loss": 0.3983, + "step": 2409 + }, + { + "epoch": 1.98, + "grad_norm": 1.753382755747083, + "learning_rate": 8.553098941876947e-12, + "loss": 0.3433, + "step": 2410 + }, + { + "epoch": 1.98, + "grad_norm": 1.7839093116025138, + "learning_rate": 2.1382756498766753e-12, + "loss": 0.3743, + "step": 2411 + }, + { + "epoch": 1.98, + "grad_norm": 1.8247664664090246, + "learning_rate": 0.0, + "loss": 0.3419, + "step": 2412 + } + ], + "logging_steps": 1, + "max_steps": 2412, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 603, + "total_flos": 1136185615319040.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2412/training_args.bin b/checkpoint-2412/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e --- /dev/null +++ b/checkpoint-2412/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26 +size 7352 diff --git a/checkpoint-2412/zero_to_fp32.py b/checkpoint-2412/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b --- /dev/null +++ b/checkpoint-2412/zero_to_fp32.py @@ -0,0 +1,592 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/checkpoint-603/config.json b/checkpoint-603/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76 --- /dev/null +++ b/checkpoint-603/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "alpindale/Mistral-7B-v0.2-hf", + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 32000, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.38.2", + "use_cache": false, + "vocab_size": 32002 +} diff --git a/checkpoint-603/generation_config.json b/checkpoint-603/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc --- /dev/null +++ b/checkpoint-603/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "transformers_version": "4.38.2" +} diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..448632be36b94fce9a238803682d75dec7ae51e9 --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59370e22148d3a8a1d2cd4624d7946cbd7616b76121082a13fe7734a7d5d02c5 +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83b19732a78854dc40d38802c3954036ef62416e --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b53042893025c005ef60ddc8f3397c4183eccf3bb8d43ca6fe90b937586c5be3 +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6b1e266d33c0a472ceec67b3a3d494980842efd --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cef9cf818255a83cde92a08ece9ca1eff5aaaf294200103287c9446b18ba83cb +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca29e9a415c45ec9fb56a1a89ec2690bb2c3c3ac --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a96afb8918f45759eb193305769f1d407427f79b37364fd4066fb832c4cf2b +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e7dccae6b525c60c58488c3a74da353d6da2766 --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f6f1b8a0c4a54dc38c3811a0160fe52885a7f82982e78eea9dbe8ec5be1799 +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e945055499c5d9be0813f06293afcf5a98002f01 --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af7f387005cd25db76e0929d2bdbe6eec2be1a8b08e284fffba3a2c6b3c71bb5 +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..438bb41c33684f1698f9f0afda965d75656e0e5a --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06d83b7316728ec5932771c7f7b3b8108c5ce6affa4f06d2f1909b490e0d58e +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..02def1d19549a11b77e6bd136f7cfe49743eb305 --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d58b1f6f3a57acccdd9de93ee37604f78994e878101c8d31e0b54ee51667710f +size 4831623435 diff --git a/checkpoint-603/global_step603/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt b/checkpoint-603/global_step603/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bce01e08b82b8e35be03f675f745a3edf7065967 --- /dev/null +++ b/checkpoint-603/global_step603/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e25c081ab69293c7624657196aa101e5f93da28fd53ce9c0ed7c2cde1e7bda5 +size 4831623435 diff --git a/checkpoint-603/global_step603/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc86df6dbf064e329b44f0edd948821b403844b0 --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f068ea32760e0b48dac2d6af9eac8a70cb1e844c7f3df607dba31b5b1a65bb +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbfe195ff56e36f822ae14568491e633e545b11b --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f012302b249642f3647f99b5acf92cbab815fb99a79368a02ab1aa7147d9cc9 +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb1befd2ac756cfc6de31025ff60c1b41f937042 --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aac9dbbcd4643c54be734b072c8a79493529ef0c76048fb783945075bdd0f4a +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fec34785df33dbcc972ce4ec8fc10cda57918a6 --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1df7d9f001aa84b5743fd10d6afb2a15e032818f728d246a9a55f62a90f78eb3 +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_4_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_4_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f877c57ebc122f734c9d8e6cb6dd6046f9e2b020 --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_4_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:549ad2aae080a1d987992d584bb537308f73c6734215531882ce33c6e440a523 +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_5_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_5_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..94f29c4943b63db704960aa3e56927c1bbeb28f2 --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_5_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43eb9c0d5739614a0e52559181b5107c8b4a52fe8f2823b3a5fe055a536a2d27 +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_6_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_6_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e31e537ad39d7343e213fc51911cfe0044593db --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_6_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eedbfca9cde614f50defef9af4967b35a4f4ccf27e94df4ad2ff4be358848b54 +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_7_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_7_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c115461d604c2025df1f904be40e4931eccb289a --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_7_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada9a3962cbc0dd20da5343bc7f4aa055b4e752ab514423e555f7cbdbc2b3c2c +size 153829 diff --git a/checkpoint-603/global_step603/zero_pp_rank_8_mp_rank_00_model_states.pt b/checkpoint-603/global_step603/zero_pp_rank_8_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cf3a0cf0990a86c554e5e80f59085fe79d62d75 --- /dev/null +++ b/checkpoint-603/global_step603/zero_pp_rank_8_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f40ba5b0013f52277f2af3d3822c99d8fdae8a69c1c3c7c3794949261d198b86 +size 153829 diff --git a/checkpoint-603/latest b/checkpoint-603/latest new file mode 100644 index 0000000000000000000000000000000000000000..79c0a3e5ad0a79110e06067b151d8c6b9a0aacd6 --- /dev/null +++ b/checkpoint-603/latest @@ -0,0 +1 @@ +global_step603 \ No newline at end of file diff --git a/checkpoint-603/model-00001-of-00003.safetensors b/checkpoint-603/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..125cf68fd8372f547ac2a310463684c5286d6917 --- /dev/null +++ b/checkpoint-603/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25433fce8850cf2c8bfb942d2859f8b1ae795d54153015bfda50c860636f2d33 +size 4943178720 diff --git a/checkpoint-603/model-00002-of-00003.safetensors b/checkpoint-603/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29f87439dd45776ba27a6ffd102bc711eec7c73d --- /dev/null +++ b/checkpoint-603/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4ab548de5173dd2f658babb32a5dc829cbec02780d3349d8fe38ea754d6e0e +size 4999819336 diff --git a/checkpoint-603/model-00003-of-00003.safetensors b/checkpoint-603/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..652488552dc46b5266ea673729d104b6c83f758e --- /dev/null +++ b/checkpoint-603/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:647792278f578fc90f83abfaef9375524293e96a59884f35e68895f545856809 +size 4540532728 diff --git a/checkpoint-603/model.safetensors.index.json b/checkpoint-603/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2 --- /dev/null +++ b/checkpoint-603/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 14483496960 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/checkpoint-603/rng_state_0.pth b/checkpoint-603/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ae643fef71bb5468722e041971c4fd10143dcde --- /dev/null +++ b/checkpoint-603/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78df38122b8b51b69a3cce1a8d8cb0f7d8684196dde8fb6d174ef0fd3440d89 +size 16240 diff --git a/checkpoint-603/rng_state_1.pth b/checkpoint-603/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0dec857bd06d8263dc0d1f195ea4d4288bad4641 --- /dev/null +++ b/checkpoint-603/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499f46e15237a5856de1a8f0582d02e4319721d83140e01c31e9e1db92da7108 +size 16240 diff --git a/checkpoint-603/rng_state_2.pth b/checkpoint-603/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d57f4b1f904b392ef605de094c7e5171fced622 --- /dev/null +++ b/checkpoint-603/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b32ec8b414a3886bf179af827449dee557e95bfa64a7c20f26c186df2659c9f +size 16240 diff --git a/checkpoint-603/rng_state_3.pth b/checkpoint-603/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..4c8bebc9d459d1ed2d1ab4f27d7ec2da721d0445 --- /dev/null +++ b/checkpoint-603/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82765e3b8fb57ca7779e75617b51182226eed278593e6441a31510115950353d +size 16240 diff --git a/checkpoint-603/rng_state_4.pth b/checkpoint-603/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..71f7ca7b0554bc7702f1e276ae0cd3924ffba0d2 --- /dev/null +++ b/checkpoint-603/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2c24e041054b45b5bf8c50512ea8c4552e5f2e877fe798759dec7a7f3aae1 +size 16240 diff --git a/checkpoint-603/rng_state_5.pth b/checkpoint-603/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..2393f7d616bfb4cf0ab81957f29d35b455685a54 --- /dev/null +++ b/checkpoint-603/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b3e1210264272a2020cbcb79f6ade48528f5682dadcecb7a94805779548161 +size 16240 diff --git a/checkpoint-603/rng_state_6.pth b/checkpoint-603/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..46f8e8cc8551391d67e345af829445ad610b17a4 --- /dev/null +++ b/checkpoint-603/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:556ec0b910e14a1a5ab8fb6a1a16d525b89e31c69dd9b6cd8d4a4cccad65b546 +size 16240 diff --git a/checkpoint-603/rng_state_7.pth b/checkpoint-603/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..b0723b7d69eb2d78f3ee4bdd7f838269f3f845d1 --- /dev/null +++ b/checkpoint-603/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e830dc416886fe1aafeacfa75da6baacdbe9a61c66d2f1fbc11417753a516513 +size 16240 diff --git a/checkpoint-603/rng_state_8.pth b/checkpoint-603/rng_state_8.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9da906954a171d52c0afc8baea75914a9bb9a62 --- /dev/null +++ b/checkpoint-603/rng_state_8.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d7cb0002af3e22c063c6751b91836d7e06c4267f7ba8e1912c42d6867e4885 +size 16240 diff --git a/checkpoint-603/scheduler.pt b/checkpoint-603/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca7b6c97f0d9d22ad737fa4ce94633a5d89d4b35 --- /dev/null +++ b/checkpoint-603/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cf517ae91e21a035522e0b4a4fedb4101eafa6a9cc5b1728a258fae8d83e6cb +size 1064 diff --git a/checkpoint-603/trainer_state.json b/checkpoint-603/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..14e538bf4467f81121288a2533b546025d0f9a26 --- /dev/null +++ b/checkpoint-603/trainer_state.json @@ -0,0 +1,4242 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4998963730569948, + "eval_steps": 500, + "global_step": 603, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 27.81778461909011, + "learning_rate": 5.000000000000001e-07, + "loss": 0.7993, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 28.63833175363421, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.9056, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 25.646828828014854, + "learning_rate": 1.5e-06, + "loss": 0.8473, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 9.834124771941388, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8192, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 10.558095859980105, + "learning_rate": 2.5e-06, + "loss": 0.7943, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 7.905789045775758, + "learning_rate": 3e-06, + "loss": 0.7075, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 7.259519170268483, + "learning_rate": 3.5e-06, + "loss": 0.7537, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 6.639042051048664, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7471, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 8.515070932390074, + "learning_rate": 4.5e-06, + "loss": 0.7689, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 8.916410424632533, + "learning_rate": 5e-06, + "loss": 0.7194, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 4.835046497413255, + "learning_rate": 4.9999978617243506e-06, + "loss": 0.6949, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 10.065648500649479, + "learning_rate": 4.9999914469010585e-06, + "loss": 0.7039, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 5.299372887839679, + "learning_rate": 4.999980755541098e-06, + "loss": 0.7067, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 5.693110837094718, + "learning_rate": 4.999965787662758e-06, + "loss": 0.7126, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 2.983869635716314, + "learning_rate": 4.999946543291642e-06, + "loss": 0.6496, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 4.2561193962441175, + "learning_rate": 4.999923022460671e-06, + "loss": 0.7036, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 3.011772824968437, + "learning_rate": 4.999895225210079e-06, + "loss": 0.7009, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 3.386638415717137, + "learning_rate": 4.9998631515874165e-06, + "loss": 0.6624, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 3.764658092125165, + "learning_rate": 4.999826801647551e-06, + "loss": 0.6687, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 2.3982096117966614, + "learning_rate": 4.999786175452662e-06, + "loss": 0.706, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 2.8051633678260193, + "learning_rate": 4.999741273072246e-06, + "loss": 0.7031, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 3.1177784624332614, + "learning_rate": 4.999692094583114e-06, + "loss": 0.7525, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 2.2533819675617806, + "learning_rate": 4.9996386400693906e-06, + "loss": 0.6767, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 2.61893793162573, + "learning_rate": 4.999580909622518e-06, + "loss": 0.6432, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 2.76057623723569, + "learning_rate": 4.999518903341251e-06, + "loss": 0.6809, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 2.27983032069553, + "learning_rate": 4.999452621331657e-06, + "loss": 0.6798, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 2.501904568120582, + "learning_rate": 4.99938206370712e-06, + "loss": 0.6412, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 2.819229290729669, + "learning_rate": 4.999307230588338e-06, + "loss": 0.6188, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 2.1233212322022212, + "learning_rate": 4.9992281221033224e-06, + "loss": 0.6378, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 2.7806911906686755, + "learning_rate": 4.999144738387396e-06, + "loss": 0.6653, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 2.4045490257014563, + "learning_rate": 4.999057079583199e-06, + "loss": 0.6377, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 2.3803717769210446, + "learning_rate": 4.998965145840681e-06, + "loss": 0.6855, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 2.3976652879633473, + "learning_rate": 4.998868937317106e-06, + "loss": 0.6284, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 2.2958541157119727, + "learning_rate": 4.998768454177051e-06, + "loss": 0.6521, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 2.1925196833696154, + "learning_rate": 4.998663696592403e-06, + "loss": 0.6619, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 2.361006042901851, + "learning_rate": 4.998554664742362e-06, + "loss": 0.6155, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 2.1577758143653614, + "learning_rate": 4.998441358813443e-06, + "loss": 0.6398, + "step": 37 + }, + { + "epoch": 0.03, + "grad_norm": 2.219872074512664, + "learning_rate": 4.998323778999467e-06, + "loss": 0.6051, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 2.2907501521408546, + "learning_rate": 4.9982019255015705e-06, + "loss": 0.6337, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 2.1769862324666183, + "learning_rate": 4.9980757985281955e-06, + "loss": 0.6606, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 2.4252479779661607, + "learning_rate": 4.997945398295101e-06, + "loss": 0.6685, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 2.3929541982084657, + "learning_rate": 4.99781072502535e-06, + "loss": 0.6084, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 1.932539969840091, + "learning_rate": 4.997671778949318e-06, + "loss": 0.6123, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 2.191742541327873, + "learning_rate": 4.997528560304688e-06, + "loss": 0.6247, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 2.423376784566499, + "learning_rate": 4.997381069336455e-06, + "loss": 0.7024, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 2.0599055392481076, + "learning_rate": 4.997229306296918e-06, + "loss": 0.6612, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 2.16832922087532, + "learning_rate": 4.997073271445686e-06, + "loss": 0.5949, + "step": 47 + }, + { + "epoch": 0.04, + "grad_norm": 2.0483598654319453, + "learning_rate": 4.9969129650496775e-06, + "loss": 0.6406, + "step": 48 + }, + { + "epoch": 0.04, + "grad_norm": 1.963056609139284, + "learning_rate": 4.996748387383113e-06, + "loss": 0.6361, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 2.2094923844269307, + "learning_rate": 4.996579538727527e-06, + "loss": 0.5901, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 2.1088153449411857, + "learning_rate": 4.996406419371749e-06, + "loss": 0.6458, + "step": 51 + }, + { + "epoch": 0.04, + "grad_norm": 2.093448940617732, + "learning_rate": 4.996229029611926e-06, + "loss": 0.6509, + "step": 52 + }, + { + "epoch": 0.04, + "grad_norm": 2.075116207412987, + "learning_rate": 4.996047369751502e-06, + "loss": 0.6295, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 2.138141165277684, + "learning_rate": 4.995861440101229e-06, + "loss": 0.6088, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 2.186316382848445, + "learning_rate": 4.995671240979161e-06, + "loss": 0.6307, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 2.2513741083982195, + "learning_rate": 4.995476772710657e-06, + "loss": 0.6175, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 2.0827167336870596, + "learning_rate": 4.995278035628379e-06, + "loss": 0.5935, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 2.117977588574442, + "learning_rate": 4.995075030072291e-06, + "loss": 0.5998, + "step": 58 + }, + { + "epoch": 0.05, + "grad_norm": 2.0996940200235485, + "learning_rate": 4.994867756389658e-06, + "loss": 0.6159, + "step": 59 + }, + { + "epoch": 0.05, + "grad_norm": 2.141096165691323, + "learning_rate": 4.994656214935045e-06, + "loss": 0.6294, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 2.022748830058395, + "learning_rate": 4.994440406070323e-06, + "loss": 0.6315, + "step": 61 + }, + { + "epoch": 0.05, + "grad_norm": 2.209132168720991, + "learning_rate": 4.994220330164654e-06, + "loss": 0.5645, + "step": 62 + }, + { + "epoch": 0.05, + "grad_norm": 2.0994557317862674, + "learning_rate": 4.993995987594509e-06, + "loss": 0.6272, + "step": 63 + }, + { + "epoch": 0.05, + "grad_norm": 2.204220831053169, + "learning_rate": 4.99376737874365e-06, + "loss": 0.6379, + "step": 64 + }, + { + "epoch": 0.05, + "grad_norm": 2.127733932186697, + "learning_rate": 4.993534504003141e-06, + "loss": 0.622, + "step": 65 + }, + { + "epoch": 0.05, + "grad_norm": 2.1338506582034316, + "learning_rate": 4.993297363771342e-06, + "loss": 0.6259, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 2.104802764460729, + "learning_rate": 4.993055958453912e-06, + "loss": 0.6414, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 2.0889535347771675, + "learning_rate": 4.9928102884638004e-06, + "loss": 0.6466, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 2.252225316694296, + "learning_rate": 4.992560354221258e-06, + "loss": 0.6167, + "step": 69 + }, + { + "epoch": 0.06, + "grad_norm": 2.015392533516649, + "learning_rate": 4.992306156153827e-06, + "loss": 0.5958, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 2.151741408948778, + "learning_rate": 4.992047694696343e-06, + "loss": 0.5875, + "step": 71 + }, + { + "epoch": 0.06, + "grad_norm": 2.0351299117412696, + "learning_rate": 4.991784970290935e-06, + "loss": 0.5935, + "step": 72 + }, + { + "epoch": 0.06, + "grad_norm": 2.0000962363827983, + "learning_rate": 4.991517983387026e-06, + "loss": 0.6091, + "step": 73 + }, + { + "epoch": 0.06, + "grad_norm": 2.202881736102415, + "learning_rate": 4.99124673444133e-06, + "loss": 0.6122, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 2.015074773396151, + "learning_rate": 4.990971223917848e-06, + "loss": 0.6134, + "step": 75 + }, + { + "epoch": 0.06, + "grad_norm": 2.009305960567766, + "learning_rate": 4.990691452287877e-06, + "loss": 0.6308, + "step": 76 + }, + { + "epoch": 0.06, + "grad_norm": 1.9967884756310221, + "learning_rate": 4.990407420029999e-06, + "loss": 0.6098, + "step": 77 + }, + { + "epoch": 0.06, + "grad_norm": 2.0858738033925905, + "learning_rate": 4.990119127630085e-06, + "loss": 0.6344, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 1.9427707561903895, + "learning_rate": 4.989826575581295e-06, + "loss": 0.6049, + "step": 79 + }, + { + "epoch": 0.07, + "grad_norm": 2.157150584766853, + "learning_rate": 4.989529764384073e-06, + "loss": 0.5965, + "step": 80 + }, + { + "epoch": 0.07, + "grad_norm": 2.0303527419352583, + "learning_rate": 4.989228694546151e-06, + "loss": 0.6524, + "step": 81 + }, + { + "epoch": 0.07, + "grad_norm": 2.128799919475717, + "learning_rate": 4.988923366582546e-06, + "loss": 0.5524, + "step": 82 + }, + { + "epoch": 0.07, + "grad_norm": 2.0122786280510696, + "learning_rate": 4.988613781015557e-06, + "loss": 0.6268, + "step": 83 + }, + { + "epoch": 0.07, + "grad_norm": 2.104580177719229, + "learning_rate": 4.988299938374769e-06, + "loss": 0.6229, + "step": 84 + }, + { + "epoch": 0.07, + "grad_norm": 2.3894843860356834, + "learning_rate": 4.9879818391970455e-06, + "loss": 0.6194, + "step": 85 + }, + { + "epoch": 0.07, + "grad_norm": 1.9615211372441477, + "learning_rate": 4.9876594840265355e-06, + "loss": 0.6355, + "step": 86 + }, + { + "epoch": 0.07, + "grad_norm": 2.4509852093141937, + "learning_rate": 4.987332873414666e-06, + "loss": 0.6405, + "step": 87 + }, + { + "epoch": 0.07, + "grad_norm": 2.178942375285086, + "learning_rate": 4.987002007920142e-06, + "loss": 0.5593, + "step": 88 + }, + { + "epoch": 0.07, + "grad_norm": 2.2625634345900445, + "learning_rate": 4.9866668881089515e-06, + "loss": 0.6133, + "step": 89 + }, + { + "epoch": 0.07, + "grad_norm": 2.363092638811143, + "learning_rate": 4.986327514554356e-06, + "loss": 0.6298, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 2.0401982492138546, + "learning_rate": 4.985983887836894e-06, + "loss": 0.6276, + "step": 91 + }, + { + "epoch": 0.08, + "grad_norm": 2.276956647922478, + "learning_rate": 4.985636008544381e-06, + "loss": 0.5691, + "step": 92 + }, + { + "epoch": 0.08, + "grad_norm": 2.1072762844110233, + "learning_rate": 4.985283877271908e-06, + "loss": 0.6175, + "step": 93 + }, + { + "epoch": 0.08, + "grad_norm": 2.2931866879442637, + "learning_rate": 4.984927494621836e-06, + "loss": 0.6419, + "step": 94 + }, + { + "epoch": 0.08, + "grad_norm": 2.112474101166308, + "learning_rate": 4.984566861203801e-06, + "loss": 0.607, + "step": 95 + }, + { + "epoch": 0.08, + "grad_norm": 2.1816059679212634, + "learning_rate": 4.984201977634711e-06, + "loss": 0.6136, + "step": 96 + }, + { + "epoch": 0.08, + "grad_norm": 2.0620776369966554, + "learning_rate": 4.9838328445387415e-06, + "loss": 0.6372, + "step": 97 + }, + { + "epoch": 0.08, + "grad_norm": 2.147592836641578, + "learning_rate": 4.983459462547341e-06, + "loss": 0.606, + "step": 98 + }, + { + "epoch": 0.08, + "grad_norm": 2.1808001877062453, + "learning_rate": 4.983081832299224e-06, + "loss": 0.6019, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 2.3751999527114087, + "learning_rate": 4.98269995444037e-06, + "loss": 0.6021, + "step": 100 + }, + { + "epoch": 0.08, + "grad_norm": 1.8769470206406913, + "learning_rate": 4.98231382962403e-06, + "loss": 0.6082, + "step": 101 + }, + { + "epoch": 0.08, + "grad_norm": 2.3060925784921347, + "learning_rate": 4.981923458510717e-06, + "loss": 0.6174, + "step": 102 + }, + { + "epoch": 0.09, + "grad_norm": 2.1543176832473683, + "learning_rate": 4.981528841768206e-06, + "loss": 0.6092, + "step": 103 + }, + { + "epoch": 0.09, + "grad_norm": 2.1558689520522547, + "learning_rate": 4.981129980071538e-06, + "loss": 0.587, + "step": 104 + }, + { + "epoch": 0.09, + "grad_norm": 2.3830532005188383, + "learning_rate": 4.980726874103014e-06, + "loss": 0.6518, + "step": 105 + }, + { + "epoch": 0.09, + "grad_norm": 2.3333119576634767, + "learning_rate": 4.980319524552195e-06, + "loss": 0.6096, + "step": 106 + }, + { + "epoch": 0.09, + "grad_norm": 2.1135146855324214, + "learning_rate": 4.9799079321159e-06, + "loss": 0.5728, + "step": 107 + }, + { + "epoch": 0.09, + "grad_norm": 2.2300463384326394, + "learning_rate": 4.9794920974982095e-06, + "loss": 0.6563, + "step": 108 + }, + { + "epoch": 0.09, + "grad_norm": 2.1745234017525443, + "learning_rate": 4.979072021410458e-06, + "loss": 0.5968, + "step": 109 + }, + { + "epoch": 0.09, + "grad_norm": 2.1536586182562334, + "learning_rate": 4.978647704571237e-06, + "loss": 0.6189, + "step": 110 + }, + { + "epoch": 0.09, + "grad_norm": 2.193809374687326, + "learning_rate": 4.97821914770639e-06, + "loss": 0.5864, + "step": 111 + }, + { + "epoch": 0.09, + "grad_norm": 2.0525896373682047, + "learning_rate": 4.977786351549017e-06, + "loss": 0.6101, + "step": 112 + }, + { + "epoch": 0.09, + "grad_norm": 2.216099286618384, + "learning_rate": 4.977349316839467e-06, + "loss": 0.5984, + "step": 113 + }, + { + "epoch": 0.09, + "grad_norm": 2.155122255962579, + "learning_rate": 4.97690804432534e-06, + "loss": 0.6311, + "step": 114 + }, + { + "epoch": 0.1, + "grad_norm": 2.2972101190291374, + "learning_rate": 4.976462534761487e-06, + "loss": 0.5813, + "step": 115 + }, + { + "epoch": 0.1, + "grad_norm": 1.9925413745245948, + "learning_rate": 4.9760127889100044e-06, + "loss": 0.6157, + "step": 116 + }, + { + "epoch": 0.1, + "grad_norm": 2.2802548684036568, + "learning_rate": 4.975558807540238e-06, + "loss": 0.6079, + "step": 117 + }, + { + "epoch": 0.1, + "grad_norm": 2.048888007394621, + "learning_rate": 4.9751005914287775e-06, + "loss": 0.6467, + "step": 118 + }, + { + "epoch": 0.1, + "grad_norm": 2.28661640438254, + "learning_rate": 4.974638141359456e-06, + "loss": 0.6029, + "step": 119 + }, + { + "epoch": 0.1, + "grad_norm": 2.004056683755783, + "learning_rate": 4.974171458123351e-06, + "loss": 0.6289, + "step": 120 + }, + { + "epoch": 0.1, + "grad_norm": 2.1628470048067667, + "learning_rate": 4.97370054251878e-06, + "loss": 0.6139, + "step": 121 + }, + { + "epoch": 0.1, + "grad_norm": 2.056119895466544, + "learning_rate": 4.9732253953513e-06, + "loss": 0.5798, + "step": 122 + }, + { + "epoch": 0.1, + "grad_norm": 2.1716513163164275, + "learning_rate": 4.972746017433709e-06, + "loss": 0.6085, + "step": 123 + }, + { + "epoch": 0.1, + "grad_norm": 2.255856676525811, + "learning_rate": 4.97226240958604e-06, + "loss": 0.6342, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 2.1049280498075373, + "learning_rate": 4.971774572635563e-06, + "loss": 0.6197, + "step": 125 + }, + { + "epoch": 0.1, + "grad_norm": 2.133349390995361, + "learning_rate": 4.97128250741678e-06, + "loss": 0.5751, + "step": 126 + }, + { + "epoch": 0.11, + "grad_norm": 2.2044887467317578, + "learning_rate": 4.97078621477143e-06, + "loss": 0.6611, + "step": 127 + }, + { + "epoch": 0.11, + "grad_norm": 2.1413863795698145, + "learning_rate": 4.970285695548481e-06, + "loss": 0.625, + "step": 128 + }, + { + "epoch": 0.11, + "grad_norm": 2.0229587336296615, + "learning_rate": 4.969780950604132e-06, + "loss": 0.5989, + "step": 129 + }, + { + "epoch": 0.11, + "grad_norm": 2.0983599595244247, + "learning_rate": 4.969271980801808e-06, + "loss": 0.5747, + "step": 130 + }, + { + "epoch": 0.11, + "grad_norm": 2.1059041140010786, + "learning_rate": 4.9687587870121645e-06, + "loss": 0.5869, + "step": 131 + }, + { + "epoch": 0.11, + "grad_norm": 1.8967441614595046, + "learning_rate": 4.9682413701130815e-06, + "loss": 0.6272, + "step": 132 + }, + { + "epoch": 0.11, + "grad_norm": 1.9976164993621088, + "learning_rate": 4.967719730989663e-06, + "loss": 0.6282, + "step": 133 + }, + { + "epoch": 0.11, + "grad_norm": 1.8719131324952145, + "learning_rate": 4.967193870534235e-06, + "loss": 0.6052, + "step": 134 + }, + { + "epoch": 0.11, + "grad_norm": 2.071702997476533, + "learning_rate": 4.9666637896463455e-06, + "loss": 0.5785, + "step": 135 + }, + { + "epoch": 0.11, + "grad_norm": 1.9549455320048341, + "learning_rate": 4.966129489232762e-06, + "loss": 0.5739, + "step": 136 + }, + { + "epoch": 0.11, + "grad_norm": 2.0656898626759315, + "learning_rate": 4.9655909702074684e-06, + "loss": 0.6651, + "step": 137 + }, + { + "epoch": 0.11, + "grad_norm": 2.1185948604203038, + "learning_rate": 4.965048233491669e-06, + "loss": 0.5759, + "step": 138 + }, + { + "epoch": 0.12, + "grad_norm": 2.08566019272993, + "learning_rate": 4.964501280013777e-06, + "loss": 0.6271, + "step": 139 + }, + { + "epoch": 0.12, + "grad_norm": 2.117420903965419, + "learning_rate": 4.963950110709425e-06, + "loss": 0.5968, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 1.9784944143818486, + "learning_rate": 4.963394726521453e-06, + "loss": 0.6112, + "step": 141 + }, + { + "epoch": 0.12, + "grad_norm": 2.077292948039572, + "learning_rate": 4.9628351283999144e-06, + "loss": 0.5636, + "step": 142 + }, + { + "epoch": 0.12, + "grad_norm": 2.223803520245629, + "learning_rate": 4.962271317302068e-06, + "loss": 0.6658, + "step": 143 + }, + { + "epoch": 0.12, + "grad_norm": 2.039369072186367, + "learning_rate": 4.9617032941923796e-06, + "loss": 0.5853, + "step": 144 + }, + { + "epoch": 0.12, + "grad_norm": 2.071470113085907, + "learning_rate": 4.961131060042522e-06, + "loss": 0.601, + "step": 145 + }, + { + "epoch": 0.12, + "grad_norm": 2.437470272347474, + "learning_rate": 4.960554615831372e-06, + "loss": 0.6593, + "step": 146 + }, + { + "epoch": 0.12, + "grad_norm": 2.178684122927139, + "learning_rate": 4.959973962545005e-06, + "loss": 0.607, + "step": 147 + }, + { + "epoch": 0.12, + "grad_norm": 2.097006749956471, + "learning_rate": 4.9593891011767e-06, + "loss": 0.5873, + "step": 148 + }, + { + "epoch": 0.12, + "grad_norm": 1.9801202541822784, + "learning_rate": 4.958800032726931e-06, + "loss": 0.5877, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 2.30001951085656, + "learning_rate": 4.958206758203373e-06, + "loss": 0.6368, + "step": 150 + }, + { + "epoch": 0.13, + "grad_norm": 1.990094260131078, + "learning_rate": 4.957609278620891e-06, + "loss": 0.59, + "step": 151 + }, + { + "epoch": 0.13, + "grad_norm": 2.262163752076628, + "learning_rate": 4.957007595001548e-06, + "loss": 0.5779, + "step": 152 + }, + { + "epoch": 0.13, + "grad_norm": 2.1970152093220983, + "learning_rate": 4.956401708374595e-06, + "loss": 0.5894, + "step": 153 + }, + { + "epoch": 0.13, + "grad_norm": 2.220825872684071, + "learning_rate": 4.9557916197764745e-06, + "loss": 0.6528, + "step": 154 + }, + { + "epoch": 0.13, + "grad_norm": 2.099472677591387, + "learning_rate": 4.955177330250817e-06, + "loss": 0.5798, + "step": 155 + }, + { + "epoch": 0.13, + "grad_norm": 2.159203936881569, + "learning_rate": 4.954558840848437e-06, + "loss": 0.6206, + "step": 156 + }, + { + "epoch": 0.13, + "grad_norm": 2.185152414039555, + "learning_rate": 4.953936152627338e-06, + "loss": 0.5624, + "step": 157 + }, + { + "epoch": 0.13, + "grad_norm": 2.0679748168992624, + "learning_rate": 4.953309266652701e-06, + "loss": 0.5859, + "step": 158 + }, + { + "epoch": 0.13, + "grad_norm": 2.327237187255128, + "learning_rate": 4.952678183996891e-06, + "loss": 0.5632, + "step": 159 + }, + { + "epoch": 0.13, + "grad_norm": 2.2865519679977417, + "learning_rate": 4.952042905739451e-06, + "loss": 0.6965, + "step": 160 + }, + { + "epoch": 0.13, + "grad_norm": 2.523435408018699, + "learning_rate": 4.9514034329671e-06, + "loss": 0.6217, + "step": 161 + }, + { + "epoch": 0.13, + "grad_norm": 2.4992653226709636, + "learning_rate": 4.950759766773734e-06, + "loss": 0.6175, + "step": 162 + }, + { + "epoch": 0.14, + "grad_norm": 2.432752824777114, + "learning_rate": 4.950111908260423e-06, + "loss": 0.5862, + "step": 163 + }, + { + "epoch": 0.14, + "grad_norm": 2.137500912204061, + "learning_rate": 4.949459858535404e-06, + "loss": 0.6124, + "step": 164 + }, + { + "epoch": 0.14, + "grad_norm": 2.2226376224120474, + "learning_rate": 4.94880361871409e-06, + "loss": 0.5891, + "step": 165 + }, + { + "epoch": 0.14, + "grad_norm": 2.3821839805775165, + "learning_rate": 4.9481431899190544e-06, + "loss": 0.6008, + "step": 166 + }, + { + "epoch": 0.14, + "grad_norm": 2.306242834684614, + "learning_rate": 4.947478573280044e-06, + "loss": 0.6159, + "step": 167 + }, + { + "epoch": 0.14, + "grad_norm": 2.3298092236851518, + "learning_rate": 4.946809769933963e-06, + "loss": 0.5809, + "step": 168 + }, + { + "epoch": 0.14, + "grad_norm": 2.364296499621558, + "learning_rate": 4.946136781024883e-06, + "loss": 0.5895, + "step": 169 + }, + { + "epoch": 0.14, + "grad_norm": 2.237241095609228, + "learning_rate": 4.945459607704029e-06, + "loss": 0.6144, + "step": 170 + }, + { + "epoch": 0.14, + "grad_norm": 2.4027419761972264, + "learning_rate": 4.9447782511297905e-06, + "loss": 0.5985, + "step": 171 + }, + { + "epoch": 0.14, + "grad_norm": 2.1547059182244284, + "learning_rate": 4.944092712467709e-06, + "loss": 0.5763, + "step": 172 + }, + { + "epoch": 0.14, + "grad_norm": 2.1530221667047984, + "learning_rate": 4.9434029928904805e-06, + "loss": 0.5692, + "step": 173 + }, + { + "epoch": 0.14, + "grad_norm": 2.228588593294869, + "learning_rate": 4.942709093577954e-06, + "loss": 0.5896, + "step": 174 + }, + { + "epoch": 0.15, + "grad_norm": 2.1597295307130198, + "learning_rate": 4.942011015717129e-06, + "loss": 0.5864, + "step": 175 + }, + { + "epoch": 0.15, + "grad_norm": 2.321140955498194, + "learning_rate": 4.941308760502149e-06, + "loss": 0.6089, + "step": 176 + }, + { + "epoch": 0.15, + "grad_norm": 2.220124736460707, + "learning_rate": 4.940602329134309e-06, + "loss": 0.5786, + "step": 177 + }, + { + "epoch": 0.15, + "grad_norm": 2.1698038563080417, + "learning_rate": 4.939891722822043e-06, + "loss": 0.5749, + "step": 178 + }, + { + "epoch": 0.15, + "grad_norm": 2.244425969121411, + "learning_rate": 4.93917694278093e-06, + "loss": 0.5877, + "step": 179 + }, + { + "epoch": 0.15, + "grad_norm": 2.143920008069458, + "learning_rate": 4.938457990233687e-06, + "loss": 0.6024, + "step": 180 + }, + { + "epoch": 0.15, + "grad_norm": 2.1786040820345813, + "learning_rate": 4.937734866410169e-06, + "loss": 0.5845, + "step": 181 + }, + { + "epoch": 0.15, + "grad_norm": 2.301832824481007, + "learning_rate": 4.9370075725473665e-06, + "loss": 0.6182, + "step": 182 + }, + { + "epoch": 0.15, + "grad_norm": 2.3748033727083997, + "learning_rate": 4.936276109889403e-06, + "loss": 0.6073, + "step": 183 + }, + { + "epoch": 0.15, + "grad_norm": 2.476334487382023, + "learning_rate": 4.935540479687534e-06, + "loss": 0.5793, + "step": 184 + }, + { + "epoch": 0.15, + "grad_norm": 2.2509466352322494, + "learning_rate": 4.934800683200143e-06, + "loss": 0.6133, + "step": 185 + }, + { + "epoch": 0.15, + "grad_norm": 2.8391697547684873, + "learning_rate": 4.934056721692742e-06, + "loss": 0.5967, + "step": 186 + }, + { + "epoch": 0.16, + "grad_norm": 2.4492364225391765, + "learning_rate": 4.933308596437965e-06, + "loss": 0.5676, + "step": 187 + }, + { + "epoch": 0.16, + "grad_norm": 2.685548141821295, + "learning_rate": 4.932556308715573e-06, + "loss": 0.6069, + "step": 188 + }, + { + "epoch": 0.16, + "grad_norm": 2.261217637824808, + "learning_rate": 4.931799859812443e-06, + "loss": 0.6411, + "step": 189 + }, + { + "epoch": 0.16, + "grad_norm": 2.3838284395200966, + "learning_rate": 4.931039251022573e-06, + "loss": 0.5745, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 2.2550921344466164, + "learning_rate": 4.930274483647074e-06, + "loss": 0.5989, + "step": 191 + }, + { + "epoch": 0.16, + "grad_norm": 2.078406234527636, + "learning_rate": 4.929505558994175e-06, + "loss": 0.5998, + "step": 192 + }, + { + "epoch": 0.16, + "grad_norm": 2.592864566091496, + "learning_rate": 4.928732478379214e-06, + "loss": 0.5842, + "step": 193 + }, + { + "epoch": 0.16, + "grad_norm": 2.092752299259724, + "learning_rate": 4.927955243124638e-06, + "loss": 0.5789, + "step": 194 + }, + { + "epoch": 0.16, + "grad_norm": 2.3799311595696966, + "learning_rate": 4.927173854560002e-06, + "loss": 0.6265, + "step": 195 + }, + { + "epoch": 0.16, + "grad_norm": 2.246876688010602, + "learning_rate": 4.926388314021964e-06, + "loss": 0.6126, + "step": 196 + }, + { + "epoch": 0.16, + "grad_norm": 2.1409898276704578, + "learning_rate": 4.925598622854287e-06, + "loss": 0.6073, + "step": 197 + }, + { + "epoch": 0.16, + "grad_norm": 2.5946158421875385, + "learning_rate": 4.924804782407834e-06, + "loss": 0.6154, + "step": 198 + }, + { + "epoch": 0.16, + "grad_norm": 2.1225494320427982, + "learning_rate": 4.924006794040562e-06, + "loss": 0.583, + "step": 199 + }, + { + "epoch": 0.17, + "grad_norm": 2.1971323526291338, + "learning_rate": 4.923204659117528e-06, + "loss": 0.6078, + "step": 200 + }, + { + "epoch": 0.17, + "grad_norm": 2.289185506404785, + "learning_rate": 4.92239837901088e-06, + "loss": 0.6127, + "step": 201 + }, + { + "epoch": 0.17, + "grad_norm": 2.0071007751625354, + "learning_rate": 4.921587955099858e-06, + "loss": 0.5804, + "step": 202 + }, + { + "epoch": 0.17, + "grad_norm": 2.2981840149068247, + "learning_rate": 4.920773388770789e-06, + "loss": 0.6027, + "step": 203 + }, + { + "epoch": 0.17, + "grad_norm": 2.236179116886702, + "learning_rate": 4.919954681417087e-06, + "loss": 0.6179, + "step": 204 + }, + { + "epoch": 0.17, + "grad_norm": 2.007422589251611, + "learning_rate": 4.91913183443925e-06, + "loss": 0.5647, + "step": 205 + }, + { + "epoch": 0.17, + "grad_norm": 2.1402813555735483, + "learning_rate": 4.918304849244857e-06, + "loss": 0.5841, + "step": 206 + }, + { + "epoch": 0.17, + "grad_norm": 2.0456415785177104, + "learning_rate": 4.917473727248565e-06, + "loss": 0.5524, + "step": 207 + }, + { + "epoch": 0.17, + "grad_norm": 1.9673558126020942, + "learning_rate": 4.916638469872109e-06, + "loss": 0.5698, + "step": 208 + }, + { + "epoch": 0.17, + "grad_norm": 2.015111672496819, + "learning_rate": 4.9157990785442964e-06, + "loss": 0.5957, + "step": 209 + }, + { + "epoch": 0.17, + "grad_norm": 1.9502065547578398, + "learning_rate": 4.9149555547010086e-06, + "loss": 0.5592, + "step": 210 + }, + { + "epoch": 0.17, + "grad_norm": 2.167936522558899, + "learning_rate": 4.9141078997851945e-06, + "loss": 0.5705, + "step": 211 + }, + { + "epoch": 0.18, + "grad_norm": 2.2066587458997935, + "learning_rate": 4.91325611524687e-06, + "loss": 0.5526, + "step": 212 + }, + { + "epoch": 0.18, + "grad_norm": 1.9132995625903553, + "learning_rate": 4.9124002025431136e-06, + "loss": 0.5767, + "step": 213 + }, + { + "epoch": 0.18, + "grad_norm": 2.0097281107801277, + "learning_rate": 4.91154016313807e-06, + "loss": 0.6185, + "step": 214 + }, + { + "epoch": 0.18, + "grad_norm": 2.023532008241332, + "learning_rate": 4.910675998502938e-06, + "loss": 0.6005, + "step": 215 + }, + { + "epoch": 0.18, + "grad_norm": 1.9253831001776973, + "learning_rate": 4.909807710115977e-06, + "loss": 0.5769, + "step": 216 + }, + { + "epoch": 0.18, + "grad_norm": 2.066862408842564, + "learning_rate": 4.908935299462497e-06, + "loss": 0.5671, + "step": 217 + }, + { + "epoch": 0.18, + "grad_norm": 1.9412704290792853, + "learning_rate": 4.908058768034862e-06, + "loss": 0.5568, + "step": 218 + }, + { + "epoch": 0.18, + "grad_norm": 2.185994457097553, + "learning_rate": 4.907178117332487e-06, + "loss": 0.5621, + "step": 219 + }, + { + "epoch": 0.18, + "grad_norm": 2.021517127546353, + "learning_rate": 4.906293348861829e-06, + "loss": 0.5672, + "step": 220 + }, + { + "epoch": 0.18, + "grad_norm": 2.099703967072734, + "learning_rate": 4.905404464136391e-06, + "loss": 0.5366, + "step": 221 + }, + { + "epoch": 0.18, + "grad_norm": 2.030197056583618, + "learning_rate": 4.904511464676718e-06, + "loss": 0.6064, + "step": 222 + }, + { + "epoch": 0.18, + "grad_norm": 2.4170102988954896, + "learning_rate": 4.903614352010393e-06, + "loss": 0.5919, + "step": 223 + }, + { + "epoch": 0.19, + "grad_norm": 2.0819468873015476, + "learning_rate": 4.9027131276720355e-06, + "loss": 0.5366, + "step": 224 + }, + { + "epoch": 0.19, + "grad_norm": 2.148008018153629, + "learning_rate": 4.901807793203299e-06, + "loss": 0.597, + "step": 225 + }, + { + "epoch": 0.19, + "grad_norm": 2.0303725862017186, + "learning_rate": 4.900898350152866e-06, + "loss": 0.6394, + "step": 226 + }, + { + "epoch": 0.19, + "grad_norm": 2.1598989214704334, + "learning_rate": 4.899984800076449e-06, + "loss": 0.5932, + "step": 227 + }, + { + "epoch": 0.19, + "grad_norm": 2.0816312637185255, + "learning_rate": 4.899067144536786e-06, + "loss": 0.5909, + "step": 228 + }, + { + "epoch": 0.19, + "grad_norm": 1.9024067197329315, + "learning_rate": 4.8981453851036365e-06, + "loss": 0.5463, + "step": 229 + }, + { + "epoch": 0.19, + "grad_norm": 2.1830926868871043, + "learning_rate": 4.897219523353781e-06, + "loss": 0.5821, + "step": 230 + }, + { + "epoch": 0.19, + "grad_norm": 2.1156269612794016, + "learning_rate": 4.8962895608710195e-06, + "loss": 0.5993, + "step": 231 + }, + { + "epoch": 0.19, + "grad_norm": 1.9653407654210864, + "learning_rate": 4.895355499246162e-06, + "loss": 0.5525, + "step": 232 + }, + { + "epoch": 0.19, + "grad_norm": 2.367769051061897, + "learning_rate": 4.894417340077036e-06, + "loss": 0.5683, + "step": 233 + }, + { + "epoch": 0.19, + "grad_norm": 2.078327064466567, + "learning_rate": 4.893475084968474e-06, + "loss": 0.6184, + "step": 234 + }, + { + "epoch": 0.19, + "grad_norm": 2.1661882731589475, + "learning_rate": 4.8925287355323195e-06, + "loss": 0.6321, + "step": 235 + }, + { + "epoch": 0.2, + "grad_norm": 2.182760952002799, + "learning_rate": 4.891578293387413e-06, + "loss": 0.6254, + "step": 236 + }, + { + "epoch": 0.2, + "grad_norm": 1.998723579962691, + "learning_rate": 4.890623760159605e-06, + "loss": 0.5371, + "step": 237 + }, + { + "epoch": 0.2, + "grad_norm": 2.319922346931926, + "learning_rate": 4.8896651374817365e-06, + "loss": 0.5941, + "step": 238 + }, + { + "epoch": 0.2, + "grad_norm": 2.090735197217999, + "learning_rate": 4.888702426993648e-06, + "loss": 0.577, + "step": 239 + }, + { + "epoch": 0.2, + "grad_norm": 2.1247199987228558, + "learning_rate": 4.887735630342173e-06, + "loss": 0.5928, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 2.33151114429804, + "learning_rate": 4.8867647491811315e-06, + "loss": 0.5838, + "step": 241 + }, + { + "epoch": 0.2, + "grad_norm": 2.1570026356289147, + "learning_rate": 4.885789785171334e-06, + "loss": 0.5642, + "step": 242 + }, + { + "epoch": 0.2, + "grad_norm": 2.049571197047368, + "learning_rate": 4.884810739980575e-06, + "loss": 0.6684, + "step": 243 + }, + { + "epoch": 0.2, + "grad_norm": 1.9810062424466381, + "learning_rate": 4.883827615283626e-06, + "loss": 0.5942, + "step": 244 + }, + { + "epoch": 0.2, + "grad_norm": 2.145869663660159, + "learning_rate": 4.882840412762244e-06, + "loss": 0.6356, + "step": 245 + }, + { + "epoch": 0.2, + "grad_norm": 2.19290302186514, + "learning_rate": 4.881849134105156e-06, + "loss": 0.6189, + "step": 246 + }, + { + "epoch": 0.2, + "grad_norm": 2.0561043419872984, + "learning_rate": 4.880853781008062e-06, + "loss": 0.5563, + "step": 247 + }, + { + "epoch": 0.21, + "grad_norm": 1.8831183793224635, + "learning_rate": 4.879854355173638e-06, + "loss": 0.5522, + "step": 248 + }, + { + "epoch": 0.21, + "grad_norm": 2.020981606684741, + "learning_rate": 4.878850858311518e-06, + "loss": 0.5548, + "step": 249 + }, + { + "epoch": 0.21, + "grad_norm": 2.060242570493272, + "learning_rate": 4.877843292138307e-06, + "loss": 0.5715, + "step": 250 + }, + { + "epoch": 0.21, + "grad_norm": 2.082455778933014, + "learning_rate": 4.8768316583775665e-06, + "loss": 0.5959, + "step": 251 + }, + { + "epoch": 0.21, + "grad_norm": 1.9830929719438626, + "learning_rate": 4.875815958759819e-06, + "loss": 0.5813, + "step": 252 + }, + { + "epoch": 0.21, + "grad_norm": 1.9772267506828567, + "learning_rate": 4.8747961950225406e-06, + "loss": 0.539, + "step": 253 + }, + { + "epoch": 0.21, + "grad_norm": 2.1492561995002104, + "learning_rate": 4.873772368910161e-06, + "loss": 0.6059, + "step": 254 + }, + { + "epoch": 0.21, + "grad_norm": 2.253757247139787, + "learning_rate": 4.872744482174058e-06, + "loss": 0.5897, + "step": 255 + }, + { + "epoch": 0.21, + "grad_norm": 2.3282624851882496, + "learning_rate": 4.8717125365725545e-06, + "loss": 0.5675, + "step": 256 + }, + { + "epoch": 0.21, + "grad_norm": 2.15573581133063, + "learning_rate": 4.8706765338709185e-06, + "loss": 0.5958, + "step": 257 + }, + { + "epoch": 0.21, + "grad_norm": 2.073289220218241, + "learning_rate": 4.869636475841358e-06, + "loss": 0.6052, + "step": 258 + }, + { + "epoch": 0.21, + "grad_norm": 2.293714090249444, + "learning_rate": 4.8685923642630165e-06, + "loss": 0.5786, + "step": 259 + }, + { + "epoch": 0.22, + "grad_norm": 1.9496544276539172, + "learning_rate": 4.867544200921974e-06, + "loss": 0.6163, + "step": 260 + }, + { + "epoch": 0.22, + "grad_norm": 2.5267016753690132, + "learning_rate": 4.866491987611239e-06, + "loss": 0.6223, + "step": 261 + }, + { + "epoch": 0.22, + "grad_norm": 1.8731249445320794, + "learning_rate": 4.865435726130751e-06, + "loss": 0.5632, + "step": 262 + }, + { + "epoch": 0.22, + "grad_norm": 2.3586331105798863, + "learning_rate": 4.86437541828737e-06, + "loss": 0.5769, + "step": 263 + }, + { + "epoch": 0.22, + "grad_norm": 2.0258106914510585, + "learning_rate": 4.863311065894883e-06, + "loss": 0.6103, + "step": 264 + }, + { + "epoch": 0.22, + "grad_norm": 2.2543614390885955, + "learning_rate": 4.862242670773991e-06, + "loss": 0.5844, + "step": 265 + }, + { + "epoch": 0.22, + "grad_norm": 1.9440299381244668, + "learning_rate": 4.861170234752314e-06, + "loss": 0.5559, + "step": 266 + }, + { + "epoch": 0.22, + "grad_norm": 2.254538268495492, + "learning_rate": 4.8600937596643815e-06, + "loss": 0.5709, + "step": 267 + }, + { + "epoch": 0.22, + "grad_norm": 2.007651746385687, + "learning_rate": 4.8590132473516346e-06, + "loss": 0.573, + "step": 268 + }, + { + "epoch": 0.22, + "grad_norm": 2.0735253118288837, + "learning_rate": 4.857928699662421e-06, + "loss": 0.5954, + "step": 269 + }, + { + "epoch": 0.22, + "grad_norm": 2.024775417101569, + "learning_rate": 4.856840118451989e-06, + "loss": 0.5992, + "step": 270 + }, + { + "epoch": 0.22, + "grad_norm": 2.1043310699945814, + "learning_rate": 4.855747505582488e-06, + "loss": 0.6507, + "step": 271 + }, + { + "epoch": 0.23, + "grad_norm": 2.0386353328313214, + "learning_rate": 4.854650862922965e-06, + "loss": 0.5666, + "step": 272 + }, + { + "epoch": 0.23, + "grad_norm": 1.978698841367705, + "learning_rate": 4.853550192349358e-06, + "loss": 0.5593, + "step": 273 + }, + { + "epoch": 0.23, + "grad_norm": 1.9386534247633986, + "learning_rate": 4.852445495744497e-06, + "loss": 0.5735, + "step": 274 + }, + { + "epoch": 0.23, + "grad_norm": 2.049346245018599, + "learning_rate": 4.8513367749981e-06, + "loss": 0.5415, + "step": 275 + }, + { + "epoch": 0.23, + "grad_norm": 2.1051969521216605, + "learning_rate": 4.850224032006765e-06, + "loss": 0.5532, + "step": 276 + }, + { + "epoch": 0.23, + "grad_norm": 2.2006792558872315, + "learning_rate": 4.849107268673975e-06, + "loss": 0.5696, + "step": 277 + }, + { + "epoch": 0.23, + "grad_norm": 2.0460787736353647, + "learning_rate": 4.847986486910088e-06, + "loss": 0.5658, + "step": 278 + }, + { + "epoch": 0.23, + "grad_norm": 2.1161843259225406, + "learning_rate": 4.846861688632336e-06, + "loss": 0.583, + "step": 279 + }, + { + "epoch": 0.23, + "grad_norm": 1.8882198480393542, + "learning_rate": 4.8457328757648224e-06, + "loss": 0.5693, + "step": 280 + }, + { + "epoch": 0.23, + "grad_norm": 2.1578413701109596, + "learning_rate": 4.844600050238517e-06, + "loss": 0.5409, + "step": 281 + }, + { + "epoch": 0.23, + "grad_norm": 2.03912467778954, + "learning_rate": 4.843463213991255e-06, + "loss": 0.5908, + "step": 282 + }, + { + "epoch": 0.23, + "grad_norm": 2.2333462480826247, + "learning_rate": 4.842322368967731e-06, + "loss": 0.6088, + "step": 283 + }, + { + "epoch": 0.24, + "grad_norm": 2.06698702157327, + "learning_rate": 4.8411775171194986e-06, + "loss": 0.5953, + "step": 284 + }, + { + "epoch": 0.24, + "grad_norm": 2.1433923121572045, + "learning_rate": 4.840028660404964e-06, + "loss": 0.5851, + "step": 285 + }, + { + "epoch": 0.24, + "grad_norm": 2.214858780835041, + "learning_rate": 4.838875800789386e-06, + "loss": 0.5913, + "step": 286 + }, + { + "epoch": 0.24, + "grad_norm": 2.038128612492624, + "learning_rate": 4.837718940244871e-06, + "loss": 0.5827, + "step": 287 + }, + { + "epoch": 0.24, + "grad_norm": 1.9894065096959768, + "learning_rate": 4.836558080750365e-06, + "loss": 0.5769, + "step": 288 + }, + { + "epoch": 0.24, + "grad_norm": 2.1711590153285822, + "learning_rate": 4.835393224291662e-06, + "loss": 0.654, + "step": 289 + }, + { + "epoch": 0.24, + "grad_norm": 2.105004451988696, + "learning_rate": 4.834224372861386e-06, + "loss": 0.6158, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 1.9554568023729102, + "learning_rate": 4.833051528459001e-06, + "loss": 0.5807, + "step": 291 + }, + { + "epoch": 0.24, + "grad_norm": 2.2693917834500312, + "learning_rate": 4.831874693090797e-06, + "loss": 0.5557, + "step": 292 + }, + { + "epoch": 0.24, + "grad_norm": 1.9081391627126192, + "learning_rate": 4.830693868769892e-06, + "loss": 0.6057, + "step": 293 + }, + { + "epoch": 0.24, + "grad_norm": 2.2133664110768585, + "learning_rate": 4.82950905751623e-06, + "loss": 0.6103, + "step": 294 + }, + { + "epoch": 0.24, + "grad_norm": 2.015392814211589, + "learning_rate": 4.8283202613565735e-06, + "loss": 0.5578, + "step": 295 + }, + { + "epoch": 0.25, + "grad_norm": 2.142124020349717, + "learning_rate": 4.8271274823245e-06, + "loss": 0.5675, + "step": 296 + }, + { + "epoch": 0.25, + "grad_norm": 1.981611826462286, + "learning_rate": 4.825930722460405e-06, + "loss": 0.5696, + "step": 297 + }, + { + "epoch": 0.25, + "grad_norm": 1.966759748348117, + "learning_rate": 4.824729983811486e-06, + "loss": 0.58, + "step": 298 + }, + { + "epoch": 0.25, + "grad_norm": 2.0117040369769397, + "learning_rate": 4.823525268431754e-06, + "loss": 0.6005, + "step": 299 + }, + { + "epoch": 0.25, + "grad_norm": 1.9579664917991193, + "learning_rate": 4.822316578382019e-06, + "loss": 0.5472, + "step": 300 + }, + { + "epoch": 0.25, + "grad_norm": 1.9075723479635032, + "learning_rate": 4.821103915729892e-06, + "loss": 0.5834, + "step": 301 + }, + { + "epoch": 0.25, + "grad_norm": 2.289340229011896, + "learning_rate": 4.819887282549777e-06, + "loss": 0.6088, + "step": 302 + }, + { + "epoch": 0.25, + "grad_norm": 2.0410700553735235, + "learning_rate": 4.818666680922874e-06, + "loss": 0.5449, + "step": 303 + }, + { + "epoch": 0.25, + "grad_norm": 2.074434792511819, + "learning_rate": 4.8174421129371675e-06, + "loss": 0.5826, + "step": 304 + }, + { + "epoch": 0.25, + "grad_norm": 2.1377170527698865, + "learning_rate": 4.816213580687428e-06, + "loss": 0.6262, + "step": 305 + }, + { + "epoch": 0.25, + "grad_norm": 2.060340839248083, + "learning_rate": 4.814981086275209e-06, + "loss": 0.5479, + "step": 306 + }, + { + "epoch": 0.25, + "grad_norm": 2.007036467413588, + "learning_rate": 4.813744631808841e-06, + "loss": 0.5642, + "step": 307 + }, + { + "epoch": 0.26, + "grad_norm": 2.016779606220332, + "learning_rate": 4.8125042194034285e-06, + "loss": 0.5503, + "step": 308 + }, + { + "epoch": 0.26, + "grad_norm": 1.930004252757651, + "learning_rate": 4.811259851180845e-06, + "loss": 0.582, + "step": 309 + }, + { + "epoch": 0.26, + "grad_norm": 1.9179477992752856, + "learning_rate": 4.810011529269734e-06, + "loss": 0.5678, + "step": 310 + }, + { + "epoch": 0.26, + "grad_norm": 2.023430757276848, + "learning_rate": 4.808759255805498e-06, + "loss": 0.614, + "step": 311 + }, + { + "epoch": 0.26, + "grad_norm": 1.8334738409404936, + "learning_rate": 4.807503032930306e-06, + "loss": 0.5742, + "step": 312 + }, + { + "epoch": 0.26, + "grad_norm": 1.937332706274502, + "learning_rate": 4.806242862793075e-06, + "loss": 0.6257, + "step": 313 + }, + { + "epoch": 0.26, + "grad_norm": 2.0265383045700363, + "learning_rate": 4.8049787475494786e-06, + "loss": 0.5733, + "step": 314 + }, + { + "epoch": 0.26, + "grad_norm": 2.056444039073761, + "learning_rate": 4.803710689361939e-06, + "loss": 0.578, + "step": 315 + }, + { + "epoch": 0.26, + "grad_norm": 2.411132719183335, + "learning_rate": 4.802438690399622e-06, + "loss": 0.5778, + "step": 316 + }, + { + "epoch": 0.26, + "grad_norm": 2.0233969242222853, + "learning_rate": 4.801162752838436e-06, + "loss": 0.5649, + "step": 317 + }, + { + "epoch": 0.26, + "grad_norm": 2.2809121915132815, + "learning_rate": 4.799882878861025e-06, + "loss": 0.5589, + "step": 318 + }, + { + "epoch": 0.26, + "grad_norm": 1.9806834041020271, + "learning_rate": 4.798599070656768e-06, + "loss": 0.5753, + "step": 319 + }, + { + "epoch": 0.27, + "grad_norm": 2.095099671577702, + "learning_rate": 4.797311330421773e-06, + "loss": 0.5644, + "step": 320 + }, + { + "epoch": 0.27, + "grad_norm": 2.1697606190375764, + "learning_rate": 4.796019660358877e-06, + "loss": 0.6009, + "step": 321 + }, + { + "epoch": 0.27, + "grad_norm": 1.9549416103216173, + "learning_rate": 4.794724062677635e-06, + "loss": 0.5429, + "step": 322 + }, + { + "epoch": 0.27, + "grad_norm": 1.9986949357292838, + "learning_rate": 4.793424539594323e-06, + "loss": 0.5456, + "step": 323 + }, + { + "epoch": 0.27, + "grad_norm": 1.9414831957796765, + "learning_rate": 4.792121093331935e-06, + "loss": 0.5468, + "step": 324 + }, + { + "epoch": 0.27, + "grad_norm": 2.100702188933012, + "learning_rate": 4.7908137261201685e-06, + "loss": 0.5763, + "step": 325 + }, + { + "epoch": 0.27, + "grad_norm": 2.2747471285831025, + "learning_rate": 4.789502440195436e-06, + "loss": 0.5637, + "step": 326 + }, + { + "epoch": 0.27, + "grad_norm": 1.8996382919319124, + "learning_rate": 4.788187237800849e-06, + "loss": 0.5285, + "step": 327 + }, + { + "epoch": 0.27, + "grad_norm": 2.3451495174978847, + "learning_rate": 4.786868121186218e-06, + "loss": 0.5638, + "step": 328 + }, + { + "epoch": 0.27, + "grad_norm": 2.0437536068229565, + "learning_rate": 4.7855450926080535e-06, + "loss": 0.5282, + "step": 329 + }, + { + "epoch": 0.27, + "grad_norm": 2.1185488514745554, + "learning_rate": 4.784218154329555e-06, + "loss": 0.5689, + "step": 330 + }, + { + "epoch": 0.27, + "grad_norm": 2.08745956731504, + "learning_rate": 4.78288730862061e-06, + "loss": 0.5772, + "step": 331 + }, + { + "epoch": 0.28, + "grad_norm": 1.9479507156354359, + "learning_rate": 4.781552557757789e-06, + "loss": 0.5419, + "step": 332 + }, + { + "epoch": 0.28, + "grad_norm": 2.0211480847937255, + "learning_rate": 4.780213904024346e-06, + "loss": 0.5757, + "step": 333 + }, + { + "epoch": 0.28, + "grad_norm": 1.9075335749936069, + "learning_rate": 4.7788713497102094e-06, + "loss": 0.5693, + "step": 334 + }, + { + "epoch": 0.28, + "grad_norm": 1.9590727137410602, + "learning_rate": 4.777524897111979e-06, + "loss": 0.5501, + "step": 335 + }, + { + "epoch": 0.28, + "grad_norm": 2.0328480247612752, + "learning_rate": 4.776174548532926e-06, + "loss": 0.587, + "step": 336 + }, + { + "epoch": 0.28, + "grad_norm": 2.062540517496736, + "learning_rate": 4.774820306282982e-06, + "loss": 0.5819, + "step": 337 + }, + { + "epoch": 0.28, + "grad_norm": 2.0054452800156195, + "learning_rate": 4.773462172678744e-06, + "loss": 0.5529, + "step": 338 + }, + { + "epoch": 0.28, + "grad_norm": 1.9641125644599562, + "learning_rate": 4.772100150043462e-06, + "loss": 0.5895, + "step": 339 + }, + { + "epoch": 0.28, + "grad_norm": 1.9196744569285298, + "learning_rate": 4.77073424070704e-06, + "loss": 0.5504, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 2.0002752186146484, + "learning_rate": 4.76936444700603e-06, + "loss": 0.5307, + "step": 341 + }, + { + "epoch": 0.28, + "grad_norm": 2.1068919823054344, + "learning_rate": 4.76799077128363e-06, + "loss": 0.5908, + "step": 342 + }, + { + "epoch": 0.28, + "grad_norm": 1.919597745459612, + "learning_rate": 4.766613215889678e-06, + "loss": 0.5423, + "step": 343 + }, + { + "epoch": 0.29, + "grad_norm": 2.0670928578728716, + "learning_rate": 4.765231783180648e-06, + "loss": 0.5901, + "step": 344 + }, + { + "epoch": 0.29, + "grad_norm": 1.906116148793229, + "learning_rate": 4.763846475519648e-06, + "loss": 0.5919, + "step": 345 + }, + { + "epoch": 0.29, + "grad_norm": 1.9133575268702454, + "learning_rate": 4.762457295276413e-06, + "loss": 0.585, + "step": 346 + }, + { + "epoch": 0.29, + "grad_norm": 2.133902651855379, + "learning_rate": 4.7610642448273025e-06, + "loss": 0.5444, + "step": 347 + }, + { + "epoch": 0.29, + "grad_norm": 1.95222194640397, + "learning_rate": 4.7596673265552985e-06, + "loss": 0.5941, + "step": 348 + }, + { + "epoch": 0.29, + "grad_norm": 2.095010268380277, + "learning_rate": 4.758266542849997e-06, + "loss": 0.6045, + "step": 349 + }, + { + "epoch": 0.29, + "grad_norm": 2.0493864712059655, + "learning_rate": 4.756861896107609e-06, + "loss": 0.6011, + "step": 350 + }, + { + "epoch": 0.29, + "grad_norm": 1.9222198823064967, + "learning_rate": 4.755453388730949e-06, + "loss": 0.5521, + "step": 351 + }, + { + "epoch": 0.29, + "grad_norm": 2.368147154955994, + "learning_rate": 4.754041023129442e-06, + "loss": 0.6117, + "step": 352 + }, + { + "epoch": 0.29, + "grad_norm": 1.9734596786106697, + "learning_rate": 4.752624801719108e-06, + "loss": 0.5727, + "step": 353 + }, + { + "epoch": 0.29, + "grad_norm": 2.151510566977991, + "learning_rate": 4.751204726922564e-06, + "loss": 0.6085, + "step": 354 + }, + { + "epoch": 0.29, + "grad_norm": 1.9291219072892685, + "learning_rate": 4.74978080116902e-06, + "loss": 0.5655, + "step": 355 + }, + { + "epoch": 0.3, + "grad_norm": 1.838592559018919, + "learning_rate": 4.748353026894273e-06, + "loss": 0.5508, + "step": 356 + }, + { + "epoch": 0.3, + "grad_norm": 2.069156589116884, + "learning_rate": 4.7469214065407e-06, + "loss": 0.5942, + "step": 357 + }, + { + "epoch": 0.3, + "grad_norm": 1.8960817746615841, + "learning_rate": 4.745485942557264e-06, + "loss": 0.5902, + "step": 358 + }, + { + "epoch": 0.3, + "grad_norm": 2.0606557307859634, + "learning_rate": 4.744046637399497e-06, + "loss": 0.556, + "step": 359 + }, + { + "epoch": 0.3, + "grad_norm": 1.9660065879130573, + "learning_rate": 4.742603493529505e-06, + "loss": 0.5364, + "step": 360 + }, + { + "epoch": 0.3, + "grad_norm": 1.9647921383638112, + "learning_rate": 4.741156513415958e-06, + "loss": 0.5601, + "step": 361 + }, + { + "epoch": 0.3, + "grad_norm": 2.049074688423064, + "learning_rate": 4.739705699534092e-06, + "loss": 0.556, + "step": 362 + }, + { + "epoch": 0.3, + "grad_norm": 1.962593945802751, + "learning_rate": 4.738251054365697e-06, + "loss": 0.5609, + "step": 363 + }, + { + "epoch": 0.3, + "grad_norm": 2.059675349950347, + "learning_rate": 4.736792580399119e-06, + "loss": 0.5499, + "step": 364 + }, + { + "epoch": 0.3, + "grad_norm": 1.8479566025134508, + "learning_rate": 4.7353302801292555e-06, + "loss": 0.5621, + "step": 365 + }, + { + "epoch": 0.3, + "grad_norm": 1.9405450724813613, + "learning_rate": 4.733864156057545e-06, + "loss": 0.5437, + "step": 366 + }, + { + "epoch": 0.3, + "grad_norm": 2.122487864033456, + "learning_rate": 4.7323942106919715e-06, + "loss": 0.5984, + "step": 367 + }, + { + "epoch": 0.31, + "grad_norm": 2.6822841144123046, + "learning_rate": 4.730920446547052e-06, + "loss": 0.5951, + "step": 368 + }, + { + "epoch": 0.31, + "grad_norm": 2.001405394086718, + "learning_rate": 4.729442866143838e-06, + "loss": 0.5552, + "step": 369 + }, + { + "epoch": 0.31, + "grad_norm": 2.081154186949651, + "learning_rate": 4.72796147200991e-06, + "loss": 0.587, + "step": 370 + }, + { + "epoch": 0.31, + "grad_norm": 2.1196544292473236, + "learning_rate": 4.72647626667937e-06, + "loss": 0.5882, + "step": 371 + }, + { + "epoch": 0.31, + "grad_norm": 2.107445583509131, + "learning_rate": 4.724987252692841e-06, + "loss": 0.5389, + "step": 372 + }, + { + "epoch": 0.31, + "grad_norm": 1.9529785007256542, + "learning_rate": 4.723494432597462e-06, + "loss": 0.6439, + "step": 373 + }, + { + "epoch": 0.31, + "grad_norm": 2.11513441515607, + "learning_rate": 4.72199780894688e-06, + "loss": 0.6089, + "step": 374 + }, + { + "epoch": 0.31, + "grad_norm": 1.9769899713721226, + "learning_rate": 4.7204973843012504e-06, + "loss": 0.5393, + "step": 375 + }, + { + "epoch": 0.31, + "grad_norm": 2.063749623036316, + "learning_rate": 4.718993161227231e-06, + "loss": 0.5987, + "step": 376 + }, + { + "epoch": 0.31, + "grad_norm": 2.0515862288253883, + "learning_rate": 4.717485142297977e-06, + "loss": 0.5772, + "step": 377 + }, + { + "epoch": 0.31, + "grad_norm": 1.8962297741946081, + "learning_rate": 4.715973330093135e-06, + "loss": 0.5424, + "step": 378 + }, + { + "epoch": 0.31, + "grad_norm": 2.2210958340400087, + "learning_rate": 4.7144577271988435e-06, + "loss": 0.6072, + "step": 379 + }, + { + "epoch": 0.32, + "grad_norm": 2.067113337475314, + "learning_rate": 4.712938336207724e-06, + "loss": 0.5482, + "step": 380 + }, + { + "epoch": 0.32, + "grad_norm": 1.8985489253954526, + "learning_rate": 4.711415159718876e-06, + "loss": 0.5593, + "step": 381 + }, + { + "epoch": 0.32, + "grad_norm": 2.085236381118245, + "learning_rate": 4.709888200337879e-06, + "loss": 0.5704, + "step": 382 + }, + { + "epoch": 0.32, + "grad_norm": 2.0967664183909784, + "learning_rate": 4.708357460676779e-06, + "loss": 0.5997, + "step": 383 + }, + { + "epoch": 0.32, + "grad_norm": 2.0454278026009645, + "learning_rate": 4.706822943354092e-06, + "loss": 0.5669, + "step": 384 + }, + { + "epoch": 0.32, + "grad_norm": 1.9171673309342674, + "learning_rate": 4.705284650994793e-06, + "loss": 0.517, + "step": 385 + }, + { + "epoch": 0.32, + "grad_norm": 2.2003223432761287, + "learning_rate": 4.70374258623032e-06, + "loss": 0.5957, + "step": 386 + }, + { + "epoch": 0.32, + "grad_norm": 1.936392519491186, + "learning_rate": 4.702196751698557e-06, + "loss": 0.5767, + "step": 387 + }, + { + "epoch": 0.32, + "grad_norm": 2.354272003403086, + "learning_rate": 4.700647150043841e-06, + "loss": 0.6515, + "step": 388 + }, + { + "epoch": 0.32, + "grad_norm": 1.9115059027323418, + "learning_rate": 4.699093783916955e-06, + "loss": 0.5579, + "step": 389 + }, + { + "epoch": 0.32, + "grad_norm": 1.9878827587010002, + "learning_rate": 4.697536655975115e-06, + "loss": 0.572, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 1.9729552535473858, + "learning_rate": 4.69597576888198e-06, + "loss": 0.5665, + "step": 391 + }, + { + "epoch": 0.32, + "grad_norm": 2.177634366499155, + "learning_rate": 4.694411125307632e-06, + "loss": 0.6363, + "step": 392 + }, + { + "epoch": 0.33, + "grad_norm": 1.8955146664976508, + "learning_rate": 4.692842727928584e-06, + "loss": 0.5682, + "step": 393 + }, + { + "epoch": 0.33, + "grad_norm": 2.175305874476245, + "learning_rate": 4.691270579427769e-06, + "loss": 0.5943, + "step": 394 + }, + { + "epoch": 0.33, + "grad_norm": 2.068140527232831, + "learning_rate": 4.689694682494537e-06, + "loss": 0.5659, + "step": 395 + }, + { + "epoch": 0.33, + "grad_norm": 1.9112960694448755, + "learning_rate": 4.688115039824648e-06, + "loss": 0.6048, + "step": 396 + }, + { + "epoch": 0.33, + "grad_norm": 1.9778305624626604, + "learning_rate": 4.686531654120272e-06, + "loss": 0.5695, + "step": 397 + }, + { + "epoch": 0.33, + "grad_norm": 2.096904163204813, + "learning_rate": 4.684944528089981e-06, + "loss": 0.6113, + "step": 398 + }, + { + "epoch": 0.33, + "grad_norm": 2.0011934144948516, + "learning_rate": 4.683353664448745e-06, + "loss": 0.5568, + "step": 399 + }, + { + "epoch": 0.33, + "grad_norm": 1.8562851971757464, + "learning_rate": 4.681759065917929e-06, + "loss": 0.5474, + "step": 400 + }, + { + "epoch": 0.33, + "grad_norm": 1.8190547574166316, + "learning_rate": 4.680160735225285e-06, + "loss": 0.5315, + "step": 401 + }, + { + "epoch": 0.33, + "grad_norm": 1.9247862956929132, + "learning_rate": 4.6785586751049505e-06, + "loss": 0.5568, + "step": 402 + }, + { + "epoch": 0.33, + "grad_norm": 1.8469793674077621, + "learning_rate": 4.676952888297442e-06, + "loss": 0.5811, + "step": 403 + }, + { + "epoch": 0.33, + "grad_norm": 1.946943145198674, + "learning_rate": 4.675343377549653e-06, + "loss": 0.5475, + "step": 404 + }, + { + "epoch": 0.34, + "grad_norm": 1.991304422730463, + "learning_rate": 4.6737301456148445e-06, + "loss": 0.5856, + "step": 405 + }, + { + "epoch": 0.34, + "grad_norm": 1.9168241989446437, + "learning_rate": 4.672113195252644e-06, + "loss": 0.6069, + "step": 406 + }, + { + "epoch": 0.34, + "grad_norm": 1.9305433665377905, + "learning_rate": 4.670492529229039e-06, + "loss": 0.5536, + "step": 407 + }, + { + "epoch": 0.34, + "grad_norm": 1.8441008898830742, + "learning_rate": 4.668868150316377e-06, + "loss": 0.5859, + "step": 408 + }, + { + "epoch": 0.34, + "grad_norm": 1.8879301596961315, + "learning_rate": 4.667240061293351e-06, + "loss": 0.5483, + "step": 409 + }, + { + "epoch": 0.34, + "grad_norm": 2.024767417636281, + "learning_rate": 4.665608264945004e-06, + "loss": 0.5414, + "step": 410 + }, + { + "epoch": 0.34, + "grad_norm": 2.1331610141797395, + "learning_rate": 4.663972764062722e-06, + "loss": 0.5811, + "step": 411 + }, + { + "epoch": 0.34, + "grad_norm": 1.8132480265817386, + "learning_rate": 4.662333561444226e-06, + "loss": 0.5573, + "step": 412 + }, + { + "epoch": 0.34, + "grad_norm": 1.9795813972027145, + "learning_rate": 4.6606906598935675e-06, + "loss": 0.5814, + "step": 413 + }, + { + "epoch": 0.34, + "grad_norm": 1.8782931074297053, + "learning_rate": 4.6590440622211295e-06, + "loss": 0.569, + "step": 414 + }, + { + "epoch": 0.34, + "grad_norm": 1.8219945335518706, + "learning_rate": 4.657393771243614e-06, + "loss": 0.5669, + "step": 415 + }, + { + "epoch": 0.34, + "grad_norm": 2.4047268604371306, + "learning_rate": 4.6557397897840454e-06, + "loss": 0.5602, + "step": 416 + }, + { + "epoch": 0.35, + "grad_norm": 2.064501780523946, + "learning_rate": 4.654082120671757e-06, + "loss": 0.5699, + "step": 417 + }, + { + "epoch": 0.35, + "grad_norm": 1.9183128854940252, + "learning_rate": 4.65242076674239e-06, + "loss": 0.6112, + "step": 418 + }, + { + "epoch": 0.35, + "grad_norm": 1.9315698971629633, + "learning_rate": 4.650755730837894e-06, + "loss": 0.5537, + "step": 419 + }, + { + "epoch": 0.35, + "grad_norm": 1.9527809333659218, + "learning_rate": 4.649087015806509e-06, + "loss": 0.5423, + "step": 420 + }, + { + "epoch": 0.35, + "grad_norm": 1.8940523915995442, + "learning_rate": 4.647414624502777e-06, + "loss": 0.5708, + "step": 421 + }, + { + "epoch": 0.35, + "grad_norm": 1.9976964785548623, + "learning_rate": 4.645738559787524e-06, + "loss": 0.6006, + "step": 422 + }, + { + "epoch": 0.35, + "grad_norm": 1.9098681403283917, + "learning_rate": 4.64405882452786e-06, + "loss": 0.5591, + "step": 423 + }, + { + "epoch": 0.35, + "grad_norm": 1.8695612182804557, + "learning_rate": 4.642375421597175e-06, + "loss": 0.5219, + "step": 424 + }, + { + "epoch": 0.35, + "grad_norm": 1.8912077704810082, + "learning_rate": 4.6406883538751315e-06, + "loss": 0.5224, + "step": 425 + }, + { + "epoch": 0.35, + "grad_norm": 1.9390714726978922, + "learning_rate": 4.638997624247664e-06, + "loss": 0.5359, + "step": 426 + }, + { + "epoch": 0.35, + "grad_norm": 2.051545992296337, + "learning_rate": 4.637303235606968e-06, + "loss": 0.544, + "step": 427 + }, + { + "epoch": 0.35, + "grad_norm": 2.0657109136265914, + "learning_rate": 4.6356051908515e-06, + "loss": 0.5429, + "step": 428 + }, + { + "epoch": 0.36, + "grad_norm": 2.0301022307984793, + "learning_rate": 4.63390349288597e-06, + "loss": 0.5787, + "step": 429 + }, + { + "epoch": 0.36, + "grad_norm": 2.052515756169346, + "learning_rate": 4.632198144621338e-06, + "loss": 0.5778, + "step": 430 + }, + { + "epoch": 0.36, + "grad_norm": 1.9741370495474897, + "learning_rate": 4.630489148974807e-06, + "loss": 0.5142, + "step": 431 + }, + { + "epoch": 0.36, + "grad_norm": 1.9713229498863698, + "learning_rate": 4.62877650886982e-06, + "loss": 0.6127, + "step": 432 + }, + { + "epoch": 0.36, + "grad_norm": 2.1609440121306007, + "learning_rate": 4.627060227236055e-06, + "loss": 0.5886, + "step": 433 + }, + { + "epoch": 0.36, + "grad_norm": 1.944966445355139, + "learning_rate": 4.625340307009418e-06, + "loss": 0.5657, + "step": 434 + }, + { + "epoch": 0.36, + "grad_norm": 2.031003925680835, + "learning_rate": 4.623616751132041e-06, + "loss": 0.5628, + "step": 435 + }, + { + "epoch": 0.36, + "grad_norm": 1.8774113373137704, + "learning_rate": 4.621889562552272e-06, + "loss": 0.6068, + "step": 436 + }, + { + "epoch": 0.36, + "grad_norm": 2.0385201543401785, + "learning_rate": 4.620158744224677e-06, + "loss": 0.5511, + "step": 437 + }, + { + "epoch": 0.36, + "grad_norm": 1.8440750841938207, + "learning_rate": 4.618424299110028e-06, + "loss": 0.5261, + "step": 438 + }, + { + "epoch": 0.36, + "grad_norm": 1.8978691755923442, + "learning_rate": 4.616686230175303e-06, + "loss": 0.5862, + "step": 439 + }, + { + "epoch": 0.36, + "grad_norm": 1.8120850246861446, + "learning_rate": 4.614944540393679e-06, + "loss": 0.5652, + "step": 440 + }, + { + "epoch": 0.37, + "grad_norm": 2.1821084695714914, + "learning_rate": 4.613199232744525e-06, + "loss": 0.5598, + "step": 441 + }, + { + "epoch": 0.37, + "grad_norm": 1.9626422737625222, + "learning_rate": 4.611450310213401e-06, + "loss": 0.5267, + "step": 442 + }, + { + "epoch": 0.37, + "grad_norm": 1.9714913234889215, + "learning_rate": 4.6096977757920505e-06, + "loss": 0.5658, + "step": 443 + }, + { + "epoch": 0.37, + "grad_norm": 2.0179324078198233, + "learning_rate": 4.607941632478393e-06, + "loss": 0.582, + "step": 444 + }, + { + "epoch": 0.37, + "grad_norm": 1.8565193856331161, + "learning_rate": 4.6061818832765246e-06, + "loss": 0.5715, + "step": 445 + }, + { + "epoch": 0.37, + "grad_norm": 1.9798501479599246, + "learning_rate": 4.604418531196708e-06, + "loss": 0.6007, + "step": 446 + }, + { + "epoch": 0.37, + "grad_norm": 2.0095846956468257, + "learning_rate": 4.602651579255369e-06, + "loss": 0.5947, + "step": 447 + }, + { + "epoch": 0.37, + "grad_norm": 1.9316541079988245, + "learning_rate": 4.600881030475093e-06, + "loss": 0.5501, + "step": 448 + }, + { + "epoch": 0.37, + "grad_norm": 2.080069353365406, + "learning_rate": 4.599106887884616e-06, + "loss": 0.5631, + "step": 449 + }, + { + "epoch": 0.37, + "grad_norm": 1.965973137652201, + "learning_rate": 4.5973291545188235e-06, + "loss": 0.5267, + "step": 450 + }, + { + "epoch": 0.37, + "grad_norm": 2.1082225966704087, + "learning_rate": 4.595547833418741e-06, + "loss": 0.6418, + "step": 451 + }, + { + "epoch": 0.37, + "grad_norm": 2.0359312594194083, + "learning_rate": 4.593762927631536e-06, + "loss": 0.5644, + "step": 452 + }, + { + "epoch": 0.38, + "grad_norm": 2.1254892914109433, + "learning_rate": 4.591974440210502e-06, + "loss": 0.5693, + "step": 453 + }, + { + "epoch": 0.38, + "grad_norm": 1.9121188587334927, + "learning_rate": 4.590182374215064e-06, + "loss": 0.5572, + "step": 454 + }, + { + "epoch": 0.38, + "grad_norm": 1.9348642624953207, + "learning_rate": 4.588386732710765e-06, + "loss": 0.5446, + "step": 455 + }, + { + "epoch": 0.38, + "grad_norm": 1.8667846547370581, + "learning_rate": 4.5865875187692695e-06, + "loss": 0.5681, + "step": 456 + }, + { + "epoch": 0.38, + "grad_norm": 1.9219061327454674, + "learning_rate": 4.5847847354683465e-06, + "loss": 0.5508, + "step": 457 + }, + { + "epoch": 0.38, + "grad_norm": 1.8106132369123122, + "learning_rate": 4.5829783858918756e-06, + "loss": 0.5626, + "step": 458 + }, + { + "epoch": 0.38, + "grad_norm": 1.7827483964442634, + "learning_rate": 4.5811684731298355e-06, + "loss": 0.5575, + "step": 459 + }, + { + "epoch": 0.38, + "grad_norm": 1.9284196979863513, + "learning_rate": 4.5793550002783e-06, + "loss": 0.5363, + "step": 460 + }, + { + "epoch": 0.38, + "grad_norm": 2.029647468705457, + "learning_rate": 4.577537970439433e-06, + "loss": 0.5415, + "step": 461 + }, + { + "epoch": 0.38, + "grad_norm": 2.0997127029950087, + "learning_rate": 4.575717386721482e-06, + "loss": 0.5814, + "step": 462 + }, + { + "epoch": 0.38, + "grad_norm": 1.9589290300656341, + "learning_rate": 4.573893252238777e-06, + "loss": 0.5156, + "step": 463 + }, + { + "epoch": 0.38, + "grad_norm": 1.905237143908251, + "learning_rate": 4.572065570111717e-06, + "loss": 0.5536, + "step": 464 + }, + { + "epoch": 0.39, + "grad_norm": 1.929519794935609, + "learning_rate": 4.570234343466775e-06, + "loss": 0.5879, + "step": 465 + }, + { + "epoch": 0.39, + "grad_norm": 2.096095808886982, + "learning_rate": 4.568399575436484e-06, + "loss": 0.6241, + "step": 466 + }, + { + "epoch": 0.39, + "grad_norm": 1.9486118894048778, + "learning_rate": 4.566561269159437e-06, + "loss": 0.6307, + "step": 467 + }, + { + "epoch": 0.39, + "grad_norm": 2.0839490306744586, + "learning_rate": 4.564719427780276e-06, + "loss": 0.5655, + "step": 468 + }, + { + "epoch": 0.39, + "grad_norm": 1.9439525665822102, + "learning_rate": 4.562874054449694e-06, + "loss": 0.5437, + "step": 469 + }, + { + "epoch": 0.39, + "grad_norm": 1.9409142791465297, + "learning_rate": 4.5610251523244244e-06, + "loss": 0.6429, + "step": 470 + }, + { + "epoch": 0.39, + "grad_norm": 1.8664574493795525, + "learning_rate": 4.559172724567238e-06, + "loss": 0.5826, + "step": 471 + }, + { + "epoch": 0.39, + "grad_norm": 1.80819349503324, + "learning_rate": 4.557316774346934e-06, + "loss": 0.5372, + "step": 472 + }, + { + "epoch": 0.39, + "grad_norm": 1.8680097526865296, + "learning_rate": 4.555457304838341e-06, + "loss": 0.5503, + "step": 473 + }, + { + "epoch": 0.39, + "grad_norm": 1.7466938790815696, + "learning_rate": 4.553594319222303e-06, + "loss": 0.5425, + "step": 474 + }, + { + "epoch": 0.39, + "grad_norm": 1.9610557658505607, + "learning_rate": 4.551727820685684e-06, + "loss": 0.5755, + "step": 475 + }, + { + "epoch": 0.39, + "grad_norm": 1.9414839604282412, + "learning_rate": 4.549857812421353e-06, + "loss": 0.5915, + "step": 476 + }, + { + "epoch": 0.4, + "grad_norm": 1.8484957644576423, + "learning_rate": 4.547984297628186e-06, + "loss": 0.5676, + "step": 477 + }, + { + "epoch": 0.4, + "grad_norm": 2.074524028551078, + "learning_rate": 4.546107279511055e-06, + "loss": 0.6084, + "step": 478 + }, + { + "epoch": 0.4, + "grad_norm": 2.069692704122282, + "learning_rate": 4.544226761280826e-06, + "loss": 0.5676, + "step": 479 + }, + { + "epoch": 0.4, + "grad_norm": 1.8975472248317244, + "learning_rate": 4.54234274615435e-06, + "loss": 0.5904, + "step": 480 + }, + { + "epoch": 0.4, + "grad_norm": 2.0118868982719897, + "learning_rate": 4.540455237354466e-06, + "loss": 0.5722, + "step": 481 + }, + { + "epoch": 0.4, + "grad_norm": 1.9733105429381828, + "learning_rate": 4.5385642381099814e-06, + "loss": 0.6112, + "step": 482 + }, + { + "epoch": 0.4, + "grad_norm": 1.862156914026863, + "learning_rate": 4.53666975165568e-06, + "loss": 0.5951, + "step": 483 + }, + { + "epoch": 0.4, + "grad_norm": 1.9512940035297868, + "learning_rate": 4.53477178123231e-06, + "loss": 0.5223, + "step": 484 + }, + { + "epoch": 0.4, + "grad_norm": 1.9202464191558823, + "learning_rate": 4.532870330086577e-06, + "loss": 0.5638, + "step": 485 + }, + { + "epoch": 0.4, + "grad_norm": 1.9015767656854419, + "learning_rate": 4.530965401471143e-06, + "loss": 0.5911, + "step": 486 + }, + { + "epoch": 0.4, + "grad_norm": 1.95190921973106, + "learning_rate": 4.529056998644619e-06, + "loss": 0.6053, + "step": 487 + }, + { + "epoch": 0.4, + "grad_norm": 2.0058459596081644, + "learning_rate": 4.527145124871556e-06, + "loss": 0.5466, + "step": 488 + }, + { + "epoch": 0.41, + "grad_norm": 1.8902620959998047, + "learning_rate": 4.5252297834224454e-06, + "loss": 0.5526, + "step": 489 + }, + { + "epoch": 0.41, + "grad_norm": 1.985466416169018, + "learning_rate": 4.523310977573711e-06, + "loss": 0.5958, + "step": 490 + }, + { + "epoch": 0.41, + "grad_norm": 2.1140148957176415, + "learning_rate": 4.521388710607699e-06, + "loss": 0.613, + "step": 491 + }, + { + "epoch": 0.41, + "grad_norm": 1.9470601192089525, + "learning_rate": 4.51946298581268e-06, + "loss": 0.5847, + "step": 492 + }, + { + "epoch": 0.41, + "grad_norm": 2.0227057176069603, + "learning_rate": 4.51753380648284e-06, + "loss": 0.5784, + "step": 493 + }, + { + "epoch": 0.41, + "grad_norm": 2.05501863673554, + "learning_rate": 4.515601175918269e-06, + "loss": 0.5501, + "step": 494 + }, + { + "epoch": 0.41, + "grad_norm": 2.0129325402811715, + "learning_rate": 4.513665097424967e-06, + "loss": 0.5641, + "step": 495 + }, + { + "epoch": 0.41, + "grad_norm": 2.0322333044110468, + "learning_rate": 4.51172557431483e-06, + "loss": 0.5422, + "step": 496 + }, + { + "epoch": 0.41, + "grad_norm": 1.9573055659958774, + "learning_rate": 4.509782609905644e-06, + "loss": 0.516, + "step": 497 + }, + { + "epoch": 0.41, + "grad_norm": 1.8223127451485421, + "learning_rate": 4.507836207521085e-06, + "loss": 0.5714, + "step": 498 + }, + { + "epoch": 0.41, + "grad_norm": 1.9343089861079434, + "learning_rate": 4.50588637049071e-06, + "loss": 0.5424, + "step": 499 + }, + { + "epoch": 0.41, + "grad_norm": 1.8940990649350729, + "learning_rate": 4.503933102149948e-06, + "loss": 0.5832, + "step": 500 + }, + { + "epoch": 0.42, + "grad_norm": 1.908617301933682, + "learning_rate": 4.501976405840101e-06, + "loss": 0.5399, + "step": 501 + }, + { + "epoch": 0.42, + "grad_norm": 1.8290259512093785, + "learning_rate": 4.500016284908334e-06, + "loss": 0.5561, + "step": 502 + }, + { + "epoch": 0.42, + "grad_norm": 1.9840280991844164, + "learning_rate": 4.49805274270767e-06, + "loss": 0.5645, + "step": 503 + }, + { + "epoch": 0.42, + "grad_norm": 1.9864953051636856, + "learning_rate": 4.496085782596984e-06, + "loss": 0.5369, + "step": 504 + }, + { + "epoch": 0.42, + "grad_norm": 1.979387839103732, + "learning_rate": 4.494115407940999e-06, + "loss": 0.6196, + "step": 505 + }, + { + "epoch": 0.42, + "grad_norm": 1.9266869362165981, + "learning_rate": 4.492141622110279e-06, + "loss": 0.5687, + "step": 506 + }, + { + "epoch": 0.42, + "grad_norm": 1.9887461782376619, + "learning_rate": 4.4901644284812205e-06, + "loss": 0.5264, + "step": 507 + }, + { + "epoch": 0.42, + "grad_norm": 1.8717867803152208, + "learning_rate": 4.488183830436052e-06, + "loss": 0.5612, + "step": 508 + }, + { + "epoch": 0.42, + "grad_norm": 2.0044226171493, + "learning_rate": 4.486199831362828e-06, + "loss": 0.5571, + "step": 509 + }, + { + "epoch": 0.42, + "grad_norm": 2.1075571016617958, + "learning_rate": 4.484212434655414e-06, + "loss": 0.5642, + "step": 510 + }, + { + "epoch": 0.42, + "grad_norm": 1.8031612547539957, + "learning_rate": 4.482221643713494e-06, + "loss": 0.5805, + "step": 511 + }, + { + "epoch": 0.42, + "grad_norm": 1.8782516337672304, + "learning_rate": 4.480227461942556e-06, + "loss": 0.5596, + "step": 512 + }, + { + "epoch": 0.43, + "grad_norm": 2.075073901596185, + "learning_rate": 4.478229892753886e-06, + "loss": 0.6124, + "step": 513 + }, + { + "epoch": 0.43, + "grad_norm": 2.0588983460568304, + "learning_rate": 4.47622893956457e-06, + "loss": 0.5589, + "step": 514 + }, + { + "epoch": 0.43, + "grad_norm": 1.850248236464706, + "learning_rate": 4.474224605797476e-06, + "loss": 0.5603, + "step": 515 + }, + { + "epoch": 0.43, + "grad_norm": 1.932844310652863, + "learning_rate": 4.472216894881261e-06, + "loss": 0.5571, + "step": 516 + }, + { + "epoch": 0.43, + "grad_norm": 2.09975454805468, + "learning_rate": 4.470205810250357e-06, + "loss": 0.5975, + "step": 517 + }, + { + "epoch": 0.43, + "grad_norm": 1.9694087093010304, + "learning_rate": 4.468191355344965e-06, + "loss": 0.5698, + "step": 518 + }, + { + "epoch": 0.43, + "grad_norm": 1.8794788153917539, + "learning_rate": 4.466173533611053e-06, + "loss": 0.5559, + "step": 519 + }, + { + "epoch": 0.43, + "grad_norm": 2.0650455557855434, + "learning_rate": 4.46415234850035e-06, + "loss": 0.5644, + "step": 520 + }, + { + "epoch": 0.43, + "grad_norm": 2.0062649027982022, + "learning_rate": 4.462127803470334e-06, + "loss": 0.608, + "step": 521 + }, + { + "epoch": 0.43, + "grad_norm": 2.043267877462657, + "learning_rate": 4.460099901984235e-06, + "loss": 0.573, + "step": 522 + }, + { + "epoch": 0.43, + "grad_norm": 2.056372436619027, + "learning_rate": 4.4580686475110235e-06, + "loss": 0.5748, + "step": 523 + }, + { + "epoch": 0.43, + "grad_norm": 1.8871033520138176, + "learning_rate": 4.456034043525404e-06, + "loss": 0.5339, + "step": 524 + }, + { + "epoch": 0.44, + "grad_norm": 1.889474616209236, + "learning_rate": 4.45399609350781e-06, + "loss": 0.5185, + "step": 525 + }, + { + "epoch": 0.44, + "grad_norm": 1.9767406217632912, + "learning_rate": 4.451954800944405e-06, + "loss": 0.5758, + "step": 526 + }, + { + "epoch": 0.44, + "grad_norm": 1.9588695861513832, + "learning_rate": 4.449910169327062e-06, + "loss": 0.5472, + "step": 527 + }, + { + "epoch": 0.44, + "grad_norm": 1.8852210889000718, + "learning_rate": 4.447862202153372e-06, + "loss": 0.5917, + "step": 528 + }, + { + "epoch": 0.44, + "grad_norm": 2.0103638871993077, + "learning_rate": 4.445810902926629e-06, + "loss": 0.5761, + "step": 529 + }, + { + "epoch": 0.44, + "grad_norm": 2.201836945389513, + "learning_rate": 4.443756275155827e-06, + "loss": 0.5614, + "step": 530 + }, + { + "epoch": 0.44, + "grad_norm": 1.900702305836831, + "learning_rate": 4.441698322355656e-06, + "loss": 0.5254, + "step": 531 + }, + { + "epoch": 0.44, + "grad_norm": 2.134694583439314, + "learning_rate": 4.4396370480464915e-06, + "loss": 0.5607, + "step": 532 + }, + { + "epoch": 0.44, + "grad_norm": 1.8073751630381198, + "learning_rate": 4.437572455754391e-06, + "loss": 0.536, + "step": 533 + }, + { + "epoch": 0.44, + "grad_norm": 1.9607338020142653, + "learning_rate": 4.435504549011088e-06, + "loss": 0.59, + "step": 534 + }, + { + "epoch": 0.44, + "grad_norm": 2.0756430867435274, + "learning_rate": 4.433433331353988e-06, + "loss": 0.5538, + "step": 535 + }, + { + "epoch": 0.44, + "grad_norm": 1.8280570853718465, + "learning_rate": 4.431358806326158e-06, + "loss": 0.5789, + "step": 536 + }, + { + "epoch": 0.45, + "grad_norm": 2.2005143967434977, + "learning_rate": 4.429280977476321e-06, + "loss": 0.545, + "step": 537 + }, + { + "epoch": 0.45, + "grad_norm": 1.896479397543979, + "learning_rate": 4.4271998483588565e-06, + "loss": 0.5791, + "step": 538 + }, + { + "epoch": 0.45, + "grad_norm": 2.117773381781195, + "learning_rate": 4.425115422533785e-06, + "loss": 0.5234, + "step": 539 + }, + { + "epoch": 0.45, + "grad_norm": 2.4438942429566617, + "learning_rate": 4.423027703566769e-06, + "loss": 0.5692, + "step": 540 + }, + { + "epoch": 0.45, + "grad_norm": 1.873481152225171, + "learning_rate": 4.4209366950291025e-06, + "loss": 0.5739, + "step": 541 + }, + { + "epoch": 0.45, + "grad_norm": 1.8655199147974673, + "learning_rate": 4.4188424004977085e-06, + "loss": 0.5795, + "step": 542 + }, + { + "epoch": 0.45, + "grad_norm": 1.948840412241188, + "learning_rate": 4.416744823555129e-06, + "loss": 0.5304, + "step": 543 + }, + { + "epoch": 0.45, + "grad_norm": 1.8389034133315045, + "learning_rate": 4.414643967789523e-06, + "loss": 0.5076, + "step": 544 + }, + { + "epoch": 0.45, + "grad_norm": 1.8269235720085213, + "learning_rate": 4.412539836794657e-06, + "loss": 0.5837, + "step": 545 + }, + { + "epoch": 0.45, + "grad_norm": 2.1298715969759505, + "learning_rate": 4.410432434169902e-06, + "loss": 0.5694, + "step": 546 + }, + { + "epoch": 0.45, + "grad_norm": 2.0057741366005746, + "learning_rate": 4.408321763520223e-06, + "loss": 0.557, + "step": 547 + }, + { + "epoch": 0.45, + "grad_norm": 1.7901331374893255, + "learning_rate": 4.406207828456177e-06, + "loss": 0.5746, + "step": 548 + }, + { + "epoch": 0.46, + "grad_norm": 2.1994839889416187, + "learning_rate": 4.404090632593904e-06, + "loss": 0.5407, + "step": 549 + }, + { + "epoch": 0.46, + "grad_norm": 1.9664921082690268, + "learning_rate": 4.401970179555123e-06, + "loss": 0.5322, + "step": 550 + }, + { + "epoch": 0.46, + "grad_norm": 1.9933486180243851, + "learning_rate": 4.399846472967124e-06, + "loss": 0.5798, + "step": 551 + }, + { + "epoch": 0.46, + "grad_norm": 1.986612256562151, + "learning_rate": 4.397719516462765e-06, + "loss": 0.5213, + "step": 552 + }, + { + "epoch": 0.46, + "grad_norm": 2.046550123292336, + "learning_rate": 4.395589313680459e-06, + "loss": 0.5857, + "step": 553 + }, + { + "epoch": 0.46, + "grad_norm": 1.7902327250340486, + "learning_rate": 4.393455868264176e-06, + "loss": 0.555, + "step": 554 + }, + { + "epoch": 0.46, + "grad_norm": 2.0203627138517146, + "learning_rate": 4.391319183863432e-06, + "loss": 0.6329, + "step": 555 + }, + { + "epoch": 0.46, + "grad_norm": 1.9373549045181289, + "learning_rate": 4.389179264133281e-06, + "loss": 0.566, + "step": 556 + }, + { + "epoch": 0.46, + "grad_norm": 1.8936753353678124, + "learning_rate": 4.387036112734316e-06, + "loss": 0.5555, + "step": 557 + }, + { + "epoch": 0.46, + "grad_norm": 1.8493817575820743, + "learning_rate": 4.3848897333326545e-06, + "loss": 0.5427, + "step": 558 + }, + { + "epoch": 0.46, + "grad_norm": 1.9119588677783816, + "learning_rate": 4.382740129599937e-06, + "loss": 0.5157, + "step": 559 + }, + { + "epoch": 0.46, + "grad_norm": 1.8190137094200924, + "learning_rate": 4.380587305213321e-06, + "loss": 0.503, + "step": 560 + }, + { + "epoch": 0.47, + "grad_norm": 1.9891332712764953, + "learning_rate": 4.37843126385547e-06, + "loss": 0.5761, + "step": 561 + }, + { + "epoch": 0.47, + "grad_norm": 1.8620896547461154, + "learning_rate": 4.376272009214555e-06, + "loss": 0.5259, + "step": 562 + }, + { + "epoch": 0.47, + "grad_norm": 1.8896721756477406, + "learning_rate": 4.37410954498424e-06, + "loss": 0.5632, + "step": 563 + }, + { + "epoch": 0.47, + "grad_norm": 1.8302281976781984, + "learning_rate": 4.37194387486368e-06, + "loss": 0.5612, + "step": 564 + }, + { + "epoch": 0.47, + "grad_norm": 2.0721820586440165, + "learning_rate": 4.369775002557516e-06, + "loss": 0.533, + "step": 565 + }, + { + "epoch": 0.47, + "grad_norm": 1.8259926551813157, + "learning_rate": 4.367602931775865e-06, + "loss": 0.526, + "step": 566 + }, + { + "epoch": 0.47, + "grad_norm": 1.8096334574000785, + "learning_rate": 4.3654276662343155e-06, + "loss": 0.5306, + "step": 567 + }, + { + "epoch": 0.47, + "grad_norm": 1.9675637591445598, + "learning_rate": 4.363249209653922e-06, + "loss": 0.5577, + "step": 568 + }, + { + "epoch": 0.47, + "grad_norm": 1.8800389115841605, + "learning_rate": 4.361067565761197e-06, + "loss": 0.5553, + "step": 569 + }, + { + "epoch": 0.47, + "grad_norm": 1.827485496395265, + "learning_rate": 4.358882738288105e-06, + "loss": 0.5587, + "step": 570 + }, + { + "epoch": 0.47, + "grad_norm": 1.820954908943235, + "learning_rate": 4.356694730972056e-06, + "loss": 0.6186, + "step": 571 + }, + { + "epoch": 0.47, + "grad_norm": 1.952072431699686, + "learning_rate": 4.3545035475559025e-06, + "loss": 0.5488, + "step": 572 + }, + { + "epoch": 0.48, + "grad_norm": 1.8292648968688423, + "learning_rate": 4.352309191787924e-06, + "loss": 0.5534, + "step": 573 + }, + { + "epoch": 0.48, + "grad_norm": 1.826293122529813, + "learning_rate": 4.350111667421835e-06, + "loss": 0.5872, + "step": 574 + }, + { + "epoch": 0.48, + "grad_norm": 1.9251425791166785, + "learning_rate": 4.347910978216763e-06, + "loss": 0.5298, + "step": 575 + }, + { + "epoch": 0.48, + "grad_norm": 1.8330818196811385, + "learning_rate": 4.345707127937253e-06, + "loss": 0.5871, + "step": 576 + }, + { + "epoch": 0.48, + "grad_norm": 1.7842986545873851, + "learning_rate": 4.3435001203532555e-06, + "loss": 0.4898, + "step": 577 + }, + { + "epoch": 0.48, + "grad_norm": 1.8778666245156521, + "learning_rate": 4.341289959240124e-06, + "loss": 0.5385, + "step": 578 + }, + { + "epoch": 0.48, + "grad_norm": 1.9300679499181266, + "learning_rate": 4.339076648378605e-06, + "loss": 0.5698, + "step": 579 + }, + { + "epoch": 0.48, + "grad_norm": 1.9440861965960357, + "learning_rate": 4.336860191554833e-06, + "loss": 0.5984, + "step": 580 + }, + { + "epoch": 0.48, + "grad_norm": 1.929951096053947, + "learning_rate": 4.3346405925603265e-06, + "loss": 0.6222, + "step": 581 + }, + { + "epoch": 0.48, + "grad_norm": 1.9138258400335695, + "learning_rate": 4.332417855191974e-06, + "loss": 0.5498, + "step": 582 + }, + { + "epoch": 0.48, + "grad_norm": 2.058548455869675, + "learning_rate": 4.330191983252039e-06, + "loss": 0.5218, + "step": 583 + }, + { + "epoch": 0.48, + "grad_norm": 2.243429045583125, + "learning_rate": 4.327962980548142e-06, + "loss": 0.5768, + "step": 584 + }, + { + "epoch": 0.48, + "grad_norm": 1.9213537104634244, + "learning_rate": 4.32573085089326e-06, + "loss": 0.5784, + "step": 585 + }, + { + "epoch": 0.49, + "grad_norm": 1.9165291289119128, + "learning_rate": 4.32349559810572e-06, + "loss": 0.5697, + "step": 586 + }, + { + "epoch": 0.49, + "grad_norm": 1.9674279518735756, + "learning_rate": 4.321257226009193e-06, + "loss": 0.5104, + "step": 587 + }, + { + "epoch": 0.49, + "grad_norm": 1.9051339015323923, + "learning_rate": 4.319015738432683e-06, + "loss": 0.5711, + "step": 588 + }, + { + "epoch": 0.49, + "grad_norm": 1.957357618850765, + "learning_rate": 4.3167711392105245e-06, + "loss": 0.5854, + "step": 589 + }, + { + "epoch": 0.49, + "grad_norm": 1.9859311708308915, + "learning_rate": 4.314523432182376e-06, + "loss": 0.547, + "step": 590 + }, + { + "epoch": 0.49, + "grad_norm": 1.773704456523191, + "learning_rate": 4.312272621193209e-06, + "loss": 0.5259, + "step": 591 + }, + { + "epoch": 0.49, + "grad_norm": 1.82988033655793, + "learning_rate": 4.31001871009331e-06, + "loss": 0.5209, + "step": 592 + }, + { + "epoch": 0.49, + "grad_norm": 1.8925134832060522, + "learning_rate": 4.307761702738264e-06, + "loss": 0.59, + "step": 593 + }, + { + "epoch": 0.49, + "grad_norm": 1.8477075780641046, + "learning_rate": 4.305501602988953e-06, + "loss": 0.5714, + "step": 594 + }, + { + "epoch": 0.49, + "grad_norm": 1.8568432886623798, + "learning_rate": 4.303238414711552e-06, + "loss": 0.5877, + "step": 595 + }, + { + "epoch": 0.49, + "grad_norm": 1.8179798660158206, + "learning_rate": 4.3009721417775166e-06, + "loss": 0.6029, + "step": 596 + }, + { + "epoch": 0.49, + "grad_norm": 1.8494963193854803, + "learning_rate": 4.29870278806358e-06, + "loss": 0.5236, + "step": 597 + }, + { + "epoch": 0.5, + "grad_norm": 1.9586017397154731, + "learning_rate": 4.296430357451744e-06, + "loss": 0.5998, + "step": 598 + }, + { + "epoch": 0.5, + "grad_norm": 1.926616057974202, + "learning_rate": 4.2941548538292765e-06, + "loss": 0.5914, + "step": 599 + }, + { + "epoch": 0.5, + "grad_norm": 1.9321738359144827, + "learning_rate": 4.291876281088701e-06, + "loss": 0.5358, + "step": 600 + }, + { + "epoch": 0.5, + "grad_norm": 1.8229177571361932, + "learning_rate": 4.289594643127788e-06, + "loss": 0.5284, + "step": 601 + }, + { + "epoch": 0.5, + "grad_norm": 1.849252449531427, + "learning_rate": 4.287309943849558e-06, + "loss": 0.5689, + "step": 602 + }, + { + "epoch": 0.5, + "grad_norm": 1.985343175388319, + "learning_rate": 4.285022187162261e-06, + "loss": 0.6101, + "step": 603 + } + ], + "logging_steps": 1, + "max_steps": 2412, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 603, + "total_flos": 283958071787520.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-603/training_args.bin b/checkpoint-603/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e --- /dev/null +++ b/checkpoint-603/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26 +size 7352 diff --git a/checkpoint-603/zero_to_fp32.py b/checkpoint-603/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..49b846633d6eb1e836e34681e44033581f4edb7b --- /dev/null +++ b/checkpoint-603/zero_to_fp32.py @@ -0,0 +1,592 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9e19729ddff0edff2916b7fba6d2fec722621e76 --- /dev/null +++ b/config.json @@ -0,0 +1,26 @@ +{ + "_name_or_path": "alpindale/Mistral-7B-v0.2-hf", + "architectures": [ + "MistralForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 32000, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 32768, + "model_type": "mistral", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.38.2", + "use_cache": false, + "vocab_size": 32002 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..282b497efd8f276cf9270e576fb79be429aebcdc --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "transformers_version": "4.38.2" +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f1c522c741bc956a541d5544734d12ff3a71b33 --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89fd0fface188ca3f7988aa53f25e087292d72ca99cd52ef8cb52cf180ad2ff +size 4943178720 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a1c7f2c1a284a17e9b7a9124040ee4bb6680b67 --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49dd97160e0a8ff75303f02969df38307407c8800ce94aaa86611ceb6727bca0 +size 4999819336 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f8cc928e41a10674f627e9a238420111f974bb7 --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03098a839ef612f1efe325b376aa90bc8311a01c1236120d9ca7934eb9b12fed +size 4540532728 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..71e207631efb42944bc779548c9b6d4b5818ffe2 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 14483496960 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..40b1c6dadc2aed5b9e61dc7f9c7299e0aee16069 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8b443ef19c2a19acc3ac64fb9c3db4a72921dff6 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055 +size 493443 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..392e982500a327e3b6f821a513fcae6cc7f4f453 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,60 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c7d637f7d8ebf811202f98fbce7e1b0a49f637e --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f4a52e7a4e455192e0e321f42138a81946c62773f6570625fe5cb773689e26 +size 7352