diff --git a/README.md b/README.md deleted file mode 100644 index 6b1ad030d8dc51c98535bc6be58bc0d0d780757a..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -base_model: Qwen/Qwen3-4B-Instruct-2507 -library_name: transformers -model_name: Qwen3-8B_n3000_math -tags: -- generated_from_trainer -- sft -- trl -licence: license ---- - -# Model Card for Qwen3-8B_n3000_math - -This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507). -It has been trained using [TRL](https://github.com/huggingface/trl). - -## Quick start - -```python -from transformers import pipeline - -question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" -generator = pipeline("text-generation", model="None", device="cuda") -output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] -print(output["generated_text"]) -``` - -## Training procedure - - - - - -This model was trained with SFT. - -### Framework versions - -- TRL: 0.29.0 -- Transformers: 5.5.3 -- Pytorch: 2.8.0 -- Datasets: 4.5.0 -- Tokenizers: 0.22.2 - -## Citations - - - -Cite TRL as: - -```bibtex -@software{vonwerra2020trl, - title = {{TRL: Transformers Reinforcement Learning}}, - author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, - license = {Apache-2.0}, - url = {https://github.com/huggingface/trl}, - year = {2020} -} -``` \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja deleted file mode 100644 index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000 --- a/chat_template.jinja +++ /dev/null @@ -1,61 +0,0 @@ -{%- if tools %} - {{- '<|im_start|>system\n' }} - {%- if messages[0].role == 'system' %} - {{- messages[0].content + '\n\n' }} - {%- endif %} - {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} - {%- for tool in tools %} - {{- "\n" }} - {{- tool | tojson }} - {%- endfor %} - {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} -{%- else %} - {%- if messages[0].role == 'system' %} - {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} - {%- endif %} -{%- endif %} -{%- for message in messages %} - {%- if message.content is string %} - {%- set content = message.content %} - {%- else %} - {%- set content = '' %} - {%- endif %} - {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} - {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} - {%- elif message.role == "assistant" %} - {{- '<|im_start|>' + message.role + '\n' + content }} - {%- if message.tool_calls %} - {%- for tool_call in message.tool_calls %} - {%- if (loop.first and content) or (not loop.first) %} - {{- '\n' }} - {%- endif %} - {%- if tool_call.function %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '\n{"name": "' }} - {{- tool_call.name }} - {{- '", "arguments": ' }} - {%- if tool_call.arguments is string %} - {{- tool_call.arguments }} - {%- else %} - {{- tool_call.arguments | tojson }} - {%- endif %} - {{- '}\n' }} - {%- endfor %} - {%- endif %} - {{- '<|im_end|>\n' }} - {%- elif message.role == "tool" %} - {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} - {{- '<|im_start|>user' }} - {%- endif %} - {{- '\n\n' }} - {{- content }} - {{- '\n' }} - {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} - {{- '<|im_end|>\n' }} - {%- endif %} - {%- endif %} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|im_start|>assistant\n' }} -{%- endif %} \ No newline at end of file diff --git a/checkpoint-1000/chat_template.jinja b/checkpoint-1000/chat_template.jinja deleted file mode 100644 index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000 --- a/checkpoint-1000/chat_template.jinja +++ /dev/null @@ -1,61 +0,0 @@ -{%- if tools %} - {{- '<|im_start|>system\n' }} - {%- if messages[0].role == 'system' %} - {{- messages[0].content + '\n\n' }} - {%- endif %} - {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} - {%- for tool in tools %} - {{- "\n" }} - {{- tool | tojson }} - {%- endfor %} - {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} -{%- else %} - {%- if messages[0].role == 'system' %} - {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} - {%- endif %} -{%- endif %} -{%- for message in messages %} - {%- if message.content is string %} - {%- set content = message.content %} - {%- else %} - {%- set content = '' %} - {%- endif %} - {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} - {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} - {%- elif message.role == "assistant" %} - {{- '<|im_start|>' + message.role + '\n' + content }} - {%- if message.tool_calls %} - {%- for tool_call in message.tool_calls %} - {%- if (loop.first and content) or (not loop.first) %} - {{- '\n' }} - {%- endif %} - {%- if tool_call.function %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '\n{"name": "' }} - {{- tool_call.name }} - {{- '", "arguments": ' }} - {%- if tool_call.arguments is string %} - {{- tool_call.arguments }} - {%- else %} - {{- tool_call.arguments | tojson }} - {%- endif %} - {{- '}\n' }} - {%- endfor %} - {%- endif %} - {{- '<|im_end|>\n' }} - {%- elif message.role == "tool" %} - {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} - {{- '<|im_start|>user' }} - {%- endif %} - {{- '\n\n' }} - {{- content }} - {{- '\n' }} - {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} - {{- '<|im_end|>\n' }} - {%- endif %} - {%- endif %} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|im_start|>assistant\n' }} -{%- endif %} \ No newline at end of file diff --git a/checkpoint-1000/config.json b/checkpoint-1000/config.json deleted file mode 100644 index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000 --- a/checkpoint-1000/config.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "architectures": [ - "Qwen3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": null, - "dtype": "float32", - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 2560, - "initializer_range": 0.02, - "intermediate_size": 9728, - "layer_types": [ - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention" - ], - "max_position_embeddings": 262144, - "max_window_layers": 36, - "model_type": "qwen3", - "num_attention_heads": 32, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "pad_token_id": 151662, - "rms_norm_eps": 1e-06, - "rope_parameters": { - "rope_theta": 5000000, - "rope_type": "default" - }, - "sliding_window": null, - "tie_word_embeddings": true, - "transformers_version": "5.5.3", - "use_cache": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/checkpoint-1000/generation_config.json b/checkpoint-1000/generation_config.json deleted file mode 100644 index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000 --- a/checkpoint-1000/generation_config.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "do_sample": true, - "eos_token_id": [ - 151645, - 151643 - ], - "pad_token_id": 151662, - "temperature": 0.7, - "top_k": 20, - "top_p": 0.8, - "transformers_version": "5.5.3" -} diff --git a/checkpoint-1000/model.safetensors b/checkpoint-1000/model.safetensors deleted file mode 100644 index bfb8d255b000e1103e63ca8cab7616c5309ec35c..0000000000000000000000000000000000000000 --- a/checkpoint-1000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16cf530a69292d5ebcdc898ff6e27f40e9fa97d07ec9a6fff92606a1cbec50f4 -size 17645743048 diff --git a/checkpoint-1000/optimizer.bin b/checkpoint-1000/optimizer.bin deleted file mode 100644 index aa4e8c63ff91cc412c4b55b37e5e1b5cd2c26f25..0000000000000000000000000000000000000000 --- a/checkpoint-1000/optimizer.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad09a9b1f9d56fb5e24fccb31bc61995bcb8aa26d3d4e5771bcd332a90d2d66e -size 32180124005 diff --git a/checkpoint-1000/pytorch_model_fsdp.bin b/checkpoint-1000/pytorch_model_fsdp.bin deleted file mode 100644 index d7e23f6b1f13fa3e7366b86beba7bead4ecc98c3..0000000000000000000000000000000000000000 --- a/checkpoint-1000/pytorch_model_fsdp.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cde7e1f8a53dcc9407e8636dd3c4261b755f26602abf7c70e6eb4291c93496bd -size 17645897996 diff --git a/checkpoint-1000/rng_state_0.pth b/checkpoint-1000/rng_state_0.pth deleted file mode 100644 index 3fc68e18ddaf65dfbdec55893d9a925ff5e43a18..0000000000000000000000000000000000000000 --- a/checkpoint-1000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4dd7671ce88d469c49c0530724ac76b2306574002d1ecd1ca9294e41621fd96a -size 14917 diff --git a/checkpoint-1000/rng_state_1.pth b/checkpoint-1000/rng_state_1.pth deleted file mode 100644 index 79d9de29ae34b3b0c10ea8ba0348aeafb0c12226..0000000000000000000000000000000000000000 --- a/checkpoint-1000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3246ef1170ccca541a03b89ad6f20e01c51eb6834a2c2211c78c71c70f896879 -size 14917 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt deleted file mode 100644 index 4c433f8131571742884bf6317ca17c54e07b544d..0000000000000000000000000000000000000000 --- a/checkpoint-1000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3e3184dc815b4354af3c63c9b5b618608d5206305b4414657ef8e0195f7ad089 -size 1465 diff --git a/checkpoint-1000/tokenizer.json b/checkpoint-1000/tokenizer.json deleted file mode 100644 index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000 --- a/checkpoint-1000/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 -size 11422650 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json deleted file mode 100644 index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000 --- a/checkpoint-1000/tokenizer_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "add_prefix_space": false, - "backend": "tokenizers", - "bos_token": null, - "clean_up_tokenization_spaces": false, - "eos_token": "<|im_end|>", - "errors": "replace", - "extra_special_tokens": [ - "<|im_start|>", - "<|im_end|>", - "<|object_ref_start|>", - "<|object_ref_end|>", - "<|box_start|>", - "<|box_end|>", - "<|quad_start|>", - "<|quad_end|>", - "<|vision_start|>", - "<|vision_end|>", - "<|vision_pad|>", - "<|image_pad|>", - "<|video_pad|>" - ], - "is_local": false, - "model_max_length": 1010000, - "pad_token": "<|fim_pad|>", - "split_special_tokens": false, - "tokenizer_class": "Qwen2Tokenizer", - "unk_token": null -} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json deleted file mode 100644 index 87532bf7321a64560106312a2f81138a0e52ebd6..0000000000000000000000000000000000000000 --- a/checkpoint-1000/trainer_state.json +++ /dev/null @@ -1,9034 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.7598784194528876, - "eval_steps": 500, - "global_step": 1000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0007598784194528875, - "grad_norm": 11.767926216125488, - "learning_rate": 0.0, - "loss": 0.7937269806861877, - "mean_token_accuracy": 0.7822731137275696, - "num_tokens": 10507.0, - "step": 1 - }, - { - "epoch": 0.001519756838905775, - "grad_norm": 14.9199800491333, - "learning_rate": 2.5252525252525256e-08, - "loss": 0.7665389776229858, - "mean_token_accuracy": 0.8342233300209045, - "num_tokens": 14806.0, - "step": 2 - }, - { - "epoch": 0.0022796352583586625, - "grad_norm": 11.991217613220215, - "learning_rate": 5.050505050505051e-08, - "loss": 0.9597002267837524, - "mean_token_accuracy": 0.7054992318153381, - "num_tokens": 27170.0, - "step": 3 - }, - { - "epoch": 0.00303951367781155, - "grad_norm": 12.958333015441895, - "learning_rate": 7.575757575757576e-08, - "loss": 0.9971482753753662, - "mean_token_accuracy": 0.7261134386062622, - "num_tokens": 33729.0, - "step": 4 - }, - { - "epoch": 0.003799392097264438, - "grad_norm": 13.5665283203125, - "learning_rate": 1.0101010101010103e-07, - "loss": 0.9504883885383606, - "mean_token_accuracy": 0.745307445526123, - "num_tokens": 41174.0, - "step": 5 - }, - { - "epoch": 0.004559270516717325, - "grad_norm": 10.09444808959961, - "learning_rate": 1.2626262626262626e-07, - "loss": 0.759548008441925, - "mean_token_accuracy": 0.7842121124267578, - "num_tokens": 47943.0, - "step": 6 - }, - { - "epoch": 0.005319148936170213, - "grad_norm": 10.741650581359863, - "learning_rate": 1.5151515151515152e-07, - "loss": 0.8231598138809204, - "mean_token_accuracy": 0.7550969123840332, - "num_tokens": 56665.0, - "step": 7 - }, - { - "epoch": 0.0060790273556231, - "grad_norm": 12.250170707702637, - "learning_rate": 1.767676767676768e-07, - "loss": 0.8576581478118896, - "mean_token_accuracy": 0.7568671703338623, - "num_tokens": 67606.0, - "step": 8 - }, - { - "epoch": 0.006838905775075988, - "grad_norm": 12.828629493713379, - "learning_rate": 2.0202020202020205e-07, - "loss": 0.9886435866355896, - "mean_token_accuracy": 0.733400285243988, - "num_tokens": 74272.0, - "step": 9 - }, - { - "epoch": 0.007598784194528876, - "grad_norm": 15.966923713684082, - "learning_rate": 2.2727272727272729e-07, - "loss": 1.064985990524292, - "mean_token_accuracy": 0.7101132869720459, - "num_tokens": 80524.0, - "step": 10 - }, - { - "epoch": 0.008358662613981762, - "grad_norm": 10.864850044250488, - "learning_rate": 2.525252525252525e-07, - "loss": 0.8311550617218018, - "mean_token_accuracy": 0.7431639432907104, - "num_tokens": 96292.0, - "step": 11 - }, - { - "epoch": 0.00911854103343465, - "grad_norm": 16.438785552978516, - "learning_rate": 2.7777777777777776e-07, - "loss": 1.0579866170883179, - "mean_token_accuracy": 0.7222976684570312, - "num_tokens": 102992.0, - "step": 12 - }, - { - "epoch": 0.009878419452887538, - "grad_norm": 11.179214477539062, - "learning_rate": 3.0303030303030305e-07, - "loss": 0.9816144704818726, - "mean_token_accuracy": 0.7206371426582336, - "num_tokens": 113571.0, - "step": 13 - }, - { - "epoch": 0.010638297872340425, - "grad_norm": 12.780299186706543, - "learning_rate": 3.2828282828282834e-07, - "loss": 0.847449004650116, - "mean_token_accuracy": 0.7826199531555176, - "num_tokens": 119568.0, - "step": 14 - }, - { - "epoch": 0.011398176291793313, - "grad_norm": 14.800421714782715, - "learning_rate": 3.535353535353536e-07, - "loss": 0.9275516271591187, - "mean_token_accuracy": 0.7655045986175537, - "num_tokens": 126258.0, - "step": 15 - }, - { - "epoch": 0.0121580547112462, - "grad_norm": 11.267602920532227, - "learning_rate": 3.787878787878788e-07, - "loss": 0.8464037179946899, - "mean_token_accuracy": 0.7606508731842041, - "num_tokens": 136831.0, - "step": 16 - }, - { - "epoch": 0.012917933130699088, - "grad_norm": 12.891013145446777, - "learning_rate": 4.040404040404041e-07, - "loss": 0.9903074502944946, - "mean_token_accuracy": 0.7247487306594849, - "num_tokens": 150434.0, - "step": 17 - }, - { - "epoch": 0.013677811550151976, - "grad_norm": 11.13957691192627, - "learning_rate": 4.2929292929292934e-07, - "loss": 0.8287211656570435, - "mean_token_accuracy": 0.7621913552284241, - "num_tokens": 158516.0, - "step": 18 - }, - { - "epoch": 0.014437689969604863, - "grad_norm": 18.39569664001465, - "learning_rate": 4.5454545454545457e-07, - "loss": 1.150015115737915, - "mean_token_accuracy": 0.7349498271942139, - "num_tokens": 162214.0, - "step": 19 - }, - { - "epoch": 0.015197568389057751, - "grad_norm": 9.353750228881836, - "learning_rate": 4.797979797979798e-07, - "loss": 0.7228299379348755, - "mean_token_accuracy": 0.7969573736190796, - "num_tokens": 173035.0, - "step": 20 - }, - { - "epoch": 0.015957446808510637, - "grad_norm": 8.267163276672363, - "learning_rate": 5.05050505050505e-07, - "loss": 0.7358136177062988, - "mean_token_accuracy": 0.7903937101364136, - "num_tokens": 183568.0, - "step": 21 - }, - { - "epoch": 0.016717325227963525, - "grad_norm": 11.137128829956055, - "learning_rate": 5.303030303030304e-07, - "loss": 1.0075397491455078, - "mean_token_accuracy": 0.702807605266571, - "num_tokens": 192759.0, - "step": 22 - }, - { - "epoch": 0.017477203647416412, - "grad_norm": 10.734103202819824, - "learning_rate": 5.555555555555555e-07, - "loss": 0.8925919532775879, - "mean_token_accuracy": 0.7475671768188477, - "num_tokens": 201280.0, - "step": 23 - }, - { - "epoch": 0.0182370820668693, - "grad_norm": 11.945566177368164, - "learning_rate": 5.808080808080809e-07, - "loss": 0.7260514497756958, - "mean_token_accuracy": 0.7859152555465698, - "num_tokens": 218053.0, - "step": 24 - }, - { - "epoch": 0.018996960486322188, - "grad_norm": 18.610652923583984, - "learning_rate": 6.060606060606061e-07, - "loss": 0.8995465636253357, - "mean_token_accuracy": 0.7931990623474121, - "num_tokens": 220953.0, - "step": 25 - }, - { - "epoch": 0.019756838905775075, - "grad_norm": 10.51898193359375, - "learning_rate": 6.313131313131314e-07, - "loss": 0.9532671570777893, - "mean_token_accuracy": 0.7257645726203918, - "num_tokens": 231200.0, - "step": 26 - }, - { - "epoch": 0.020516717325227963, - "grad_norm": 9.581812858581543, - "learning_rate": 6.565656565656567e-07, - "loss": 0.9038010239601135, - "mean_token_accuracy": 0.7390379905700684, - "num_tokens": 237711.0, - "step": 27 - }, - { - "epoch": 0.02127659574468085, - "grad_norm": 12.297484397888184, - "learning_rate": 6.818181818181818e-07, - "loss": 1.048936367034912, - "mean_token_accuracy": 0.7175670862197876, - "num_tokens": 242503.0, - "step": 28 - }, - { - "epoch": 0.022036474164133738, - "grad_norm": 7.437953472137451, - "learning_rate": 7.070707070707071e-07, - "loss": 0.8308826684951782, - "mean_token_accuracy": 0.7415335774421692, - "num_tokens": 250842.0, - "step": 29 - }, - { - "epoch": 0.022796352583586626, - "grad_norm": 6.134475231170654, - "learning_rate": 7.323232323232324e-07, - "loss": 0.647913932800293, - "mean_token_accuracy": 0.8124054670333862, - "num_tokens": 267453.0, - "step": 30 - }, - { - "epoch": 0.023556231003039513, - "grad_norm": 6.678966045379639, - "learning_rate": 7.575757575757576e-07, - "loss": 0.7052810192108154, - "mean_token_accuracy": 0.7908754348754883, - "num_tokens": 284416.0, - "step": 31 - }, - { - "epoch": 0.0243161094224924, - "grad_norm": 7.42232084274292, - "learning_rate": 7.82828282828283e-07, - "loss": 1.022383213043213, - "mean_token_accuracy": 0.7053230404853821, - "num_tokens": 292073.0, - "step": 32 - }, - { - "epoch": 0.02507598784194529, - "grad_norm": 6.463219165802002, - "learning_rate": 8.080808080808082e-07, - "loss": 0.7603012323379517, - "mean_token_accuracy": 0.7728140354156494, - "num_tokens": 298550.0, - "step": 33 - }, - { - "epoch": 0.025835866261398176, - "grad_norm": 5.668411731719971, - "learning_rate": 8.333333333333333e-07, - "loss": 0.7707852721214294, - "mean_token_accuracy": 0.7827773094177246, - "num_tokens": 306683.0, - "step": 34 - }, - { - "epoch": 0.026595744680851064, - "grad_norm": 4.984964847564697, - "learning_rate": 8.585858585858587e-07, - "loss": 0.6317349672317505, - "mean_token_accuracy": 0.8106861114501953, - "num_tokens": 318842.0, - "step": 35 - }, - { - "epoch": 0.02735562310030395, - "grad_norm": 4.421732425689697, - "learning_rate": 8.838383838383839e-07, - "loss": 0.6228617429733276, - "mean_token_accuracy": 0.8023355603218079, - "num_tokens": 329850.0, - "step": 36 - }, - { - "epoch": 0.02811550151975684, - "grad_norm": 5.970808029174805, - "learning_rate": 9.090909090909091e-07, - "loss": 0.8443238139152527, - "mean_token_accuracy": 0.7462409734725952, - "num_tokens": 335844.0, - "step": 37 - }, - { - "epoch": 0.028875379939209727, - "grad_norm": 4.5389084815979, - "learning_rate": 9.343434343434345e-07, - "loss": 0.6976436376571655, - "mean_token_accuracy": 0.790410041809082, - "num_tokens": 348768.0, - "step": 38 - }, - { - "epoch": 0.029635258358662615, - "grad_norm": 4.116631507873535, - "learning_rate": 9.595959595959596e-07, - "loss": 0.6698519587516785, - "mean_token_accuracy": 0.7818127870559692, - "num_tokens": 355460.0, - "step": 39 - }, - { - "epoch": 0.030395136778115502, - "grad_norm": 3.3714773654937744, - "learning_rate": 9.84848484848485e-07, - "loss": 0.5723201036453247, - "mean_token_accuracy": 0.8100086450576782, - "num_tokens": 368507.0, - "step": 40 - }, - { - "epoch": 0.03115501519756839, - "grad_norm": 4.4438347816467285, - "learning_rate": 1.01010101010101e-06, - "loss": 0.7508786916732788, - "mean_token_accuracy": 0.7711942791938782, - "num_tokens": 376467.0, - "step": 41 - }, - { - "epoch": 0.031914893617021274, - "grad_norm": 5.609974384307861, - "learning_rate": 1.0353535353535354e-06, - "loss": 0.566256046295166, - "mean_token_accuracy": 0.8319284319877625, - "num_tokens": 381399.0, - "step": 42 - }, - { - "epoch": 0.03267477203647416, - "grad_norm": 5.124386787414551, - "learning_rate": 1.0606060606060608e-06, - "loss": 0.8151067495346069, - "mean_token_accuracy": 0.7537785768508911, - "num_tokens": 387389.0, - "step": 43 - }, - { - "epoch": 0.03343465045592705, - "grad_norm": 3.6318116188049316, - "learning_rate": 1.085858585858586e-06, - "loss": 0.5989949107170105, - "mean_token_accuracy": 0.8129256963729858, - "num_tokens": 395302.0, - "step": 44 - }, - { - "epoch": 0.03419452887537994, - "grad_norm": 2.694424629211426, - "learning_rate": 1.111111111111111e-06, - "loss": 0.5831396579742432, - "mean_token_accuracy": 0.8056820631027222, - "num_tokens": 409920.0, - "step": 45 - }, - { - "epoch": 0.034954407294832825, - "grad_norm": 2.2949178218841553, - "learning_rate": 1.1363636363636364e-06, - "loss": 0.472550630569458, - "mean_token_accuracy": 0.8343006372451782, - "num_tokens": 428323.0, - "step": 46 - }, - { - "epoch": 0.03571428571428571, - "grad_norm": 3.3930575847625732, - "learning_rate": 1.1616161616161617e-06, - "loss": 0.6246505379676819, - "mean_token_accuracy": 0.783149003982544, - "num_tokens": 435889.0, - "step": 47 - }, - { - "epoch": 0.0364741641337386, - "grad_norm": 3.692598819732666, - "learning_rate": 1.186868686868687e-06, - "loss": 0.46132946014404297, - "mean_token_accuracy": 0.8583089113235474, - "num_tokens": 441192.0, - "step": 48 - }, - { - "epoch": 0.03723404255319149, - "grad_norm": 6.571533203125, - "learning_rate": 1.2121212121212122e-06, - "loss": 0.9351121783256531, - "mean_token_accuracy": 0.7580878734588623, - "num_tokens": 444277.0, - "step": 49 - }, - { - "epoch": 0.037993920972644375, - "grad_norm": 5.029570579528809, - "learning_rate": 1.2373737373737375e-06, - "loss": 0.6921554803848267, - "mean_token_accuracy": 0.8131166100502014, - "num_tokens": 447646.0, - "step": 50 - }, - { - "epoch": 0.03875379939209726, - "grad_norm": 2.9174208641052246, - "learning_rate": 1.2626262626262629e-06, - "loss": 0.591706395149231, - "mean_token_accuracy": 0.8108617067337036, - "num_tokens": 461397.0, - "step": 51 - }, - { - "epoch": 0.03951367781155015, - "grad_norm": 4.315536022186279, - "learning_rate": 1.287878787878788e-06, - "loss": 0.6986310482025146, - "mean_token_accuracy": 0.7710754871368408, - "num_tokens": 472047.0, - "step": 52 - }, - { - "epoch": 0.04027355623100304, - "grad_norm": 2.6216275691986084, - "learning_rate": 1.3131313131313134e-06, - "loss": 0.5553690791130066, - "mean_token_accuracy": 0.8167896866798401, - "num_tokens": 482795.0, - "step": 53 - }, - { - "epoch": 0.041033434650455926, - "grad_norm": 3.0562477111816406, - "learning_rate": 1.3383838383838385e-06, - "loss": 0.6909202337265015, - "mean_token_accuracy": 0.7859863638877869, - "num_tokens": 494818.0, - "step": 54 - }, - { - "epoch": 0.04179331306990881, - "grad_norm": 2.1420412063598633, - "learning_rate": 1.3636363636363636e-06, - "loss": 0.5415265560150146, - "mean_token_accuracy": 0.818886399269104, - "num_tokens": 513695.0, - "step": 55 - }, - { - "epoch": 0.0425531914893617, - "grad_norm": 2.9610488414764404, - "learning_rate": 1.3888888888888892e-06, - "loss": 0.6602212190628052, - "mean_token_accuracy": 0.7830734252929688, - "num_tokens": 523784.0, - "step": 56 - }, - { - "epoch": 0.04331306990881459, - "grad_norm": 2.511972665786743, - "learning_rate": 1.4141414141414143e-06, - "loss": 0.5717809796333313, - "mean_token_accuracy": 0.8053616285324097, - "num_tokens": 546308.0, - "step": 57 - }, - { - "epoch": 0.044072948328267476, - "grad_norm": 3.52642822265625, - "learning_rate": 1.4393939393939396e-06, - "loss": 0.6242594718933105, - "mean_token_accuracy": 0.8162082433700562, - "num_tokens": 552019.0, - "step": 58 - }, - { - "epoch": 0.044832826747720364, - "grad_norm": 3.02362322807312, - "learning_rate": 1.4646464646464648e-06, - "loss": 0.6634255647659302, - "mean_token_accuracy": 0.7682032585144043, - "num_tokens": 560009.0, - "step": 59 - }, - { - "epoch": 0.04559270516717325, - "grad_norm": 2.3910107612609863, - "learning_rate": 1.48989898989899e-06, - "loss": 0.5519146919250488, - "mean_token_accuracy": 0.8270269632339478, - "num_tokens": 571005.0, - "step": 60 - }, - { - "epoch": 0.04635258358662614, - "grad_norm": 4.28154993057251, - "learning_rate": 1.5151515151515152e-06, - "loss": 0.7437789440155029, - "mean_token_accuracy": 0.7782418131828308, - "num_tokens": 574950.0, - "step": 61 - }, - { - "epoch": 0.04711246200607903, - "grad_norm": 3.4078686237335205, - "learning_rate": 1.5404040404040404e-06, - "loss": 0.6345915198326111, - "mean_token_accuracy": 0.7903392314910889, - "num_tokens": 581657.0, - "step": 62 - }, - { - "epoch": 0.047872340425531915, - "grad_norm": 2.6834158897399902, - "learning_rate": 1.565656565656566e-06, - "loss": 0.5981127023696899, - "mean_token_accuracy": 0.7911489605903625, - "num_tokens": 591267.0, - "step": 63 - }, - { - "epoch": 0.0486322188449848, - "grad_norm": 2.1054461002349854, - "learning_rate": 1.590909090909091e-06, - "loss": 0.5523523688316345, - "mean_token_accuracy": 0.8194501399993896, - "num_tokens": 606787.0, - "step": 64 - }, - { - "epoch": 0.04939209726443769, - "grad_norm": 3.322596788406372, - "learning_rate": 1.6161616161616164e-06, - "loss": 0.48417025804519653, - "mean_token_accuracy": 0.8293706178665161, - "num_tokens": 611068.0, - "step": 65 - }, - { - "epoch": 0.05015197568389058, - "grad_norm": 2.302450180053711, - "learning_rate": 1.6414141414141415e-06, - "loss": 0.6498389840126038, - "mean_token_accuracy": 0.7728497385978699, - "num_tokens": 624452.0, - "step": 66 - }, - { - "epoch": 0.050911854103343465, - "grad_norm": 2.680191993713379, - "learning_rate": 1.6666666666666667e-06, - "loss": 0.6347037553787231, - "mean_token_accuracy": 0.8108306527137756, - "num_tokens": 638049.0, - "step": 67 - }, - { - "epoch": 0.05167173252279635, - "grad_norm": 3.0297021865844727, - "learning_rate": 1.6919191919191922e-06, - "loss": 0.5344363451004028, - "mean_token_accuracy": 0.8113535046577454, - "num_tokens": 643892.0, - "step": 68 - }, - { - "epoch": 0.05243161094224924, - "grad_norm": 2.9283676147460938, - "learning_rate": 1.7171717171717173e-06, - "loss": 0.6999260187149048, - "mean_token_accuracy": 0.7782022356987, - "num_tokens": 654418.0, - "step": 69 - }, - { - "epoch": 0.05319148936170213, - "grad_norm": 3.4098572731018066, - "learning_rate": 1.7424242424242427e-06, - "loss": 0.6508946418762207, - "mean_token_accuracy": 0.7942900657653809, - "num_tokens": 659837.0, - "step": 70 - }, - { - "epoch": 0.053951367781155016, - "grad_norm": 2.6756019592285156, - "learning_rate": 1.7676767676767678e-06, - "loss": 0.603486180305481, - "mean_token_accuracy": 0.8015457391738892, - "num_tokens": 668361.0, - "step": 71 - }, - { - "epoch": 0.0547112462006079, - "grad_norm": 2.2630293369293213, - "learning_rate": 1.792929292929293e-06, - "loss": 0.6608274579048157, - "mean_token_accuracy": 0.7753809690475464, - "num_tokens": 679025.0, - "step": 72 - }, - { - "epoch": 0.05547112462006079, - "grad_norm": 2.123962879180908, - "learning_rate": 1.8181818181818183e-06, - "loss": 0.4525482654571533, - "mean_token_accuracy": 0.8425612449645996, - "num_tokens": 688574.0, - "step": 73 - }, - { - "epoch": 0.05623100303951368, - "grad_norm": 7.90519905090332, - "learning_rate": 1.8434343434343434e-06, - "loss": 0.6507195830345154, - "mean_token_accuracy": 0.7714964151382446, - "num_tokens": 694534.0, - "step": 74 - }, - { - "epoch": 0.056990881458966566, - "grad_norm": 2.372203826904297, - "learning_rate": 1.868686868686869e-06, - "loss": 0.4458143413066864, - "mean_token_accuracy": 0.7991449236869812, - "num_tokens": 703114.0, - "step": 75 - }, - { - "epoch": 0.057750759878419454, - "grad_norm": 2.918677568435669, - "learning_rate": 1.8939393939393941e-06, - "loss": 0.5614339113235474, - "mean_token_accuracy": 0.8211464881896973, - "num_tokens": 709038.0, - "step": 76 - }, - { - "epoch": 0.05851063829787234, - "grad_norm": 1.6106709241867065, - "learning_rate": 1.9191919191919192e-06, - "loss": 0.5802098512649536, - "mean_token_accuracy": 0.8055065870285034, - "num_tokens": 730482.0, - "step": 77 - }, - { - "epoch": 0.05927051671732523, - "grad_norm": 2.8069989681243896, - "learning_rate": 1.944444444444445e-06, - "loss": 0.5709059238433838, - "mean_token_accuracy": 0.8024872541427612, - "num_tokens": 751817.0, - "step": 78 - }, - { - "epoch": 0.06003039513677812, - "grad_norm": 2.641667127609253, - "learning_rate": 1.96969696969697e-06, - "loss": 0.6480152606964111, - "mean_token_accuracy": 0.7912271618843079, - "num_tokens": 759236.0, - "step": 79 - }, - { - "epoch": 0.060790273556231005, - "grad_norm": 2.6034350395202637, - "learning_rate": 1.994949494949495e-06, - "loss": 0.5535176396369934, - "mean_token_accuracy": 0.7980542778968811, - "num_tokens": 766496.0, - "step": 80 - }, - { - "epoch": 0.06155015197568389, - "grad_norm": 1.7095069885253906, - "learning_rate": 2.02020202020202e-06, - "loss": 0.4545496106147766, - "mean_token_accuracy": 0.8229660391807556, - "num_tokens": 780124.0, - "step": 81 - }, - { - "epoch": 0.06231003039513678, - "grad_norm": 3.788830518722534, - "learning_rate": 2.0454545454545457e-06, - "loss": 0.6679391264915466, - "mean_token_accuracy": 0.7942397594451904, - "num_tokens": 784555.0, - "step": 82 - }, - { - "epoch": 0.06306990881458967, - "grad_norm": 2.009831666946411, - "learning_rate": 2.070707070707071e-06, - "loss": 0.5067101120948792, - "mean_token_accuracy": 0.8276634216308594, - "num_tokens": 797459.0, - "step": 83 - }, - { - "epoch": 0.06382978723404255, - "grad_norm": 2.201627731323242, - "learning_rate": 2.095959595959596e-06, - "loss": 0.5012127161026001, - "mean_token_accuracy": 0.8432504534721375, - "num_tokens": 810817.0, - "step": 84 - }, - { - "epoch": 0.06458966565349544, - "grad_norm": 2.492568016052246, - "learning_rate": 2.1212121212121216e-06, - "loss": 0.6142797470092773, - "mean_token_accuracy": 0.8338661193847656, - "num_tokens": 818191.0, - "step": 85 - }, - { - "epoch": 0.06534954407294832, - "grad_norm": 2.8360862731933594, - "learning_rate": 2.1464646464646467e-06, - "loss": 0.5569300651550293, - "mean_token_accuracy": 0.8121030330657959, - "num_tokens": 825325.0, - "step": 86 - }, - { - "epoch": 0.06610942249240122, - "grad_norm": 2.407548427581787, - "learning_rate": 2.171717171717172e-06, - "loss": 0.6442930102348328, - "mean_token_accuracy": 0.792514443397522, - "num_tokens": 834439.0, - "step": 87 - }, - { - "epoch": 0.0668693009118541, - "grad_norm": 2.340728759765625, - "learning_rate": 2.196969696969697e-06, - "loss": 0.6494365930557251, - "mean_token_accuracy": 0.7746615409851074, - "num_tokens": 843078.0, - "step": 88 - }, - { - "epoch": 0.067629179331307, - "grad_norm": 1.7703697681427002, - "learning_rate": 2.222222222222222e-06, - "loss": 0.598991870880127, - "mean_token_accuracy": 0.7992157340049744, - "num_tokens": 860171.0, - "step": 89 - }, - { - "epoch": 0.06838905775075987, - "grad_norm": 2.5779271125793457, - "learning_rate": 2.2474747474747476e-06, - "loss": 0.5693082809448242, - "mean_token_accuracy": 0.8093700408935547, - "num_tokens": 866669.0, - "step": 90 - }, - { - "epoch": 0.06914893617021277, - "grad_norm": 2.014092206954956, - "learning_rate": 2.2727272727272728e-06, - "loss": 0.5346695780754089, - "mean_token_accuracy": 0.8165590763092041, - "num_tokens": 876698.0, - "step": 91 - }, - { - "epoch": 0.06990881458966565, - "grad_norm": 1.7555919885635376, - "learning_rate": 2.2979797979797983e-06, - "loss": 0.5321458578109741, - "mean_token_accuracy": 0.8166656494140625, - "num_tokens": 889488.0, - "step": 92 - }, - { - "epoch": 0.07066869300911854, - "grad_norm": 1.8631824254989624, - "learning_rate": 2.3232323232323234e-06, - "loss": 0.5246532559394836, - "mean_token_accuracy": 0.8088107705116272, - "num_tokens": 901322.0, - "step": 93 - }, - { - "epoch": 0.07142857142857142, - "grad_norm": 3.2332139015197754, - "learning_rate": 2.348484848484849e-06, - "loss": 0.5141711235046387, - "mean_token_accuracy": 0.8382217884063721, - "num_tokens": 905792.0, - "step": 94 - }, - { - "epoch": 0.07218844984802432, - "grad_norm": 1.7806555032730103, - "learning_rate": 2.373737373737374e-06, - "loss": 0.5233149528503418, - "mean_token_accuracy": 0.8101529479026794, - "num_tokens": 917320.0, - "step": 95 - }, - { - "epoch": 0.0729483282674772, - "grad_norm": 1.8169859647750854, - "learning_rate": 2.3989898989898993e-06, - "loss": 0.578881561756134, - "mean_token_accuracy": 0.8044873476028442, - "num_tokens": 931062.0, - "step": 96 - }, - { - "epoch": 0.0737082066869301, - "grad_norm": 4.677402496337891, - "learning_rate": 2.4242424242424244e-06, - "loss": 0.7842556238174438, - "mean_token_accuracy": 0.7579764127731323, - "num_tokens": 934712.0, - "step": 97 - }, - { - "epoch": 0.07446808510638298, - "grad_norm": 2.6987264156341553, - "learning_rate": 2.4494949494949495e-06, - "loss": 0.5669287443161011, - "mean_token_accuracy": 0.8186933994293213, - "num_tokens": 941058.0, - "step": 98 - }, - { - "epoch": 0.07522796352583587, - "grad_norm": 1.6906023025512695, - "learning_rate": 2.474747474747475e-06, - "loss": 0.4976363778114319, - "mean_token_accuracy": 0.8198553323745728, - "num_tokens": 956509.0, - "step": 99 - }, - { - "epoch": 0.07598784194528875, - "grad_norm": 2.7256152629852295, - "learning_rate": 2.5e-06, - "loss": 0.7138420343399048, - "mean_token_accuracy": 0.7752805948257446, - "num_tokens": 963920.0, - "step": 100 - }, - { - "epoch": 0.07674772036474165, - "grad_norm": 2.174870491027832, - "learning_rate": 2.5252525252525258e-06, - "loss": 0.6733541488647461, - "mean_token_accuracy": 0.7745175361633301, - "num_tokens": 975268.0, - "step": 101 - }, - { - "epoch": 0.07750759878419453, - "grad_norm": 1.5587213039398193, - "learning_rate": 2.5505050505050505e-06, - "loss": 0.44223445653915405, - "mean_token_accuracy": 0.8278359174728394, - "num_tokens": 991837.0, - "step": 102 - }, - { - "epoch": 0.07826747720364742, - "grad_norm": 2.181840658187866, - "learning_rate": 2.575757575757576e-06, - "loss": 0.625128448009491, - "mean_token_accuracy": 0.7941786050796509, - "num_tokens": 1004325.0, - "step": 103 - }, - { - "epoch": 0.0790273556231003, - "grad_norm": 1.4986687898635864, - "learning_rate": 2.601010101010101e-06, - "loss": 0.39262527227401733, - "mean_token_accuracy": 0.8412648439407349, - "num_tokens": 1018331.0, - "step": 104 - }, - { - "epoch": 0.0797872340425532, - "grad_norm": 2.3416061401367188, - "learning_rate": 2.6262626262626267e-06, - "loss": 0.5495132803916931, - "mean_token_accuracy": 0.8193322420120239, - "num_tokens": 1026090.0, - "step": 105 - }, - { - "epoch": 0.08054711246200608, - "grad_norm": 3.8168859481811523, - "learning_rate": 2.6515151515151514e-06, - "loss": 0.4898706376552582, - "mean_token_accuracy": 0.8467956185340881, - "num_tokens": 1029955.0, - "step": 106 - }, - { - "epoch": 0.08130699088145897, - "grad_norm": 4.113908767700195, - "learning_rate": 2.676767676767677e-06, - "loss": 0.6189584732055664, - "mean_token_accuracy": 0.8019394278526306, - "num_tokens": 1033598.0, - "step": 107 - }, - { - "epoch": 0.08206686930091185, - "grad_norm": 2.50003981590271, - "learning_rate": 2.7020202020202025e-06, - "loss": 0.6479471921920776, - "mean_token_accuracy": 0.7790026664733887, - "num_tokens": 1042533.0, - "step": 108 - }, - { - "epoch": 0.08282674772036475, - "grad_norm": 1.408934473991394, - "learning_rate": 2.7272727272727272e-06, - "loss": 0.3909248113632202, - "mean_token_accuracy": 0.8477586507797241, - "num_tokens": 1061755.0, - "step": 109 - }, - { - "epoch": 0.08358662613981763, - "grad_norm": 3.360633611679077, - "learning_rate": 2.7525252525252528e-06, - "loss": 0.6952459812164307, - "mean_token_accuracy": 0.777535080909729, - "num_tokens": 1067316.0, - "step": 110 - }, - { - "epoch": 0.08434650455927052, - "grad_norm": 1.8631696701049805, - "learning_rate": 2.7777777777777783e-06, - "loss": 0.5420593023300171, - "mean_token_accuracy": 0.8157662749290466, - "num_tokens": 1079930.0, - "step": 111 - }, - { - "epoch": 0.0851063829787234, - "grad_norm": 2.4308314323425293, - "learning_rate": 2.803030303030303e-06, - "loss": 0.5863882303237915, - "mean_token_accuracy": 0.8206346035003662, - "num_tokens": 1088069.0, - "step": 112 - }, - { - "epoch": 0.0858662613981763, - "grad_norm": 2.922808885574341, - "learning_rate": 2.8282828282828286e-06, - "loss": 0.5217319130897522, - "mean_token_accuracy": 0.8253234028816223, - "num_tokens": 1093607.0, - "step": 113 - }, - { - "epoch": 0.08662613981762918, - "grad_norm": 2.3596107959747314, - "learning_rate": 2.8535353535353537e-06, - "loss": 0.5070714950561523, - "mean_token_accuracy": 0.8258323669433594, - "num_tokens": 1100405.0, - "step": 114 - }, - { - "epoch": 0.08738601823708207, - "grad_norm": 3.0853066444396973, - "learning_rate": 2.8787878787878793e-06, - "loss": 0.591964840888977, - "mean_token_accuracy": 0.8047322630882263, - "num_tokens": 1107535.0, - "step": 115 - }, - { - "epoch": 0.08814589665653495, - "grad_norm": 1.9251092672348022, - "learning_rate": 2.904040404040404e-06, - "loss": 0.5226191878318787, - "mean_token_accuracy": 0.8022720217704773, - "num_tokens": 1118716.0, - "step": 116 - }, - { - "epoch": 0.08890577507598785, - "grad_norm": 1.9692988395690918, - "learning_rate": 2.9292929292929295e-06, - "loss": 0.5462069511413574, - "mean_token_accuracy": 0.8157015442848206, - "num_tokens": 1131917.0, - "step": 117 - }, - { - "epoch": 0.08966565349544073, - "grad_norm": 1.4738909006118774, - "learning_rate": 2.954545454545455e-06, - "loss": 0.4564219117164612, - "mean_token_accuracy": 0.849632978439331, - "num_tokens": 1148534.0, - "step": 118 - }, - { - "epoch": 0.09042553191489362, - "grad_norm": 2.72646164894104, - "learning_rate": 2.97979797979798e-06, - "loss": 0.6654808521270752, - "mean_token_accuracy": 0.7752684354782104, - "num_tokens": 1155438.0, - "step": 119 - }, - { - "epoch": 0.0911854103343465, - "grad_norm": 2.7843852043151855, - "learning_rate": 3.0050505050505054e-06, - "loss": 0.5354680418968201, - "mean_token_accuracy": 0.8196378946304321, - "num_tokens": 1161815.0, - "step": 120 - }, - { - "epoch": 0.0919452887537994, - "grad_norm": 2.8052573204040527, - "learning_rate": 3.0303030303030305e-06, - "loss": 0.6366757154464722, - "mean_token_accuracy": 0.7967483997344971, - "num_tokens": 1168295.0, - "step": 121 - }, - { - "epoch": 0.09270516717325228, - "grad_norm": 2.7462735176086426, - "learning_rate": 3.055555555555556e-06, - "loss": 0.59470534324646, - "mean_token_accuracy": 0.8023771047592163, - "num_tokens": 1174502.0, - "step": 122 - }, - { - "epoch": 0.09346504559270517, - "grad_norm": 2.2743821144104004, - "learning_rate": 3.0808080808080807e-06, - "loss": 0.5720560550689697, - "mean_token_accuracy": 0.8162771463394165, - "num_tokens": 1183615.0, - "step": 123 - }, - { - "epoch": 0.09422492401215805, - "grad_norm": 1.8669533729553223, - "learning_rate": 3.1060606060606063e-06, - "loss": 0.4655378758907318, - "mean_token_accuracy": 0.8360732793807983, - "num_tokens": 1193761.0, - "step": 124 - }, - { - "epoch": 0.09498480243161095, - "grad_norm": 1.7666901350021362, - "learning_rate": 3.131313131313132e-06, - "loss": 0.5524153709411621, - "mean_token_accuracy": 0.8252713680267334, - "num_tokens": 1207870.0, - "step": 125 - }, - { - "epoch": 0.09574468085106383, - "grad_norm": 2.4720070362091064, - "learning_rate": 3.1565656565656566e-06, - "loss": 0.5003011226654053, - "mean_token_accuracy": 0.8491042852401733, - "num_tokens": 1214603.0, - "step": 126 - }, - { - "epoch": 0.09650455927051672, - "grad_norm": 1.6500422954559326, - "learning_rate": 3.181818181818182e-06, - "loss": 0.5137069225311279, - "mean_token_accuracy": 0.8273531198501587, - "num_tokens": 1228717.0, - "step": 127 - }, - { - "epoch": 0.0972644376899696, - "grad_norm": 3.402543067932129, - "learning_rate": 3.2070707070707072e-06, - "loss": 0.708167552947998, - "mean_token_accuracy": 0.7705385684967041, - "num_tokens": 1234361.0, - "step": 128 - }, - { - "epoch": 0.0980243161094225, - "grad_norm": 2.547285795211792, - "learning_rate": 3.232323232323233e-06, - "loss": 0.6020137071609497, - "mean_token_accuracy": 0.7981340289115906, - "num_tokens": 1244169.0, - "step": 129 - }, - { - "epoch": 0.09878419452887538, - "grad_norm": 2.0578792095184326, - "learning_rate": 3.257575757575758e-06, - "loss": 0.4425000250339508, - "mean_token_accuracy": 0.8567807674407959, - "num_tokens": 1252709.0, - "step": 130 - }, - { - "epoch": 0.09954407294832827, - "grad_norm": 1.672614336013794, - "learning_rate": 3.282828282828283e-06, - "loss": 0.4860966205596924, - "mean_token_accuracy": 0.8393139243125916, - "num_tokens": 1265766.0, - "step": 131 - }, - { - "epoch": 0.10030395136778116, - "grad_norm": 3.2560198307037354, - "learning_rate": 3.3080808080808086e-06, - "loss": 0.624736487865448, - "mean_token_accuracy": 0.7875322699546814, - "num_tokens": 1270779.0, - "step": 132 - }, - { - "epoch": 0.10106382978723404, - "grad_norm": 2.4468185901641846, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.5062227249145508, - "mean_token_accuracy": 0.8217229843139648, - "num_tokens": 1277113.0, - "step": 133 - }, - { - "epoch": 0.10182370820668693, - "grad_norm": 2.6371328830718994, - "learning_rate": 3.358585858585859e-06, - "loss": 0.477113276720047, - "mean_token_accuracy": 0.8605583906173706, - "num_tokens": 1282514.0, - "step": 134 - }, - { - "epoch": 0.10258358662613981, - "grad_norm": 2.48421311378479, - "learning_rate": 3.3838383838383844e-06, - "loss": 0.40855684876441956, - "mean_token_accuracy": 0.864548921585083, - "num_tokens": 1287859.0, - "step": 135 - }, - { - "epoch": 0.1033434650455927, - "grad_norm": 1.993099331855774, - "learning_rate": 3.409090909090909e-06, - "loss": 0.5913145542144775, - "mean_token_accuracy": 0.8248485922813416, - "num_tokens": 1301074.0, - "step": 136 - }, - { - "epoch": 0.10410334346504559, - "grad_norm": 3.5947680473327637, - "learning_rate": 3.4343434343434347e-06, - "loss": 0.5028599500656128, - "mean_token_accuracy": 0.8367215394973755, - "num_tokens": 1305219.0, - "step": 137 - }, - { - "epoch": 0.10486322188449848, - "grad_norm": 2.5778582096099854, - "learning_rate": 3.45959595959596e-06, - "loss": 0.5297672748565674, - "mean_token_accuracy": 0.8232187032699585, - "num_tokens": 1312482.0, - "step": 138 - }, - { - "epoch": 0.10562310030395136, - "grad_norm": 1.8961588144302368, - "learning_rate": 3.4848484848484854e-06, - "loss": 0.39954107999801636, - "mean_token_accuracy": 0.8605833053588867, - "num_tokens": 1323404.0, - "step": 139 - }, - { - "epoch": 0.10638297872340426, - "grad_norm": 1.9687960147857666, - "learning_rate": 3.51010101010101e-06, - "loss": 0.48791587352752686, - "mean_token_accuracy": 0.8200347423553467, - "num_tokens": 1333027.0, - "step": 140 - }, - { - "epoch": 0.10714285714285714, - "grad_norm": 2.520242691040039, - "learning_rate": 3.5353535353535356e-06, - "loss": 0.6106002330780029, - "mean_token_accuracy": 0.790692150592804, - "num_tokens": 1340999.0, - "step": 141 - }, - { - "epoch": 0.10790273556231003, - "grad_norm": 3.751617431640625, - "learning_rate": 3.560606060606061e-06, - "loss": 0.48141729831695557, - "mean_token_accuracy": 0.8421382904052734, - "num_tokens": 1344687.0, - "step": 142 - }, - { - "epoch": 0.10866261398176291, - "grad_norm": 2.7101709842681885, - "learning_rate": 3.585858585858586e-06, - "loss": 0.5375241637229919, - "mean_token_accuracy": 0.8061438202857971, - "num_tokens": 1350192.0, - "step": 143 - }, - { - "epoch": 0.1094224924012158, - "grad_norm": 2.583484411239624, - "learning_rate": 3.6111111111111115e-06, - "loss": 0.6492470502853394, - "mean_token_accuracy": 0.7863001823425293, - "num_tokens": 1358148.0, - "step": 144 - }, - { - "epoch": 0.11018237082066869, - "grad_norm": 1.792561650276184, - "learning_rate": 3.6363636363636366e-06, - "loss": 0.48480600118637085, - "mean_token_accuracy": 0.8358709812164307, - "num_tokens": 1369519.0, - "step": 145 - }, - { - "epoch": 0.11094224924012158, - "grad_norm": 2.6480472087860107, - "learning_rate": 3.661616161616162e-06, - "loss": 0.5268933176994324, - "mean_token_accuracy": 0.8214013576507568, - "num_tokens": 1375862.0, - "step": 146 - }, - { - "epoch": 0.11170212765957446, - "grad_norm": 2.3174469470977783, - "learning_rate": 3.686868686868687e-06, - "loss": 0.42517897486686707, - "mean_token_accuracy": 0.8523461222648621, - "num_tokens": 1381546.0, - "step": 147 - }, - { - "epoch": 0.11246200607902736, - "grad_norm": 3.0090949535369873, - "learning_rate": 3.7121212121212124e-06, - "loss": 0.4042336940765381, - "mean_token_accuracy": 0.8670448064804077, - "num_tokens": 1385896.0, - "step": 148 - }, - { - "epoch": 0.11322188449848024, - "grad_norm": 2.4928104877471924, - "learning_rate": 3.737373737373738e-06, - "loss": 0.6498878598213196, - "mean_token_accuracy": 0.7967068552970886, - "num_tokens": 1394169.0, - "step": 149 - }, - { - "epoch": 0.11398176291793313, - "grad_norm": 1.5984913110733032, - "learning_rate": 3.7626262626262627e-06, - "loss": 0.546096920967102, - "mean_token_accuracy": 0.8035850524902344, - "num_tokens": 1408785.0, - "step": 150 - }, - { - "epoch": 0.11474164133738601, - "grad_norm": 2.3663532733917236, - "learning_rate": 3.7878787878787882e-06, - "loss": 0.6111721992492676, - "mean_token_accuracy": 0.8015355467796326, - "num_tokens": 1417510.0, - "step": 151 - }, - { - "epoch": 0.11550151975683891, - "grad_norm": 2.518932819366455, - "learning_rate": 3.8131313131313138e-06, - "loss": 0.5274964570999146, - "mean_token_accuracy": 0.8155480623245239, - "num_tokens": 1424186.0, - "step": 152 - }, - { - "epoch": 0.11626139817629179, - "grad_norm": 2.14353609085083, - "learning_rate": 3.8383838383838385e-06, - "loss": 0.5283297896385193, - "mean_token_accuracy": 0.8275758028030396, - "num_tokens": 1432630.0, - "step": 153 - }, - { - "epoch": 0.11702127659574468, - "grad_norm": 1.8243604898452759, - "learning_rate": 3.863636363636364e-06, - "loss": 0.41854870319366455, - "mean_token_accuracy": 0.8222295045852661, - "num_tokens": 1442691.0, - "step": 154 - }, - { - "epoch": 0.11778115501519756, - "grad_norm": 2.088212251663208, - "learning_rate": 3.88888888888889e-06, - "loss": 0.6062943339347839, - "mean_token_accuracy": 0.8009427785873413, - "num_tokens": 1456890.0, - "step": 155 - }, - { - "epoch": 0.11854103343465046, - "grad_norm": 1.3469511270523071, - "learning_rate": 3.914141414141415e-06, - "loss": 0.4390433728694916, - "mean_token_accuracy": 0.8436295986175537, - "num_tokens": 1475349.0, - "step": 156 - }, - { - "epoch": 0.11930091185410334, - "grad_norm": 3.247023105621338, - "learning_rate": 3.93939393939394e-06, - "loss": 0.6490433216094971, - "mean_token_accuracy": 0.8037861585617065, - "num_tokens": 1479952.0, - "step": 157 - }, - { - "epoch": 0.12006079027355623, - "grad_norm": 2.6610445976257324, - "learning_rate": 3.964646464646465e-06, - "loss": 0.6221826076507568, - "mean_token_accuracy": 0.7848749160766602, - "num_tokens": 1487306.0, - "step": 158 - }, - { - "epoch": 0.12082066869300911, - "grad_norm": 2.3060810565948486, - "learning_rate": 3.98989898989899e-06, - "loss": 0.5052388310432434, - "mean_token_accuracy": 0.8281195759773254, - "num_tokens": 1495367.0, - "step": 159 - }, - { - "epoch": 0.12158054711246201, - "grad_norm": 2.504448652267456, - "learning_rate": 4.015151515151515e-06, - "loss": 0.5005477666854858, - "mean_token_accuracy": 0.8408058881759644, - "num_tokens": 1502069.0, - "step": 160 - }, - { - "epoch": 0.12234042553191489, - "grad_norm": 3.993938446044922, - "learning_rate": 4.04040404040404e-06, - "loss": 0.5569638013839722, - "mean_token_accuracy": 0.8095242977142334, - "num_tokens": 1510224.0, - "step": 161 - }, - { - "epoch": 0.12310030395136778, - "grad_norm": 2.2287683486938477, - "learning_rate": 4.065656565656566e-06, - "loss": 0.524042546749115, - "mean_token_accuracy": 0.8102203607559204, - "num_tokens": 1518364.0, - "step": 162 - }, - { - "epoch": 0.12386018237082067, - "grad_norm": 1.9531738758087158, - "learning_rate": 4.0909090909090915e-06, - "loss": 0.45794573426246643, - "mean_token_accuracy": 0.8560376167297363, - "num_tokens": 1528097.0, - "step": 163 - }, - { - "epoch": 0.12462006079027356, - "grad_norm": 1.5841206312179565, - "learning_rate": 4.116161616161617e-06, - "loss": 0.5420972108840942, - "mean_token_accuracy": 0.8092726469039917, - "num_tokens": 1544119.0, - "step": 164 - }, - { - "epoch": 0.12537993920972645, - "grad_norm": 1.7536218166351318, - "learning_rate": 4.141414141414142e-06, - "loss": 0.554668664932251, - "mean_token_accuracy": 0.8193825483322144, - "num_tokens": 1559140.0, - "step": 165 - }, - { - "epoch": 0.12613981762917933, - "grad_norm": 3.545454740524292, - "learning_rate": 4.166666666666667e-06, - "loss": 0.580947995185852, - "mean_token_accuracy": 0.8286383152008057, - "num_tokens": 1563625.0, - "step": 166 - }, - { - "epoch": 0.12689969604863222, - "grad_norm": 1.6608915328979492, - "learning_rate": 4.191919191919192e-06, - "loss": 0.5523324012756348, - "mean_token_accuracy": 0.8155215978622437, - "num_tokens": 1574945.0, - "step": 167 - }, - { - "epoch": 0.1276595744680851, - "grad_norm": 1.4832708835601807, - "learning_rate": 4.217171717171717e-06, - "loss": 0.5133191347122192, - "mean_token_accuracy": 0.8367571830749512, - "num_tokens": 1595865.0, - "step": 168 - }, - { - "epoch": 0.128419452887538, - "grad_norm": 1.7807520627975464, - "learning_rate": 4.242424242424243e-06, - "loss": 0.5131410360336304, - "mean_token_accuracy": 0.8129367232322693, - "num_tokens": 1608723.0, - "step": 169 - }, - { - "epoch": 0.12917933130699089, - "grad_norm": 2.707569122314453, - "learning_rate": 4.267676767676767e-06, - "loss": 0.6129013299942017, - "mean_token_accuracy": 0.7926048040390015, - "num_tokens": 1616136.0, - "step": 170 - }, - { - "epoch": 0.12993920972644377, - "grad_norm": 2.5831644535064697, - "learning_rate": 4.292929292929293e-06, - "loss": 0.6264227628707886, - "mean_token_accuracy": 0.8074911236763, - "num_tokens": 1624228.0, - "step": 171 - }, - { - "epoch": 0.13069908814589665, - "grad_norm": 3.1124250888824463, - "learning_rate": 4.3181818181818185e-06, - "loss": 0.41763827204704285, - "mean_token_accuracy": 0.8565453290939331, - "num_tokens": 1628098.0, - "step": 172 - }, - { - "epoch": 0.13145896656534956, - "grad_norm": 2.3214211463928223, - "learning_rate": 4.343434343434344e-06, - "loss": 0.421974778175354, - "mean_token_accuracy": 0.8391546010971069, - "num_tokens": 1634950.0, - "step": 173 - }, - { - "epoch": 0.13221884498480244, - "grad_norm": 2.1010327339172363, - "learning_rate": 4.368686868686869e-06, - "loss": 0.5307331681251526, - "mean_token_accuracy": 0.8139588236808777, - "num_tokens": 1644132.0, - "step": 174 - }, - { - "epoch": 0.13297872340425532, - "grad_norm": 2.533612012863159, - "learning_rate": 4.393939393939394e-06, - "loss": 0.5626664161682129, - "mean_token_accuracy": 0.8029808402061462, - "num_tokens": 1651637.0, - "step": 175 - }, - { - "epoch": 0.1337386018237082, - "grad_norm": 1.669508457183838, - "learning_rate": 4.41919191919192e-06, - "loss": 0.5351508259773254, - "mean_token_accuracy": 0.8281655311584473, - "num_tokens": 1666776.0, - "step": 176 - }, - { - "epoch": 0.1344984802431611, - "grad_norm": 1.7579659223556519, - "learning_rate": 4.444444444444444e-06, - "loss": 0.5235031247138977, - "mean_token_accuracy": 0.8143284320831299, - "num_tokens": 1679241.0, - "step": 177 - }, - { - "epoch": 0.135258358662614, - "grad_norm": 3.123563528060913, - "learning_rate": 4.46969696969697e-06, - "loss": 0.43051332235336304, - "mean_token_accuracy": 0.8518186211585999, - "num_tokens": 1683317.0, - "step": 178 - }, - { - "epoch": 0.13601823708206687, - "grad_norm": 2.2411575317382812, - "learning_rate": 4.494949494949495e-06, - "loss": 0.5471380949020386, - "mean_token_accuracy": 0.8267596960067749, - "num_tokens": 1691366.0, - "step": 179 - }, - { - "epoch": 0.13677811550151975, - "grad_norm": 2.621973991394043, - "learning_rate": 4.520202020202021e-06, - "loss": 0.5685839653015137, - "mean_token_accuracy": 0.8260642290115356, - "num_tokens": 1698148.0, - "step": 180 - }, - { - "epoch": 0.13753799392097266, - "grad_norm": 2.1553852558135986, - "learning_rate": 4.5454545454545455e-06, - "loss": 0.5703883171081543, - "mean_token_accuracy": 0.8219090700149536, - "num_tokens": 1707225.0, - "step": 181 - }, - { - "epoch": 0.13829787234042554, - "grad_norm": 5.1767897605896, - "learning_rate": 4.5707070707070715e-06, - "loss": 0.32704639434814453, - "mean_token_accuracy": 0.8754568099975586, - "num_tokens": 1712748.0, - "step": 182 - }, - { - "epoch": 0.13905775075987842, - "grad_norm": 2.609168291091919, - "learning_rate": 4.595959595959597e-06, - "loss": 0.5939987301826477, - "mean_token_accuracy": 0.8034975528717041, - "num_tokens": 1719932.0, - "step": 183 - }, - { - "epoch": 0.1398176291793313, - "grad_norm": 2.2059099674224854, - "learning_rate": 4.621212121212122e-06, - "loss": 0.5310720205307007, - "mean_token_accuracy": 0.8177368640899658, - "num_tokens": 1727640.0, - "step": 184 - }, - { - "epoch": 0.1405775075987842, - "grad_norm": 2.6367759704589844, - "learning_rate": 4.646464646464647e-06, - "loss": 0.522086501121521, - "mean_token_accuracy": 0.826233983039856, - "num_tokens": 1733609.0, - "step": 185 - }, - { - "epoch": 0.1413373860182371, - "grad_norm": 3.326732873916626, - "learning_rate": 4.671717171717172e-06, - "loss": 0.4127829074859619, - "mean_token_accuracy": 0.8551101684570312, - "num_tokens": 1737256.0, - "step": 186 - }, - { - "epoch": 0.14209726443768997, - "grad_norm": 1.828412413597107, - "learning_rate": 4.696969696969698e-06, - "loss": 0.5444269180297852, - "mean_token_accuracy": 0.8350818157196045, - "num_tokens": 1750196.0, - "step": 187 - }, - { - "epoch": 0.14285714285714285, - "grad_norm": 3.209203004837036, - "learning_rate": 4.722222222222222e-06, - "loss": 0.5087994933128357, - "mean_token_accuracy": 0.8349015712738037, - "num_tokens": 1754836.0, - "step": 188 - }, - { - "epoch": 0.14361702127659576, - "grad_norm": 1.7339166402816772, - "learning_rate": 4.747474747474748e-06, - "loss": 0.5151352286338806, - "mean_token_accuracy": 0.8321266174316406, - "num_tokens": 1766015.0, - "step": 189 - }, - { - "epoch": 0.14437689969604864, - "grad_norm": 2.699068069458008, - "learning_rate": 4.772727272727273e-06, - "loss": 0.4406203031539917, - "mean_token_accuracy": 0.8425000905990601, - "num_tokens": 1771684.0, - "step": 190 - }, - { - "epoch": 0.14513677811550152, - "grad_norm": 2.8117282390594482, - "learning_rate": 4.7979797979797985e-06, - "loss": 0.40428489446640015, - "mean_token_accuracy": 0.8654326796531677, - "num_tokens": 1776301.0, - "step": 191 - }, - { - "epoch": 0.1458966565349544, - "grad_norm": 2.9204647541046143, - "learning_rate": 4.823232323232324e-06, - "loss": 0.4191770553588867, - "mean_token_accuracy": 0.8574687242507935, - "num_tokens": 1781678.0, - "step": 192 - }, - { - "epoch": 0.1466565349544073, - "grad_norm": 2.1648988723754883, - "learning_rate": 4.848484848484849e-06, - "loss": 0.5839012861251831, - "mean_token_accuracy": 0.8053664565086365, - "num_tokens": 1792516.0, - "step": 193 - }, - { - "epoch": 0.1474164133738602, - "grad_norm": 2.3221631050109863, - "learning_rate": 4.873737373737374e-06, - "loss": 0.5037894248962402, - "mean_token_accuracy": 0.8427227139472961, - "num_tokens": 1800192.0, - "step": 194 - }, - { - "epoch": 0.14817629179331307, - "grad_norm": 2.4536430835723877, - "learning_rate": 4.898989898989899e-06, - "loss": 0.42326074838638306, - "mean_token_accuracy": 0.8510633111000061, - "num_tokens": 1806159.0, - "step": 195 - }, - { - "epoch": 0.14893617021276595, - "grad_norm": 2.4875805377960205, - "learning_rate": 4.924242424242425e-06, - "loss": 0.539531409740448, - "mean_token_accuracy": 0.8060250282287598, - "num_tokens": 1813392.0, - "step": 196 - }, - { - "epoch": 0.14969604863221886, - "grad_norm": 2.1664798259735107, - "learning_rate": 4.94949494949495e-06, - "loss": 0.42502015829086304, - "mean_token_accuracy": 0.8503251075744629, - "num_tokens": 1821424.0, - "step": 197 - }, - { - "epoch": 0.15045592705167174, - "grad_norm": 2.568808078765869, - "learning_rate": 4.974747474747475e-06, - "loss": 0.5025098323822021, - "mean_token_accuracy": 0.8182311058044434, - "num_tokens": 1827225.0, - "step": 198 - }, - { - "epoch": 0.15121580547112462, - "grad_norm": 1.9116802215576172, - "learning_rate": 5e-06, - "loss": 0.4907258450984955, - "mean_token_accuracy": 0.8310189843177795, - "num_tokens": 1836297.0, - "step": 199 - }, - { - "epoch": 0.1519756838905775, - "grad_norm": 3.150765895843506, - "learning_rate": 4.999999122701883e-06, - "loss": 0.390616774559021, - "mean_token_accuracy": 0.8626647591590881, - "num_tokens": 1839984.0, - "step": 200 - }, - { - "epoch": 0.15273556231003038, - "grad_norm": 3.2229044437408447, - "learning_rate": 4.999996490808146e-06, - "loss": 0.48009657859802246, - "mean_token_accuracy": 0.825214147567749, - "num_tokens": 1844610.0, - "step": 201 - }, - { - "epoch": 0.1534954407294833, - "grad_norm": 1.4473289251327515, - "learning_rate": 4.9999921043206356e-06, - "loss": 0.40135183930397034, - "mean_token_accuracy": 0.8537827730178833, - "num_tokens": 1859573.0, - "step": 202 - }, - { - "epoch": 0.15425531914893617, - "grad_norm": 4.072319507598877, - "learning_rate": 4.999985963242432e-06, - "loss": 0.6158689260482788, - "mean_token_accuracy": 0.8075432777404785, - "num_tokens": 1863147.0, - "step": 203 - }, - { - "epoch": 0.15501519756838905, - "grad_norm": 3.15741229057312, - "learning_rate": 4.999978067577844e-06, - "loss": 0.4603108763694763, - "mean_token_accuracy": 0.8418779373168945, - "num_tokens": 1867201.0, - "step": 204 - }, - { - "epoch": 0.15577507598784193, - "grad_norm": 2.1925418376922607, - "learning_rate": 4.999968417332415e-06, - "loss": 0.5552488565444946, - "mean_token_accuracy": 0.8216016292572021, - "num_tokens": 1874837.0, - "step": 205 - }, - { - "epoch": 0.15653495440729484, - "grad_norm": 2.2518117427825928, - "learning_rate": 4.999957012512916e-06, - "loss": 0.4912569522857666, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 1881842.0, - "step": 206 - }, - { - "epoch": 0.15729483282674772, - "grad_norm": 1.8223762512207031, - "learning_rate": 4.999943853127351e-06, - "loss": 0.47709137201309204, - "mean_token_accuracy": 0.8311659097671509, - "num_tokens": 1890805.0, - "step": 207 - }, - { - "epoch": 0.1580547112462006, - "grad_norm": 2.066499948501587, - "learning_rate": 4.999928939184958e-06, - "loss": 0.44794657826423645, - "mean_token_accuracy": 0.8513424396514893, - "num_tokens": 1898264.0, - "step": 208 - }, - { - "epoch": 0.15881458966565348, - "grad_norm": 3.53865909576416, - "learning_rate": 4.999912270696202e-06, - "loss": 0.5978270769119263, - "mean_token_accuracy": 0.8080137968063354, - "num_tokens": 1902435.0, - "step": 209 - }, - { - "epoch": 0.1595744680851064, - "grad_norm": 2.0760679244995117, - "learning_rate": 4.999893847672783e-06, - "loss": 0.5930601358413696, - "mean_token_accuracy": 0.8028650283813477, - "num_tokens": 1912252.0, - "step": 210 - }, - { - "epoch": 0.16033434650455927, - "grad_norm": 2.21551513671875, - "learning_rate": 4.99987367012763e-06, - "loss": 0.6336753964424133, - "mean_token_accuracy": 0.7902286648750305, - "num_tokens": 1922095.0, - "step": 211 - }, - { - "epoch": 0.16109422492401215, - "grad_norm": 1.7654480934143066, - "learning_rate": 4.999851738074904e-06, - "loss": 0.6373403668403625, - "mean_token_accuracy": 0.7802424430847168, - "num_tokens": 1938962.0, - "step": 212 - }, - { - "epoch": 0.16185410334346503, - "grad_norm": 2.852834701538086, - "learning_rate": 4.9998280515300006e-06, - "loss": 0.6418683528900146, - "mean_token_accuracy": 0.7895716428756714, - "num_tokens": 1944668.0, - "step": 213 - }, - { - "epoch": 0.16261398176291794, - "grad_norm": 3.4737212657928467, - "learning_rate": 4.999802610509541e-06, - "loss": 0.6323273181915283, - "mean_token_accuracy": 0.7982614636421204, - "num_tokens": 1949142.0, - "step": 214 - }, - { - "epoch": 0.16337386018237082, - "grad_norm": 3.0802664756774902, - "learning_rate": 4.999775415031381e-06, - "loss": 0.5929068326950073, - "mean_token_accuracy": 0.8112219572067261, - "num_tokens": 1954141.0, - "step": 215 - }, - { - "epoch": 0.1641337386018237, - "grad_norm": 2.9808855056762695, - "learning_rate": 4.999746465114609e-06, - "loss": 0.5556406378746033, - "mean_token_accuracy": 0.8117628693580627, - "num_tokens": 1959406.0, - "step": 216 - }, - { - "epoch": 0.16489361702127658, - "grad_norm": 1.7346166372299194, - "learning_rate": 4.999715760779541e-06, - "loss": 0.5122925043106079, - "mean_token_accuracy": 0.8040724992752075, - "num_tokens": 1971921.0, - "step": 217 - }, - { - "epoch": 0.1656534954407295, - "grad_norm": 1.4183907508850098, - "learning_rate": 4.999683302047729e-06, - "loss": 0.46471893787384033, - "mean_token_accuracy": 0.8381330966949463, - "num_tokens": 1988863.0, - "step": 218 - }, - { - "epoch": 0.16641337386018237, - "grad_norm": 1.6797802448272705, - "learning_rate": 4.999649088941951e-06, - "loss": 0.38348832726478577, - "mean_token_accuracy": 0.8344278931617737, - "num_tokens": 2000003.0, - "step": 219 - }, - { - "epoch": 0.16717325227963525, - "grad_norm": 3.036963939666748, - "learning_rate": 4.999613121486222e-06, - "loss": 0.6062780618667603, - "mean_token_accuracy": 0.8217900991439819, - "num_tokens": 2004813.0, - "step": 220 - }, - { - "epoch": 0.16793313069908813, - "grad_norm": 2.0343217849731445, - "learning_rate": 4.999575399705782e-06, - "loss": 0.5052450895309448, - "mean_token_accuracy": 0.8368623852729797, - "num_tokens": 2013565.0, - "step": 221 - }, - { - "epoch": 0.16869300911854104, - "grad_norm": 2.1162009239196777, - "learning_rate": 4.9995359236271094e-06, - "loss": 0.5169756412506104, - "mean_token_accuracy": 0.8339958190917969, - "num_tokens": 2025763.0, - "step": 222 - }, - { - "epoch": 0.16945288753799392, - "grad_norm": 2.055333375930786, - "learning_rate": 4.9994946932779076e-06, - "loss": 0.6327048540115356, - "mean_token_accuracy": 0.8078711032867432, - "num_tokens": 2037005.0, - "step": 223 - }, - { - "epoch": 0.1702127659574468, - "grad_norm": 3.334620475769043, - "learning_rate": 4.999451708687114e-06, - "loss": 0.5688358545303345, - "mean_token_accuracy": 0.8015589714050293, - "num_tokens": 2041473.0, - "step": 224 - }, - { - "epoch": 0.17097264437689969, - "grad_norm": 2.3734676837921143, - "learning_rate": 4.999406969884897e-06, - "loss": 0.5673821568489075, - "mean_token_accuracy": 0.8054057359695435, - "num_tokens": 2049397.0, - "step": 225 - }, - { - "epoch": 0.1717325227963526, - "grad_norm": 1.807358980178833, - "learning_rate": 4.999360476902656e-06, - "loss": 0.4376158118247986, - "mean_token_accuracy": 0.8456039428710938, - "num_tokens": 2058721.0, - "step": 226 - }, - { - "epoch": 0.17249240121580547, - "grad_norm": 3.231638193130493, - "learning_rate": 4.999312229773022e-06, - "loss": 0.5592809915542603, - "mean_token_accuracy": 0.8170154094696045, - "num_tokens": 2063455.0, - "step": 227 - }, - { - "epoch": 0.17325227963525835, - "grad_norm": 2.2717151641845703, - "learning_rate": 4.999262228529855e-06, - "loss": 0.6144396066665649, - "mean_token_accuracy": 0.7948470115661621, - "num_tokens": 2071686.0, - "step": 228 - }, - { - "epoch": 0.17401215805471124, - "grad_norm": 1.4171342849731445, - "learning_rate": 4.99921047320825e-06, - "loss": 0.43680912256240845, - "mean_token_accuracy": 0.84850013256073, - "num_tokens": 2086999.0, - "step": 229 - }, - { - "epoch": 0.17477203647416414, - "grad_norm": 3.162736654281616, - "learning_rate": 4.99915696384453e-06, - "loss": 0.6025407910346985, - "mean_token_accuracy": 0.8042335510253906, - "num_tokens": 2092001.0, - "step": 230 - }, - { - "epoch": 0.17553191489361702, - "grad_norm": 1.8672804832458496, - "learning_rate": 4.99910170047625e-06, - "loss": 0.5843087434768677, - "mean_token_accuracy": 0.8016980886459351, - "num_tokens": 2103372.0, - "step": 231 - }, - { - "epoch": 0.1762917933130699, - "grad_norm": 2.967587471008301, - "learning_rate": 4.999044683142196e-06, - "loss": 0.5123642086982727, - "mean_token_accuracy": 0.8216149806976318, - "num_tokens": 2108008.0, - "step": 232 - }, - { - "epoch": 0.1770516717325228, - "grad_norm": 1.9651981592178345, - "learning_rate": 4.998985911882383e-06, - "loss": 0.5868178606033325, - "mean_token_accuracy": 0.7904198169708252, - "num_tokens": 2119009.0, - "step": 233 - }, - { - "epoch": 0.1778115501519757, - "grad_norm": 2.7785449028015137, - "learning_rate": 4.998925386738063e-06, - "loss": 0.5075510144233704, - "mean_token_accuracy": 0.8280210494995117, - "num_tokens": 2124915.0, - "step": 234 - }, - { - "epoch": 0.17857142857142858, - "grad_norm": 2.957470417022705, - "learning_rate": 4.998863107751711e-06, - "loss": 0.5351958274841309, - "mean_token_accuracy": 0.846825122833252, - "num_tokens": 2129905.0, - "step": 235 - }, - { - "epoch": 0.17933130699088146, - "grad_norm": 3.207671880722046, - "learning_rate": 4.99879907496704e-06, - "loss": 0.6209091544151306, - "mean_token_accuracy": 0.789960503578186, - "num_tokens": 2135027.0, - "step": 236 - }, - { - "epoch": 0.18009118541033434, - "grad_norm": 2.018953800201416, - "learning_rate": 4.998733288428987e-06, - "loss": 0.601510763168335, - "mean_token_accuracy": 0.8136930465698242, - "num_tokens": 2147016.0, - "step": 237 - }, - { - "epoch": 0.18085106382978725, - "grad_norm": 2.437281847000122, - "learning_rate": 4.998665748183727e-06, - "loss": 0.5813639163970947, - "mean_token_accuracy": 0.8116716146469116, - "num_tokens": 2155386.0, - "step": 238 - }, - { - "epoch": 0.18161094224924013, - "grad_norm": 1.5708180665969849, - "learning_rate": 4.998596454278661e-06, - "loss": 0.5252395272254944, - "mean_token_accuracy": 0.8193864822387695, - "num_tokens": 2170295.0, - "step": 239 - }, - { - "epoch": 0.182370820668693, - "grad_norm": 1.9921495914459229, - "learning_rate": 4.998525406762422e-06, - "loss": 0.5335029363632202, - "mean_token_accuracy": 0.8120872974395752, - "num_tokens": 2180012.0, - "step": 240 - }, - { - "epoch": 0.1831306990881459, - "grad_norm": 2.6562681198120117, - "learning_rate": 4.998452605684874e-06, - "loss": 0.48021435737609863, - "mean_token_accuracy": 0.8388714790344238, - "num_tokens": 2185607.0, - "step": 241 - }, - { - "epoch": 0.1838905775075988, - "grad_norm": 2.2535853385925293, - "learning_rate": 4.998378051097111e-06, - "loss": 0.5747300386428833, - "mean_token_accuracy": 0.8004639148712158, - "num_tokens": 2194105.0, - "step": 242 - }, - { - "epoch": 0.18465045592705168, - "grad_norm": 1.6151788234710693, - "learning_rate": 4.998301743051459e-06, - "loss": 0.6190565824508667, - "mean_token_accuracy": 0.7816627621650696, - "num_tokens": 2210629.0, - "step": 243 - }, - { - "epoch": 0.18541033434650456, - "grad_norm": 2.1088173389434814, - "learning_rate": 4.9982236816014735e-06, - "loss": 0.4715560972690582, - "mean_token_accuracy": 0.8485721349716187, - "num_tokens": 2218958.0, - "step": 244 - }, - { - "epoch": 0.18617021276595744, - "grad_norm": 2.6168735027313232, - "learning_rate": 4.998143866801941e-06, - "loss": 0.6077103018760681, - "mean_token_accuracy": 0.8057924509048462, - "num_tokens": 2226368.0, - "step": 245 - }, - { - "epoch": 0.18693009118541035, - "grad_norm": 2.5988616943359375, - "learning_rate": 4.99806229870888e-06, - "loss": 0.5021637678146362, - "mean_token_accuracy": 0.8361666202545166, - "num_tokens": 2232485.0, - "step": 246 - }, - { - "epoch": 0.18768996960486323, - "grad_norm": 2.015887498855591, - "learning_rate": 4.9979789773795365e-06, - "loss": 0.4309737980365753, - "mean_token_accuracy": 0.8508044481277466, - "num_tokens": 2240819.0, - "step": 247 - }, - { - "epoch": 0.1884498480243161, - "grad_norm": 2.3115265369415283, - "learning_rate": 4.997893902872389e-06, - "loss": 0.5776500701904297, - "mean_token_accuracy": 0.8079549074172974, - "num_tokens": 2249460.0, - "step": 248 - }, - { - "epoch": 0.189209726443769, - "grad_norm": 1.7387021780014038, - "learning_rate": 4.997807075247147e-06, - "loss": 0.430944561958313, - "mean_token_accuracy": 0.8483544588088989, - "num_tokens": 2259124.0, - "step": 249 - }, - { - "epoch": 0.1899696048632219, - "grad_norm": 1.6378381252288818, - "learning_rate": 4.997718494564747e-06, - "loss": 0.4123363792896271, - "mean_token_accuracy": 0.8557409644126892, - "num_tokens": 2269899.0, - "step": 250 - }, - { - "epoch": 0.19072948328267478, - "grad_norm": 1.336282730102539, - "learning_rate": 4.997628160887361e-06, - "loss": 0.502329409122467, - "mean_token_accuracy": 0.8186938166618347, - "num_tokens": 2292821.0, - "step": 251 - }, - { - "epoch": 0.19148936170212766, - "grad_norm": 3.3335583209991455, - "learning_rate": 4.997536074278388e-06, - "loss": 0.584446907043457, - "mean_token_accuracy": 0.8062717318534851, - "num_tokens": 2297175.0, - "step": 252 - }, - { - "epoch": 0.19224924012158054, - "grad_norm": 2.246727228164673, - "learning_rate": 4.9974422348024565e-06, - "loss": 0.5683060884475708, - "mean_token_accuracy": 0.8193703293800354, - "num_tokens": 2305456.0, - "step": 253 - }, - { - "epoch": 0.19300911854103345, - "grad_norm": 2.3520865440368652, - "learning_rate": 4.997346642525429e-06, - "loss": 0.4724946618080139, - "mean_token_accuracy": 0.8426719307899475, - "num_tokens": 2312241.0, - "step": 254 - }, - { - "epoch": 0.19376899696048633, - "grad_norm": 2.7115702629089355, - "learning_rate": 4.9972492975143936e-06, - "loss": 0.5019032955169678, - "mean_token_accuracy": 0.8253573179244995, - "num_tokens": 2318094.0, - "step": 255 - }, - { - "epoch": 0.1945288753799392, - "grad_norm": 1.705528974533081, - "learning_rate": 4.997150199837671e-06, - "loss": 0.45588475465774536, - "mean_token_accuracy": 0.836666464805603, - "num_tokens": 2329025.0, - "step": 256 - }, - { - "epoch": 0.1952887537993921, - "grad_norm": 2.161400318145752, - "learning_rate": 4.997049349564814e-06, - "loss": 0.5170183777809143, - "mean_token_accuracy": 0.8287534117698669, - "num_tokens": 2337448.0, - "step": 257 - }, - { - "epoch": 0.196048632218845, - "grad_norm": 2.629669189453125, - "learning_rate": 4.996946746766602e-06, - "loss": 0.44650501012802124, - "mean_token_accuracy": 0.850114107131958, - "num_tokens": 2343207.0, - "step": 258 - }, - { - "epoch": 0.19680851063829788, - "grad_norm": 1.6735503673553467, - "learning_rate": 4.996842391515045e-06, - "loss": 0.5247820019721985, - "mean_token_accuracy": 0.8285071849822998, - "num_tokens": 2356801.0, - "step": 259 - }, - { - "epoch": 0.19756838905775076, - "grad_norm": 1.2753115892410278, - "learning_rate": 4.996736283883382e-06, - "loss": 0.41870927810668945, - "mean_token_accuracy": 0.8448047637939453, - "num_tokens": 2377306.0, - "step": 260 - }, - { - "epoch": 0.19832826747720364, - "grad_norm": 2.6947314739227295, - "learning_rate": 4.9966284239460875e-06, - "loss": 0.5059205889701843, - "mean_token_accuracy": 0.8430814743041992, - "num_tokens": 2383352.0, - "step": 261 - }, - { - "epoch": 0.19908814589665655, - "grad_norm": 2.0509963035583496, - "learning_rate": 4.996518811778858e-06, - "loss": 0.4565388560295105, - "mean_token_accuracy": 0.8453130722045898, - "num_tokens": 2391149.0, - "step": 262 - }, - { - "epoch": 0.19984802431610943, - "grad_norm": 2.1856348514556885, - "learning_rate": 4.996407447458626e-06, - "loss": 0.531380832195282, - "mean_token_accuracy": 0.8387004137039185, - "num_tokens": 2399875.0, - "step": 263 - }, - { - "epoch": 0.2006079027355623, - "grad_norm": 2.7348573207855225, - "learning_rate": 4.99629433106355e-06, - "loss": 0.5242817401885986, - "mean_token_accuracy": 0.8177423477172852, - "num_tokens": 2406586.0, - "step": 264 - }, - { - "epoch": 0.2013677811550152, - "grad_norm": 1.76587975025177, - "learning_rate": 4.99617946267302e-06, - "loss": 0.49298471212387085, - "mean_token_accuracy": 0.8271149396896362, - "num_tokens": 2418683.0, - "step": 265 - }, - { - "epoch": 0.20212765957446807, - "grad_norm": 2.8129730224609375, - "learning_rate": 4.996062842367655e-06, - "loss": 0.46420302987098694, - "mean_token_accuracy": 0.8453244566917419, - "num_tokens": 2422929.0, - "step": 266 - }, - { - "epoch": 0.20288753799392098, - "grad_norm": 2.575744152069092, - "learning_rate": 4.9959444702293025e-06, - "loss": 0.43208545446395874, - "mean_token_accuracy": 0.8494843244552612, - "num_tokens": 2429567.0, - "step": 267 - }, - { - "epoch": 0.20364741641337386, - "grad_norm": 2.7586750984191895, - "learning_rate": 4.995824346341041e-06, - "loss": 0.4390473961830139, - "mean_token_accuracy": 0.8348895311355591, - "num_tokens": 2434700.0, - "step": 268 - }, - { - "epoch": 0.20440729483282674, - "grad_norm": 1.972145438194275, - "learning_rate": 4.99570247078718e-06, - "loss": 0.6219544410705566, - "mean_token_accuracy": 0.7939999103546143, - "num_tokens": 2447007.0, - "step": 269 - }, - { - "epoch": 0.20516717325227962, - "grad_norm": 2.2963485717773438, - "learning_rate": 4.995578843653255e-06, - "loss": 0.5008970499038696, - "mean_token_accuracy": 0.8255308866500854, - "num_tokens": 2453936.0, - "step": 270 - }, - { - "epoch": 0.20592705167173253, - "grad_norm": 1.8897721767425537, - "learning_rate": 4.995453465026033e-06, - "loss": 0.5436089038848877, - "mean_token_accuracy": 0.819086492061615, - "num_tokens": 2464494.0, - "step": 271 - }, - { - "epoch": 0.2066869300911854, - "grad_norm": 2.319728374481201, - "learning_rate": 4.995326334993508e-06, - "loss": 0.5136368870735168, - "mean_token_accuracy": 0.820817232131958, - "num_tokens": 2470938.0, - "step": 272 - }, - { - "epoch": 0.2074468085106383, - "grad_norm": 2.230414390563965, - "learning_rate": 4.9951974536449055e-06, - "loss": 0.5272846817970276, - "mean_token_accuracy": 0.8203279972076416, - "num_tokens": 2478629.0, - "step": 273 - }, - { - "epoch": 0.20820668693009117, - "grad_norm": 3.401937484741211, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.4389592111110687, - "mean_token_accuracy": 0.8647899031639099, - "num_tokens": 2482193.0, - "step": 274 - }, - { - "epoch": 0.20896656534954408, - "grad_norm": 2.1278507709503174, - "learning_rate": 4.994934437362513e-06, - "loss": 0.598863422870636, - "mean_token_accuracy": 0.7945119738578796, - "num_tokens": 2492465.0, - "step": 275 - }, - { - "epoch": 0.20972644376899696, - "grad_norm": 1.9259960651397705, - "learning_rate": 4.994800302613318e-06, - "loss": 0.49520939588546753, - "mean_token_accuracy": 0.8371536135673523, - "num_tokens": 2500825.0, - "step": 276 - }, - { - "epoch": 0.21048632218844984, - "grad_norm": 2.346418857574463, - "learning_rate": 4.994664416917236e-06, - "loss": 0.5412614345550537, - "mean_token_accuracy": 0.810661792755127, - "num_tokens": 2509513.0, - "step": 277 - }, - { - "epoch": 0.21124620060790272, - "grad_norm": 1.3092039823532104, - "learning_rate": 4.994526780369636e-06, - "loss": 0.46305379271507263, - "mean_token_accuracy": 0.8358527421951294, - "num_tokens": 2531405.0, - "step": 278 - }, - { - "epoch": 0.21200607902735563, - "grad_norm": 2.924611806869507, - "learning_rate": 4.9943873930671175e-06, - "loss": 0.6134544610977173, - "mean_token_accuracy": 0.7947378754615784, - "num_tokens": 2536744.0, - "step": 279 - }, - { - "epoch": 0.2127659574468085, - "grad_norm": 2.8290598392486572, - "learning_rate": 4.994246255107506e-06, - "loss": 0.465520441532135, - "mean_token_accuracy": 0.8440108299255371, - "num_tokens": 2541184.0, - "step": 280 - }, - { - "epoch": 0.2135258358662614, - "grad_norm": 3.8081259727478027, - "learning_rate": 4.994103366589859e-06, - "loss": 0.43394139409065247, - "mean_token_accuracy": 0.8579148054122925, - "num_tokens": 2545395.0, - "step": 281 - }, - { - "epoch": 0.21428571428571427, - "grad_norm": 1.7994529008865356, - "learning_rate": 4.993958727614462e-06, - "loss": 0.5076484680175781, - "mean_token_accuracy": 0.8270803093910217, - "num_tokens": 2556541.0, - "step": 282 - }, - { - "epoch": 0.21504559270516718, - "grad_norm": 2.5582659244537354, - "learning_rate": 4.993812338282826e-06, - "loss": 0.4453684389591217, - "mean_token_accuracy": 0.8488293886184692, - "num_tokens": 2562949.0, - "step": 283 - }, - { - "epoch": 0.21580547112462006, - "grad_norm": 1.6448938846588135, - "learning_rate": 4.993664198697694e-06, - "loss": 0.461971640586853, - "mean_token_accuracy": 0.824763298034668, - "num_tokens": 2576407.0, - "step": 284 - }, - { - "epoch": 0.21656534954407294, - "grad_norm": 2.1264469623565674, - "learning_rate": 4.993514308963037e-06, - "loss": 0.6241602897644043, - "mean_token_accuracy": 0.7916014790534973, - "num_tokens": 2585695.0, - "step": 285 - }, - { - "epoch": 0.21732522796352582, - "grad_norm": 3.629991292953491, - "learning_rate": 4.993362669184051e-06, - "loss": 0.610355019569397, - "mean_token_accuracy": 0.7847568988800049, - "num_tokens": 2589778.0, - "step": 286 - }, - { - "epoch": 0.21808510638297873, - "grad_norm": 1.9070756435394287, - "learning_rate": 4.993209279467164e-06, - "loss": 0.5513623952865601, - "mean_token_accuracy": 0.7911607027053833, - "num_tokens": 2600920.0, - "step": 287 - }, - { - "epoch": 0.2188449848024316, - "grad_norm": 1.761062741279602, - "learning_rate": 4.993054139920031e-06, - "loss": 0.4579957127571106, - "mean_token_accuracy": 0.8189530372619629, - "num_tokens": 2611856.0, - "step": 288 - }, - { - "epoch": 0.2196048632218845, - "grad_norm": 1.7264713048934937, - "learning_rate": 4.992897250651535e-06, - "loss": 0.5871305465698242, - "mean_token_accuracy": 0.7918527126312256, - "num_tokens": 2624730.0, - "step": 289 - }, - { - "epoch": 0.22036474164133737, - "grad_norm": 1.7455977201461792, - "learning_rate": 4.992738611771787e-06, - "loss": 0.5475119948387146, - "mean_token_accuracy": 0.8226917386054993, - "num_tokens": 2635705.0, - "step": 290 - }, - { - "epoch": 0.22112462006079028, - "grad_norm": 2.095095157623291, - "learning_rate": 4.992578223392124e-06, - "loss": 0.5952225923538208, - "mean_token_accuracy": 0.8078469038009644, - "num_tokens": 2643954.0, - "step": 291 - }, - { - "epoch": 0.22188449848024316, - "grad_norm": 2.994664192199707, - "learning_rate": 4.992416085625115e-06, - "loss": 0.5432442426681519, - "mean_token_accuracy": 0.8329008221626282, - "num_tokens": 2648800.0, - "step": 292 - }, - { - "epoch": 0.22264437689969604, - "grad_norm": 2.796790361404419, - "learning_rate": 4.992252198584554e-06, - "loss": 0.5168961882591248, - "mean_token_accuracy": 0.8393474817276001, - "num_tokens": 2653546.0, - "step": 293 - }, - { - "epoch": 0.22340425531914893, - "grad_norm": 1.8610522747039795, - "learning_rate": 4.992086562385462e-06, - "loss": 0.5728024244308472, - "mean_token_accuracy": 0.797406792640686, - "num_tokens": 2667483.0, - "step": 294 - }, - { - "epoch": 0.22416413373860183, - "grad_norm": 1.695472002029419, - "learning_rate": 4.9919191771440905e-06, - "loss": 0.5460028648376465, - "mean_token_accuracy": 0.8123016357421875, - "num_tokens": 2683574.0, - "step": 295 - }, - { - "epoch": 0.22492401215805471, - "grad_norm": 2.8627376556396484, - "learning_rate": 4.9917500429779165e-06, - "loss": 0.5566985011100769, - "mean_token_accuracy": 0.815531313419342, - "num_tokens": 2688985.0, - "step": 296 - }, - { - "epoch": 0.2256838905775076, - "grad_norm": 2.73323655128479, - "learning_rate": 4.991579160005644e-06, - "loss": 0.48197102546691895, - "mean_token_accuracy": 0.8471829295158386, - "num_tokens": 2694799.0, - "step": 297 - }, - { - "epoch": 0.22644376899696048, - "grad_norm": 1.8436161279678345, - "learning_rate": 4.991406528347206e-06, - "loss": 0.4528339207172394, - "mean_token_accuracy": 0.8603188395500183, - "num_tokens": 2707321.0, - "step": 298 - }, - { - "epoch": 0.22720364741641338, - "grad_norm": 2.6231515407562256, - "learning_rate": 4.9912321481237616e-06, - "loss": 0.5916541814804077, - "mean_token_accuracy": 0.8050242066383362, - "num_tokens": 2714233.0, - "step": 299 - }, - { - "epoch": 0.22796352583586627, - "grad_norm": 3.08776593208313, - "learning_rate": 4.991056019457697e-06, - "loss": 0.4860580563545227, - "mean_token_accuracy": 0.8464088439941406, - "num_tokens": 2718443.0, - "step": 300 - }, - { - "epoch": 0.22872340425531915, - "grad_norm": 2.2537803649902344, - "learning_rate": 4.990878142472628e-06, - "loss": 0.5158311128616333, - "mean_token_accuracy": 0.824694812297821, - "num_tokens": 2726158.0, - "step": 301 - }, - { - "epoch": 0.22948328267477203, - "grad_norm": 2.1122705936431885, - "learning_rate": 4.990698517293394e-06, - "loss": 0.495265394449234, - "mean_token_accuracy": 0.8343238830566406, - "num_tokens": 2735022.0, - "step": 302 - }, - { - "epoch": 0.23024316109422494, - "grad_norm": 3.5503528118133545, - "learning_rate": 4.9905171440460645e-06, - "loss": 0.46063232421875, - "mean_token_accuracy": 0.8420047760009766, - "num_tokens": 2738550.0, - "step": 303 - }, - { - "epoch": 0.23100303951367782, - "grad_norm": 3.9858486652374268, - "learning_rate": 4.990334022857932e-06, - "loss": 0.5832710266113281, - "mean_token_accuracy": 0.8144199848175049, - "num_tokens": 2741720.0, - "step": 304 - }, - { - "epoch": 0.2317629179331307, - "grad_norm": 2.407231330871582, - "learning_rate": 4.990149153857519e-06, - "loss": 0.4692630171775818, - "mean_token_accuracy": 0.8429223299026489, - "num_tokens": 2748693.0, - "step": 305 - }, - { - "epoch": 0.23252279635258358, - "grad_norm": 1.6996397972106934, - "learning_rate": 4.989962537174573e-06, - "loss": 0.49143946170806885, - "mean_token_accuracy": 0.8340128064155579, - "num_tokens": 2761254.0, - "step": 306 - }, - { - "epoch": 0.23328267477203649, - "grad_norm": 3.746432065963745, - "learning_rate": 4.989774172940071e-06, - "loss": 0.6282026767730713, - "mean_token_accuracy": 0.775698184967041, - "num_tokens": 2765115.0, - "step": 307 - }, - { - "epoch": 0.23404255319148937, - "grad_norm": 2.212872266769409, - "learning_rate": 4.989584061286211e-06, - "loss": 0.5193763971328735, - "mean_token_accuracy": 0.8168246746063232, - "num_tokens": 2772345.0, - "step": 308 - }, - { - "epoch": 0.23480243161094225, - "grad_norm": 1.752297282218933, - "learning_rate": 4.989392202346423e-06, - "loss": 0.4437984824180603, - "mean_token_accuracy": 0.8451256155967712, - "num_tokens": 2783072.0, - "step": 309 - }, - { - "epoch": 0.23556231003039513, - "grad_norm": 2.386019706726074, - "learning_rate": 4.989198596255361e-06, - "loss": 0.4090752899646759, - "mean_token_accuracy": 0.8480085134506226, - "num_tokens": 2788757.0, - "step": 310 - }, - { - "epoch": 0.23632218844984804, - "grad_norm": 3.9981489181518555, - "learning_rate": 4.989003243148904e-06, - "loss": 0.5149132013320923, - "mean_token_accuracy": 0.8179056644439697, - "num_tokens": 2792096.0, - "step": 311 - }, - { - "epoch": 0.23708206686930092, - "grad_norm": 1.8723100423812866, - "learning_rate": 4.988806143164159e-06, - "loss": 0.4531487822532654, - "mean_token_accuracy": 0.8400167226791382, - "num_tokens": 2802210.0, - "step": 312 - }, - { - "epoch": 0.2378419452887538, - "grad_norm": 2.3415136337280273, - "learning_rate": 4.988607296439459e-06, - "loss": 0.5974439978599548, - "mean_token_accuracy": 0.8035976886749268, - "num_tokens": 2810088.0, - "step": 313 - }, - { - "epoch": 0.23860182370820668, - "grad_norm": 1.5317577123641968, - "learning_rate": 4.98840670311436e-06, - "loss": 0.49247145652770996, - "mean_token_accuracy": 0.8292540311813354, - "num_tokens": 2824005.0, - "step": 314 - }, - { - "epoch": 0.2393617021276596, - "grad_norm": 2.170772075653076, - "learning_rate": 4.988204363329648e-06, - "loss": 0.6359974145889282, - "mean_token_accuracy": 0.7785564661026001, - "num_tokens": 2834680.0, - "step": 315 - }, - { - "epoch": 0.24012158054711247, - "grad_norm": 3.2655932903289795, - "learning_rate": 4.988000277227334e-06, - "loss": 0.5080196857452393, - "mean_token_accuracy": 0.8295877575874329, - "num_tokens": 2838735.0, - "step": 316 - }, - { - "epoch": 0.24088145896656535, - "grad_norm": 3.406589984893799, - "learning_rate": 4.987794444950651e-06, - "loss": 0.3939085006713867, - "mean_token_accuracy": 0.8700719475746155, - "num_tokens": 2842127.0, - "step": 317 - }, - { - "epoch": 0.24164133738601823, - "grad_norm": 1.8211106061935425, - "learning_rate": 4.987586866644061e-06, - "loss": 0.5270540118217468, - "mean_token_accuracy": 0.826683521270752, - "num_tokens": 2853656.0, - "step": 318 - }, - { - "epoch": 0.24240121580547114, - "grad_norm": 1.8429969549179077, - "learning_rate": 4.9873775424532515e-06, - "loss": 0.4705049991607666, - "mean_token_accuracy": 0.8355701565742493, - "num_tokens": 2863513.0, - "step": 319 - }, - { - "epoch": 0.24316109422492402, - "grad_norm": 2.2425320148468018, - "learning_rate": 4.9871664725251314e-06, - "loss": 0.485736608505249, - "mean_token_accuracy": 0.835182785987854, - "num_tokens": 2871556.0, - "step": 320 - }, - { - "epoch": 0.2439209726443769, - "grad_norm": 1.6202056407928467, - "learning_rate": 4.986953657007841e-06, - "loss": 0.4437887370586395, - "mean_token_accuracy": 0.8282591700553894, - "num_tokens": 2884335.0, - "step": 321 - }, - { - "epoch": 0.24468085106382978, - "grad_norm": 1.1027268171310425, - "learning_rate": 4.98673909605074e-06, - "loss": 0.3770800828933716, - "mean_token_accuracy": 0.8325437307357788, - "num_tokens": 2904286.0, - "step": 322 - }, - { - "epoch": 0.2454407294832827, - "grad_norm": 2.3239076137542725, - "learning_rate": 4.986522789804417e-06, - "loss": 0.5387254953384399, - "mean_token_accuracy": 0.806242823600769, - "num_tokens": 2910975.0, - "step": 323 - }, - { - "epoch": 0.24620060790273557, - "grad_norm": 2.243482828140259, - "learning_rate": 4.986304738420684e-06, - "loss": 0.4396553039550781, - "mean_token_accuracy": 0.8561904430389404, - "num_tokens": 2917087.0, - "step": 324 - }, - { - "epoch": 0.24696048632218845, - "grad_norm": 2.537264347076416, - "learning_rate": 4.986084942052577e-06, - "loss": 0.395110160112381, - "mean_token_accuracy": 0.8636915683746338, - "num_tokens": 2921887.0, - "step": 325 - }, - { - "epoch": 0.24772036474164133, - "grad_norm": 2.319399118423462, - "learning_rate": 4.9858634008543574e-06, - "loss": 0.581517219543457, - "mean_token_accuracy": 0.8157487511634827, - "num_tokens": 2928996.0, - "step": 326 - }, - { - "epoch": 0.24848024316109424, - "grad_norm": 1.9787474870681763, - "learning_rate": 4.985640114981513e-06, - "loss": 0.5084106922149658, - "mean_token_accuracy": 0.835221529006958, - "num_tokens": 2940302.0, - "step": 327 - }, - { - "epoch": 0.24924012158054712, - "grad_norm": 2.4783265590667725, - "learning_rate": 4.985415084590752e-06, - "loss": 0.6062222719192505, - "mean_token_accuracy": 0.7885516285896301, - "num_tokens": 2946386.0, - "step": 328 - }, - { - "epoch": 0.25, - "grad_norm": 2.4081411361694336, - "learning_rate": 4.985188309840012e-06, - "loss": 0.5079880356788635, - "mean_token_accuracy": 0.8313904404640198, - "num_tokens": 2952323.0, - "step": 329 - }, - { - "epoch": 0.2507598784194529, - "grad_norm": 2.64993953704834, - "learning_rate": 4.984959790888451e-06, - "loss": 0.5461447834968567, - "mean_token_accuracy": 0.8125468492507935, - "num_tokens": 2958119.0, - "step": 330 - }, - { - "epoch": 0.25151975683890576, - "grad_norm": 2.549734115600586, - "learning_rate": 4.984729527896451e-06, - "loss": 0.5998573303222656, - "mean_token_accuracy": 0.8076666593551636, - "num_tokens": 2964947.0, - "step": 331 - }, - { - "epoch": 0.25227963525835867, - "grad_norm": 3.2185161113739014, - "learning_rate": 4.984497521025622e-06, - "loss": 0.4232945442199707, - "mean_token_accuracy": 0.8543803095817566, - "num_tokens": 2968598.0, - "step": 332 - }, - { - "epoch": 0.2530395136778115, - "grad_norm": 2.588994264602661, - "learning_rate": 4.984263770438793e-06, - "loss": 0.460967481136322, - "mean_token_accuracy": 0.8416207432746887, - "num_tokens": 2974510.0, - "step": 333 - }, - { - "epoch": 0.25379939209726443, - "grad_norm": 2.1373162269592285, - "learning_rate": 4.984028276300021e-06, - "loss": 0.49382102489471436, - "mean_token_accuracy": 0.8388048410415649, - "num_tokens": 2981632.0, - "step": 334 - }, - { - "epoch": 0.25455927051671734, - "grad_norm": 2.2524826526641846, - "learning_rate": 4.983791038774585e-06, - "loss": 0.4947671890258789, - "mean_token_accuracy": 0.8066365122795105, - "num_tokens": 2988736.0, - "step": 335 - }, - { - "epoch": 0.2553191489361702, - "grad_norm": 1.7244199514389038, - "learning_rate": 4.983552058028985e-06, - "loss": 0.48096776008605957, - "mean_token_accuracy": 0.830735445022583, - "num_tokens": 3003576.0, - "step": 336 - }, - { - "epoch": 0.2560790273556231, - "grad_norm": 3.0628933906555176, - "learning_rate": 4.9833113342309495e-06, - "loss": 0.6027032136917114, - "mean_token_accuracy": 0.8008694648742676, - "num_tokens": 3009549.0, - "step": 337 - }, - { - "epoch": 0.256838905775076, - "grad_norm": 2.438674211502075, - "learning_rate": 4.983068867549427e-06, - "loss": 0.517090916633606, - "mean_token_accuracy": 0.827893853187561, - "num_tokens": 3015236.0, - "step": 338 - }, - { - "epoch": 0.25759878419452886, - "grad_norm": 2.131535053253174, - "learning_rate": 4.982824658154589e-06, - "loss": 0.6656812429428101, - "mean_token_accuracy": 0.7772425413131714, - "num_tokens": 3028142.0, - "step": 339 - }, - { - "epoch": 0.25835866261398177, - "grad_norm": 2.3206584453582764, - "learning_rate": 4.9825787062178315e-06, - "loss": 0.5757625699043274, - "mean_token_accuracy": 0.8073873519897461, - "num_tokens": 3040996.0, - "step": 340 - }, - { - "epoch": 0.2591185410334346, - "grad_norm": 1.3905521631240845, - "learning_rate": 4.982331011911774e-06, - "loss": 0.4193805456161499, - "mean_token_accuracy": 0.8399466872215271, - "num_tokens": 3061931.0, - "step": 341 - }, - { - "epoch": 0.25987841945288753, - "grad_norm": 2.184173345565796, - "learning_rate": 4.982081575410256e-06, - "loss": 0.4751223921775818, - "mean_token_accuracy": 0.8409271240234375, - "num_tokens": 3069081.0, - "step": 342 - }, - { - "epoch": 0.26063829787234044, - "grad_norm": 3.538764238357544, - "learning_rate": 4.9818303968883445e-06, - "loss": 0.8119601011276245, - "mean_token_accuracy": 0.7442739009857178, - "num_tokens": 3073628.0, - "step": 343 - }, - { - "epoch": 0.2613981762917933, - "grad_norm": 1.8063762187957764, - "learning_rate": 4.981577476522323e-06, - "loss": 0.5615730881690979, - "mean_token_accuracy": 0.8207751512527466, - "num_tokens": 3086596.0, - "step": 344 - }, - { - "epoch": 0.2621580547112462, - "grad_norm": 2.4346961975097656, - "learning_rate": 4.981322814489703e-06, - "loss": 0.5266709327697754, - "mean_token_accuracy": 0.8211277723312378, - "num_tokens": 3092631.0, - "step": 345 - }, - { - "epoch": 0.2629179331306991, - "grad_norm": 1.91289484500885, - "learning_rate": 4.981066410969215e-06, - "loss": 0.5047177672386169, - "mean_token_accuracy": 0.8356877565383911, - "num_tokens": 3101102.0, - "step": 346 - }, - { - "epoch": 0.26367781155015196, - "grad_norm": 2.1495707035064697, - "learning_rate": 4.980808266140813e-06, - "loss": 0.47876280546188354, - "mean_token_accuracy": 0.8364313244819641, - "num_tokens": 3107998.0, - "step": 347 - }, - { - "epoch": 0.26443768996960487, - "grad_norm": 2.5961992740631104, - "learning_rate": 4.9805483801856744e-06, - "loss": 0.5512958765029907, - "mean_token_accuracy": 0.8181467652320862, - "num_tokens": 3113848.0, - "step": 348 - }, - { - "epoch": 0.2651975683890577, - "grad_norm": 3.2828900814056396, - "learning_rate": 4.980286753286196e-06, - "loss": 0.4217945635318756, - "mean_token_accuracy": 0.8617103099822998, - "num_tokens": 3117652.0, - "step": 349 - }, - { - "epoch": 0.26595744680851063, - "grad_norm": 1.425554871559143, - "learning_rate": 4.980023385625996e-06, - "loss": 0.4042487144470215, - "mean_token_accuracy": 0.8492785692214966, - "num_tokens": 3132336.0, - "step": 350 - }, - { - "epoch": 0.26671732522796354, - "grad_norm": 2.933504104614258, - "learning_rate": 4.979758277389919e-06, - "loss": 0.5406704545021057, - "mean_token_accuracy": 0.8035423755645752, - "num_tokens": 3137544.0, - "step": 351 - }, - { - "epoch": 0.2674772036474164, - "grad_norm": 1.9958966970443726, - "learning_rate": 4.9794914287640264e-06, - "loss": 0.5857555270195007, - "mean_token_accuracy": 0.7965140342712402, - "num_tokens": 3149705.0, - "step": 352 - }, - { - "epoch": 0.2682370820668693, - "grad_norm": 2.467694044113159, - "learning_rate": 4.979222839935602e-06, - "loss": 0.6404043436050415, - "mean_token_accuracy": 0.7823755741119385, - "num_tokens": 3158353.0, - "step": 353 - }, - { - "epoch": 0.2689969604863222, - "grad_norm": 2.0102720260620117, - "learning_rate": 4.9789525110931545e-06, - "loss": 0.5681496858596802, - "mean_token_accuracy": 0.8108169436454773, - "num_tokens": 3167121.0, - "step": 354 - }, - { - "epoch": 0.26975683890577506, - "grad_norm": 2.6017866134643555, - "learning_rate": 4.978680442426409e-06, - "loss": 0.6309828162193298, - "mean_token_accuracy": 0.7742617130279541, - "num_tokens": 3175012.0, - "step": 355 - }, - { - "epoch": 0.270516717325228, - "grad_norm": 1.8799268007278442, - "learning_rate": 4.978406634126315e-06, - "loss": 0.524029016494751, - "mean_token_accuracy": 0.8317689895629883, - "num_tokens": 3185331.0, - "step": 356 - }, - { - "epoch": 0.2712765957446808, - "grad_norm": 1.508332371711731, - "learning_rate": 4.978131086385041e-06, - "loss": 0.46656402945518494, - "mean_token_accuracy": 0.8339117765426636, - "num_tokens": 3198813.0, - "step": 357 - }, - { - "epoch": 0.27203647416413373, - "grad_norm": 3.595707654953003, - "learning_rate": 4.977853799395976e-06, - "loss": 0.5101234912872314, - "mean_token_accuracy": 0.8251723051071167, - "num_tokens": 3206557.0, - "step": 358 - }, - { - "epoch": 0.27279635258358664, - "grad_norm": 3.5317916870117188, - "learning_rate": 4.977574773353732e-06, - "loss": 0.5684665441513062, - "mean_token_accuracy": 0.8124493360519409, - "num_tokens": 3210912.0, - "step": 359 - }, - { - "epoch": 0.2735562310030395, - "grad_norm": 2.8606204986572266, - "learning_rate": 4.97729400845414e-06, - "loss": 0.4746384620666504, - "mean_token_accuracy": 0.8195606470108032, - "num_tokens": 3215365.0, - "step": 360 - }, - { - "epoch": 0.2743161094224924, - "grad_norm": 1.8214033842086792, - "learning_rate": 4.977011504894253e-06, - "loss": 0.4842769503593445, - "mean_token_accuracy": 0.82928866147995, - "num_tokens": 3224037.0, - "step": 361 - }, - { - "epoch": 0.2750759878419453, - "grad_norm": 1.628746509552002, - "learning_rate": 4.97672726287234e-06, - "loss": 0.4397493302822113, - "mean_token_accuracy": 0.8606528043746948, - "num_tokens": 3235589.0, - "step": 362 - }, - { - "epoch": 0.27583586626139817, - "grad_norm": 3.557973861694336, - "learning_rate": 4.976441282587894e-06, - "loss": 0.5732032060623169, - "mean_token_accuracy": 0.8041545748710632, - "num_tokens": 3239958.0, - "step": 363 - }, - { - "epoch": 0.2765957446808511, - "grad_norm": 1.3467901945114136, - "learning_rate": 4.9761535642416284e-06, - "loss": 0.4525323510169983, - "mean_token_accuracy": 0.8281061053276062, - "num_tokens": 3257703.0, - "step": 364 - }, - { - "epoch": 0.2773556231003039, - "grad_norm": 2.2649986743927, - "learning_rate": 4.9758641080354745e-06, - "loss": 0.5074734687805176, - "mean_token_accuracy": 0.8447474241256714, - "num_tokens": 3264334.0, - "step": 365 - }, - { - "epoch": 0.27811550151975684, - "grad_norm": 2.8667566776275635, - "learning_rate": 4.975572914172581e-06, - "loss": 0.5759559869766235, - "mean_token_accuracy": 0.7976793050765991, - "num_tokens": 3269314.0, - "step": 366 - }, - { - "epoch": 0.27887537993920974, - "grad_norm": 2.2514986991882324, - "learning_rate": 4.975279982857324e-06, - "loss": 0.5786465406417847, - "mean_token_accuracy": 0.8058781623840332, - "num_tokens": 3277324.0, - "step": 367 - }, - { - "epoch": 0.2796352583586626, - "grad_norm": 1.3826723098754883, - "learning_rate": 4.97498531429529e-06, - "loss": 0.40801727771759033, - "mean_token_accuracy": 0.8601310849189758, - "num_tokens": 3290530.0, - "step": 368 - }, - { - "epoch": 0.2803951367781155, - "grad_norm": 2.084092617034912, - "learning_rate": 4.97468890869329e-06, - "loss": 0.47076648473739624, - "mean_token_accuracy": 0.8310186862945557, - "num_tokens": 3298325.0, - "step": 369 - }, - { - "epoch": 0.2811550151975684, - "grad_norm": 1.3467998504638672, - "learning_rate": 4.974390766259353e-06, - "loss": 0.44668465852737427, - "mean_token_accuracy": 0.8275353908538818, - "num_tokens": 3314302.0, - "step": 370 - }, - { - "epoch": 0.28191489361702127, - "grad_norm": 2.5921075344085693, - "learning_rate": 4.974090887202726e-06, - "loss": 0.5343953967094421, - "mean_token_accuracy": 0.8110706806182861, - "num_tokens": 3320963.0, - "step": 371 - }, - { - "epoch": 0.2826747720364742, - "grad_norm": 2.042781352996826, - "learning_rate": 4.973789271733877e-06, - "loss": 0.6293343305587769, - "mean_token_accuracy": 0.7800243496894836, - "num_tokens": 3332742.0, - "step": 372 - }, - { - "epoch": 0.28343465045592703, - "grad_norm": 4.822193145751953, - "learning_rate": 4.973485920064491e-06, - "loss": 0.6256728768348694, - "mean_token_accuracy": 0.7962433099746704, - "num_tokens": 3335872.0, - "step": 373 - }, - { - "epoch": 0.28419452887537994, - "grad_norm": 1.260988473892212, - "learning_rate": 4.973180832407471e-06, - "loss": 0.38731223344802856, - "mean_token_accuracy": 0.8385066986083984, - "num_tokens": 3351884.0, - "step": 374 - }, - { - "epoch": 0.28495440729483285, - "grad_norm": 2.669966697692871, - "learning_rate": 4.97287400897694e-06, - "loss": 0.5594710111618042, - "mean_token_accuracy": 0.8097212314605713, - "num_tokens": 3358197.0, - "step": 375 - }, - { - "epoch": 0.2857142857142857, - "grad_norm": 3.0344486236572266, - "learning_rate": 4.972565449988238e-06, - "loss": 0.34449583292007446, - "mean_token_accuracy": 0.8813316822052002, - "num_tokens": 3362133.0, - "step": 376 - }, - { - "epoch": 0.2864741641337386, - "grad_norm": 2.562251091003418, - "learning_rate": 4.972255155657925e-06, - "loss": 0.5331522822380066, - "mean_token_accuracy": 0.8212941288948059, - "num_tokens": 3370346.0, - "step": 377 - }, - { - "epoch": 0.2872340425531915, - "grad_norm": 2.7083740234375, - "learning_rate": 4.9719431262037755e-06, - "loss": 0.5403046011924744, - "mean_token_accuracy": 0.8108335733413696, - "num_tokens": 3375588.0, - "step": 378 - }, - { - "epoch": 0.28799392097264437, - "grad_norm": 1.396430492401123, - "learning_rate": 4.971629361844785e-06, - "loss": 0.4041529893875122, - "mean_token_accuracy": 0.8588063716888428, - "num_tokens": 3390749.0, - "step": 379 - }, - { - "epoch": 0.2887537993920973, - "grad_norm": 1.9872784614562988, - "learning_rate": 4.971313862801166e-06, - "loss": 0.4336993098258972, - "mean_token_accuracy": 0.8511303663253784, - "num_tokens": 3399064.0, - "step": 380 - }, - { - "epoch": 0.28951367781155013, - "grad_norm": 1.9652575254440308, - "learning_rate": 4.9709966292943455e-06, - "loss": 0.4578358232975006, - "mean_token_accuracy": 0.8229440450668335, - "num_tokens": 3407229.0, - "step": 381 - }, - { - "epoch": 0.29027355623100304, - "grad_norm": 1.6626898050308228, - "learning_rate": 4.970677661546972e-06, - "loss": 0.5427594184875488, - "mean_token_accuracy": 0.815427303314209, - "num_tokens": 3422321.0, - "step": 382 - }, - { - "epoch": 0.29103343465045595, - "grad_norm": 3.5265562534332275, - "learning_rate": 4.970356959782909e-06, - "loss": 0.6661460995674133, - "mean_token_accuracy": 0.7856965065002441, - "num_tokens": 3427442.0, - "step": 383 - }, - { - "epoch": 0.2917933130699088, - "grad_norm": 1.667205572128296, - "learning_rate": 4.970034524227239e-06, - "loss": 0.36256325244903564, - "mean_token_accuracy": 0.8711205720901489, - "num_tokens": 3436662.0, - "step": 384 - }, - { - "epoch": 0.2925531914893617, - "grad_norm": 1.3389486074447632, - "learning_rate": 4.969710355106256e-06, - "loss": 0.4282698631286621, - "mean_token_accuracy": 0.838951587677002, - "num_tokens": 3450060.0, - "step": 385 - }, - { - "epoch": 0.2933130699088146, - "grad_norm": 2.5163397789001465, - "learning_rate": 4.969384452647477e-06, - "loss": 0.5176984071731567, - "mean_token_accuracy": 0.8235267996788025, - "num_tokens": 3456990.0, - "step": 386 - }, - { - "epoch": 0.29407294832826747, - "grad_norm": 1.7588495016098022, - "learning_rate": 4.969056817079633e-06, - "loss": 0.49710947275161743, - "mean_token_accuracy": 0.818520724773407, - "num_tokens": 3468098.0, - "step": 387 - }, - { - "epoch": 0.2948328267477204, - "grad_norm": 2.6381046772003174, - "learning_rate": 4.968727448632669e-06, - "loss": 0.4425308108329773, - "mean_token_accuracy": 0.8451643586158752, - "num_tokens": 3472899.0, - "step": 388 - }, - { - "epoch": 0.29559270516717323, - "grad_norm": 1.6345038414001465, - "learning_rate": 4.968396347537751e-06, - "loss": 0.4177059829235077, - "mean_token_accuracy": 0.8498886227607727, - "num_tokens": 3484826.0, - "step": 389 - }, - { - "epoch": 0.29635258358662614, - "grad_norm": 3.0466468334198, - "learning_rate": 4.968063514027258e-06, - "loss": 0.4274463951587677, - "mean_token_accuracy": 0.8387278318405151, - "num_tokens": 3488610.0, - "step": 390 - }, - { - "epoch": 0.29711246200607905, - "grad_norm": 2.6509406566619873, - "learning_rate": 4.967728948334784e-06, - "loss": 0.5401753783226013, - "mean_token_accuracy": 0.8252490162849426, - "num_tokens": 3493657.0, - "step": 391 - }, - { - "epoch": 0.2978723404255319, - "grad_norm": 1.6372219324111938, - "learning_rate": 4.967392650695141e-06, - "loss": 0.3862472176551819, - "mean_token_accuracy": 0.8555525541305542, - "num_tokens": 3505588.0, - "step": 392 - }, - { - "epoch": 0.2986322188449848, - "grad_norm": 2.1615452766418457, - "learning_rate": 4.967054621344356e-06, - "loss": 0.57850581407547, - "mean_token_accuracy": 0.8222678899765015, - "num_tokens": 3514396.0, - "step": 393 - }, - { - "epoch": 0.2993920972644377, - "grad_norm": 1.8610916137695312, - "learning_rate": 4.96671486051967e-06, - "loss": 0.5440595149993896, - "mean_token_accuracy": 0.8196715116500854, - "num_tokens": 3523604.0, - "step": 394 - }, - { - "epoch": 0.30015197568389057, - "grad_norm": 2.9585862159729004, - "learning_rate": 4.966373368459542e-06, - "loss": 0.6921588182449341, - "mean_token_accuracy": 0.7816659808158875, - "num_tokens": 3529849.0, - "step": 395 - }, - { - "epoch": 0.3009118541033435, - "grad_norm": 1.9374035596847534, - "learning_rate": 4.966030145403642e-06, - "loss": 0.5494055151939392, - "mean_token_accuracy": 0.8126792907714844, - "num_tokens": 3539529.0, - "step": 396 - }, - { - "epoch": 0.30167173252279633, - "grad_norm": 1.730530023574829, - "learning_rate": 4.965685191592859e-06, - "loss": 0.4271572232246399, - "mean_token_accuracy": 0.8383668661117554, - "num_tokens": 3550972.0, - "step": 397 - }, - { - "epoch": 0.30243161094224924, - "grad_norm": 3.9635560512542725, - "learning_rate": 4.9653385072692935e-06, - "loss": 0.5576210021972656, - "mean_token_accuracy": 0.799404501914978, - "num_tokens": 3554147.0, - "step": 398 - }, - { - "epoch": 0.30319148936170215, - "grad_norm": 2.5731968879699707, - "learning_rate": 4.964990092676263e-06, - "loss": 0.5478942394256592, - "mean_token_accuracy": 0.8220961093902588, - "num_tokens": 3559972.0, - "step": 399 - }, - { - "epoch": 0.303951367781155, - "grad_norm": 2.2096588611602783, - "learning_rate": 4.964639948058297e-06, - "loss": 0.35461270809173584, - "mean_token_accuracy": 0.8640927076339722, - "num_tokens": 3565770.0, - "step": 400 - }, - { - "epoch": 0.3047112462006079, - "grad_norm": 1.7874189615249634, - "learning_rate": 4.964288073661142e-06, - "loss": 0.38849619030952454, - "mean_token_accuracy": 0.8443037271499634, - "num_tokens": 3574514.0, - "step": 401 - }, - { - "epoch": 0.30547112462006076, - "grad_norm": 1.5583146810531616, - "learning_rate": 4.963934469731756e-06, - "loss": 0.48909449577331543, - "mean_token_accuracy": 0.8429768681526184, - "num_tokens": 3585877.0, - "step": 402 - }, - { - "epoch": 0.30623100303951367, - "grad_norm": 3.026599645614624, - "learning_rate": 4.963579136518312e-06, - "loss": 0.5138992071151733, - "mean_token_accuracy": 0.8283728361129761, - "num_tokens": 3590412.0, - "step": 403 - }, - { - "epoch": 0.3069908814589666, - "grad_norm": 2.777505874633789, - "learning_rate": 4.963222074270197e-06, - "loss": 0.6241534948348999, - "mean_token_accuracy": 0.8130464553833008, - "num_tokens": 3596246.0, - "step": 404 - }, - { - "epoch": 0.30775075987841943, - "grad_norm": 2.4772839546203613, - "learning_rate": 4.962863283238011e-06, - "loss": 0.5930814146995544, - "mean_token_accuracy": 0.8036394715309143, - "num_tokens": 3602878.0, - "step": 405 - }, - { - "epoch": 0.30851063829787234, - "grad_norm": 1.5049982070922852, - "learning_rate": 4.962502763673566e-06, - "loss": 0.4903082549571991, - "mean_token_accuracy": 0.8184912204742432, - "num_tokens": 3617018.0, - "step": 406 - }, - { - "epoch": 0.30927051671732525, - "grad_norm": 2.453155040740967, - "learning_rate": 4.96214051582989e-06, - "loss": 0.5138067603111267, - "mean_token_accuracy": 0.8336835503578186, - "num_tokens": 3624188.0, - "step": 407 - }, - { - "epoch": 0.3100303951367781, - "grad_norm": 2.4038336277008057, - "learning_rate": 4.961776539961222e-06, - "loss": 0.5752760171890259, - "mean_token_accuracy": 0.8054730892181396, - "num_tokens": 3634152.0, - "step": 408 - }, - { - "epoch": 0.310790273556231, - "grad_norm": 2.629068374633789, - "learning_rate": 4.961410836323014e-06, - "loss": 0.5580606460571289, - "mean_token_accuracy": 0.8121089935302734, - "num_tokens": 3639528.0, - "step": 409 - }, - { - "epoch": 0.31155015197568386, - "grad_norm": 1.4245928525924683, - "learning_rate": 4.961043405171931e-06, - "loss": 0.5399882793426514, - "mean_token_accuracy": 0.812280535697937, - "num_tokens": 3655744.0, - "step": 410 - }, - { - "epoch": 0.3123100303951368, - "grad_norm": 1.5236459970474243, - "learning_rate": 4.9606742467658505e-06, - "loss": 0.5234690308570862, - "mean_token_accuracy": 0.8188928365707397, - "num_tokens": 3675010.0, - "step": 411 - }, - { - "epoch": 0.3130699088145897, - "grad_norm": 2.27961802482605, - "learning_rate": 4.960303361363863e-06, - "loss": 0.5502505898475647, - "mean_token_accuracy": 0.8161963224411011, - "num_tokens": 3682328.0, - "step": 412 - }, - { - "epoch": 0.31382978723404253, - "grad_norm": 1.554518222808838, - "learning_rate": 4.959930749226269e-06, - "loss": 0.420867919921875, - "mean_token_accuracy": 0.8499157428741455, - "num_tokens": 3694980.0, - "step": 413 - }, - { - "epoch": 0.31458966565349544, - "grad_norm": 2.609218120574951, - "learning_rate": 4.9595564106145825e-06, - "loss": 0.4706704318523407, - "mean_token_accuracy": 0.8412490487098694, - "num_tokens": 3700033.0, - "step": 414 - }, - { - "epoch": 0.31534954407294835, - "grad_norm": 1.5303231477737427, - "learning_rate": 4.959180345791528e-06, - "loss": 0.4668654799461365, - "mean_token_accuracy": 0.8125015497207642, - "num_tokens": 3715012.0, - "step": 415 - }, - { - "epoch": 0.3161094224924012, - "grad_norm": 1.2774665355682373, - "learning_rate": 4.958802555021042e-06, - "loss": 0.4339369237422943, - "mean_token_accuracy": 0.8442851901054382, - "num_tokens": 3733928.0, - "step": 416 - }, - { - "epoch": 0.3168693009118541, - "grad_norm": 2.1240181922912598, - "learning_rate": 4.958423038568274e-06, - "loss": 0.4029104709625244, - "mean_token_accuracy": 0.8627674579620361, - "num_tokens": 3740202.0, - "step": 417 - }, - { - "epoch": 0.31762917933130697, - "grad_norm": 2.00538969039917, - "learning_rate": 4.958041796699583e-06, - "loss": 0.5229607820510864, - "mean_token_accuracy": 0.8282366394996643, - "num_tokens": 3749308.0, - "step": 418 - }, - { - "epoch": 0.3183890577507599, - "grad_norm": 2.6555092334747314, - "learning_rate": 4.957658829682539e-06, - "loss": 0.5344101190567017, - "mean_token_accuracy": 0.8183202743530273, - "num_tokens": 3754595.0, - "step": 419 - }, - { - "epoch": 0.3191489361702128, - "grad_norm": 1.7468839883804321, - "learning_rate": 4.9572741377859225e-06, - "loss": 0.5667245984077454, - "mean_token_accuracy": 0.8080123662948608, - "num_tokens": 3765761.0, - "step": 420 - }, - { - "epoch": 0.31990881458966564, - "grad_norm": 2.9612457752227783, - "learning_rate": 4.956887721279726e-06, - "loss": 0.5389559864997864, - "mean_token_accuracy": 0.8019476532936096, - "num_tokens": 3770844.0, - "step": 421 - }, - { - "epoch": 0.32066869300911854, - "grad_norm": 1.842403769493103, - "learning_rate": 4.95649958043515e-06, - "loss": 0.38279837369918823, - "mean_token_accuracy": 0.858866810798645, - "num_tokens": 3778094.0, - "step": 422 - }, - { - "epoch": 0.32142857142857145, - "grad_norm": 2.3108131885528564, - "learning_rate": 4.956109715524609e-06, - "loss": 0.5453893542289734, - "mean_token_accuracy": 0.8085013031959534, - "num_tokens": 3785015.0, - "step": 423 - }, - { - "epoch": 0.3221884498480243, - "grad_norm": 3.0326945781707764, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.5550523400306702, - "mean_token_accuracy": 0.8125876188278198, - "num_tokens": 3789830.0, - "step": 424 - }, - { - "epoch": 0.3229483282674772, - "grad_norm": 1.8851977586746216, - "learning_rate": 4.955324814601324e-06, - "loss": 0.4902324974536896, - "mean_token_accuracy": 0.8205406665802002, - "num_tokens": 3799862.0, - "step": 425 - }, - { - "epoch": 0.32370820668693007, - "grad_norm": 2.6018171310424805, - "learning_rate": 4.954929779139455e-06, - "loss": 0.5920133590698242, - "mean_token_accuracy": 0.8340690732002258, - "num_tokens": 3806617.0, - "step": 426 - }, - { - "epoch": 0.324468085106383, - "grad_norm": 2.4283878803253174, - "learning_rate": 4.954533020713367e-06, - "loss": 0.5305854082107544, - "mean_token_accuracy": 0.8137468099594116, - "num_tokens": 3813843.0, - "step": 427 - }, - { - "epoch": 0.3252279635258359, - "grad_norm": 2.667978525161743, - "learning_rate": 4.954134539601519e-06, - "loss": 0.5333638787269592, - "mean_token_accuracy": 0.8402629494667053, - "num_tokens": 3819450.0, - "step": 428 - }, - { - "epoch": 0.32598784194528874, - "grad_norm": 1.7302523851394653, - "learning_rate": 4.953734336083582e-06, - "loss": 0.422895610332489, - "mean_token_accuracy": 0.8709704875946045, - "num_tokens": 3831027.0, - "step": 429 - }, - { - "epoch": 0.32674772036474165, - "grad_norm": 2.427192211151123, - "learning_rate": 4.953332410440434e-06, - "loss": 0.6334598064422607, - "mean_token_accuracy": 0.7817479968070984, - "num_tokens": 3841776.0, - "step": 430 - }, - { - "epoch": 0.32750759878419455, - "grad_norm": 1.460949182510376, - "learning_rate": 4.952928762954161e-06, - "loss": 0.3654777705669403, - "mean_token_accuracy": 0.8780122995376587, - "num_tokens": 3852213.0, - "step": 431 - }, - { - "epoch": 0.3282674772036474, - "grad_norm": 1.9855005741119385, - "learning_rate": 4.952523393908059e-06, - "loss": 0.5117089748382568, - "mean_token_accuracy": 0.811911404132843, - "num_tokens": 3861176.0, - "step": 432 - }, - { - "epoch": 0.3290273556231003, - "grad_norm": 2.2653207778930664, - "learning_rate": 4.952116303586631e-06, - "loss": 0.42514950037002563, - "mean_token_accuracy": 0.8448518514633179, - "num_tokens": 3867164.0, - "step": 433 - }, - { - "epoch": 0.32978723404255317, - "grad_norm": 1.9780964851379395, - "learning_rate": 4.951707492275589e-06, - "loss": 0.5095293521881104, - "mean_token_accuracy": 0.8262748718261719, - "num_tokens": 3876406.0, - "step": 434 - }, - { - "epoch": 0.3305471124620061, - "grad_norm": 2.9480233192443848, - "learning_rate": 4.951296960261853e-06, - "loss": 0.3494448959827423, - "mean_token_accuracy": 0.8781307935714722, - "num_tokens": 3880298.0, - "step": 435 - }, - { - "epoch": 0.331306990881459, - "grad_norm": 2.335571527481079, - "learning_rate": 4.95088470783355e-06, - "loss": 0.5456914901733398, - "mean_token_accuracy": 0.816297173500061, - "num_tokens": 3886487.0, - "step": 436 - }, - { - "epoch": 0.33206686930091184, - "grad_norm": 2.3046419620513916, - "learning_rate": 4.950470735280013e-06, - "loss": 0.4835948944091797, - "mean_token_accuracy": 0.8539175391197205, - "num_tokens": 3892706.0, - "step": 437 - }, - { - "epoch": 0.33282674772036475, - "grad_norm": 2.44047474861145, - "learning_rate": 4.950055042891786e-06, - "loss": 0.5154092907905579, - "mean_token_accuracy": 0.8579919338226318, - "num_tokens": 3899532.0, - "step": 438 - }, - { - "epoch": 0.33358662613981765, - "grad_norm": 4.826764106750488, - "learning_rate": 4.949637630960618e-06, - "loss": 0.5270259976387024, - "mean_token_accuracy": 0.8172192573547363, - "num_tokens": 3902260.0, - "step": 439 - }, - { - "epoch": 0.3343465045592705, - "grad_norm": 2.001574754714966, - "learning_rate": 4.949218499779462e-06, - "loss": 0.5413002967834473, - "mean_token_accuracy": 0.8162837028503418, - "num_tokens": 3911706.0, - "step": 440 - }, - { - "epoch": 0.3351063829787234, - "grad_norm": 1.7998944520950317, - "learning_rate": 4.948797649642484e-06, - "loss": 0.5131614208221436, - "mean_token_accuracy": 0.8367440700531006, - "num_tokens": 3923490.0, - "step": 441 - }, - { - "epoch": 0.33586626139817627, - "grad_norm": 3.4566173553466797, - "learning_rate": 4.94837508084505e-06, - "loss": 0.7258909940719604, - "mean_token_accuracy": 0.771377444267273, - "num_tokens": 3928099.0, - "step": 442 - }, - { - "epoch": 0.3366261398176292, - "grad_norm": 2.0040442943573, - "learning_rate": 4.9479507936837364e-06, - "loss": 0.482135534286499, - "mean_token_accuracy": 0.8339327573776245, - "num_tokens": 3937328.0, - "step": 443 - }, - { - "epoch": 0.3373860182370821, - "grad_norm": 2.949502944946289, - "learning_rate": 4.947524788456325e-06, - "loss": 0.6474795341491699, - "mean_token_accuracy": 0.7951677441596985, - "num_tokens": 3942529.0, - "step": 444 - }, - { - "epoch": 0.33814589665653494, - "grad_norm": 1.5528364181518555, - "learning_rate": 4.947097065461801e-06, - "loss": 0.48791584372520447, - "mean_token_accuracy": 0.8425545692443848, - "num_tokens": 3955200.0, - "step": 445 - }, - { - "epoch": 0.33890577507598785, - "grad_norm": 1.8813284635543823, - "learning_rate": 4.946667625000358e-06, - "loss": 0.45922309160232544, - "mean_token_accuracy": 0.8206527233123779, - "num_tokens": 3962975.0, - "step": 446 - }, - { - "epoch": 0.33966565349544076, - "grad_norm": 1.7157847881317139, - "learning_rate": 4.946236467373392e-06, - "loss": 0.5454182028770447, - "mean_token_accuracy": 0.8049604892730713, - "num_tokens": 3973956.0, - "step": 447 - }, - { - "epoch": 0.3404255319148936, - "grad_norm": 2.008857250213623, - "learning_rate": 4.945803592883509e-06, - "loss": 0.5151860117912292, - "mean_token_accuracy": 0.8262045383453369, - "num_tokens": 3982853.0, - "step": 448 - }, - { - "epoch": 0.3411854103343465, - "grad_norm": 1.6632496118545532, - "learning_rate": 4.9453690018345144e-06, - "loss": 0.42710691690444946, - "mean_token_accuracy": 0.8521314859390259, - "num_tokens": 3993838.0, - "step": 449 - }, - { - "epoch": 0.34194528875379937, - "grad_norm": 1.365234375, - "learning_rate": 4.944932694531423e-06, - "loss": 0.5172526836395264, - "mean_token_accuracy": 0.8277045488357544, - "num_tokens": 4014179.0, - "step": 450 - }, - { - "epoch": 0.3427051671732523, - "grad_norm": 1.7610243558883667, - "learning_rate": 4.94449467128045e-06, - "loss": 0.42104798555374146, - "mean_token_accuracy": 0.8552065491676331, - "num_tokens": 4023663.0, - "step": 451 - }, - { - "epoch": 0.3434650455927052, - "grad_norm": 2.3732354640960693, - "learning_rate": 4.944054932389018e-06, - "loss": 0.5471175909042358, - "mean_token_accuracy": 0.8487317562103271, - "num_tokens": 4030100.0, - "step": 452 - }, - { - "epoch": 0.34422492401215804, - "grad_norm": 1.5973623991012573, - "learning_rate": 4.943613478165753e-06, - "loss": 0.419813871383667, - "mean_token_accuracy": 0.8484025001525879, - "num_tokens": 4041124.0, - "step": 453 - }, - { - "epoch": 0.34498480243161095, - "grad_norm": 2.966381549835205, - "learning_rate": 4.943170308920484e-06, - "loss": 0.5370652675628662, - "mean_token_accuracy": 0.8439491987228394, - "num_tokens": 4045675.0, - "step": 454 - }, - { - "epoch": 0.34574468085106386, - "grad_norm": 2.5097248554229736, - "learning_rate": 4.9427254249642445e-06, - "loss": 0.5776349306106567, - "mean_token_accuracy": 0.8060523867607117, - "num_tokens": 4053250.0, - "step": 455 - }, - { - "epoch": 0.3465045592705167, - "grad_norm": 1.6779125928878784, - "learning_rate": 4.942278826609272e-06, - "loss": 0.5245476961135864, - "mean_token_accuracy": 0.8168526887893677, - "num_tokens": 4064106.0, - "step": 456 - }, - { - "epoch": 0.3472644376899696, - "grad_norm": 1.5945546627044678, - "learning_rate": 4.9418305141690045e-06, - "loss": 0.4972047209739685, - "mean_token_accuracy": 0.8257735967636108, - "num_tokens": 4077687.0, - "step": 457 - }, - { - "epoch": 0.34802431610942247, - "grad_norm": 2.864778757095337, - "learning_rate": 4.9413804879580865e-06, - "loss": 0.5372499823570251, - "mean_token_accuracy": 0.8423776626586914, - "num_tokens": 4082632.0, - "step": 458 - }, - { - "epoch": 0.3487841945288754, - "grad_norm": 1.4797078371047974, - "learning_rate": 4.940928748292363e-06, - "loss": 0.5903409719467163, - "mean_token_accuracy": 0.8061295747756958, - "num_tokens": 4104218.0, - "step": 459 - }, - { - "epoch": 0.3495440729483283, - "grad_norm": 2.4376983642578125, - "learning_rate": 4.940475295488882e-06, - "loss": 0.4534894824028015, - "mean_token_accuracy": 0.8395825028419495, - "num_tokens": 4110530.0, - "step": 460 - }, - { - "epoch": 0.35030395136778114, - "grad_norm": 1.2955626249313354, - "learning_rate": 4.940020129865895e-06, - "loss": 0.47155818343162537, - "mean_token_accuracy": 0.8253582715988159, - "num_tokens": 4128398.0, - "step": 461 - }, - { - "epoch": 0.35106382978723405, - "grad_norm": 2.066575527191162, - "learning_rate": 4.9395632517428546e-06, - "loss": 0.5555641651153564, - "mean_token_accuracy": 0.814624547958374, - "num_tokens": 4137623.0, - "step": 462 - }, - { - "epoch": 0.3518237082066869, - "grad_norm": 1.6407525539398193, - "learning_rate": 4.939104661440415e-06, - "loss": 0.4361790418624878, - "mean_token_accuracy": 0.8544459342956543, - "num_tokens": 4152803.0, - "step": 463 - }, - { - "epoch": 0.3525835866261398, - "grad_norm": 2.1685116291046143, - "learning_rate": 4.938644359280433e-06, - "loss": 0.5347012877464294, - "mean_token_accuracy": 0.853853702545166, - "num_tokens": 4160778.0, - "step": 464 - }, - { - "epoch": 0.3533434650455927, - "grad_norm": 1.8824869394302368, - "learning_rate": 4.938182345585967e-06, - "loss": 0.5512481927871704, - "mean_token_accuracy": 0.7985891699790955, - "num_tokens": 4170380.0, - "step": 465 - }, - { - "epoch": 0.3541033434650456, - "grad_norm": 2.2229504585266113, - "learning_rate": 4.937718620681273e-06, - "loss": 0.516828179359436, - "mean_token_accuracy": 0.8265621066093445, - "num_tokens": 4178179.0, - "step": 466 - }, - { - "epoch": 0.3548632218844985, - "grad_norm": 1.955990195274353, - "learning_rate": 4.9372531848918145e-06, - "loss": 0.5586158037185669, - "mean_token_accuracy": 0.8367916345596313, - "num_tokens": 4188626.0, - "step": 467 - }, - { - "epoch": 0.3556231003039514, - "grad_norm": 1.9687023162841797, - "learning_rate": 4.936786038544251e-06, - "loss": 0.5517531633377075, - "mean_token_accuracy": 0.8134098052978516, - "num_tokens": 4198144.0, - "step": 468 - }, - { - "epoch": 0.35638297872340424, - "grad_norm": 1.405516505241394, - "learning_rate": 4.9363171819664434e-06, - "loss": 0.5305492877960205, - "mean_token_accuracy": 0.8014427423477173, - "num_tokens": 4222818.0, - "step": 469 - }, - { - "epoch": 0.35714285714285715, - "grad_norm": 2.6355695724487305, - "learning_rate": 4.9358466154874535e-06, - "loss": 0.5303391218185425, - "mean_token_accuracy": 0.8028861284255981, - "num_tokens": 4228318.0, - "step": 470 - }, - { - "epoch": 0.35790273556231, - "grad_norm": 1.5133824348449707, - "learning_rate": 4.935374339437543e-06, - "loss": 0.5329189300537109, - "mean_token_accuracy": 0.8479441404342651, - "num_tokens": 4244527.0, - "step": 471 - }, - { - "epoch": 0.3586626139817629, - "grad_norm": 3.4356725215911865, - "learning_rate": 4.934900354148173e-06, - "loss": 0.5431582927703857, - "mean_token_accuracy": 0.8328983783721924, - "num_tokens": 4248034.0, - "step": 472 - }, - { - "epoch": 0.3594224924012158, - "grad_norm": 2.5789499282836914, - "learning_rate": 4.934424659952006e-06, - "loss": 0.4141455292701721, - "mean_token_accuracy": 0.8658635020256042, - "num_tokens": 4252953.0, - "step": 473 - }, - { - "epoch": 0.3601823708206687, - "grad_norm": 1.145262598991394, - "learning_rate": 4.933947257182901e-06, - "loss": 0.40294092893600464, - "mean_token_accuracy": 0.8565847277641296, - "num_tokens": 4277813.0, - "step": 474 - }, - { - "epoch": 0.3609422492401216, - "grad_norm": 1.7242133617401123, - "learning_rate": 4.933468146175918e-06, - "loss": 0.6036738753318787, - "mean_token_accuracy": 0.8072597980499268, - "num_tokens": 4291088.0, - "step": 475 - }, - { - "epoch": 0.3617021276595745, - "grad_norm": 2.3490941524505615, - "learning_rate": 4.932987327267317e-06, - "loss": 0.49456146359443665, - "mean_token_accuracy": 0.8372673988342285, - "num_tokens": 4297376.0, - "step": 476 - }, - { - "epoch": 0.36246200607902734, - "grad_norm": 1.3605526685714722, - "learning_rate": 4.932504800794553e-06, - "loss": 0.43595948815345764, - "mean_token_accuracy": 0.8415953516960144, - "num_tokens": 4312054.0, - "step": 477 - }, - { - "epoch": 0.36322188449848025, - "grad_norm": 1.4525885581970215, - "learning_rate": 4.9320205670962815e-06, - "loss": 0.5390371680259705, - "mean_token_accuracy": 0.8101649284362793, - "num_tokens": 4328701.0, - "step": 478 - }, - { - "epoch": 0.3639817629179331, - "grad_norm": 1.9862419366836548, - "learning_rate": 4.931534626512359e-06, - "loss": 0.45436930656433105, - "mean_token_accuracy": 0.8352861404418945, - "num_tokens": 4338372.0, - "step": 479 - }, - { - "epoch": 0.364741641337386, - "grad_norm": 1.7804961204528809, - "learning_rate": 4.931046979383836e-06, - "loss": 0.4677754044532776, - "mean_token_accuracy": 0.840467095375061, - "num_tokens": 4347897.0, - "step": 480 - }, - { - "epoch": 0.3655015197568389, - "grad_norm": 2.066632032394409, - "learning_rate": 4.930557626052961e-06, - "loss": 0.42418140172958374, - "mean_token_accuracy": 0.8528275489807129, - "num_tokens": 4354061.0, - "step": 481 - }, - { - "epoch": 0.3662613981762918, - "grad_norm": 1.6155282258987427, - "learning_rate": 4.930066566863182e-06, - "loss": 0.5424284934997559, - "mean_token_accuracy": 0.825040876865387, - "num_tokens": 4370400.0, - "step": 482 - }, - { - "epoch": 0.3670212765957447, - "grad_norm": 2.1452953815460205, - "learning_rate": 4.929573802159143e-06, - "loss": 0.5105804204940796, - "mean_token_accuracy": 0.8284053802490234, - "num_tokens": 4377579.0, - "step": 483 - }, - { - "epoch": 0.3677811550151976, - "grad_norm": 1.8940945863723755, - "learning_rate": 4.929079332286685e-06, - "loss": 0.43478304147720337, - "mean_token_accuracy": 0.8505665063858032, - "num_tokens": 4385686.0, - "step": 484 - }, - { - "epoch": 0.36854103343465044, - "grad_norm": 1.6785860061645508, - "learning_rate": 4.928583157592846e-06, - "loss": 0.40227848291397095, - "mean_token_accuracy": 0.8623573780059814, - "num_tokens": 4396128.0, - "step": 485 - }, - { - "epoch": 0.36930091185410335, - "grad_norm": 1.6416733264923096, - "learning_rate": 4.928085278425862e-06, - "loss": 0.526267409324646, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 4407963.0, - "step": 486 - }, - { - "epoch": 0.3700607902735562, - "grad_norm": 1.8882389068603516, - "learning_rate": 4.927585695135162e-06, - "loss": 0.5555213093757629, - "mean_token_accuracy": 0.8115293979644775, - "num_tokens": 4418057.0, - "step": 487 - }, - { - "epoch": 0.3708206686930091, - "grad_norm": 2.300248384475708, - "learning_rate": 4.9270844080713735e-06, - "loss": 0.5812339186668396, - "mean_token_accuracy": 0.800270676612854, - "num_tokens": 4425358.0, - "step": 488 - }, - { - "epoch": 0.371580547112462, - "grad_norm": 1.6802922487258911, - "learning_rate": 4.926581417586319e-06, - "loss": 0.5134941935539246, - "mean_token_accuracy": 0.8247408866882324, - "num_tokens": 4437702.0, - "step": 489 - }, - { - "epoch": 0.3723404255319149, - "grad_norm": 1.7620291709899902, - "learning_rate": 4.926076724033016e-06, - "loss": 0.5233973264694214, - "mean_token_accuracy": 0.8102161884307861, - "num_tokens": 4448584.0, - "step": 490 - }, - { - "epoch": 0.3731003039513678, - "grad_norm": 1.6911998987197876, - "learning_rate": 4.925570327765678e-06, - "loss": 0.5337274074554443, - "mean_token_accuracy": 0.845306396484375, - "num_tokens": 4462651.0, - "step": 491 - }, - { - "epoch": 0.3738601823708207, - "grad_norm": 1.7991242408752441, - "learning_rate": 4.9250622291397144e-06, - "loss": 0.31018948554992676, - "mean_token_accuracy": 0.8857606053352356, - "num_tokens": 4469971.0, - "step": 492 - }, - { - "epoch": 0.37462006079027355, - "grad_norm": 4.9776835441589355, - "learning_rate": 4.924552428511727e-06, - "loss": 0.44114983081817627, - "mean_token_accuracy": 0.8429906368255615, - "num_tokens": 4478275.0, - "step": 493 - }, - { - "epoch": 0.37537993920972645, - "grad_norm": 1.8007272481918335, - "learning_rate": 4.924040926239515e-06, - "loss": 0.574328601360321, - "mean_token_accuracy": 0.7669196128845215, - "num_tokens": 4491551.0, - "step": 494 - }, - { - "epoch": 0.3761398176291793, - "grad_norm": 2.021300792694092, - "learning_rate": 4.92352772268207e-06, - "loss": 0.45636120438575745, - "mean_token_accuracy": 0.840438723564148, - "num_tokens": 4498658.0, - "step": 495 - }, - { - "epoch": 0.3768996960486322, - "grad_norm": 2.369748592376709, - "learning_rate": 4.923012818199576e-06, - "loss": 0.5206376910209656, - "mean_token_accuracy": 0.8521823287010193, - "num_tokens": 4504648.0, - "step": 496 - }, - { - "epoch": 0.3776595744680851, - "grad_norm": 2.733485221862793, - "learning_rate": 4.922496213153416e-06, - "loss": 0.5067723989486694, - "mean_token_accuracy": 0.8168281316757202, - "num_tokens": 4509990.0, - "step": 497 - }, - { - "epoch": 0.378419452887538, - "grad_norm": 2.3751676082611084, - "learning_rate": 4.921977907906161e-06, - "loss": 0.49757206439971924, - "mean_token_accuracy": 0.8325017690658569, - "num_tokens": 4518373.0, - "step": 498 - }, - { - "epoch": 0.3791793313069909, - "grad_norm": 2.1672775745391846, - "learning_rate": 4.921457902821578e-06, - "loss": 0.4237566590309143, - "mean_token_accuracy": 0.8404698371887207, - "num_tokens": 4524338.0, - "step": 499 - }, - { - "epoch": 0.3799392097264438, - "grad_norm": 1.8374360799789429, - "learning_rate": 4.9209361982646275e-06, - "loss": 0.4995468854904175, - "mean_token_accuracy": 0.8299649953842163, - "num_tokens": 4533396.0, - "step": 500 - }, - { - "epoch": 0.38069908814589665, - "grad_norm": 2.083967924118042, - "learning_rate": 4.920412794601461e-06, - "loss": 0.489935040473938, - "mean_token_accuracy": 0.8315291404724121, - "num_tokens": 4540941.0, - "step": 501 - }, - { - "epoch": 0.38145896656534956, - "grad_norm": 2.2075610160827637, - "learning_rate": 4.919887692199423e-06, - "loss": 0.5233147740364075, - "mean_token_accuracy": 0.804171085357666, - "num_tokens": 4548215.0, - "step": 502 - }, - { - "epoch": 0.3822188449848024, - "grad_norm": 2.076775312423706, - "learning_rate": 4.9193608914270515e-06, - "loss": 0.5785550475120544, - "mean_token_accuracy": 0.7993186116218567, - "num_tokens": 4558204.0, - "step": 503 - }, - { - "epoch": 0.3829787234042553, - "grad_norm": 2.238546133041382, - "learning_rate": 4.918832392654075e-06, - "loss": 0.5287384390830994, - "mean_token_accuracy": 0.8214945793151855, - "num_tokens": 4565407.0, - "step": 504 - }, - { - "epoch": 0.3837386018237082, - "grad_norm": 1.6783074140548706, - "learning_rate": 4.9183021962514145e-06, - "loss": 0.6063359379768372, - "mean_token_accuracy": 0.7914625406265259, - "num_tokens": 4580991.0, - "step": 505 - }, - { - "epoch": 0.3844984802431611, - "grad_norm": 1.6287449598312378, - "learning_rate": 4.917770302591183e-06, - "loss": 0.3598247766494751, - "mean_token_accuracy": 0.8706809878349304, - "num_tokens": 4590579.0, - "step": 506 - }, - { - "epoch": 0.385258358662614, - "grad_norm": 1.5432041883468628, - "learning_rate": 4.917236712046682e-06, - "loss": 0.5267890095710754, - "mean_token_accuracy": 0.8032117486000061, - "num_tokens": 4608380.0, - "step": 507 - }, - { - "epoch": 0.3860182370820669, - "grad_norm": 1.7664037942886353, - "learning_rate": 4.9167014249924075e-06, - "loss": 0.3552354574203491, - "mean_token_accuracy": 0.8569793701171875, - "num_tokens": 4616426.0, - "step": 508 - }, - { - "epoch": 0.38677811550151975, - "grad_norm": 2.1147472858428955, - "learning_rate": 4.916164441804044e-06, - "loss": 0.5212404727935791, - "mean_token_accuracy": 0.8196578025817871, - "num_tokens": 4623908.0, - "step": 509 - }, - { - "epoch": 0.38753799392097266, - "grad_norm": 2.1092333793640137, - "learning_rate": 4.915625762858467e-06, - "loss": 0.5197038650512695, - "mean_token_accuracy": 0.8245604634284973, - "num_tokens": 4630956.0, - "step": 510 - }, - { - "epoch": 0.3882978723404255, - "grad_norm": 1.23331880569458, - "learning_rate": 4.915085388533743e-06, - "loss": 0.4759839177131653, - "mean_token_accuracy": 0.8192248344421387, - "num_tokens": 4651269.0, - "step": 511 - }, - { - "epoch": 0.3890577507598784, - "grad_norm": 2.424199104309082, - "learning_rate": 4.914543319209126e-06, - "loss": 0.5576270818710327, - "mean_token_accuracy": 0.8203302621841431, - "num_tokens": 4657296.0, - "step": 512 - }, - { - "epoch": 0.3898176291793313, - "grad_norm": 2.725156307220459, - "learning_rate": 4.913999555265062e-06, - "loss": 0.4337949752807617, - "mean_token_accuracy": 0.8382406234741211, - "num_tokens": 4661850.0, - "step": 513 - }, - { - "epoch": 0.3905775075987842, - "grad_norm": 2.3120534420013428, - "learning_rate": 4.913454097083185e-06, - "loss": 0.4941597580909729, - "mean_token_accuracy": 0.8302834033966064, - "num_tokens": 4667769.0, - "step": 514 - }, - { - "epoch": 0.3913373860182371, - "grad_norm": 2.3111207485198975, - "learning_rate": 4.912906945046319e-06, - "loss": 0.5253715515136719, - "mean_token_accuracy": 0.84515380859375, - "num_tokens": 4674537.0, - "step": 515 - }, - { - "epoch": 0.39209726443769, - "grad_norm": 1.4117841720581055, - "learning_rate": 4.912358099538476e-06, - "loss": 0.4521017074584961, - "mean_token_accuracy": 0.8208256959915161, - "num_tokens": 4690605.0, - "step": 516 - }, - { - "epoch": 0.39285714285714285, - "grad_norm": 2.3742799758911133, - "learning_rate": 4.911807560944858e-06, - "loss": 0.41572901606559753, - "mean_token_accuracy": 0.8550551533699036, - "num_tokens": 4706437.0, - "step": 517 - }, - { - "epoch": 0.39361702127659576, - "grad_norm": 2.4052202701568604, - "learning_rate": 4.911255329651852e-06, - "loss": 0.6003736257553101, - "mean_token_accuracy": 0.8247885704040527, - "num_tokens": 4712746.0, - "step": 518 - }, - { - "epoch": 0.3943768996960486, - "grad_norm": 1.9335490465164185, - "learning_rate": 4.910701406047037e-06, - "loss": 0.5457713603973389, - "mean_token_accuracy": 0.787429690361023, - "num_tokens": 4731937.0, - "step": 519 - }, - { - "epoch": 0.3951367781155015, - "grad_norm": 2.257706880569458, - "learning_rate": 4.910145790519177e-06, - "loss": 0.5300652980804443, - "mean_token_accuracy": 0.8192912936210632, - "num_tokens": 4739422.0, - "step": 520 - }, - { - "epoch": 0.3958966565349544, - "grad_norm": 1.2099462747573853, - "learning_rate": 4.9095884834582256e-06, - "loss": 0.45872747898101807, - "mean_token_accuracy": 0.8362667560577393, - "num_tokens": 4757113.0, - "step": 521 - }, - { - "epoch": 0.3966565349544073, - "grad_norm": 2.7991135120391846, - "learning_rate": 4.909029485255321e-06, - "loss": 0.49039560556411743, - "mean_token_accuracy": 0.8260016441345215, - "num_tokens": 4761709.0, - "step": 522 - }, - { - "epoch": 0.3974164133738602, - "grad_norm": 2.2360129356384277, - "learning_rate": 4.90846879630279e-06, - "loss": 0.49556830525398254, - "mean_token_accuracy": 0.827864408493042, - "num_tokens": 4769048.0, - "step": 523 - }, - { - "epoch": 0.3981762917933131, - "grad_norm": 2.5953688621520996, - "learning_rate": 4.907906416994146e-06, - "loss": 0.387208491563797, - "mean_token_accuracy": 0.8467001914978027, - "num_tokens": 4774637.0, - "step": 524 - }, - { - "epoch": 0.39893617021276595, - "grad_norm": 2.1046814918518066, - "learning_rate": 4.907342347724088e-06, - "loss": 0.5477259755134583, - "mean_token_accuracy": 0.8060322999954224, - "num_tokens": 4782774.0, - "step": 525 - }, - { - "epoch": 0.39969604863221886, - "grad_norm": 2.5622646808624268, - "learning_rate": 4.906776588888502e-06, - "loss": 0.5684159398078918, - "mean_token_accuracy": 0.8095303177833557, - "num_tokens": 4788766.0, - "step": 526 - }, - { - "epoch": 0.4004559270516717, - "grad_norm": 1.9027913808822632, - "learning_rate": 4.906209140884459e-06, - "loss": 0.535524845123291, - "mean_token_accuracy": 0.815237820148468, - "num_tokens": 4798492.0, - "step": 527 - }, - { - "epoch": 0.4012158054711246, - "grad_norm": 2.1447622776031494, - "learning_rate": 4.905640004110216e-06, - "loss": 0.5628632307052612, - "mean_token_accuracy": 0.8085395097732544, - "num_tokens": 4805737.0, - "step": 528 - }, - { - "epoch": 0.40197568389057753, - "grad_norm": 1.6754741668701172, - "learning_rate": 4.905069178965215e-06, - "loss": 0.5046736598014832, - "mean_token_accuracy": 0.8247535228729248, - "num_tokens": 4816912.0, - "step": 529 - }, - { - "epoch": 0.4027355623100304, - "grad_norm": 2.271230459213257, - "learning_rate": 4.904496665850083e-06, - "loss": 0.6086187958717346, - "mean_token_accuracy": 0.7935276627540588, - "num_tokens": 4824577.0, - "step": 530 - }, - { - "epoch": 0.4034954407294833, - "grad_norm": 2.107595205307007, - "learning_rate": 4.903922465166633e-06, - "loss": 0.5431341528892517, - "mean_token_accuracy": 0.8129537105560303, - "num_tokens": 4831772.0, - "step": 531 - }, - { - "epoch": 0.40425531914893614, - "grad_norm": 1.3860732316970825, - "learning_rate": 4.903346577317859e-06, - "loss": 0.45816320180892944, - "mean_token_accuracy": 0.8328287601470947, - "num_tokens": 4850302.0, - "step": 532 - }, - { - "epoch": 0.40501519756838905, - "grad_norm": 1.9186837673187256, - "learning_rate": 4.902769002707942e-06, - "loss": 0.3294633626937866, - "mean_token_accuracy": 0.8853933811187744, - "num_tokens": 4856624.0, - "step": 533 - }, - { - "epoch": 0.40577507598784196, - "grad_norm": 1.516194462776184, - "learning_rate": 4.902189741742247e-06, - "loss": 0.45482105016708374, - "mean_token_accuracy": 0.8370342254638672, - "num_tokens": 4870395.0, - "step": 534 - }, - { - "epoch": 0.4065349544072948, - "grad_norm": 2.3235628604888916, - "learning_rate": 4.901608794827321e-06, - "loss": 0.40688639879226685, - "mean_token_accuracy": 0.8643521666526794, - "num_tokens": 4875645.0, - "step": 535 - }, - { - "epoch": 0.4072948328267477, - "grad_norm": 2.29286527633667, - "learning_rate": 4.9010261623708945e-06, - "loss": 0.45482826232910156, - "mean_token_accuracy": 0.8429383039474487, - "num_tokens": 4881772.0, - "step": 536 - }, - { - "epoch": 0.40805471124620063, - "grad_norm": 1.5907070636749268, - "learning_rate": 4.900441844781882e-06, - "loss": 0.5266948342323303, - "mean_token_accuracy": 0.8348641395568848, - "num_tokens": 4894289.0, - "step": 537 - }, - { - "epoch": 0.4088145896656535, - "grad_norm": 2.1816294193267822, - "learning_rate": 4.89985584247038e-06, - "loss": 0.4797617793083191, - "mean_token_accuracy": 0.8549500703811646, - "num_tokens": 4901106.0, - "step": 538 - }, - { - "epoch": 0.4095744680851064, - "grad_norm": 1.7347146272659302, - "learning_rate": 4.899268155847667e-06, - "loss": 0.4754739999771118, - "mean_token_accuracy": 0.8278418183326721, - "num_tokens": 4912131.0, - "step": 539 - }, - { - "epoch": 0.41033434650455924, - "grad_norm": 2.0694527626037598, - "learning_rate": 4.898678785326205e-06, - "loss": 0.5071008801460266, - "mean_token_accuracy": 0.8157946467399597, - "num_tokens": 4921141.0, - "step": 540 - }, - { - "epoch": 0.41109422492401215, - "grad_norm": 2.570047616958618, - "learning_rate": 4.898087731319637e-06, - "loss": 0.43639278411865234, - "mean_token_accuracy": 0.8682913780212402, - "num_tokens": 4926182.0, - "step": 541 - }, - { - "epoch": 0.41185410334346506, - "grad_norm": 4.064006805419922, - "learning_rate": 4.8974949942427854e-06, - "loss": 0.539260745048523, - "mean_token_accuracy": 0.8225528001785278, - "num_tokens": 4929449.0, - "step": 542 - }, - { - "epoch": 0.4126139817629179, - "grad_norm": 1.7644332647323608, - "learning_rate": 4.896900574511657e-06, - "loss": 0.472618043422699, - "mean_token_accuracy": 0.8332902193069458, - "num_tokens": 4939443.0, - "step": 543 - }, - { - "epoch": 0.4133738601823708, - "grad_norm": 2.879918336868286, - "learning_rate": 4.89630447254344e-06, - "loss": 0.6360667943954468, - "mean_token_accuracy": 0.8215296268463135, - "num_tokens": 4950838.0, - "step": 544 - }, - { - "epoch": 0.41413373860182373, - "grad_norm": 1.4575570821762085, - "learning_rate": 4.8957066887565005e-06, - "loss": 0.45617997646331787, - "mean_token_accuracy": 0.8373187184333801, - "num_tokens": 4965222.0, - "step": 545 - }, - { - "epoch": 0.4148936170212766, - "grad_norm": 2.4829535484313965, - "learning_rate": 4.895107223570386e-06, - "loss": 0.42285341024398804, - "mean_token_accuracy": 0.8686380386352539, - "num_tokens": 4970724.0, - "step": 546 - }, - { - "epoch": 0.4156534954407295, - "grad_norm": 2.639474630355835, - "learning_rate": 4.894506077405824e-06, - "loss": 0.5906289219856262, - "mean_token_accuracy": 0.8174435496330261, - "num_tokens": 4976766.0, - "step": 547 - }, - { - "epoch": 0.41641337386018235, - "grad_norm": 2.7960562705993652, - "learning_rate": 4.893903250684723e-06, - "loss": 0.4518949091434479, - "mean_token_accuracy": 0.8387585282325745, - "num_tokens": 4980991.0, - "step": 548 - }, - { - "epoch": 0.41717325227963525, - "grad_norm": 2.184176206588745, - "learning_rate": 4.893298743830168e-06, - "loss": 0.5223842859268188, - "mean_token_accuracy": 0.8170937299728394, - "num_tokens": 4987781.0, - "step": 549 - }, - { - "epoch": 0.41793313069908816, - "grad_norm": 2.2393438816070557, - "learning_rate": 4.892692557266429e-06, - "loss": 0.5238431692123413, - "mean_token_accuracy": 0.8217905759811401, - "num_tokens": 4994321.0, - "step": 550 - }, - { - "epoch": 0.418693009118541, - "grad_norm": 3.579047441482544, - "learning_rate": 4.8920846914189465e-06, - "loss": 0.5367584228515625, - "mean_token_accuracy": 0.8312011361122131, - "num_tokens": 4997951.0, - "step": 551 - }, - { - "epoch": 0.4194528875379939, - "grad_norm": 1.6330240964889526, - "learning_rate": 4.891475146714348e-06, - "loss": 0.6054705381393433, - "mean_token_accuracy": 0.7938206791877747, - "num_tokens": 5012726.0, - "step": 552 - }, - { - "epoch": 0.42021276595744683, - "grad_norm": 1.5775716304779053, - "learning_rate": 4.8908639235804324e-06, - "loss": 0.4774656891822815, - "mean_token_accuracy": 0.828762948513031, - "num_tokens": 5026751.0, - "step": 553 - }, - { - "epoch": 0.4209726443768997, - "grad_norm": 1.5719101428985596, - "learning_rate": 4.890251022446181e-06, - "loss": 0.549429178237915, - "mean_token_accuracy": 0.8110791444778442, - "num_tokens": 5041861.0, - "step": 554 - }, - { - "epoch": 0.4217325227963526, - "grad_norm": 1.8585275411605835, - "learning_rate": 4.889636443741752e-06, - "loss": 0.4448118805885315, - "mean_token_accuracy": 0.8462690711021423, - "num_tokens": 5052690.0, - "step": 555 - }, - { - "epoch": 0.42249240121580545, - "grad_norm": 2.189202070236206, - "learning_rate": 4.88902018789848e-06, - "loss": 0.4296762943267822, - "mean_token_accuracy": 0.8488791584968567, - "num_tokens": 5058964.0, - "step": 556 - }, - { - "epoch": 0.42325227963525835, - "grad_norm": 1.9328460693359375, - "learning_rate": 4.888402255348877e-06, - "loss": 0.5369474291801453, - "mean_token_accuracy": 0.8184729814529419, - "num_tokens": 5068465.0, - "step": 557 - }, - { - "epoch": 0.42401215805471126, - "grad_norm": 1.6233323812484741, - "learning_rate": 4.887782646526631e-06, - "loss": 0.5284391641616821, - "mean_token_accuracy": 0.8276044726371765, - "num_tokens": 5081052.0, - "step": 558 - }, - { - "epoch": 0.4247720364741641, - "grad_norm": 2.222813844680786, - "learning_rate": 4.887161361866608e-06, - "loss": 0.5679137706756592, - "mean_token_accuracy": 0.8012375831604004, - "num_tokens": 5090001.0, - "step": 559 - }, - { - "epoch": 0.425531914893617, - "grad_norm": 2.1062207221984863, - "learning_rate": 4.8865384018048494e-06, - "loss": 0.5554201602935791, - "mean_token_accuracy": 0.8128066062927246, - "num_tokens": 5097644.0, - "step": 560 - }, - { - "epoch": 0.42629179331306993, - "grad_norm": 1.5380984544754028, - "learning_rate": 4.8859137667785735e-06, - "loss": 0.4948265850543976, - "mean_token_accuracy": 0.8258291482925415, - "num_tokens": 5110069.0, - "step": 561 - }, - { - "epoch": 0.4270516717325228, - "grad_norm": 2.0290257930755615, - "learning_rate": 4.8852874572261715e-06, - "loss": 0.4969530403614044, - "mean_token_accuracy": 0.8297134637832642, - "num_tokens": 5117452.0, - "step": 562 - }, - { - "epoch": 0.4278115501519757, - "grad_norm": 1.5651452541351318, - "learning_rate": 4.884659473587213e-06, - "loss": 0.5353102087974548, - "mean_token_accuracy": 0.8161719441413879, - "num_tokens": 5133756.0, - "step": 563 - }, - { - "epoch": 0.42857142857142855, - "grad_norm": 2.2470998764038086, - "learning_rate": 4.884029816302441e-06, - "loss": 0.5104288458824158, - "mean_token_accuracy": 0.8081635236740112, - "num_tokens": 5140278.0, - "step": 564 - }, - { - "epoch": 0.42933130699088146, - "grad_norm": 1.726891279220581, - "learning_rate": 4.883398485813772e-06, - "loss": 0.4508771002292633, - "mean_token_accuracy": 0.8548800349235535, - "num_tokens": 5150115.0, - "step": 565 - }, - { - "epoch": 0.43009118541033436, - "grad_norm": 1.4779289960861206, - "learning_rate": 4.8827654825642984e-06, - "loss": 0.46861088275909424, - "mean_token_accuracy": 0.8209476470947266, - "num_tokens": 5163225.0, - "step": 566 - }, - { - "epoch": 0.4308510638297872, - "grad_norm": 1.2361034154891968, - "learning_rate": 4.882130806998287e-06, - "loss": 0.4591076672077179, - "mean_token_accuracy": 0.803041934967041, - "num_tokens": 5180342.0, - "step": 567 - }, - { - "epoch": 0.4316109422492401, - "grad_norm": 1.882467269897461, - "learning_rate": 4.881494459561177e-06, - "loss": 0.579258143901825, - "mean_token_accuracy": 0.8007112741470337, - "num_tokens": 5189595.0, - "step": 568 - }, - { - "epoch": 0.43237082066869303, - "grad_norm": 1.095462441444397, - "learning_rate": 4.880856440699582e-06, - "loss": 0.3806574046611786, - "mean_token_accuracy": 0.8650111556053162, - "num_tokens": 5211642.0, - "step": 569 - }, - { - "epoch": 0.4331306990881459, - "grad_norm": 1.6469846963882446, - "learning_rate": 4.880216750861288e-06, - "loss": 0.544589638710022, - "mean_token_accuracy": 0.8060122728347778, - "num_tokens": 5224137.0, - "step": 570 - }, - { - "epoch": 0.4338905775075988, - "grad_norm": 1.8561251163482666, - "learning_rate": 4.879575390495254e-06, - "loss": 0.4094924330711365, - "mean_token_accuracy": 0.8591406345367432, - "num_tokens": 5231588.0, - "step": 571 - }, - { - "epoch": 0.43465045592705165, - "grad_norm": 3.01326847076416, - "learning_rate": 4.878932360051611e-06, - "loss": 0.6139192581176758, - "mean_token_accuracy": 0.8108739852905273, - "num_tokens": 5236853.0, - "step": 572 - }, - { - "epoch": 0.43541033434650456, - "grad_norm": 2.1753034591674805, - "learning_rate": 4.878287659981663e-06, - "loss": 0.49082931876182556, - "mean_token_accuracy": 0.862828254699707, - "num_tokens": 5243264.0, - "step": 573 - }, - { - "epoch": 0.43617021276595747, - "grad_norm": 1.4437755346298218, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.5608728528022766, - "mean_token_accuracy": 0.8271626234054565, - "num_tokens": 5261757.0, - "step": 574 - }, - { - "epoch": 0.4369300911854103, - "grad_norm": 1.786683440208435, - "learning_rate": 4.876993252773923e-06, - "loss": 0.4377627968788147, - "mean_token_accuracy": 0.844936192035675, - "num_tokens": 5271038.0, - "step": 575 - }, - { - "epoch": 0.4376899696048632, - "grad_norm": 1.3425915241241455, - "learning_rate": 4.876343546544596e-06, - "loss": 0.44762521982192993, - "mean_token_accuracy": 0.8397793769836426, - "num_tokens": 5285555.0, - "step": 576 - }, - { - "epoch": 0.43844984802431614, - "grad_norm": 2.1549675464630127, - "learning_rate": 4.8756921725058935e-06, - "loss": 0.5332942008972168, - "mean_token_accuracy": 0.820149302482605, - "num_tokens": 5294595.0, - "step": 577 - }, - { - "epoch": 0.439209726443769, - "grad_norm": 1.5254042148590088, - "learning_rate": 4.875039131114975e-06, - "loss": 0.3646543622016907, - "mean_token_accuracy": 0.8442583084106445, - "num_tokens": 5304955.0, - "step": 578 - }, - { - "epoch": 0.4399696048632219, - "grad_norm": 1.5751557350158691, - "learning_rate": 4.8743844228301676e-06, - "loss": 0.4854734539985657, - "mean_token_accuracy": 0.8317523002624512, - "num_tokens": 5317351.0, - "step": 579 - }, - { - "epoch": 0.44072948328267475, - "grad_norm": 1.6950466632843018, - "learning_rate": 4.873728048110973e-06, - "loss": 0.5907570719718933, - "mean_token_accuracy": 0.7946986556053162, - "num_tokens": 5332542.0, - "step": 580 - }, - { - "epoch": 0.44148936170212766, - "grad_norm": 2.1180708408355713, - "learning_rate": 4.873070007418059e-06, - "loss": 0.5220296382904053, - "mean_token_accuracy": 0.8037363290786743, - "num_tokens": 5341722.0, - "step": 581 - }, - { - "epoch": 0.44224924012158057, - "grad_norm": 1.3643816709518433, - "learning_rate": 4.872410301213265e-06, - "loss": 0.4865502417087555, - "mean_token_accuracy": 0.8377852439880371, - "num_tokens": 5359359.0, - "step": 582 - }, - { - "epoch": 0.4430091185410334, - "grad_norm": 1.483280897140503, - "learning_rate": 4.871748929959598e-06, - "loss": 0.36856764554977417, - "mean_token_accuracy": 0.8709549903869629, - "num_tokens": 5369749.0, - "step": 583 - }, - { - "epoch": 0.44376899696048633, - "grad_norm": 1.6891541481018066, - "learning_rate": 4.871085894121234e-06, - "loss": 0.5768930912017822, - "mean_token_accuracy": 0.8030461668968201, - "num_tokens": 5383912.0, - "step": 584 - }, - { - "epoch": 0.44452887537993924, - "grad_norm": 2.1318740844726562, - "learning_rate": 4.870421194163515e-06, - "loss": 0.4337100386619568, - "mean_token_accuracy": 0.8562518358230591, - "num_tokens": 5389412.0, - "step": 585 - }, - { - "epoch": 0.4452887537993921, - "grad_norm": 2.540255546569824, - "learning_rate": 4.869754830552956e-06, - "loss": 0.4708256125450134, - "mean_token_accuracy": 0.8446552753448486, - "num_tokens": 5394762.0, - "step": 586 - }, - { - "epoch": 0.446048632218845, - "grad_norm": 2.048015594482422, - "learning_rate": 4.869086803757235e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8181137442588806, - "num_tokens": 5402379.0, - "step": 587 - }, - { - "epoch": 0.44680851063829785, - "grad_norm": 2.9821012020111084, - "learning_rate": 4.868417114245199e-06, - "loss": 0.6299797296524048, - "mean_token_accuracy": 0.8237329125404358, - "num_tokens": 5408229.0, - "step": 588 - }, - { - "epoch": 0.44756838905775076, - "grad_norm": 1.7807202339172363, - "learning_rate": 4.867745762486862e-06, - "loss": 0.5176759958267212, - "mean_token_accuracy": 0.8184244632720947, - "num_tokens": 5418383.0, - "step": 589 - }, - { - "epoch": 0.44832826747720367, - "grad_norm": 1.5466399192810059, - "learning_rate": 4.8670727489534035e-06, - "loss": 0.5137228965759277, - "mean_token_accuracy": 0.8365053534507751, - "num_tokens": 5432127.0, - "step": 590 - }, - { - "epoch": 0.4490881458966565, - "grad_norm": 2.9521141052246094, - "learning_rate": 4.866398074117173e-06, - "loss": 0.4056887924671173, - "mean_token_accuracy": 0.8561501502990723, - "num_tokens": 5436062.0, - "step": 591 - }, - { - "epoch": 0.44984802431610943, - "grad_norm": 2.058743953704834, - "learning_rate": 4.86572173845168e-06, - "loss": 0.6124799251556396, - "mean_token_accuracy": 0.8007957339286804, - "num_tokens": 5444989.0, - "step": 592 - }, - { - "epoch": 0.4506079027355623, - "grad_norm": 2.1243767738342285, - "learning_rate": 4.865043742431605e-06, - "loss": 0.5659694671630859, - "mean_token_accuracy": 0.8084750175476074, - "num_tokens": 5453865.0, - "step": 593 - }, - { - "epoch": 0.4513677811550152, - "grad_norm": 1.6732314825057983, - "learning_rate": 4.864364086532792e-06, - "loss": 0.47879064083099365, - "mean_token_accuracy": 0.8346436023712158, - "num_tokens": 5466398.0, - "step": 594 - }, - { - "epoch": 0.4521276595744681, - "grad_norm": 1.3793858289718628, - "learning_rate": 4.863682771232249e-06, - "loss": 0.45989373326301575, - "mean_token_accuracy": 0.8254791498184204, - "num_tokens": 5482121.0, - "step": 595 - }, - { - "epoch": 0.45288753799392095, - "grad_norm": 1.9812315702438354, - "learning_rate": 4.862999797008149e-06, - "loss": 0.5778874754905701, - "mean_token_accuracy": 0.8041508197784424, - "num_tokens": 5493000.0, - "step": 596 - }, - { - "epoch": 0.45364741641337386, - "grad_norm": 3.3065083026885986, - "learning_rate": 4.862315164339829e-06, - "loss": 0.4623975157737732, - "mean_token_accuracy": 0.8426318168640137, - "num_tokens": 5496723.0, - "step": 597 - }, - { - "epoch": 0.45440729483282677, - "grad_norm": 3.167119026184082, - "learning_rate": 4.861628873707792e-06, - "loss": 0.6984533667564392, - "mean_token_accuracy": 0.772136926651001, - "num_tokens": 5501161.0, - "step": 598 - }, - { - "epoch": 0.4551671732522796, - "grad_norm": 2.2130985260009766, - "learning_rate": 4.860940925593703e-06, - "loss": 0.4823192059993744, - "mean_token_accuracy": 0.8462972640991211, - "num_tokens": 5509544.0, - "step": 599 - }, - { - "epoch": 0.45592705167173253, - "grad_norm": 3.029191732406616, - "learning_rate": 4.86025132048039e-06, - "loss": 0.523664116859436, - "mean_token_accuracy": 0.8229140043258667, - "num_tokens": 5514586.0, - "step": 600 - }, - { - "epoch": 0.4566869300911854, - "grad_norm": 1.6983962059020996, - "learning_rate": 4.859560058851844e-06, - "loss": 0.4832698106765747, - "mean_token_accuracy": 0.8403248190879822, - "num_tokens": 5525773.0, - "step": 601 - }, - { - "epoch": 0.4574468085106383, - "grad_norm": 3.0504038333892822, - "learning_rate": 4.8588671411932195e-06, - "loss": 0.5158926248550415, - "mean_token_accuracy": 0.8098392486572266, - "num_tokens": 5529739.0, - "step": 602 - }, - { - "epoch": 0.4582066869300912, - "grad_norm": 2.584836483001709, - "learning_rate": 4.858172567990832e-06, - "loss": 0.5724587440490723, - "mean_token_accuracy": 0.8128519058227539, - "num_tokens": 5535763.0, - "step": 603 - }, - { - "epoch": 0.45896656534954405, - "grad_norm": 2.0514042377471924, - "learning_rate": 4.857476339732162e-06, - "loss": 0.4337679445743561, - "mean_token_accuracy": 0.8405929207801819, - "num_tokens": 5543075.0, - "step": 604 - }, - { - "epoch": 0.45972644376899696, - "grad_norm": 2.2949347496032715, - "learning_rate": 4.856778456905846e-06, - "loss": 0.46532145142555237, - "mean_token_accuracy": 0.8345137238502502, - "num_tokens": 5549035.0, - "step": 605 - }, - { - "epoch": 0.46048632218844987, - "grad_norm": 2.2067551612854004, - "learning_rate": 4.856078920001689e-06, - "loss": 0.5855136513710022, - "mean_token_accuracy": 0.8043795228004456, - "num_tokens": 5555545.0, - "step": 606 - }, - { - "epoch": 0.4612462006079027, - "grad_norm": 2.101945161819458, - "learning_rate": 4.855377729510648e-06, - "loss": 0.6071814298629761, - "mean_token_accuracy": 0.7973253130912781, - "num_tokens": 5563615.0, - "step": 607 - }, - { - "epoch": 0.46200607902735563, - "grad_norm": 2.5958821773529053, - "learning_rate": 4.8546748859248504e-06, - "loss": 0.6278061866760254, - "mean_token_accuracy": 0.7864972352981567, - "num_tokens": 5570078.0, - "step": 608 - }, - { - "epoch": 0.4627659574468085, - "grad_norm": 2.778101921081543, - "learning_rate": 4.853970389737576e-06, - "loss": 0.35521194338798523, - "mean_token_accuracy": 0.8752605319023132, - "num_tokens": 5573995.0, - "step": 609 - }, - { - "epoch": 0.4635258358662614, - "grad_norm": 2.600534677505493, - "learning_rate": 4.8532642414432675e-06, - "loss": 0.6541563868522644, - "mean_token_accuracy": 0.7843613028526306, - "num_tokens": 5580333.0, - "step": 610 - }, - { - "epoch": 0.4642857142857143, - "grad_norm": 1.778337836265564, - "learning_rate": 4.852556441537528e-06, - "loss": 0.3561405837535858, - "mean_token_accuracy": 0.8579353094100952, - "num_tokens": 5588430.0, - "step": 611 - }, - { - "epoch": 0.46504559270516715, - "grad_norm": 1.5653862953186035, - "learning_rate": 4.851846990517118e-06, - "loss": 0.6067906618118286, - "mean_token_accuracy": 0.7919317483901978, - "num_tokens": 5601700.0, - "step": 612 - }, - { - "epoch": 0.46580547112462006, - "grad_norm": 1.6097723245620728, - "learning_rate": 4.851135888879958e-06, - "loss": 0.446664422750473, - "mean_token_accuracy": 0.8441969156265259, - "num_tokens": 5612063.0, - "step": 613 - }, - { - "epoch": 0.46656534954407297, - "grad_norm": 1.961207389831543, - "learning_rate": 4.850423137125126e-06, - "loss": 0.5508605241775513, - "mean_token_accuracy": 0.8240450024604797, - "num_tokens": 5620245.0, - "step": 614 - }, - { - "epoch": 0.4673252279635258, - "grad_norm": 2.2189085483551025, - "learning_rate": 4.8497087357528585e-06, - "loss": 0.6805076599121094, - "mean_token_accuracy": 0.771978497505188, - "num_tokens": 5629590.0, - "step": 615 - }, - { - "epoch": 0.46808510638297873, - "grad_norm": 2.5176279544830322, - "learning_rate": 4.8489926852645505e-06, - "loss": 0.4512156844139099, - "mean_token_accuracy": 0.836459755897522, - "num_tokens": 5635259.0, - "step": 616 - }, - { - "epoch": 0.4688449848024316, - "grad_norm": 1.5327287912368774, - "learning_rate": 4.848274986162754e-06, - "loss": 0.4884302616119385, - "mean_token_accuracy": 0.8194037079811096, - "num_tokens": 5649993.0, - "step": 617 - }, - { - "epoch": 0.4696048632218845, - "grad_norm": 2.184554100036621, - "learning_rate": 4.847555638951177e-06, - "loss": 0.5141451358795166, - "mean_token_accuracy": 0.8245922327041626, - "num_tokens": 5657375.0, - "step": 618 - }, - { - "epoch": 0.4703647416413374, - "grad_norm": 1.6143407821655273, - "learning_rate": 4.846834644134686e-06, - "loss": 0.4276641607284546, - "mean_token_accuracy": 0.8481845855712891, - "num_tokens": 5667941.0, - "step": 619 - }, - { - "epoch": 0.47112462006079026, - "grad_norm": 2.3747270107269287, - "learning_rate": 4.846112002219301e-06, - "loss": 0.5608246922492981, - "mean_token_accuracy": 0.8073011040687561, - "num_tokens": 5675042.0, - "step": 620 - }, - { - "epoch": 0.47188449848024316, - "grad_norm": 2.390404224395752, - "learning_rate": 4.845387713712203e-06, - "loss": 0.46616724133491516, - "mean_token_accuracy": 0.8468319177627563, - "num_tokens": 5680207.0, - "step": 621 - }, - { - "epoch": 0.4726443768996961, - "grad_norm": 1.7245099544525146, - "learning_rate": 4.844661779121723e-06, - "loss": 0.5652435421943665, - "mean_token_accuracy": 0.8010749816894531, - "num_tokens": 5693759.0, - "step": 622 - }, - { - "epoch": 0.4734042553191489, - "grad_norm": 2.6923108100891113, - "learning_rate": 4.843934198957351e-06, - "loss": 0.6254661679267883, - "mean_token_accuracy": 0.8236024975776672, - "num_tokens": 5699916.0, - "step": 623 - }, - { - "epoch": 0.47416413373860183, - "grad_norm": 2.516901969909668, - "learning_rate": 4.84320497372973e-06, - "loss": 0.6334252953529358, - "mean_token_accuracy": 0.7803834676742554, - "num_tokens": 5706554.0, - "step": 624 - }, - { - "epoch": 0.4749240121580547, - "grad_norm": 2.3744447231292725, - "learning_rate": 4.842474103950658e-06, - "loss": 0.4221811890602112, - "mean_token_accuracy": 0.8639545440673828, - "num_tokens": 5711756.0, - "step": 625 - }, - { - "epoch": 0.4756838905775076, - "grad_norm": 3.2373476028442383, - "learning_rate": 4.841741590133089e-06, - "loss": 0.6637828946113586, - "mean_token_accuracy": 0.7968347072601318, - "num_tokens": 5716458.0, - "step": 626 - }, - { - "epoch": 0.4764437689969605, - "grad_norm": 2.153888463973999, - "learning_rate": 4.841007432791129e-06, - "loss": 0.4877486228942871, - "mean_token_accuracy": 0.8345249891281128, - "num_tokens": 5723155.0, - "step": 627 - }, - { - "epoch": 0.47720364741641336, - "grad_norm": 2.120497703552246, - "learning_rate": 4.8402716324400375e-06, - "loss": 0.37323033809661865, - "mean_token_accuracy": 0.8734050393104553, - "num_tokens": 5729171.0, - "step": 628 - }, - { - "epoch": 0.47796352583586627, - "grad_norm": 1.5294172763824463, - "learning_rate": 4.839534189596228e-06, - "loss": 0.4057067334651947, - "mean_token_accuracy": 0.8523319959640503, - "num_tokens": 5740112.0, - "step": 629 - }, - { - "epoch": 0.4787234042553192, - "grad_norm": 2.1913886070251465, - "learning_rate": 4.8387951047772656e-06, - "loss": 0.4835960865020752, - "mean_token_accuracy": 0.8438145518302917, - "num_tokens": 5746838.0, - "step": 630 - }, - { - "epoch": 0.479483282674772, - "grad_norm": 1.482897162437439, - "learning_rate": 4.838054378501868e-06, - "loss": 0.46967992186546326, - "mean_token_accuracy": 0.8315759897232056, - "num_tokens": 5760428.0, - "step": 631 - }, - { - "epoch": 0.48024316109422494, - "grad_norm": 1.38850998878479, - "learning_rate": 4.837312011289907e-06, - "loss": 0.41845446825027466, - "mean_token_accuracy": 0.8557186126708984, - "num_tokens": 5773437.0, - "step": 632 - }, - { - "epoch": 0.4810030395136778, - "grad_norm": 3.8337457180023193, - "learning_rate": 4.836568003662403e-06, - "loss": 0.5102912187576294, - "mean_token_accuracy": 0.830644965171814, - "num_tokens": 5776367.0, - "step": 633 - }, - { - "epoch": 0.4817629179331307, - "grad_norm": 1.2084007263183594, - "learning_rate": 4.8358223561415304e-06, - "loss": 0.3835333585739136, - "mean_token_accuracy": 0.8639016151428223, - "num_tokens": 5792246.0, - "step": 634 - }, - { - "epoch": 0.4825227963525836, - "grad_norm": 1.939408540725708, - "learning_rate": 4.835075069250613e-06, - "loss": 0.4044850468635559, - "mean_token_accuracy": 0.8488376140594482, - "num_tokens": 5799853.0, - "step": 635 - }, - { - "epoch": 0.48328267477203646, - "grad_norm": 1.345870852470398, - "learning_rate": 4.8343261435141245e-06, - "loss": 0.46660199761390686, - "mean_token_accuracy": 0.8371681571006775, - "num_tokens": 5817478.0, - "step": 636 - }, - { - "epoch": 0.48404255319148937, - "grad_norm": 1.6531339883804321, - "learning_rate": 4.833575579457691e-06, - "loss": 0.3886989951133728, - "mean_token_accuracy": 0.8763507008552551, - "num_tokens": 5825739.0, - "step": 637 - }, - { - "epoch": 0.4848024316109423, - "grad_norm": 1.6443969011306763, - "learning_rate": 4.832823377608088e-06, - "loss": 0.4070289731025696, - "mean_token_accuracy": 0.8586630821228027, - "num_tokens": 5837917.0, - "step": 638 - }, - { - "epoch": 0.48556231003039513, - "grad_norm": 2.005136013031006, - "learning_rate": 4.832069538493237e-06, - "loss": 0.40616685152053833, - "mean_token_accuracy": 0.8571510314941406, - "num_tokens": 5845250.0, - "step": 639 - }, - { - "epoch": 0.48632218844984804, - "grad_norm": 1.5244266986846924, - "learning_rate": 4.831314062642213e-06, - "loss": 0.49530288577079773, - "mean_token_accuracy": 0.8328841924667358, - "num_tokens": 5857407.0, - "step": 640 - }, - { - "epoch": 0.4870820668693009, - "grad_norm": 1.9876971244812012, - "learning_rate": 4.830556950585239e-06, - "loss": 0.4583776593208313, - "mean_token_accuracy": 0.8427221179008484, - "num_tokens": 5865391.0, - "step": 641 - }, - { - "epoch": 0.4878419452887538, - "grad_norm": 3.023336172103882, - "learning_rate": 4.829798202853683e-06, - "loss": 0.6134771108627319, - "mean_token_accuracy": 0.7981935739517212, - "num_tokens": 5870729.0, - "step": 642 - }, - { - "epoch": 0.4886018237082067, - "grad_norm": 1.8889515399932861, - "learning_rate": 4.829037819980065e-06, - "loss": 0.4420135021209717, - "mean_token_accuracy": 0.8480775356292725, - "num_tokens": 5878982.0, - "step": 643 - }, - { - "epoch": 0.48936170212765956, - "grad_norm": 2.2408435344696045, - "learning_rate": 4.828275802498051e-06, - "loss": 0.525706946849823, - "mean_token_accuracy": 0.8271557092666626, - "num_tokens": 5885097.0, - "step": 644 - }, - { - "epoch": 0.49012158054711247, - "grad_norm": 1.9734224081039429, - "learning_rate": 4.827512150942454e-06, - "loss": 0.44246578216552734, - "mean_token_accuracy": 0.8456668257713318, - "num_tokens": 5893941.0, - "step": 645 - }, - { - "epoch": 0.4908814589665654, - "grad_norm": 1.9618173837661743, - "learning_rate": 4.8267468658492335e-06, - "loss": 0.5119768381118774, - "mean_token_accuracy": 0.8355510830879211, - "num_tokens": 5902829.0, - "step": 646 - }, - { - "epoch": 0.49164133738601823, - "grad_norm": 1.7181587219238281, - "learning_rate": 4.825979947755496e-06, - "loss": 0.5666520595550537, - "mean_token_accuracy": 0.7951971888542175, - "num_tokens": 5915212.0, - "step": 647 - }, - { - "epoch": 0.49240121580547114, - "grad_norm": 3.0121164321899414, - "learning_rate": 4.8252113971994955e-06, - "loss": 0.628632128238678, - "mean_token_accuracy": 0.8041050434112549, - "num_tokens": 5921410.0, - "step": 648 - }, - { - "epoch": 0.493161094224924, - "grad_norm": 2.9980475902557373, - "learning_rate": 4.824441214720629e-06, - "loss": 0.4507424831390381, - "mean_token_accuracy": 0.8636263608932495, - "num_tokens": 5925179.0, - "step": 649 - }, - { - "epoch": 0.4939209726443769, - "grad_norm": 2.0096445083618164, - "learning_rate": 4.823669400859441e-06, - "loss": 0.602759838104248, - "mean_token_accuracy": 0.8104915618896484, - "num_tokens": 5934160.0, - "step": 650 - }, - { - "epoch": 0.4946808510638298, - "grad_norm": 1.1186442375183105, - "learning_rate": 4.8228959561576195e-06, - "loss": 0.41168469190597534, - "mean_token_accuracy": 0.8461419939994812, - "num_tokens": 5954163.0, - "step": 651 - }, - { - "epoch": 0.49544072948328266, - "grad_norm": 1.855465054512024, - "learning_rate": 4.822120881157998e-06, - "loss": 0.5049735307693481, - "mean_token_accuracy": 0.8225747346878052, - "num_tokens": 5963840.0, - "step": 652 - }, - { - "epoch": 0.49620060790273557, - "grad_norm": 3.550563335418701, - "learning_rate": 4.821344176404554e-06, - "loss": 0.49025264382362366, - "mean_token_accuracy": 0.8265978693962097, - "num_tokens": 5967358.0, - "step": 653 - }, - { - "epoch": 0.4969604863221885, - "grad_norm": 3.063910484313965, - "learning_rate": 4.820565842442408e-06, - "loss": 0.5652767419815063, - "mean_token_accuracy": 0.811700701713562, - "num_tokens": 5971858.0, - "step": 654 - }, - { - "epoch": 0.49772036474164133, - "grad_norm": 2.4613308906555176, - "learning_rate": 4.819785879817827e-06, - "loss": 0.5296125411987305, - "mean_token_accuracy": 0.8336488008499146, - "num_tokens": 5977442.0, - "step": 655 - }, - { - "epoch": 0.49848024316109424, - "grad_norm": 2.342519760131836, - "learning_rate": 4.819004289078217e-06, - "loss": 0.5753380060195923, - "mean_token_accuracy": 0.7922406792640686, - "num_tokens": 5984531.0, - "step": 656 - }, - { - "epoch": 0.4992401215805471, - "grad_norm": 2.0410680770874023, - "learning_rate": 4.818221070772129e-06, - "loss": 0.5433275699615479, - "mean_token_accuracy": 0.8043830990791321, - "num_tokens": 5992642.0, - "step": 657 - }, - { - "epoch": 0.5, - "grad_norm": 1.4999698400497437, - "learning_rate": 4.8174362254492555e-06, - "loss": 0.5248899459838867, - "mean_token_accuracy": 0.8107168674468994, - "num_tokens": 6005543.0, - "step": 658 - }, - { - "epoch": 0.5007598784194529, - "grad_norm": 1.9494401216506958, - "learning_rate": 4.816649753660431e-06, - "loss": 0.41291385889053345, - "mean_token_accuracy": 0.8650569915771484, - "num_tokens": 6012185.0, - "step": 659 - }, - { - "epoch": 0.5015197568389058, - "grad_norm": 2.7514095306396484, - "learning_rate": 4.815861655957632e-06, - "loss": 0.4244142770767212, - "mean_token_accuracy": 0.8485112190246582, - "num_tokens": 6016809.0, - "step": 660 - }, - { - "epoch": 0.5022796352583586, - "grad_norm": 1.4354928731918335, - "learning_rate": 4.815071932893976e-06, - "loss": 0.4332060217857361, - "mean_token_accuracy": 0.8386815786361694, - "num_tokens": 6034795.0, - "step": 661 - }, - { - "epoch": 0.5030395136778115, - "grad_norm": 1.3113417625427246, - "learning_rate": 4.81428058502372e-06, - "loss": 0.5415540933609009, - "mean_token_accuracy": 0.8115285038948059, - "num_tokens": 6053624.0, - "step": 662 - }, - { - "epoch": 0.5037993920972644, - "grad_norm": 1.820868730545044, - "learning_rate": 4.813487612902265e-06, - "loss": 0.5360245108604431, - "mean_token_accuracy": 0.8313555717468262, - "num_tokens": 6063399.0, - "step": 663 - }, - { - "epoch": 0.5045592705167173, - "grad_norm": 2.347001552581787, - "learning_rate": 4.812693017086145e-06, - "loss": 0.4926982820034027, - "mean_token_accuracy": 0.8137006759643555, - "num_tokens": 6070111.0, - "step": 664 - }, - { - "epoch": 0.5053191489361702, - "grad_norm": 1.8830888271331787, - "learning_rate": 4.811896798133042e-06, - "loss": 0.5419014692306519, - "mean_token_accuracy": 0.8027454614639282, - "num_tokens": 6081090.0, - "step": 665 - }, - { - "epoch": 0.506079027355623, - "grad_norm": 2.3258056640625, - "learning_rate": 4.811098956601772e-06, - "loss": 0.4629337787628174, - "mean_token_accuracy": 0.8416580557823181, - "num_tokens": 6087921.0, - "step": 666 - }, - { - "epoch": 0.506838905775076, - "grad_norm": 1.9578291177749634, - "learning_rate": 4.810299493052289e-06, - "loss": 0.40305402874946594, - "mean_token_accuracy": 0.8529061079025269, - "num_tokens": 6100034.0, - "step": 667 - }, - { - "epoch": 0.5075987841945289, - "grad_norm": 2.800635576248169, - "learning_rate": 4.809498408045691e-06, - "loss": 0.5087342262268066, - "mean_token_accuracy": 0.8214689493179321, - "num_tokens": 6104742.0, - "step": 668 - }, - { - "epoch": 0.5083586626139818, - "grad_norm": 1.5318149328231812, - "learning_rate": 4.808695702144206e-06, - "loss": 0.4733222723007202, - "mean_token_accuracy": 0.837577223777771, - "num_tokens": 6117242.0, - "step": 669 - }, - { - "epoch": 0.5091185410334347, - "grad_norm": 1.2368661165237427, - "learning_rate": 4.807891375911207e-06, - "loss": 0.3929097056388855, - "mean_token_accuracy": 0.8331400752067566, - "num_tokens": 6133509.0, - "step": 670 - }, - { - "epoch": 0.5098784194528876, - "grad_norm": 2.4711415767669678, - "learning_rate": 4.8070854299112e-06, - "loss": 0.6294851303100586, - "mean_token_accuracy": 0.7956781983375549, - "num_tokens": 6140294.0, - "step": 671 - }, - { - "epoch": 0.5106382978723404, - "grad_norm": 2.590961217880249, - "learning_rate": 4.806277864709828e-06, - "loss": 0.580160915851593, - "mean_token_accuracy": 0.809589684009552, - "num_tokens": 6145803.0, - "step": 672 - }, - { - "epoch": 0.5113981762917933, - "grad_norm": 2.4653842449188232, - "learning_rate": 4.805468680873874e-06, - "loss": 0.5262120366096497, - "mean_token_accuracy": 0.822458803653717, - "num_tokens": 6151236.0, - "step": 673 - }, - { - "epoch": 0.5121580547112462, - "grad_norm": 2.860720157623291, - "learning_rate": 4.804657878971252e-06, - "loss": 0.4007391035556793, - "mean_token_accuracy": 0.8637382984161377, - "num_tokens": 6155310.0, - "step": 674 - }, - { - "epoch": 0.5129179331306991, - "grad_norm": 2.520282030105591, - "learning_rate": 4.803845459571014e-06, - "loss": 0.45798182487487793, - "mean_token_accuracy": 0.8270114660263062, - "num_tokens": 6160326.0, - "step": 675 - }, - { - "epoch": 0.513677811550152, - "grad_norm": 2.7290921211242676, - "learning_rate": 4.803031423243349e-06, - "loss": 0.5745848417282104, - "mean_token_accuracy": 0.8401234745979309, - "num_tokens": 6165709.0, - "step": 676 - }, - { - "epoch": 0.5144376899696048, - "grad_norm": 1.6678650379180908, - "learning_rate": 4.802215770559578e-06, - "loss": 0.5257721543312073, - "mean_token_accuracy": 0.8241991996765137, - "num_tokens": 6177875.0, - "step": 677 - }, - { - "epoch": 0.5151975683890577, - "grad_norm": 2.1720468997955322, - "learning_rate": 4.801398502092156e-06, - "loss": 0.45342206954956055, - "mean_token_accuracy": 0.8463799953460693, - "num_tokens": 6185415.0, - "step": 678 - }, - { - "epoch": 0.5159574468085106, - "grad_norm": 2.282259702682495, - "learning_rate": 4.800579618414677e-06, - "loss": 0.4864169955253601, - "mean_token_accuracy": 0.8300632238388062, - "num_tokens": 6191832.0, - "step": 679 - }, - { - "epoch": 0.5167173252279635, - "grad_norm": 2.0092248916625977, - "learning_rate": 4.799759120101861e-06, - "loss": 0.5781463980674744, - "mean_token_accuracy": 0.8267031908035278, - "num_tokens": 6199440.0, - "step": 680 - }, - { - "epoch": 0.5174772036474165, - "grad_norm": 1.396580696105957, - "learning_rate": 4.798937007729568e-06, - "loss": 0.49689239263534546, - "mean_token_accuracy": 0.8257499933242798, - "num_tokens": 6213840.0, - "step": 681 - }, - { - "epoch": 0.5182370820668692, - "grad_norm": 1.9060769081115723, - "learning_rate": 4.798113281874788e-06, - "loss": 0.48969539999961853, - "mean_token_accuracy": 0.8171790838241577, - "num_tokens": 6223006.0, - "step": 682 - }, - { - "epoch": 0.5189969604863222, - "grad_norm": 1.6255282163619995, - "learning_rate": 4.797287943115642e-06, - "loss": 0.5532330870628357, - "mean_token_accuracy": 0.8173393607139587, - "num_tokens": 6234857.0, - "step": 683 - }, - { - "epoch": 0.5197568389057751, - "grad_norm": 1.6923905611038208, - "learning_rate": 4.796460992031386e-06, - "loss": 0.4880887269973755, - "mean_token_accuracy": 0.834983229637146, - "num_tokens": 6245252.0, - "step": 684 - }, - { - "epoch": 0.520516717325228, - "grad_norm": 2.13161301612854, - "learning_rate": 4.7956324292024045e-06, - "loss": 0.5687593817710876, - "mean_token_accuracy": 0.7996571063995361, - "num_tokens": 6253726.0, - "step": 685 - }, - { - "epoch": 0.5212765957446809, - "grad_norm": 2.509375810623169, - "learning_rate": 4.794802255210217e-06, - "loss": 0.5396929979324341, - "mean_token_accuracy": 0.8007107973098755, - "num_tokens": 6259238.0, - "step": 686 - }, - { - "epoch": 0.5220364741641338, - "grad_norm": 2.393710136413574, - "learning_rate": 4.793970470637469e-06, - "loss": 0.6165191531181335, - "mean_token_accuracy": 0.7891418933868408, - "num_tokens": 6266325.0, - "step": 687 - }, - { - "epoch": 0.5227963525835866, - "grad_norm": 1.511647343635559, - "learning_rate": 4.7931370760679415e-06, - "loss": 0.4773876965045929, - "mean_token_accuracy": 0.8381044864654541, - "num_tokens": 6277447.0, - "step": 688 - }, - { - "epoch": 0.5235562310030395, - "grad_norm": 2.206587314605713, - "learning_rate": 4.792302072086542e-06, - "loss": 0.5482058525085449, - "mean_token_accuracy": 0.8239108920097351, - "num_tokens": 6285163.0, - "step": 689 - }, - { - "epoch": 0.5243161094224924, - "grad_norm": 3.018146514892578, - "learning_rate": 4.7914654592793065e-06, - "loss": 0.4880615472793579, - "mean_token_accuracy": 0.8361308574676514, - "num_tokens": 6289386.0, - "step": 690 - }, - { - "epoch": 0.5250759878419453, - "grad_norm": 1.6469231843948364, - "learning_rate": 4.790627238233405e-06, - "loss": 0.4164774715900421, - "mean_token_accuracy": 0.8496290445327759, - "num_tokens": 6298915.0, - "step": 691 - }, - { - "epoch": 0.5258358662613982, - "grad_norm": 2.352505922317505, - "learning_rate": 4.789787409537131e-06, - "loss": 0.5366303324699402, - "mean_token_accuracy": 0.8350417613983154, - "num_tokens": 6306130.0, - "step": 692 - }, - { - "epoch": 0.526595744680851, - "grad_norm": 1.7463021278381348, - "learning_rate": 4.7889459737799105e-06, - "loss": 0.4389137923717499, - "mean_token_accuracy": 0.8463300466537476, - "num_tokens": 6315503.0, - "step": 693 - }, - { - "epoch": 0.5273556231003039, - "grad_norm": 2.257706642150879, - "learning_rate": 4.788102931552294e-06, - "loss": 0.5309344530105591, - "mean_token_accuracy": 0.8164352178573608, - "num_tokens": 6321852.0, - "step": 694 - }, - { - "epoch": 0.5281155015197568, - "grad_norm": 2.392732620239258, - "learning_rate": 4.787258283445962e-06, - "loss": 0.3956204056739807, - "mean_token_accuracy": 0.8671456575393677, - "num_tokens": 6327380.0, - "step": 695 - }, - { - "epoch": 0.5288753799392097, - "grad_norm": 2.210514545440674, - "learning_rate": 4.786412030053721e-06, - "loss": 0.4842875003814697, - "mean_token_accuracy": 0.8508446216583252, - "num_tokens": 6334898.0, - "step": 696 - }, - { - "epoch": 0.5296352583586627, - "grad_norm": 1.8678946495056152, - "learning_rate": 4.785564171969503e-06, - "loss": 0.47399595379829407, - "mean_token_accuracy": 0.8514996767044067, - "num_tokens": 6346374.0, - "step": 697 - }, - { - "epoch": 0.5303951367781155, - "grad_norm": 2.604079484939575, - "learning_rate": 4.784714709788368e-06, - "loss": 0.5950228571891785, - "mean_token_accuracy": 0.7983481884002686, - "num_tokens": 6351648.0, - "step": 698 - }, - { - "epoch": 0.5311550151975684, - "grad_norm": 1.662381649017334, - "learning_rate": 4.783863644106502e-06, - "loss": 0.41616758704185486, - "mean_token_accuracy": 0.8554803133010864, - "num_tokens": 6360506.0, - "step": 699 - }, - { - "epoch": 0.5319148936170213, - "grad_norm": 1.6300342082977295, - "learning_rate": 4.783010975521216e-06, - "loss": 0.43029269576072693, - "mean_token_accuracy": 0.8443028926849365, - "num_tokens": 6370675.0, - "step": 700 - }, - { - "epoch": 0.5326747720364742, - "grad_norm": 1.731873869895935, - "learning_rate": 4.782156704630944e-06, - "loss": 0.4383814334869385, - "mean_token_accuracy": 0.8443183898925781, - "num_tokens": 6381803.0, - "step": 701 - }, - { - "epoch": 0.5334346504559271, - "grad_norm": 3.1788413524627686, - "learning_rate": 4.7813008320352475e-06, - "loss": 0.32194480299949646, - "mean_token_accuracy": 0.8870962858200073, - "num_tokens": 6389263.0, - "step": 702 - }, - { - "epoch": 0.53419452887538, - "grad_norm": 2.099513530731201, - "learning_rate": 4.78044335833481e-06, - "loss": 0.36962923407554626, - "mean_token_accuracy": 0.8661133646965027, - "num_tokens": 6395589.0, - "step": 703 - }, - { - "epoch": 0.5349544072948328, - "grad_norm": 1.4859435558319092, - "learning_rate": 4.77958428413144e-06, - "loss": 0.4619954824447632, - "mean_token_accuracy": 0.8438555002212524, - "num_tokens": 6407470.0, - "step": 704 - }, - { - "epoch": 0.5357142857142857, - "grad_norm": 1.2561073303222656, - "learning_rate": 4.7787236100280685e-06, - "loss": 0.3770977258682251, - "mean_token_accuracy": 0.8515733480453491, - "num_tokens": 6422888.0, - "step": 705 - }, - { - "epoch": 0.5364741641337386, - "grad_norm": 1.4455817937850952, - "learning_rate": 4.777861336628751e-06, - "loss": 0.46481069922447205, - "mean_token_accuracy": 0.8502002954483032, - "num_tokens": 6441266.0, - "step": 706 - }, - { - "epoch": 0.5372340425531915, - "grad_norm": 1.1387295722961426, - "learning_rate": 4.7769974645386616e-06, - "loss": 0.36964765191078186, - "mean_token_accuracy": 0.8719524145126343, - "num_tokens": 6463686.0, - "step": 707 - }, - { - "epoch": 0.5379939209726444, - "grad_norm": 1.7179663181304932, - "learning_rate": 4.776131994364102e-06, - "loss": 0.4231719970703125, - "mean_token_accuracy": 0.8416585922241211, - "num_tokens": 6472956.0, - "step": 708 - }, - { - "epoch": 0.5387537993920972, - "grad_norm": 1.6328502893447876, - "learning_rate": 4.775264926712489e-06, - "loss": 0.5836569666862488, - "mean_token_accuracy": 0.8039724230766296, - "num_tokens": 6485773.0, - "step": 709 - }, - { - "epoch": 0.5395136778115501, - "grad_norm": 1.8515360355377197, - "learning_rate": 4.774396262192368e-06, - "loss": 0.5477553009986877, - "mean_token_accuracy": 0.8136521577835083, - "num_tokens": 6496379.0, - "step": 710 - }, - { - "epoch": 0.540273556231003, - "grad_norm": 1.741858959197998, - "learning_rate": 4.7735260014133986e-06, - "loss": 0.4663267731666565, - "mean_token_accuracy": 0.8473691940307617, - "num_tokens": 6507652.0, - "step": 711 - }, - { - "epoch": 0.541033434650456, - "grad_norm": 1.7516659498214722, - "learning_rate": 4.772654144986364e-06, - "loss": 0.374914288520813, - "mean_token_accuracy": 0.8600220680236816, - "num_tokens": 6519030.0, - "step": 712 - }, - { - "epoch": 0.5417933130699089, - "grad_norm": 2.662343978881836, - "learning_rate": 4.7717806935231665e-06, - "loss": 0.4206875264644623, - "mean_token_accuracy": 0.8544126749038696, - "num_tokens": 6523669.0, - "step": 713 - }, - { - "epoch": 0.5425531914893617, - "grad_norm": 1.4088834524154663, - "learning_rate": 4.770905647636828e-06, - "loss": 0.5824331045150757, - "mean_token_accuracy": 0.7857901453971863, - "num_tokens": 6540560.0, - "step": 714 - }, - { - "epoch": 0.5433130699088146, - "grad_norm": 2.173656940460205, - "learning_rate": 4.77002900794149e-06, - "loss": 0.555023729801178, - "mean_token_accuracy": 0.8067290782928467, - "num_tokens": 6548946.0, - "step": 715 - }, - { - "epoch": 0.5440729483282675, - "grad_norm": 2.121018648147583, - "learning_rate": 4.769150775052411e-06, - "loss": 0.559730052947998, - "mean_token_accuracy": 0.8166372776031494, - "num_tokens": 6556065.0, - "step": 716 - }, - { - "epoch": 0.5448328267477204, - "grad_norm": 3.335866928100586, - "learning_rate": 4.768270949585968e-06, - "loss": 0.6442267894744873, - "mean_token_accuracy": 0.7858607769012451, - "num_tokens": 6560615.0, - "step": 717 - }, - { - "epoch": 0.5455927051671733, - "grad_norm": 2.3813695907592773, - "learning_rate": 4.767389532159659e-06, - "loss": 0.4027421474456787, - "mean_token_accuracy": 0.8635619282722473, - "num_tokens": 6565841.0, - "step": 718 - }, - { - "epoch": 0.5463525835866262, - "grad_norm": 2.0657708644866943, - "learning_rate": 4.766506523392095e-06, - "loss": 0.38899827003479004, - "mean_token_accuracy": 0.8660480380058289, - "num_tokens": 6572362.0, - "step": 719 - }, - { - "epoch": 0.547112462006079, - "grad_norm": 1.093705415725708, - "learning_rate": 4.765621923903005e-06, - "loss": 0.45967352390289307, - "mean_token_accuracy": 0.8338102102279663, - "num_tokens": 6595998.0, - "step": 720 - }, - { - "epoch": 0.5478723404255319, - "grad_norm": 2.942065954208374, - "learning_rate": 4.764735734313236e-06, - "loss": 0.42910510301589966, - "mean_token_accuracy": 0.8406122922897339, - "num_tokens": 6601075.0, - "step": 721 - }, - { - "epoch": 0.5486322188449848, - "grad_norm": 2.049011707305908, - "learning_rate": 4.763847955244749e-06, - "loss": 0.5584231615066528, - "mean_token_accuracy": 0.8171684741973877, - "num_tokens": 6609310.0, - "step": 722 - }, - { - "epoch": 0.5493920972644377, - "grad_norm": 2.485543966293335, - "learning_rate": 4.762958587320623e-06, - "loss": 0.5396170020103455, - "mean_token_accuracy": 0.8158525824546814, - "num_tokens": 6616185.0, - "step": 723 - }, - { - "epoch": 0.5501519756838906, - "grad_norm": 1.87015962600708, - "learning_rate": 4.762067631165049e-06, - "loss": 0.49739527702331543, - "mean_token_accuracy": 0.8303765654563904, - "num_tokens": 6625629.0, - "step": 724 - }, - { - "epoch": 0.5509118541033434, - "grad_norm": 4.239654541015625, - "learning_rate": 4.761175087403336e-06, - "loss": 0.6029239296913147, - "mean_token_accuracy": 0.8123486042022705, - "num_tokens": 6629194.0, - "step": 725 - }, - { - "epoch": 0.5516717325227963, - "grad_norm": 2.0134730339050293, - "learning_rate": 4.760280956661904e-06, - "loss": 0.4777873754501343, - "mean_token_accuracy": 0.8283513784408569, - "num_tokens": 6636929.0, - "step": 726 - }, - { - "epoch": 0.5524316109422492, - "grad_norm": 1.991780400276184, - "learning_rate": 4.75938523956829e-06, - "loss": 0.4631248116493225, - "mean_token_accuracy": 0.8275107741355896, - "num_tokens": 6645135.0, - "step": 727 - }, - { - "epoch": 0.5531914893617021, - "grad_norm": 1.423792839050293, - "learning_rate": 4.75848793675114e-06, - "loss": 0.49630722403526306, - "mean_token_accuracy": 0.8388000130653381, - "num_tokens": 6662690.0, - "step": 728 - }, - { - "epoch": 0.5539513677811551, - "grad_norm": 2.345294952392578, - "learning_rate": 4.757589048840219e-06, - "loss": 0.37830638885498047, - "mean_token_accuracy": 0.8782080411911011, - "num_tokens": 6667285.0, - "step": 729 - }, - { - "epoch": 0.5547112462006079, - "grad_norm": 2.7452144622802734, - "learning_rate": 4.756688576466398e-06, - "loss": 0.51595538854599, - "mean_token_accuracy": 0.8441770672798157, - "num_tokens": 6672324.0, - "step": 730 - }, - { - "epoch": 0.5554711246200608, - "grad_norm": 1.5247859954833984, - "learning_rate": 4.755786520261666e-06, - "loss": 0.48365193605422974, - "mean_token_accuracy": 0.8276445269584656, - "num_tokens": 6685296.0, - "step": 731 - }, - { - "epoch": 0.5562310030395137, - "grad_norm": 1.4018276929855347, - "learning_rate": 4.75488288085912e-06, - "loss": 0.3876481354236603, - "mean_token_accuracy": 0.8612343072891235, - "num_tokens": 6697515.0, - "step": 732 - }, - { - "epoch": 0.5569908814589666, - "grad_norm": 2.9570324420928955, - "learning_rate": 4.753977658892967e-06, - "loss": 0.5468149185180664, - "mean_token_accuracy": 0.8054271340370178, - "num_tokens": 6702194.0, - "step": 733 - }, - { - "epoch": 0.5577507598784195, - "grad_norm": 1.9282715320587158, - "learning_rate": 4.753070854998529e-06, - "loss": 0.4758574962615967, - "mean_token_accuracy": 0.8379775285720825, - "num_tokens": 6709938.0, - "step": 734 - }, - { - "epoch": 0.5585106382978723, - "grad_norm": 1.981264591217041, - "learning_rate": 4.752162469812234e-06, - "loss": 0.48461222648620605, - "mean_token_accuracy": 0.833509087562561, - "num_tokens": 6718125.0, - "step": 735 - }, - { - "epoch": 0.5592705167173252, - "grad_norm": 1.1643427610397339, - "learning_rate": 4.751252503971624e-06, - "loss": 0.410121887922287, - "mean_token_accuracy": 0.8221402764320374, - "num_tokens": 6735125.0, - "step": 736 - }, - { - "epoch": 0.5600303951367781, - "grad_norm": 1.786566972732544, - "learning_rate": 4.750340958115346e-06, - "loss": 0.5964341163635254, - "mean_token_accuracy": 0.8038164377212524, - "num_tokens": 6747369.0, - "step": 737 - }, - { - "epoch": 0.560790273556231, - "grad_norm": 1.7256991863250732, - "learning_rate": 4.749427832883158e-06, - "loss": 0.48737066984176636, - "mean_token_accuracy": 0.830894947052002, - "num_tokens": 6758115.0, - "step": 738 - }, - { - "epoch": 0.5615501519756839, - "grad_norm": 1.997747540473938, - "learning_rate": 4.748513128915928e-06, - "loss": 0.5238886475563049, - "mean_token_accuracy": 0.8066858053207397, - "num_tokens": 6766111.0, - "step": 739 - }, - { - "epoch": 0.5623100303951368, - "grad_norm": 2.127016305923462, - "learning_rate": 4.747596846855629e-06, - "loss": 0.5045586228370667, - "mean_token_accuracy": 0.821424126625061, - "num_tokens": 6772893.0, - "step": 740 - }, - { - "epoch": 0.5630699088145896, - "grad_norm": 1.7664796113967896, - "learning_rate": 4.7466789873453446e-06, - "loss": 0.42954835295677185, - "mean_token_accuracy": 0.8533384799957275, - "num_tokens": 6785133.0, - "step": 741 - }, - { - "epoch": 0.5638297872340425, - "grad_norm": 1.4987404346466064, - "learning_rate": 4.7457595510292615e-06, - "loss": 0.5378558039665222, - "mean_token_accuracy": 0.8184819221496582, - "num_tokens": 6799563.0, - "step": 742 - }, - { - "epoch": 0.5645896656534954, - "grad_norm": 1.4444655179977417, - "learning_rate": 4.744838538552678e-06, - "loss": 0.42193782329559326, - "mean_token_accuracy": 0.837514340877533, - "num_tokens": 6812470.0, - "step": 743 - }, - { - "epoch": 0.5653495440729484, - "grad_norm": 3.867751121520996, - "learning_rate": 4.7439159505619946e-06, - "loss": 0.4457814693450928, - "mean_token_accuracy": 0.8630104660987854, - "num_tokens": 6815652.0, - "step": 744 - }, - { - "epoch": 0.5661094224924013, - "grad_norm": 2.1250710487365723, - "learning_rate": 4.74299178770472e-06, - "loss": 0.5638922452926636, - "mean_token_accuracy": 0.7969781160354614, - "num_tokens": 6824566.0, - "step": 745 - }, - { - "epoch": 0.5668693009118541, - "grad_norm": 2.547072410583496, - "learning_rate": 4.742066050629465e-06, - "loss": 0.5516207814216614, - "mean_token_accuracy": 0.8160669803619385, - "num_tokens": 6830589.0, - "step": 746 - }, - { - "epoch": 0.567629179331307, - "grad_norm": 1.2975233793258667, - "learning_rate": 4.741138739985951e-06, - "loss": 0.3823344111442566, - "mean_token_accuracy": 0.8668368458747864, - "num_tokens": 6842707.0, - "step": 747 - }, - { - "epoch": 0.5683890577507599, - "grad_norm": 1.3410450220108032, - "learning_rate": 4.740209856424998e-06, - "loss": 0.5148671269416809, - "mean_token_accuracy": 0.8188045024871826, - "num_tokens": 6857624.0, - "step": 748 - }, - { - "epoch": 0.5691489361702128, - "grad_norm": 1.219467282295227, - "learning_rate": 4.7392794005985324e-06, - "loss": 0.3998957872390747, - "mean_token_accuracy": 0.855175256729126, - "num_tokens": 6875064.0, - "step": 749 - }, - { - "epoch": 0.5699088145896657, - "grad_norm": 1.3530343770980835, - "learning_rate": 4.738347373159585e-06, - "loss": 0.5359633564949036, - "mean_token_accuracy": 0.8178457021713257, - "num_tokens": 6890911.0, - "step": 750 - }, - { - "epoch": 0.5706686930091185, - "grad_norm": 2.146988868713379, - "learning_rate": 4.737413774762287e-06, - "loss": 0.4460008144378662, - "mean_token_accuracy": 0.8172903060913086, - "num_tokens": 6896959.0, - "step": 751 - }, - { - "epoch": 0.5714285714285714, - "grad_norm": 1.456023097038269, - "learning_rate": 4.736478606061876e-06, - "loss": 0.43616920709609985, - "mean_token_accuracy": 0.8465108871459961, - "num_tokens": 6908904.0, - "step": 752 - }, - { - "epoch": 0.5721884498480243, - "grad_norm": 2.9696967601776123, - "learning_rate": 4.735541867714687e-06, - "loss": 0.43464532494544983, - "mean_token_accuracy": 0.8608652353286743, - "num_tokens": 6913026.0, - "step": 753 - }, - { - "epoch": 0.5729483282674772, - "grad_norm": 2.2990667819976807, - "learning_rate": 4.73460356037816e-06, - "loss": 0.6619116067886353, - "mean_token_accuracy": 0.7821142673492432, - "num_tokens": 6920588.0, - "step": 754 - }, - { - "epoch": 0.5737082066869301, - "grad_norm": 2.054746389389038, - "learning_rate": 4.733663684710835e-06, - "loss": 0.5304250717163086, - "mean_token_accuracy": 0.8265531063079834, - "num_tokens": 6928910.0, - "step": 755 - }, - { - "epoch": 0.574468085106383, - "grad_norm": 2.0050594806671143, - "learning_rate": 4.732722241372354e-06, - "loss": 0.6393026113510132, - "mean_token_accuracy": 0.796819806098938, - "num_tokens": 6940217.0, - "step": 756 - }, - { - "epoch": 0.5752279635258358, - "grad_norm": 1.4285320043563843, - "learning_rate": 4.731779231023456e-06, - "loss": 0.5432837009429932, - "mean_token_accuracy": 0.8104778528213501, - "num_tokens": 6959101.0, - "step": 757 - }, - { - "epoch": 0.5759878419452887, - "grad_norm": 2.3941943645477295, - "learning_rate": 4.730834654325984e-06, - "loss": 0.46550673246383667, - "mean_token_accuracy": 0.8444503545761108, - "num_tokens": 6965036.0, - "step": 758 - }, - { - "epoch": 0.5767477203647416, - "grad_norm": 2.3850574493408203, - "learning_rate": 4.729888511942877e-06, - "loss": 0.4916389584541321, - "mean_token_accuracy": 0.8228527307510376, - "num_tokens": 6971184.0, - "step": 759 - }, - { - "epoch": 0.5775075987841946, - "grad_norm": 1.627480149269104, - "learning_rate": 4.728940804538176e-06, - "loss": 0.5863215923309326, - "mean_token_accuracy": 0.7995302677154541, - "num_tokens": 6982569.0, - "step": 760 - }, - { - "epoch": 0.5782674772036475, - "grad_norm": 1.1723195314407349, - "learning_rate": 4.727991532777016e-06, - "loss": 0.36908864974975586, - "mean_token_accuracy": 0.8355655670166016, - "num_tokens": 6998659.0, - "step": 761 - }, - { - "epoch": 0.5790273556231003, - "grad_norm": 1.5324925184249878, - "learning_rate": 4.727040697325634e-06, - "loss": 0.557658851146698, - "mean_token_accuracy": 0.8141458034515381, - "num_tokens": 7012969.0, - "step": 762 - }, - { - "epoch": 0.5797872340425532, - "grad_norm": 2.4106390476226807, - "learning_rate": 4.726088298851362e-06, - "loss": 0.5004243850708008, - "mean_token_accuracy": 0.8376860618591309, - "num_tokens": 7018301.0, - "step": 763 - }, - { - "epoch": 0.5805471124620061, - "grad_norm": 2.2594921588897705, - "learning_rate": 4.725134338022631e-06, - "loss": 0.6067016124725342, - "mean_token_accuracy": 0.8100241422653198, - "num_tokens": 7025201.0, - "step": 764 - }, - { - "epoch": 0.581306990881459, - "grad_norm": 1.4649826288223267, - "learning_rate": 4.724178815508967e-06, - "loss": 0.36200693249702454, - "mean_token_accuracy": 0.8621826171875, - "num_tokens": 7035112.0, - "step": 765 - }, - { - "epoch": 0.5820668693009119, - "grad_norm": 2.3634560108184814, - "learning_rate": 4.723221731980993e-06, - "loss": 0.41862213611602783, - "mean_token_accuracy": 0.8541463613510132, - "num_tokens": 7040339.0, - "step": 766 - }, - { - "epoch": 0.5828267477203647, - "grad_norm": 2.7798104286193848, - "learning_rate": 4.722263088110426e-06, - "loss": 0.4647108018398285, - "mean_token_accuracy": 0.8505672216415405, - "num_tokens": 7044880.0, - "step": 767 - }, - { - "epoch": 0.5835866261398176, - "grad_norm": 2.070528507232666, - "learning_rate": 4.721302884570079e-06, - "loss": 0.5147565007209778, - "mean_token_accuracy": 0.8113877773284912, - "num_tokens": 7052433.0, - "step": 768 - }, - { - "epoch": 0.5843465045592705, - "grad_norm": 2.1953284740448, - "learning_rate": 4.720341122033862e-06, - "loss": 0.5075466632843018, - "mean_token_accuracy": 0.8474211096763611, - "num_tokens": 7058686.0, - "step": 769 - }, - { - "epoch": 0.5851063829787234, - "grad_norm": 1.9287755489349365, - "learning_rate": 4.719377801176774e-06, - "loss": 0.5382202863693237, - "mean_token_accuracy": 0.8148090243339539, - "num_tokens": 7067538.0, - "step": 770 - }, - { - "epoch": 0.5858662613981763, - "grad_norm": 1.5574456453323364, - "learning_rate": 4.718412922674913e-06, - "loss": 0.43406790494918823, - "mean_token_accuracy": 0.8477081060409546, - "num_tokens": 7077853.0, - "step": 771 - }, - { - "epoch": 0.5866261398176292, - "grad_norm": 1.5490336418151855, - "learning_rate": 4.717446487205466e-06, - "loss": 0.43164271116256714, - "mean_token_accuracy": 0.8504570126533508, - "num_tokens": 7091728.0, - "step": 772 - }, - { - "epoch": 0.587386018237082, - "grad_norm": 1.6945984363555908, - "learning_rate": 4.716478495446717e-06, - "loss": 0.5153743624687195, - "mean_token_accuracy": 0.8213579058647156, - "num_tokens": 7108680.0, - "step": 773 - }, - { - "epoch": 0.5881458966565349, - "grad_norm": 2.2633883953094482, - "learning_rate": 4.715508948078037e-06, - "loss": 0.45254790782928467, - "mean_token_accuracy": 0.8392219543457031, - "num_tokens": 7115546.0, - "step": 774 - }, - { - "epoch": 0.5889057750759878, - "grad_norm": 1.5731090307235718, - "learning_rate": 4.714537845779894e-06, - "loss": 0.38678881525993347, - "mean_token_accuracy": 0.8800252676010132, - "num_tokens": 7126360.0, - "step": 775 - }, - { - "epoch": 0.5896656534954408, - "grad_norm": 2.4873392581939697, - "learning_rate": 4.7135651892338445e-06, - "loss": 0.5190927386283875, - "mean_token_accuracy": 0.8145407438278198, - "num_tokens": 7135705.0, - "step": 776 - }, - { - "epoch": 0.5904255319148937, - "grad_norm": 1.2931004762649536, - "learning_rate": 4.712590979122534e-06, - "loss": 0.3686544895172119, - "mean_token_accuracy": 0.8720537424087524, - "num_tokens": 7150688.0, - "step": 777 - }, - { - "epoch": 0.5911854103343465, - "grad_norm": 1.6353671550750732, - "learning_rate": 4.7116152161297045e-06, - "loss": 0.49065062403678894, - "mean_token_accuracy": 0.8203760385513306, - "num_tokens": 7161040.0, - "step": 778 - }, - { - "epoch": 0.5919452887537994, - "grad_norm": 1.2345483303070068, - "learning_rate": 4.710637900940181e-06, - "loss": 0.4004976451396942, - "mean_token_accuracy": 0.8302007913589478, - "num_tokens": 7178074.0, - "step": 779 - }, - { - "epoch": 0.5927051671732523, - "grad_norm": 2.2506837844848633, - "learning_rate": 4.7096590342398825e-06, - "loss": 0.45142874121665955, - "mean_token_accuracy": 0.8481036424636841, - "num_tokens": 7184153.0, - "step": 780 - }, - { - "epoch": 0.5934650455927052, - "grad_norm": 1.420479416847229, - "learning_rate": 4.708678616715815e-06, - "loss": 0.4802100360393524, - "mean_token_accuracy": 0.8586992025375366, - "num_tokens": 7202810.0, - "step": 781 - }, - { - "epoch": 0.5942249240121581, - "grad_norm": 3.457632303237915, - "learning_rate": 4.707696649056073e-06, - "loss": 0.5265094041824341, - "mean_token_accuracy": 0.8260114192962646, - "num_tokens": 7206396.0, - "step": 782 - }, - { - "epoch": 0.5949848024316109, - "grad_norm": 1.1592093706130981, - "learning_rate": 4.706713131949839e-06, - "loss": 0.3708173632621765, - "mean_token_accuracy": 0.8476542234420776, - "num_tokens": 7225034.0, - "step": 783 - }, - { - "epoch": 0.5957446808510638, - "grad_norm": 1.6761400699615479, - "learning_rate": 4.705728066087384e-06, - "loss": 0.4137252867221832, - "mean_token_accuracy": 0.8462049961090088, - "num_tokens": 7237101.0, - "step": 784 - }, - { - "epoch": 0.5965045592705167, - "grad_norm": 2.320185422897339, - "learning_rate": 4.704741452160064e-06, - "loss": 0.5157154202461243, - "mean_token_accuracy": 0.8391785621643066, - "num_tokens": 7243826.0, - "step": 785 - }, - { - "epoch": 0.5972644376899696, - "grad_norm": 2.079423427581787, - "learning_rate": 4.703753290860323e-06, - "loss": 0.4734993278980255, - "mean_token_accuracy": 0.8353281021118164, - "num_tokens": 7250175.0, - "step": 786 - }, - { - "epoch": 0.5980243161094225, - "grad_norm": 1.8215159177780151, - "learning_rate": 4.702763582881692e-06, - "loss": 0.520193338394165, - "mean_token_accuracy": 0.844062864780426, - "num_tokens": 7258868.0, - "step": 787 - }, - { - "epoch": 0.5987841945288754, - "grad_norm": 1.3823071718215942, - "learning_rate": 4.701772328918784e-06, - "loss": 0.4177844822406769, - "mean_token_accuracy": 0.8363165259361267, - "num_tokens": 7271744.0, - "step": 788 - }, - { - "epoch": 0.5995440729483282, - "grad_norm": 2.4749298095703125, - "learning_rate": 4.700779529667301e-06, - "loss": 0.5115069150924683, - "mean_token_accuracy": 0.8473520278930664, - "num_tokens": 7277040.0, - "step": 789 - }, - { - "epoch": 0.6003039513677811, - "grad_norm": 1.7072296142578125, - "learning_rate": 4.699785185824026e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8161447048187256, - "num_tokens": 7288288.0, - "step": 790 - }, - { - "epoch": 0.601063829787234, - "grad_norm": 1.6479384899139404, - "learning_rate": 4.69878929808683e-06, - "loss": 0.4445168972015381, - "mean_token_accuracy": 0.8381255865097046, - "num_tokens": 7298640.0, - "step": 791 - }, - { - "epoch": 0.601823708206687, - "grad_norm": 1.9095896482467651, - "learning_rate": 4.6977918671546635e-06, - "loss": 0.5841238498687744, - "mean_token_accuracy": 0.7971454858779907, - "num_tokens": 7307220.0, - "step": 792 - }, - { - "epoch": 0.6025835866261399, - "grad_norm": 1.9614146947860718, - "learning_rate": 4.696792893727562e-06, - "loss": 0.34684082865715027, - "mean_token_accuracy": 0.8739526271820068, - "num_tokens": 7313875.0, - "step": 793 - }, - { - "epoch": 0.6033434650455927, - "grad_norm": 2.015570640563965, - "learning_rate": 4.695792378506645e-06, - "loss": 0.42779117822647095, - "mean_token_accuracy": 0.8625012636184692, - "num_tokens": 7321439.0, - "step": 794 - }, - { - "epoch": 0.6041033434650456, - "grad_norm": 2.8581228256225586, - "learning_rate": 4.694790322194111e-06, - "loss": 0.6519991159439087, - "mean_token_accuracy": 0.7629562616348267, - "num_tokens": 7326916.0, - "step": 795 - }, - { - "epoch": 0.6048632218844985, - "grad_norm": 2.482715368270874, - "learning_rate": 4.693786725493242e-06, - "loss": 0.532963216304779, - "mean_token_accuracy": 0.832184910774231, - "num_tokens": 7333311.0, - "step": 796 - }, - { - "epoch": 0.6056231003039514, - "grad_norm": 1.6076741218566895, - "learning_rate": 4.692781589108402e-06, - "loss": 0.43381205201148987, - "mean_token_accuracy": 0.8402494192123413, - "num_tokens": 7343731.0, - "step": 797 - }, - { - "epoch": 0.6063829787234043, - "grad_norm": 2.2133216857910156, - "learning_rate": 4.691774913745033e-06, - "loss": 0.4380851089954376, - "mean_token_accuracy": 0.8600908517837524, - "num_tokens": 7350224.0, - "step": 798 - }, - { - "epoch": 0.6071428571428571, - "grad_norm": 2.046280860900879, - "learning_rate": 4.690766700109659e-06, - "loss": 0.3821919560432434, - "mean_token_accuracy": 0.8691814541816711, - "num_tokens": 7356717.0, - "step": 799 - }, - { - "epoch": 0.60790273556231, - "grad_norm": 1.8482693433761597, - "learning_rate": 4.689756948909884e-06, - "loss": 0.5217651128768921, - "mean_token_accuracy": 0.803473711013794, - "num_tokens": 7365806.0, - "step": 800 - }, - { - "epoch": 0.6086626139817629, - "grad_norm": 2.192134141921997, - "learning_rate": 4.688745660854388e-06, - "loss": 0.573980987071991, - "mean_token_accuracy": 0.8198676109313965, - "num_tokens": 7380281.0, - "step": 801 - }, - { - "epoch": 0.6094224924012158, - "grad_norm": 2.363626718521118, - "learning_rate": 4.687732836652935e-06, - "loss": 0.5204599499702454, - "mean_token_accuracy": 0.8373252153396606, - "num_tokens": 7386938.0, - "step": 802 - }, - { - "epoch": 0.6101823708206687, - "grad_norm": 1.9320523738861084, - "learning_rate": 4.686718477016361e-06, - "loss": 0.47316622734069824, - "mean_token_accuracy": 0.830596923828125, - "num_tokens": 7395069.0, - "step": 803 - }, - { - "epoch": 0.6109422492401215, - "grad_norm": 2.6573057174682617, - "learning_rate": 4.6857025826565845e-06, - "loss": 0.5495861768722534, - "mean_token_accuracy": 0.8187421560287476, - "num_tokens": 7400563.0, - "step": 804 - }, - { - "epoch": 0.6117021276595744, - "grad_norm": 2.0893123149871826, - "learning_rate": 4.684685154286599e-06, - "loss": 0.5362675786018372, - "mean_token_accuracy": 0.8394701480865479, - "num_tokens": 7406973.0, - "step": 805 - }, - { - "epoch": 0.6124620060790273, - "grad_norm": 2.455130100250244, - "learning_rate": 4.683666192620474e-06, - "loss": 0.5405995845794678, - "mean_token_accuracy": 0.8079100847244263, - "num_tokens": 7412931.0, - "step": 806 - }, - { - "epoch": 0.6132218844984803, - "grad_norm": 2.311915636062622, - "learning_rate": 4.682645698373357e-06, - "loss": 0.5395106077194214, - "mean_token_accuracy": 0.8156260251998901, - "num_tokens": 7419699.0, - "step": 807 - }, - { - "epoch": 0.6139817629179332, - "grad_norm": 1.686838984489441, - "learning_rate": 4.6816236722614694e-06, - "loss": 0.6034521460533142, - "mean_token_accuracy": 0.7855954170227051, - "num_tokens": 7431899.0, - "step": 808 - }, - { - "epoch": 0.6147416413373861, - "grad_norm": 1.682759165763855, - "learning_rate": 4.680600115002109e-06, - "loss": 0.48593831062316895, - "mean_token_accuracy": 0.8229435682296753, - "num_tokens": 7443187.0, - "step": 809 - }, - { - "epoch": 0.6155015197568389, - "grad_norm": 2.064589738845825, - "learning_rate": 4.679575027313649e-06, - "loss": 0.5098468661308289, - "mean_token_accuracy": 0.8234638571739197, - "num_tokens": 7450868.0, - "step": 810 - }, - { - "epoch": 0.6162613981762918, - "grad_norm": 2.2063486576080322, - "learning_rate": 4.6785484099155324e-06, - "loss": 0.5138497352600098, - "mean_token_accuracy": 0.8152111172676086, - "num_tokens": 7457176.0, - "step": 811 - }, - { - "epoch": 0.6170212765957447, - "grad_norm": 1.6258726119995117, - "learning_rate": 4.67752026352828e-06, - "loss": 0.4064181447029114, - "mean_token_accuracy": 0.8720619678497314, - "num_tokens": 7466557.0, - "step": 812 - }, - { - "epoch": 0.6177811550151976, - "grad_norm": 2.3309383392333984, - "learning_rate": 4.676490588873486e-06, - "loss": 0.5180112719535828, - "mean_token_accuracy": 0.8233879804611206, - "num_tokens": 7472650.0, - "step": 813 - }, - { - "epoch": 0.6185410334346505, - "grad_norm": 1.4545246362686157, - "learning_rate": 4.675459386673815e-06, - "loss": 0.37917959690093994, - "mean_token_accuracy": 0.8598103523254395, - "num_tokens": 7485171.0, - "step": 814 - }, - { - "epoch": 0.6193009118541033, - "grad_norm": 2.654231071472168, - "learning_rate": 4.674426657653003e-06, - "loss": 0.554074227809906, - "mean_token_accuracy": 0.8026446104049683, - "num_tokens": 7490787.0, - "step": 815 - }, - { - "epoch": 0.6200607902735562, - "grad_norm": 1.5543994903564453, - "learning_rate": 4.67339240253586e-06, - "loss": 0.6335440278053284, - "mean_token_accuracy": 0.783241868019104, - "num_tokens": 7505975.0, - "step": 816 - }, - { - "epoch": 0.6208206686930091, - "grad_norm": 2.079998016357422, - "learning_rate": 4.672356622048266e-06, - "loss": 0.5169394016265869, - "mean_token_accuracy": 0.8088761568069458, - "num_tokens": 7513470.0, - "step": 817 - }, - { - "epoch": 0.621580547112462, - "grad_norm": 1.5971896648406982, - "learning_rate": 4.671319316917172e-06, - "loss": 0.44588586688041687, - "mean_token_accuracy": 0.8518649339675903, - "num_tokens": 7524352.0, - "step": 818 - }, - { - "epoch": 0.6223404255319149, - "grad_norm": 2.477579116821289, - "learning_rate": 4.670280487870599e-06, - "loss": 0.5713893175125122, - "mean_token_accuracy": 0.8116940259933472, - "num_tokens": 7530359.0, - "step": 819 - }, - { - "epoch": 0.6231003039513677, - "grad_norm": 2.066211700439453, - "learning_rate": 4.669240135637635e-06, - "loss": 0.5295331478118896, - "mean_token_accuracy": 0.819536566734314, - "num_tokens": 7536963.0, - "step": 820 - }, - { - "epoch": 0.6238601823708206, - "grad_norm": 2.1217997074127197, - "learning_rate": 4.668198260948442e-06, - "loss": 0.6146406531333923, - "mean_token_accuracy": 0.7932635545730591, - "num_tokens": 7545800.0, - "step": 821 - }, - { - "epoch": 0.6246200607902735, - "grad_norm": 2.0173542499542236, - "learning_rate": 4.667154864534245e-06, - "loss": 0.6240535974502563, - "mean_token_accuracy": 0.7883644104003906, - "num_tokens": 7556165.0, - "step": 822 - }, - { - "epoch": 0.6253799392097265, - "grad_norm": 2.014526128768921, - "learning_rate": 4.666109947127343e-06, - "loss": 0.40367332100868225, - "mean_token_accuracy": 0.8653522729873657, - "num_tokens": 7562665.0, - "step": 823 - }, - { - "epoch": 0.6261398176291794, - "grad_norm": 2.5078861713409424, - "learning_rate": 4.665063509461098e-06, - "loss": 0.5903617739677429, - "mean_token_accuracy": 0.7902897596359253, - "num_tokens": 7568922.0, - "step": 824 - }, - { - "epoch": 0.6268996960486323, - "grad_norm": 2.454622745513916, - "learning_rate": 4.664015552269938e-06, - "loss": 0.5238361358642578, - "mean_token_accuracy": 0.838546872138977, - "num_tokens": 7575965.0, - "step": 825 - }, - { - "epoch": 0.6276595744680851, - "grad_norm": 2.920919418334961, - "learning_rate": 4.662966076289363e-06, - "loss": 0.5028782486915588, - "mean_token_accuracy": 0.8311152458190918, - "num_tokens": 7580193.0, - "step": 826 - }, - { - "epoch": 0.628419452887538, - "grad_norm": 1.545382022857666, - "learning_rate": 4.661915082255932e-06, - "loss": 0.4817378520965576, - "mean_token_accuracy": 0.8373227119445801, - "num_tokens": 7593024.0, - "step": 827 - }, - { - "epoch": 0.6291793313069909, - "grad_norm": 1.5152469873428345, - "learning_rate": 4.6608625709072766e-06, - "loss": 0.4693033695220947, - "mean_token_accuracy": 0.8150848150253296, - "num_tokens": 7606459.0, - "step": 828 - }, - { - "epoch": 0.6299392097264438, - "grad_norm": 2.1310224533081055, - "learning_rate": 4.659808542982089e-06, - "loss": 0.4653395414352417, - "mean_token_accuracy": 0.8286294341087341, - "num_tokens": 7613036.0, - "step": 829 - }, - { - "epoch": 0.6306990881458967, - "grad_norm": 2.1949679851531982, - "learning_rate": 4.658752999220125e-06, - "loss": 0.3698633909225464, - "mean_token_accuracy": 0.871590793132782, - "num_tokens": 7618527.0, - "step": 830 - }, - { - "epoch": 0.6314589665653495, - "grad_norm": 2.2770416736602783, - "learning_rate": 4.657695940362207e-06, - "loss": 0.5202419757843018, - "mean_token_accuracy": 0.817577600479126, - "num_tokens": 7624459.0, - "step": 831 - }, - { - "epoch": 0.6322188449848024, - "grad_norm": 1.402042269706726, - "learning_rate": 4.65663736715022e-06, - "loss": 0.51531583070755, - "mean_token_accuracy": 0.8228116631507874, - "num_tokens": 7639371.0, - "step": 832 - }, - { - "epoch": 0.6329787234042553, - "grad_norm": 3.3554883003234863, - "learning_rate": 4.65557728032711e-06, - "loss": 0.6771188378334045, - "mean_token_accuracy": 0.7880028486251831, - "num_tokens": 7643924.0, - "step": 833 - }, - { - "epoch": 0.6337386018237082, - "grad_norm": 2.081040143966675, - "learning_rate": 4.654515680636888e-06, - "loss": 0.5712796449661255, - "mean_token_accuracy": 0.8177868127822876, - "num_tokens": 7651881.0, - "step": 834 - }, - { - "epoch": 0.6344984802431611, - "grad_norm": 0.9128716588020325, - "learning_rate": 4.653452568824625e-06, - "loss": 0.3423936069011688, - "mean_token_accuracy": 0.8782886266708374, - "num_tokens": 7677829.0, - "step": 835 - }, - { - "epoch": 0.6352583586626139, - "grad_norm": 3.49015736579895, - "learning_rate": 4.652387945636454e-06, - "loss": 0.34657734632492065, - "mean_token_accuracy": 0.8770567178726196, - "num_tokens": 7680796.0, - "step": 836 - }, - { - "epoch": 0.6360182370820668, - "grad_norm": 2.026247501373291, - "learning_rate": 4.651321811819568e-06, - "loss": 0.5098431706428528, - "mean_token_accuracy": 0.8216961622238159, - "num_tokens": 7688746.0, - "step": 837 - }, - { - "epoch": 0.6367781155015197, - "grad_norm": 2.444343090057373, - "learning_rate": 4.650254168122222e-06, - "loss": 0.5490090250968933, - "mean_token_accuracy": 0.8092857599258423, - "num_tokens": 7695220.0, - "step": 838 - }, - { - "epoch": 0.6375379939209727, - "grad_norm": 2.0171122550964355, - "learning_rate": 4.649185015293728e-06, - "loss": 0.47221142053604126, - "mean_token_accuracy": 0.8514408469200134, - "num_tokens": 7702759.0, - "step": 839 - }, - { - "epoch": 0.6382978723404256, - "grad_norm": 1.9800984859466553, - "learning_rate": 4.64811435408446e-06, - "loss": 0.5238803625106812, - "mean_token_accuracy": 0.8479194641113281, - "num_tokens": 7714017.0, - "step": 840 - }, - { - "epoch": 0.6390577507598785, - "grad_norm": 3.0674357414245605, - "learning_rate": 4.647042185245848e-06, - "loss": 0.4668245315551758, - "mean_token_accuracy": 0.8381714820861816, - "num_tokens": 7717801.0, - "step": 841 - }, - { - "epoch": 0.6398176291793313, - "grad_norm": 1.5672820806503296, - "learning_rate": 4.645968509530381e-06, - "loss": 0.4428741931915283, - "mean_token_accuracy": 0.8416479825973511, - "num_tokens": 7728342.0, - "step": 842 - }, - { - "epoch": 0.6405775075987842, - "grad_norm": 2.3042354583740234, - "learning_rate": 4.644893327691608e-06, - "loss": 0.49937760829925537, - "mean_token_accuracy": 0.827070951461792, - "num_tokens": 7734576.0, - "step": 843 - }, - { - "epoch": 0.6413373860182371, - "grad_norm": 2.057772159576416, - "learning_rate": 4.6438166404841316e-06, - "loss": 0.5912986993789673, - "mean_token_accuracy": 0.805509090423584, - "num_tokens": 7742481.0, - "step": 844 - }, - { - "epoch": 0.64209726443769, - "grad_norm": 1.9688186645507812, - "learning_rate": 4.6427384486636115e-06, - "loss": 0.482401967048645, - "mean_token_accuracy": 0.8358086347579956, - "num_tokens": 7750002.0, - "step": 845 - }, - { - "epoch": 0.6428571428571429, - "grad_norm": 2.6852948665618896, - "learning_rate": 4.6416587529867665e-06, - "loss": 0.5479315519332886, - "mean_token_accuracy": 0.8091106414794922, - "num_tokens": 7755578.0, - "step": 846 - }, - { - "epoch": 0.6436170212765957, - "grad_norm": 2.0547337532043457, - "learning_rate": 4.640577554211366e-06, - "loss": 0.5327274203300476, - "mean_token_accuracy": 0.8280376195907593, - "num_tokens": 7763513.0, - "step": 847 - }, - { - "epoch": 0.6443768996960486, - "grad_norm": 2.0328633785247803, - "learning_rate": 4.63949485309624e-06, - "loss": 0.4814409613609314, - "mean_token_accuracy": 0.8527672290802002, - "num_tokens": 7771131.0, - "step": 848 - }, - { - "epoch": 0.6451367781155015, - "grad_norm": 1.5892863273620605, - "learning_rate": 4.638410650401267e-06, - "loss": 0.4492785334587097, - "mean_token_accuracy": 0.846997857093811, - "num_tokens": 7781572.0, - "step": 849 - }, - { - "epoch": 0.6458966565349544, - "grad_norm": 1.8295910358428955, - "learning_rate": 4.637324946887384e-06, - "loss": 0.37088239192962646, - "mean_token_accuracy": 0.8616628646850586, - "num_tokens": 7788604.0, - "step": 850 - }, - { - "epoch": 0.6466565349544073, - "grad_norm": 3.380040168762207, - "learning_rate": 4.636237743316578e-06, - "loss": 0.4737280607223511, - "mean_token_accuracy": 0.855940580368042, - "num_tokens": 7792504.0, - "step": 851 - }, - { - "epoch": 0.6474164133738601, - "grad_norm": 2.8790009021759033, - "learning_rate": 4.635149040451891e-06, - "loss": 0.39790448546409607, - "mean_token_accuracy": 0.8710698485374451, - "num_tokens": 7796333.0, - "step": 852 - }, - { - "epoch": 0.648176291793313, - "grad_norm": 1.914914608001709, - "learning_rate": 4.634058839057417e-06, - "loss": 0.2954312562942505, - "mean_token_accuracy": 0.8880234956741333, - "num_tokens": 7802456.0, - "step": 853 - }, - { - "epoch": 0.648936170212766, - "grad_norm": 1.3709120750427246, - "learning_rate": 4.632967139898301e-06, - "loss": 0.43224576115608215, - "mean_token_accuracy": 0.8446190357208252, - "num_tokens": 7816770.0, - "step": 854 - }, - { - "epoch": 0.6496960486322189, - "grad_norm": 1.6579312086105347, - "learning_rate": 4.63187394374074e-06, - "loss": 0.3535553514957428, - "mean_token_accuracy": 0.8738704919815063, - "num_tokens": 7824963.0, - "step": 855 - }, - { - "epoch": 0.6504559270516718, - "grad_norm": 2.4055678844451904, - "learning_rate": 4.63077925135198e-06, - "loss": 0.5078744292259216, - "mean_token_accuracy": 0.8430874347686768, - "num_tokens": 7830962.0, - "step": 856 - }, - { - "epoch": 0.6512158054711246, - "grad_norm": 2.5171499252319336, - "learning_rate": 4.629683063500319e-06, - "loss": 0.5172419548034668, - "mean_token_accuracy": 0.8087141513824463, - "num_tokens": 7836638.0, - "step": 857 - }, - { - "epoch": 0.6519756838905775, - "grad_norm": 1.7588486671447754, - "learning_rate": 4.628585380955104e-06, - "loss": 0.5759496092796326, - "mean_token_accuracy": 0.8043236136436462, - "num_tokens": 7844654.0, - "step": 858 - }, - { - "epoch": 0.6527355623100304, - "grad_norm": 1.5887070894241333, - "learning_rate": 4.62748620448673e-06, - "loss": 0.41849038004875183, - "mean_token_accuracy": 0.8556643724441528, - "num_tokens": 7855642.0, - "step": 859 - }, - { - "epoch": 0.6534954407294833, - "grad_norm": 3.227942705154419, - "learning_rate": 4.626385534866642e-06, - "loss": 0.5279449224472046, - "mean_token_accuracy": 0.8250958323478699, - "num_tokens": 7859890.0, - "step": 860 - }, - { - "epoch": 0.6542553191489362, - "grad_norm": 2.440467119216919, - "learning_rate": 4.625283372867333e-06, - "loss": 0.5294933319091797, - "mean_token_accuracy": 0.8235013484954834, - "num_tokens": 7866766.0, - "step": 861 - }, - { - "epoch": 0.6550151975683891, - "grad_norm": 2.4106903076171875, - "learning_rate": 4.624179719262342e-06, - "loss": 0.5662813186645508, - "mean_token_accuracy": 0.8061668872833252, - "num_tokens": 7872809.0, - "step": 862 - }, - { - "epoch": 0.6557750759878419, - "grad_norm": 3.5151145458221436, - "learning_rate": 4.623074574826254e-06, - "loss": 0.5471097230911255, - "mean_token_accuracy": 0.8220691084861755, - "num_tokens": 7876136.0, - "step": 863 - }, - { - "epoch": 0.6565349544072948, - "grad_norm": 1.5319840908050537, - "learning_rate": 4.621967940334705e-06, - "loss": 0.4178982377052307, - "mean_token_accuracy": 0.8517135977745056, - "num_tokens": 7886113.0, - "step": 864 - }, - { - "epoch": 0.6572948328267477, - "grad_norm": 1.63701331615448, - "learning_rate": 4.620859816564371e-06, - "loss": 0.4666512608528137, - "mean_token_accuracy": 0.8223508596420288, - "num_tokens": 7897982.0, - "step": 865 - }, - { - "epoch": 0.6580547112462006, - "grad_norm": 2.1515414714813232, - "learning_rate": 4.619750204292978e-06, - "loss": 0.5359305143356323, - "mean_token_accuracy": 0.8192868232727051, - "num_tokens": 7904947.0, - "step": 866 - }, - { - "epoch": 0.6588145896656535, - "grad_norm": 2.2140955924987793, - "learning_rate": 4.618639104299294e-06, - "loss": 0.5275633931159973, - "mean_token_accuracy": 0.8120715618133545, - "num_tokens": 7913913.0, - "step": 867 - }, - { - "epoch": 0.6595744680851063, - "grad_norm": 1.3956893682479858, - "learning_rate": 4.6175265173631304e-06, - "loss": 0.4378768503665924, - "mean_token_accuracy": 0.8479125499725342, - "num_tokens": 7927979.0, - "step": 868 - }, - { - "epoch": 0.6603343465045592, - "grad_norm": 2.98103928565979, - "learning_rate": 4.616412444265344e-06, - "loss": 0.42614591121673584, - "mean_token_accuracy": 0.8595094680786133, - "num_tokens": 7934293.0, - "step": 869 - }, - { - "epoch": 0.6610942249240122, - "grad_norm": 2.554845094680786, - "learning_rate": 4.6152968857878365e-06, - "loss": 0.3698030412197113, - "mean_token_accuracy": 0.8717041015625, - "num_tokens": 7938547.0, - "step": 870 - }, - { - "epoch": 0.6618541033434651, - "grad_norm": 3.0901825428009033, - "learning_rate": 4.6141798427135475e-06, - "loss": 0.5037497282028198, - "mean_token_accuracy": 0.8354041576385498, - "num_tokens": 7942829.0, - "step": 871 - }, - { - "epoch": 0.662613981762918, - "grad_norm": 2.8692073822021484, - "learning_rate": 4.6130613158264605e-06, - "loss": 0.5418164134025574, - "mean_token_accuracy": 0.8298909664154053, - "num_tokens": 7949303.0, - "step": 872 - }, - { - "epoch": 0.6633738601823708, - "grad_norm": 3.960404396057129, - "learning_rate": 4.611941305911602e-06, - "loss": 0.6284480094909668, - "mean_token_accuracy": 0.837495744228363, - "num_tokens": 7952486.0, - "step": 873 - }, - { - "epoch": 0.6641337386018237, - "grad_norm": 2.6690115928649902, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5214360952377319, - "mean_token_accuracy": 0.8213508129119873, - "num_tokens": 7957559.0, - "step": 874 - }, - { - "epoch": 0.6648936170212766, - "grad_norm": 2.3376171588897705, - "learning_rate": 4.609696840143875e-06, - "loss": 0.46887528896331787, - "mean_token_accuracy": 0.8438819646835327, - "num_tokens": 7962826.0, - "step": 875 - }, - { - "epoch": 0.6656534954407295, - "grad_norm": 2.2222683429718018, - "learning_rate": 4.6085723858662575e-06, - "loss": 0.5607719421386719, - "mean_token_accuracy": 0.8128405809402466, - "num_tokens": 7970131.0, - "step": 876 - }, - { - "epoch": 0.6664133738601824, - "grad_norm": 2.069091558456421, - "learning_rate": 4.607446451711372e-06, - "loss": 0.506301760673523, - "mean_token_accuracy": 0.8256827592849731, - "num_tokens": 7977524.0, - "step": 877 - }, - { - "epoch": 0.6671732522796353, - "grad_norm": 1.3724967241287231, - "learning_rate": 4.606319038469443e-06, - "loss": 0.43285101652145386, - "mean_token_accuracy": 0.8525032997131348, - "num_tokens": 7989174.0, - "step": 878 - }, - { - "epoch": 0.6679331306990881, - "grad_norm": 2.278205156326294, - "learning_rate": 4.605190146931731e-06, - "loss": 0.4845905303955078, - "mean_token_accuracy": 0.8284652829170227, - "num_tokens": 7998524.0, - "step": 879 - }, - { - "epoch": 0.668693009118541, - "grad_norm": 1.3871766328811646, - "learning_rate": 4.604059777890537e-06, - "loss": 0.5736679434776306, - "mean_token_accuracy": 0.8223285675048828, - "num_tokens": 8015776.0, - "step": 880 - }, - { - "epoch": 0.6694528875379939, - "grad_norm": 1.926164984703064, - "learning_rate": 4.602927932139197e-06, - "loss": 0.4133230447769165, - "mean_token_accuracy": 0.8653768301010132, - "num_tokens": 8022979.0, - "step": 881 - }, - { - "epoch": 0.6702127659574468, - "grad_norm": 2.109272003173828, - "learning_rate": 4.601794610472083e-06, - "loss": 0.7005600929260254, - "mean_token_accuracy": 0.7777010202407837, - "num_tokens": 8032618.0, - "step": 882 - }, - { - "epoch": 0.6709726443768997, - "grad_norm": 2.077977418899536, - "learning_rate": 4.6006598136846056e-06, - "loss": 0.5278208255767822, - "mean_token_accuracy": 0.8230358958244324, - "num_tokens": 8040534.0, - "step": 883 - }, - { - "epoch": 0.6717325227963525, - "grad_norm": 1.678581714630127, - "learning_rate": 4.599523542573207e-06, - "loss": 0.4955351650714874, - "mean_token_accuracy": 0.8270003795623779, - "num_tokens": 8052249.0, - "step": 884 - }, - { - "epoch": 0.6724924012158054, - "grad_norm": 2.0751662254333496, - "learning_rate": 4.598385797935368e-06, - "loss": 0.5266247987747192, - "mean_token_accuracy": 0.8263581991195679, - "num_tokens": 8060600.0, - "step": 885 - }, - { - "epoch": 0.6732522796352584, - "grad_norm": 2.418405771255493, - "learning_rate": 4.5972465805696e-06, - "loss": 0.4481425881385803, - "mean_token_accuracy": 0.846164345741272, - "num_tokens": 8066025.0, - "step": 886 - }, - { - "epoch": 0.6740121580547113, - "grad_norm": 2.3936474323272705, - "learning_rate": 4.596105891275449e-06, - "loss": 0.4553404450416565, - "mean_token_accuracy": 0.8412896394729614, - "num_tokens": 8071544.0, - "step": 887 - }, - { - "epoch": 0.6747720364741642, - "grad_norm": 2.2024407386779785, - "learning_rate": 4.594963730853497e-06, - "loss": 0.6218541860580444, - "mean_token_accuracy": 0.7890232801437378, - "num_tokens": 8079061.0, - "step": 888 - }, - { - "epoch": 0.675531914893617, - "grad_norm": 2.51015567779541, - "learning_rate": 4.593820100105355e-06, - "loss": 0.5149124264717102, - "mean_token_accuracy": 0.8241918087005615, - "num_tokens": 8084293.0, - "step": 889 - }, - { - "epoch": 0.6762917933130699, - "grad_norm": 1.8748939037322998, - "learning_rate": 4.5926749998336665e-06, - "loss": 0.50836181640625, - "mean_token_accuracy": 0.8067223429679871, - "num_tokens": 8092511.0, - "step": 890 - }, - { - "epoch": 0.6770516717325228, - "grad_norm": 1.801193118095398, - "learning_rate": 4.5915284308421075e-06, - "loss": 0.4372861683368683, - "mean_token_accuracy": 0.8510604500770569, - "num_tokens": 8101174.0, - "step": 891 - }, - { - "epoch": 0.6778115501519757, - "grad_norm": 2.6476457118988037, - "learning_rate": 4.590380393935383e-06, - "loss": 0.38700711727142334, - "mean_token_accuracy": 0.8659796714782715, - "num_tokens": 8105398.0, - "step": 892 - }, - { - "epoch": 0.6785714285714286, - "grad_norm": 1.1147183179855347, - "learning_rate": 4.589230889919232e-06, - "loss": 0.38546115159988403, - "mean_token_accuracy": 0.8570581674575806, - "num_tokens": 8127394.0, - "step": 893 - }, - { - "epoch": 0.6793313069908815, - "grad_norm": 2.908905506134033, - "learning_rate": 4.588079919600419e-06, - "loss": 0.5108504295349121, - "mean_token_accuracy": 0.8121406435966492, - "num_tokens": 8131801.0, - "step": 894 - }, - { - "epoch": 0.6800911854103343, - "grad_norm": 3.1522326469421387, - "learning_rate": 4.586927483786739e-06, - "loss": 0.44059112668037415, - "mean_token_accuracy": 0.8448011875152588, - "num_tokens": 8154416.0, - "step": 895 - }, - { - "epoch": 0.6808510638297872, - "grad_norm": 1.5142440795898438, - "learning_rate": 4.585773583287017e-06, - "loss": 0.513217568397522, - "mean_token_accuracy": 0.8386049270629883, - "num_tokens": 8171156.0, - "step": 896 - }, - { - "epoch": 0.6816109422492401, - "grad_norm": 2.597881317138672, - "learning_rate": 4.584618218911104e-06, - "loss": 0.4937712550163269, - "mean_token_accuracy": 0.8223681449890137, - "num_tokens": 8176124.0, - "step": 897 - }, - { - "epoch": 0.682370820668693, - "grad_norm": 1.8185619115829468, - "learning_rate": 4.583461391469879e-06, - "loss": 0.519811749458313, - "mean_token_accuracy": 0.8169777393341064, - "num_tokens": 8185136.0, - "step": 898 - }, - { - "epoch": 0.6831306990881459, - "grad_norm": 3.2061994075775146, - "learning_rate": 4.582303101775249e-06, - "loss": 0.4655115008354187, - "mean_token_accuracy": 0.8425977230072021, - "num_tokens": 8188864.0, - "step": 899 - }, - { - "epoch": 0.6838905775075987, - "grad_norm": 1.3485229015350342, - "learning_rate": 4.581143350640146e-06, - "loss": 0.5014470815658569, - "mean_token_accuracy": 0.8273109197616577, - "num_tokens": 8203460.0, - "step": 900 - }, - { - "epoch": 0.6846504559270516, - "grad_norm": 1.3264713287353516, - "learning_rate": 4.579982138878527e-06, - "loss": 0.5073703527450562, - "mean_token_accuracy": 0.8259357213973999, - "num_tokens": 8219348.0, - "step": 901 - }, - { - "epoch": 0.6854103343465046, - "grad_norm": 2.4436347484588623, - "learning_rate": 4.578819467305375e-06, - "loss": 0.47020310163497925, - "mean_token_accuracy": 0.8567265272140503, - "num_tokens": 8224427.0, - "step": 902 - }, - { - "epoch": 0.6861702127659575, - "grad_norm": 1.921749234199524, - "learning_rate": 4.5776553367367e-06, - "loss": 0.622514009475708, - "mean_token_accuracy": 0.7863982319831848, - "num_tokens": 8233151.0, - "step": 903 - }, - { - "epoch": 0.6869300911854104, - "grad_norm": 1.8815616369247437, - "learning_rate": 4.576489747989532e-06, - "loss": 0.4910545349121094, - "mean_token_accuracy": 0.8147122859954834, - "num_tokens": 8240762.0, - "step": 904 - }, - { - "epoch": 0.6876899696048632, - "grad_norm": 1.2366989850997925, - "learning_rate": 4.575322701881926e-06, - "loss": 0.3947566747665405, - "mean_token_accuracy": 0.873993992805481, - "num_tokens": 8259381.0, - "step": 905 - }, - { - "epoch": 0.6884498480243161, - "grad_norm": 1.5767735242843628, - "learning_rate": 4.57415419923296e-06, - "loss": 0.57136070728302, - "mean_token_accuracy": 0.8028088808059692, - "num_tokens": 8273296.0, - "step": 906 - }, - { - "epoch": 0.689209726443769, - "grad_norm": 2.378675699234009, - "learning_rate": 4.572984240862733e-06, - "loss": 0.5894849896430969, - "mean_token_accuracy": 0.7977708578109741, - "num_tokens": 8280083.0, - "step": 907 - }, - { - "epoch": 0.6899696048632219, - "grad_norm": 2.0401132106781006, - "learning_rate": 4.57181282759237e-06, - "loss": 0.5524613261222839, - "mean_token_accuracy": 0.8138598203659058, - "num_tokens": 8288236.0, - "step": 908 - }, - { - "epoch": 0.6907294832826748, - "grad_norm": 2.293701648712158, - "learning_rate": 4.570639960244011e-06, - "loss": 0.5154546499252319, - "mean_token_accuracy": 0.8234660625457764, - "num_tokens": 8294493.0, - "step": 909 - }, - { - "epoch": 0.6914893617021277, - "grad_norm": 1.9286527633666992, - "learning_rate": 4.56946563964082e-06, - "loss": 0.5364264845848083, - "mean_token_accuracy": 0.8147368431091309, - "num_tokens": 8303441.0, - "step": 910 - }, - { - "epoch": 0.6922492401215805, - "grad_norm": 1.2571251392364502, - "learning_rate": 4.5682898666069815e-06, - "loss": 0.43535223603248596, - "mean_token_accuracy": 0.859239935874939, - "num_tokens": 8321548.0, - "step": 911 - }, - { - "epoch": 0.6930091185410334, - "grad_norm": 1.2224860191345215, - "learning_rate": 4.567112641967697e-06, - "loss": 0.40205076336860657, - "mean_token_accuracy": 0.8724711537361145, - "num_tokens": 8335205.0, - "step": 912 - }, - { - "epoch": 0.6937689969604863, - "grad_norm": 1.2064491510391235, - "learning_rate": 4.5659339665491894e-06, - "loss": 0.37790587544441223, - "mean_token_accuracy": 0.8464339971542358, - "num_tokens": 8350926.0, - "step": 913 - }, - { - "epoch": 0.6945288753799392, - "grad_norm": 2.1755270957946777, - "learning_rate": 4.5647538411786965e-06, - "loss": 0.42034298181533813, - "mean_token_accuracy": 0.84148108959198, - "num_tokens": 8356739.0, - "step": 914 - }, - { - "epoch": 0.6952887537993921, - "grad_norm": 1.234864592552185, - "learning_rate": 4.563572266684478e-06, - "loss": 0.5062938332557678, - "mean_token_accuracy": 0.8132052421569824, - "num_tokens": 8373660.0, - "step": 915 - }, - { - "epoch": 0.6960486322188449, - "grad_norm": 2.4250621795654297, - "learning_rate": 4.562389243895807e-06, - "loss": 0.4907791018486023, - "mean_token_accuracy": 0.8337979912757874, - "num_tokens": 8378661.0, - "step": 916 - }, - { - "epoch": 0.6968085106382979, - "grad_norm": 1.5018314123153687, - "learning_rate": 4.561204773642974e-06, - "loss": 0.41041281819343567, - "mean_token_accuracy": 0.8569784164428711, - "num_tokens": 8390322.0, - "step": 917 - }, - { - "epoch": 0.6975683890577508, - "grad_norm": 2.797269344329834, - "learning_rate": 4.5600188567572874e-06, - "loss": 0.3146931529045105, - "mean_token_accuracy": 0.8913302421569824, - "num_tokens": 8393567.0, - "step": 918 - }, - { - "epoch": 0.6983282674772037, - "grad_norm": 1.4002827405929565, - "learning_rate": 4.558831494071069e-06, - "loss": 0.4275597333908081, - "mean_token_accuracy": 0.8504893779754639, - "num_tokens": 8407119.0, - "step": 919 - }, - { - "epoch": 0.6990881458966566, - "grad_norm": 1.7045831680297852, - "learning_rate": 4.557642686417654e-06, - "loss": 0.49593430757522583, - "mean_token_accuracy": 0.8185091018676758, - "num_tokens": 8417408.0, - "step": 920 - }, - { - "epoch": 0.6998480243161094, - "grad_norm": 2.8818066120147705, - "learning_rate": 4.556452434631396e-06, - "loss": 0.637908935546875, - "mean_token_accuracy": 0.7883946895599365, - "num_tokens": 8422319.0, - "step": 921 - }, - { - "epoch": 0.7006079027355623, - "grad_norm": 2.3587265014648438, - "learning_rate": 4.555260739547657e-06, - "loss": 0.38749319314956665, - "mean_token_accuracy": 0.8774704933166504, - "num_tokens": 8427315.0, - "step": 922 - }, - { - "epoch": 0.7013677811550152, - "grad_norm": 1.6648749113082886, - "learning_rate": 4.554067602002815e-06, - "loss": 0.4044865369796753, - "mean_token_accuracy": 0.8524141311645508, - "num_tokens": 8438662.0, - "step": 923 - }, - { - "epoch": 0.7021276595744681, - "grad_norm": 3.467787742614746, - "learning_rate": 4.55287302283426e-06, - "loss": 0.591016411781311, - "mean_token_accuracy": 0.81184983253479, - "num_tokens": 8442237.0, - "step": 924 - }, - { - "epoch": 0.702887537993921, - "grad_norm": 2.1458635330200195, - "learning_rate": 4.551677002880395e-06, - "loss": 0.5017476677894592, - "mean_token_accuracy": 0.822914183139801, - "num_tokens": 8449494.0, - "step": 925 - }, - { - "epoch": 0.7036474164133738, - "grad_norm": 2.521714448928833, - "learning_rate": 4.550479542980632e-06, - "loss": 0.531912088394165, - "mean_token_accuracy": 0.8225687742233276, - "num_tokens": 8454983.0, - "step": 926 - }, - { - "epoch": 0.7044072948328267, - "grad_norm": 3.5248100757598877, - "learning_rate": 4.549280643975394e-06, - "loss": 0.4631815254688263, - "mean_token_accuracy": 0.8443771600723267, - "num_tokens": 8458504.0, - "step": 927 - }, - { - "epoch": 0.7051671732522796, - "grad_norm": 2.5105819702148438, - "learning_rate": 4.548080306706114e-06, - "loss": 0.30487123131752014, - "mean_token_accuracy": 0.9018767476081848, - "num_tokens": 8462589.0, - "step": 928 - }, - { - "epoch": 0.7059270516717325, - "grad_norm": 1.3367713689804077, - "learning_rate": 4.5468785320152365e-06, - "loss": 0.4355026185512543, - "mean_token_accuracy": 0.8323584794998169, - "num_tokens": 8478450.0, - "step": 929 - }, - { - "epoch": 0.7066869300911854, - "grad_norm": 2.2506282329559326, - "learning_rate": 4.545675320746212e-06, - "loss": 0.5082957744598389, - "mean_token_accuracy": 0.823430597782135, - "num_tokens": 8485991.0, - "step": 930 - }, - { - "epoch": 0.7074468085106383, - "grad_norm": 1.7164632081985474, - "learning_rate": 4.544470673743502e-06, - "loss": 0.3960164785385132, - "mean_token_accuracy": 0.8592486381530762, - "num_tokens": 8495217.0, - "step": 931 - }, - { - "epoch": 0.7082066869300911, - "grad_norm": 1.5864969491958618, - "learning_rate": 4.543264591852572e-06, - "loss": 0.49114471673965454, - "mean_token_accuracy": 0.8330780267715454, - "num_tokens": 8508904.0, - "step": 932 - }, - { - "epoch": 0.708966565349544, - "grad_norm": 2.1707003116607666, - "learning_rate": 4.542057075919898e-06, - "loss": 0.49895772337913513, - "mean_token_accuracy": 0.8327431082725525, - "num_tokens": 8515792.0, - "step": 933 - }, - { - "epoch": 0.709726443768997, - "grad_norm": 1.9002083539962769, - "learning_rate": 4.54084812679296e-06, - "loss": 0.4548531472682953, - "mean_token_accuracy": 0.834532618522644, - "num_tokens": 8524006.0, - "step": 934 - }, - { - "epoch": 0.7104863221884499, - "grad_norm": 1.8505141735076904, - "learning_rate": 4.539637745320247e-06, - "loss": 0.35716521739959717, - "mean_token_accuracy": 0.872222900390625, - "num_tokens": 8533647.0, - "step": 935 - }, - { - "epoch": 0.7112462006079028, - "grad_norm": 2.092620849609375, - "learning_rate": 4.53842593235125e-06, - "loss": 0.4673694372177124, - "mean_token_accuracy": 0.8460999131202698, - "num_tokens": 8540734.0, - "step": 936 - }, - { - "epoch": 0.7120060790273556, - "grad_norm": 2.689514636993408, - "learning_rate": 4.537212688736466e-06, - "loss": 0.45461273193359375, - "mean_token_accuracy": 0.8450704216957092, - "num_tokens": 8544948.0, - "step": 937 - }, - { - "epoch": 0.7127659574468085, - "grad_norm": 2.4507734775543213, - "learning_rate": 4.535998015327396e-06, - "loss": 0.4571906626224518, - "mean_token_accuracy": 0.8429360389709473, - "num_tokens": 8550445.0, - "step": 938 - }, - { - "epoch": 0.7135258358662614, - "grad_norm": 1.8960013389587402, - "learning_rate": 4.534781912976546e-06, - "loss": 0.4461391568183899, - "mean_token_accuracy": 0.8487973213195801, - "num_tokens": 8557630.0, - "step": 939 - }, - { - "epoch": 0.7142857142857143, - "grad_norm": 1.602611780166626, - "learning_rate": 4.533564382537421e-06, - "loss": 0.5277102589607239, - "mean_token_accuracy": 0.8330916166305542, - "num_tokens": 8570397.0, - "step": 940 - }, - { - "epoch": 0.7150455927051672, - "grad_norm": 1.8936395645141602, - "learning_rate": 4.532345424864533e-06, - "loss": 0.38619571924209595, - "mean_token_accuracy": 0.8514572381973267, - "num_tokens": 8582673.0, - "step": 941 - }, - { - "epoch": 0.71580547112462, - "grad_norm": 1.3898619413375854, - "learning_rate": 4.531125040813392e-06, - "loss": 0.4825032949447632, - "mean_token_accuracy": 0.833012580871582, - "num_tokens": 8597239.0, - "step": 942 - }, - { - "epoch": 0.7165653495440729, - "grad_norm": 2.128230571746826, - "learning_rate": 4.529903231240511e-06, - "loss": 0.4862118065357208, - "mean_token_accuracy": 0.8210917711257935, - "num_tokens": 8605877.0, - "step": 943 - }, - { - "epoch": 0.7173252279635258, - "grad_norm": 1.6552259922027588, - "learning_rate": 4.528679997003403e-06, - "loss": 0.5092059373855591, - "mean_token_accuracy": 0.8247389793395996, - "num_tokens": 8617060.0, - "step": 944 - }, - { - "epoch": 0.7180851063829787, - "grad_norm": 2.1174771785736084, - "learning_rate": 4.52745533896058e-06, - "loss": 0.39110174775123596, - "mean_token_accuracy": 0.8672944903373718, - "num_tokens": 8623306.0, - "step": 945 - }, - { - "epoch": 0.7188449848024316, - "grad_norm": 2.8648383617401123, - "learning_rate": 4.526229257971556e-06, - "loss": 0.49864327907562256, - "mean_token_accuracy": 0.8305130004882812, - "num_tokens": 8627466.0, - "step": 946 - }, - { - "epoch": 0.7196048632218845, - "grad_norm": 2.155514717102051, - "learning_rate": 4.52500175489684e-06, - "loss": 0.5070191025733948, - "mean_token_accuracy": 0.8311188817024231, - "num_tokens": 8634759.0, - "step": 947 - }, - { - "epoch": 0.7203647416413373, - "grad_norm": 1.8432683944702148, - "learning_rate": 4.523772830597942e-06, - "loss": 0.5569252371788025, - "mean_token_accuracy": 0.8070821762084961, - "num_tokens": 8644160.0, - "step": 948 - }, - { - "epoch": 0.7211246200607903, - "grad_norm": 2.8912241458892822, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4799427390098572, - "mean_token_accuracy": 0.8443552851676941, - "num_tokens": 8648377.0, - "step": 949 - }, - { - "epoch": 0.7218844984802432, - "grad_norm": 3.3449625968933105, - "learning_rate": 4.521310721778622e-06, - "loss": 0.44043463468551636, - "mean_token_accuracy": 0.8521315455436707, - "num_tokens": 8651846.0, - "step": 950 - }, - { - "epoch": 0.7226443768996961, - "grad_norm": 1.4127917289733887, - "learning_rate": 4.520077538986203e-06, - "loss": 0.4700999855995178, - "mean_token_accuracy": 0.8377952575683594, - "num_tokens": 8665199.0, - "step": 951 - }, - { - "epoch": 0.723404255319149, - "grad_norm": 2.1607301235198975, - "learning_rate": 4.518842938425606e-06, - "loss": 0.4374256730079651, - "mean_token_accuracy": 0.8448896408081055, - "num_tokens": 8672158.0, - "step": 952 - }, - { - "epoch": 0.7241641337386018, - "grad_norm": 1.3442779779434204, - "learning_rate": 4.51760692096332e-06, - "loss": 0.38948923349380493, - "mean_token_accuracy": 0.8598923683166504, - "num_tokens": 8684532.0, - "step": 953 - }, - { - "epoch": 0.7249240121580547, - "grad_norm": 2.0003178119659424, - "learning_rate": 4.516369487466832e-06, - "loss": 0.3797217011451721, - "mean_token_accuracy": 0.8652102947235107, - "num_tokens": 8691460.0, - "step": 954 - }, - { - "epoch": 0.7256838905775076, - "grad_norm": 1.8196535110473633, - "learning_rate": 4.5151306388046175e-06, - "loss": 0.5676811933517456, - "mean_token_accuracy": 0.818500816822052, - "num_tokens": 8701624.0, - "step": 955 - }, - { - "epoch": 0.7264437689969605, - "grad_norm": 2.1962296962738037, - "learning_rate": 4.513890375846152e-06, - "loss": 0.45399484038352966, - "mean_token_accuracy": 0.8463879227638245, - "num_tokens": 8707410.0, - "step": 956 - }, - { - "epoch": 0.7272036474164134, - "grad_norm": 1.8798872232437134, - "learning_rate": 4.512648699461897e-06, - "loss": 0.5679811239242554, - "mean_token_accuracy": 0.8089900016784668, - "num_tokens": 8715630.0, - "step": 957 - }, - { - "epoch": 0.7279635258358662, - "grad_norm": 2.3540258407592773, - "learning_rate": 4.511405610523309e-06, - "loss": 0.5282865762710571, - "mean_token_accuracy": 0.8196114301681519, - "num_tokens": 8721934.0, - "step": 958 - }, - { - "epoch": 0.7287234042553191, - "grad_norm": 2.5630908012390137, - "learning_rate": 4.510161109902837e-06, - "loss": 0.39442378282546997, - "mean_token_accuracy": 0.8400980830192566, - "num_tokens": 8726511.0, - "step": 959 - }, - { - "epoch": 0.729483282674772, - "grad_norm": 1.9829226732254028, - "learning_rate": 4.508915198473919e-06, - "loss": 0.4611976742744446, - "mean_token_accuracy": 0.8439624309539795, - "num_tokens": 8733460.0, - "step": 960 - }, - { - "epoch": 0.7302431610942249, - "grad_norm": 3.0291950702667236, - "learning_rate": 4.507667877110982e-06, - "loss": 0.5158340930938721, - "mean_token_accuracy": 0.8300060033798218, - "num_tokens": 8737629.0, - "step": 961 - }, - { - "epoch": 0.7310030395136778, - "grad_norm": 1.9208252429962158, - "learning_rate": 4.506419146689445e-06, - "loss": 0.3807099163532257, - "mean_token_accuracy": 0.871469259262085, - "num_tokens": 8744615.0, - "step": 962 - }, - { - "epoch": 0.7317629179331308, - "grad_norm": 3.051565408706665, - "learning_rate": 4.505169008085717e-06, - "loss": 0.38461726903915405, - "mean_token_accuracy": 0.874465823173523, - "num_tokens": 8748154.0, - "step": 963 - }, - { - "epoch": 0.7325227963525835, - "grad_norm": 1.375466227531433, - "learning_rate": 4.503917462177192e-06, - "loss": 0.42490679025650024, - "mean_token_accuracy": 0.8457326889038086, - "num_tokens": 8760965.0, - "step": 964 - }, - { - "epoch": 0.7332826747720365, - "grad_norm": 2.216681957244873, - "learning_rate": 4.5026645098422515e-06, - "loss": 0.43149900436401367, - "mean_token_accuracy": 0.8527278900146484, - "num_tokens": 8766996.0, - "step": 965 - }, - { - "epoch": 0.7340425531914894, - "grad_norm": 1.9422595500946045, - "learning_rate": 4.5014101519602684e-06, - "loss": 0.4964504539966583, - "mean_token_accuracy": 0.8137556314468384, - "num_tokens": 8774411.0, - "step": 966 - }, - { - "epoch": 0.7348024316109423, - "grad_norm": 2.058887004852295, - "learning_rate": 4.500154389411598e-06, - "loss": 0.4977570176124573, - "mean_token_accuracy": 0.8254626989364624, - "num_tokens": 8782220.0, - "step": 967 - }, - { - "epoch": 0.7355623100303952, - "grad_norm": 2.9977786540985107, - "learning_rate": 4.498897223077582e-06, - "loss": 0.4061415195465088, - "mean_token_accuracy": 0.8752427101135254, - "num_tokens": 8786120.0, - "step": 968 - }, - { - "epoch": 0.736322188449848, - "grad_norm": 2.2636303901672363, - "learning_rate": 4.49763865384055e-06, - "loss": 0.5062161087989807, - "mean_token_accuracy": 0.8171653747558594, - "num_tokens": 8792459.0, - "step": 969 - }, - { - "epoch": 0.7370820668693009, - "grad_norm": 1.8850842714309692, - "learning_rate": 4.496378682583813e-06, - "loss": 0.5014280676841736, - "mean_token_accuracy": 0.8547511100769043, - "num_tokens": 8800675.0, - "step": 970 - }, - { - "epoch": 0.7378419452887538, - "grad_norm": 1.191985011100769, - "learning_rate": 4.495117310191667e-06, - "loss": 0.4713883101940155, - "mean_token_accuracy": 0.8213596343994141, - "num_tokens": 8820740.0, - "step": 971 - }, - { - "epoch": 0.7386018237082067, - "grad_norm": 1.823000192642212, - "learning_rate": 4.493854537549393e-06, - "loss": 0.46332645416259766, - "mean_token_accuracy": 0.8359860777854919, - "num_tokens": 8828884.0, - "step": 972 - }, - { - "epoch": 0.7393617021276596, - "grad_norm": 2.590446949005127, - "learning_rate": 4.492590365543253e-06, - "loss": 0.49074703454971313, - "mean_token_accuracy": 0.8433758020401001, - "num_tokens": 8833859.0, - "step": 973 - }, - { - "epoch": 0.7401215805471124, - "grad_norm": 2.2762670516967773, - "learning_rate": 4.491324795060491e-06, - "loss": 0.39465656876564026, - "mean_token_accuracy": 0.8734766244888306, - "num_tokens": 8839350.0, - "step": 974 - }, - { - "epoch": 0.7408814589665653, - "grad_norm": 2.698725461959839, - "learning_rate": 4.490057826989333e-06, - "loss": 0.5552085041999817, - "mean_token_accuracy": 0.8132266998291016, - "num_tokens": 8844373.0, - "step": 975 - }, - { - "epoch": 0.7416413373860182, - "grad_norm": 2.704606294631958, - "learning_rate": 4.488789462218988e-06, - "loss": 0.3447791635990143, - "mean_token_accuracy": 0.8736170530319214, - "num_tokens": 8848236.0, - "step": 976 - }, - { - "epoch": 0.7424012158054711, - "grad_norm": 3.1260716915130615, - "learning_rate": 4.487519701639641e-06, - "loss": 0.5945233702659607, - "mean_token_accuracy": 0.7997599840164185, - "num_tokens": 8852935.0, - "step": 977 - }, - { - "epoch": 0.743161094224924, - "grad_norm": 1.6895452737808228, - "learning_rate": 4.486248546142459e-06, - "loss": 0.4823892116546631, - "mean_token_accuracy": 0.8279662132263184, - "num_tokens": 8861743.0, - "step": 978 - }, - { - "epoch": 0.743920972644377, - "grad_norm": 1.9161452054977417, - "learning_rate": 4.4849759966195885e-06, - "loss": 0.5266581773757935, - "mean_token_accuracy": 0.8218623399734497, - "num_tokens": 8870601.0, - "step": 979 - }, - { - "epoch": 0.7446808510638298, - "grad_norm": 1.6894301176071167, - "learning_rate": 4.483702053964154e-06, - "loss": 0.4186219573020935, - "mean_token_accuracy": 0.8471781015396118, - "num_tokens": 8885617.0, - "step": 980 - }, - { - "epoch": 0.7454407294832827, - "grad_norm": 1.6319992542266846, - "learning_rate": 4.482426719070258e-06, - "loss": 0.541317880153656, - "mean_token_accuracy": 0.8216162323951721, - "num_tokens": 8897595.0, - "step": 981 - }, - { - "epoch": 0.7462006079027356, - "grad_norm": 5.102413177490234, - "learning_rate": 4.4811499928329775e-06, - "loss": 0.3928517699241638, - "mean_token_accuracy": 0.858033299446106, - "num_tokens": 8901682.0, - "step": 982 - }, - { - "epoch": 0.7469604863221885, - "grad_norm": 2.213860273361206, - "learning_rate": 4.479871876148368e-06, - "loss": 0.4276347756385803, - "mean_token_accuracy": 0.8529798984527588, - "num_tokens": 8908088.0, - "step": 983 - }, - { - "epoch": 0.7477203647416414, - "grad_norm": 1.2180038690567017, - "learning_rate": 4.478592369913464e-06, - "loss": 0.3941590189933777, - "mean_token_accuracy": 0.8608149290084839, - "num_tokens": 8925876.0, - "step": 984 - }, - { - "epoch": 0.7484802431610942, - "grad_norm": 2.849802255630493, - "learning_rate": 4.477311475026271e-06, - "loss": 0.42190325260162354, - "mean_token_accuracy": 0.860505223274231, - "num_tokens": 8930190.0, - "step": 985 - }, - { - "epoch": 0.7492401215805471, - "grad_norm": 1.704128384590149, - "learning_rate": 4.476029192385769e-06, - "loss": 0.4786282777786255, - "mean_token_accuracy": 0.8302322626113892, - "num_tokens": 8938340.0, - "step": 986 - }, - { - "epoch": 0.75, - "grad_norm": 2.06322979927063, - "learning_rate": 4.474745522891915e-06, - "loss": 0.4648786187171936, - "mean_token_accuracy": 0.8366481065750122, - "num_tokens": 8944633.0, - "step": 987 - }, - { - "epoch": 0.7507598784194529, - "grad_norm": 2.0745396614074707, - "learning_rate": 4.473460467445637e-06, - "loss": 0.5744885206222534, - "mean_token_accuracy": 0.8357284069061279, - "num_tokens": 8954457.0, - "step": 988 - }, - { - "epoch": 0.7515197568389058, - "grad_norm": 1.9281407594680786, - "learning_rate": 4.472174026948836e-06, - "loss": 0.528974175453186, - "mean_token_accuracy": 0.8083580732345581, - "num_tokens": 8962701.0, - "step": 989 - }, - { - "epoch": 0.7522796352583586, - "grad_norm": 3.012381076812744, - "learning_rate": 4.470886202304385e-06, - "loss": 0.48754751682281494, - "mean_token_accuracy": 0.8368391990661621, - "num_tokens": 8967272.0, - "step": 990 - }, - { - "epoch": 0.7530395136778115, - "grad_norm": 1.691826581954956, - "learning_rate": 4.469596994416131e-06, - "loss": 0.484740674495697, - "mean_token_accuracy": 0.8500643968582153, - "num_tokens": 8976615.0, - "step": 991 - }, - { - "epoch": 0.7537993920972644, - "grad_norm": 2.4961965084075928, - "learning_rate": 4.468306404188887e-06, - "loss": 0.50777268409729, - "mean_token_accuracy": 0.8168395757675171, - "num_tokens": 8983235.0, - "step": 992 - }, - { - "epoch": 0.7545592705167173, - "grad_norm": 1.512007713317871, - "learning_rate": 4.467014432528441e-06, - "loss": 0.4583340287208557, - "mean_token_accuracy": 0.8465162515640259, - "num_tokens": 8993815.0, - "step": 993 - }, - { - "epoch": 0.7553191489361702, - "grad_norm": 1.9362257719039917, - "learning_rate": 4.465721080341547e-06, - "loss": 0.6027892827987671, - "mean_token_accuracy": 0.8052380084991455, - "num_tokens": 9002697.0, - "step": 994 - }, - { - "epoch": 0.756079027355623, - "grad_norm": 2.473632335662842, - "learning_rate": 4.4644263485359316e-06, - "loss": 0.5394320487976074, - "mean_token_accuracy": 0.834665834903717, - "num_tokens": 9007428.0, - "step": 995 - }, - { - "epoch": 0.756838905775076, - "grad_norm": 2.2527434825897217, - "learning_rate": 4.463130238020284e-06, - "loss": 0.5485198497772217, - "mean_token_accuracy": 0.8090173006057739, - "num_tokens": 9013570.0, - "step": 996 - }, - { - "epoch": 0.7575987841945289, - "grad_norm": 1.4130940437316895, - "learning_rate": 4.4618327497042676e-06, - "loss": 0.37994423508644104, - "mean_token_accuracy": 0.8625167012214661, - "num_tokens": 9025485.0, - "step": 997 - }, - { - "epoch": 0.7583586626139818, - "grad_norm": 2.685115098953247, - "learning_rate": 4.460533884498509e-06, - "loss": 0.447973370552063, - "mean_token_accuracy": 0.8564165234565735, - "num_tokens": 9030355.0, - "step": 998 - }, - { - "epoch": 0.7591185410334347, - "grad_norm": 3.2743139266967773, - "learning_rate": 4.4592336433146e-06, - "loss": 0.45275989174842834, - "mean_token_accuracy": 0.8462578058242798, - "num_tokens": 9034406.0, - "step": 999 - }, - { - "epoch": 0.7598784194528876, - "grad_norm": 1.9383049011230469, - "learning_rate": 4.457932027065102e-06, - "loss": 0.5387729406356812, - "mean_token_accuracy": 0.8357330560684204, - "num_tokens": 9041502.0, - "step": 1000 - } - ], - "logging_steps": 1.0, - "max_steps": 3948, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 1000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 9.855721706985882e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin deleted file mode 100644 index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000 --- a/checkpoint-1000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 -size 6225 diff --git a/checkpoint-2000/chat_template.jinja b/checkpoint-2000/chat_template.jinja deleted file mode 100644 index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000 --- a/checkpoint-2000/chat_template.jinja +++ /dev/null @@ -1,61 +0,0 @@ -{%- if tools %} - {{- '<|im_start|>system\n' }} - {%- if messages[0].role == 'system' %} - {{- messages[0].content + '\n\n' }} - {%- endif %} - {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} - {%- for tool in tools %} - {{- "\n" }} - {{- tool | tojson }} - {%- endfor %} - {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} -{%- else %} - {%- if messages[0].role == 'system' %} - {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} - {%- endif %} -{%- endif %} -{%- for message in messages %} - {%- if message.content is string %} - {%- set content = message.content %} - {%- else %} - {%- set content = '' %} - {%- endif %} - {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} - {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} - {%- elif message.role == "assistant" %} - {{- '<|im_start|>' + message.role + '\n' + content }} - {%- if message.tool_calls %} - {%- for tool_call in message.tool_calls %} - {%- if (loop.first and content) or (not loop.first) %} - {{- '\n' }} - {%- endif %} - {%- if tool_call.function %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '\n{"name": "' }} - {{- tool_call.name }} - {{- '", "arguments": ' }} - {%- if tool_call.arguments is string %} - {{- tool_call.arguments }} - {%- else %} - {{- tool_call.arguments | tojson }} - {%- endif %} - {{- '}\n' }} - {%- endfor %} - {%- endif %} - {{- '<|im_end|>\n' }} - {%- elif message.role == "tool" %} - {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} - {{- '<|im_start|>user' }} - {%- endif %} - {{- '\n\n' }} - {{- content }} - {{- '\n' }} - {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} - {{- '<|im_end|>\n' }} - {%- endif %} - {%- endif %} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|im_start|>assistant\n' }} -{%- endif %} \ No newline at end of file diff --git a/checkpoint-2000/config.json b/checkpoint-2000/config.json deleted file mode 100644 index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000 --- a/checkpoint-2000/config.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "architectures": [ - "Qwen3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": null, - "dtype": "float32", - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 2560, - "initializer_range": 0.02, - "intermediate_size": 9728, - "layer_types": [ - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention" - ], - "max_position_embeddings": 262144, - "max_window_layers": 36, - "model_type": "qwen3", - "num_attention_heads": 32, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "pad_token_id": 151662, - "rms_norm_eps": 1e-06, - "rope_parameters": { - "rope_theta": 5000000, - "rope_type": "default" - }, - "sliding_window": null, - "tie_word_embeddings": true, - "transformers_version": "5.5.3", - "use_cache": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/checkpoint-2000/generation_config.json b/checkpoint-2000/generation_config.json deleted file mode 100644 index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000 --- a/checkpoint-2000/generation_config.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "do_sample": true, - "eos_token_id": [ - 151645, - 151643 - ], - "pad_token_id": 151662, - "temperature": 0.7, - "top_k": 20, - "top_p": 0.8, - "transformers_version": "5.5.3" -} diff --git a/checkpoint-2000/model.safetensors b/checkpoint-2000/model.safetensors deleted file mode 100644 index 7d26c4ee9aa6752602ff6db19d02edfff6e062f7..0000000000000000000000000000000000000000 --- a/checkpoint-2000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b1ce241be74f81ade1793d7d1184e1cf7ce2e9afe46f5dd9418012bd1861b43 -size 17645743048 diff --git a/checkpoint-2000/optimizer.bin b/checkpoint-2000/optimizer.bin deleted file mode 100644 index 9bf26616282816435a39edb78ec22ebe2461696f..0000000000000000000000000000000000000000 --- a/checkpoint-2000/optimizer.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07e07657f743306d7736d8218c799dfc731283d7dedfca7eb48d4dcc64c64623 -size 32180124005 diff --git a/checkpoint-2000/pytorch_model_fsdp.bin b/checkpoint-2000/pytorch_model_fsdp.bin deleted file mode 100644 index 675400f377bfee7718a7693c8e10f410f7ec7242..0000000000000000000000000000000000000000 --- a/checkpoint-2000/pytorch_model_fsdp.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27df8f98b77baf9afbd9bdac0a9ff6cc9e53f4d44310a5d8c665d45656911b2e -size 17645897996 diff --git a/checkpoint-2000/rng_state_0.pth b/checkpoint-2000/rng_state_0.pth deleted file mode 100644 index 870021e3fa5ac35c2f711adf0c93a556ab4842da..0000000000000000000000000000000000000000 --- a/checkpoint-2000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95e5fc2074c0df31522a514f862c86cb00d71c946a7f15cc9ec0e53a69fb28a7 -size 14917 diff --git a/checkpoint-2000/rng_state_1.pth b/checkpoint-2000/rng_state_1.pth deleted file mode 100644 index 21f20da5eb1da017f08aaa88bd19cf24d40e3fbf..0000000000000000000000000000000000000000 --- a/checkpoint-2000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0e7153eae67b6c9232a41bc996a2bf5b83229b8c7230d61911ac0fd40e64154e -size 14917 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt deleted file mode 100644 index 19b70f1f9806f4fbfc99dc7c3b253116b6abd505..0000000000000000000000000000000000000000 --- a/checkpoint-2000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c70c34042f727a1ef06eb662d77f90fe87f01cf21415dce97c8cb4c779b5625 -size 1465 diff --git a/checkpoint-2000/tokenizer.json b/checkpoint-2000/tokenizer.json deleted file mode 100644 index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000 --- a/checkpoint-2000/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 -size 11422650 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json deleted file mode 100644 index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000 --- a/checkpoint-2000/tokenizer_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "add_prefix_space": false, - "backend": "tokenizers", - "bos_token": null, - "clean_up_tokenization_spaces": false, - "eos_token": "<|im_end|>", - "errors": "replace", - "extra_special_tokens": [ - "<|im_start|>", - "<|im_end|>", - "<|object_ref_start|>", - "<|object_ref_end|>", - "<|box_start|>", - "<|box_end|>", - "<|quad_start|>", - "<|quad_end|>", - "<|vision_start|>", - "<|vision_end|>", - "<|vision_pad|>", - "<|image_pad|>", - "<|video_pad|>" - ], - "is_local": false, - "model_max_length": 1010000, - "pad_token": "<|fim_pad|>", - "split_special_tokens": false, - "tokenizer_class": "Qwen2Tokenizer", - "unk_token": null -} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json deleted file mode 100644 index 18d493bffa53c88bc213582a98da8699d575acdc..0000000000000000000000000000000000000000 --- a/checkpoint-2000/trainer_state.json +++ /dev/null @@ -1,18034 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.5197568389057752, - "eval_steps": 500, - "global_step": 2000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0007598784194528875, - "grad_norm": 11.767926216125488, - "learning_rate": 0.0, - "loss": 0.7937269806861877, - "mean_token_accuracy": 0.7822731137275696, - "num_tokens": 10507.0, - "step": 1 - }, - { - "epoch": 0.001519756838905775, - "grad_norm": 14.9199800491333, - "learning_rate": 2.5252525252525256e-08, - "loss": 0.7665389776229858, - "mean_token_accuracy": 0.8342233300209045, - "num_tokens": 14806.0, - "step": 2 - }, - { - "epoch": 0.0022796352583586625, - "grad_norm": 11.991217613220215, - "learning_rate": 5.050505050505051e-08, - "loss": 0.9597002267837524, - "mean_token_accuracy": 0.7054992318153381, - "num_tokens": 27170.0, - "step": 3 - }, - { - "epoch": 0.00303951367781155, - "grad_norm": 12.958333015441895, - "learning_rate": 7.575757575757576e-08, - "loss": 0.9971482753753662, - "mean_token_accuracy": 0.7261134386062622, - "num_tokens": 33729.0, - "step": 4 - }, - { - "epoch": 0.003799392097264438, - "grad_norm": 13.5665283203125, - "learning_rate": 1.0101010101010103e-07, - "loss": 0.9504883885383606, - "mean_token_accuracy": 0.745307445526123, - "num_tokens": 41174.0, - "step": 5 - }, - { - "epoch": 0.004559270516717325, - "grad_norm": 10.09444808959961, - "learning_rate": 1.2626262626262626e-07, - "loss": 0.759548008441925, - "mean_token_accuracy": 0.7842121124267578, - "num_tokens": 47943.0, - "step": 6 - }, - { - "epoch": 0.005319148936170213, - "grad_norm": 10.741650581359863, - "learning_rate": 1.5151515151515152e-07, - "loss": 0.8231598138809204, - "mean_token_accuracy": 0.7550969123840332, - "num_tokens": 56665.0, - "step": 7 - }, - { - "epoch": 0.0060790273556231, - "grad_norm": 12.250170707702637, - "learning_rate": 1.767676767676768e-07, - "loss": 0.8576581478118896, - "mean_token_accuracy": 0.7568671703338623, - "num_tokens": 67606.0, - "step": 8 - }, - { - "epoch": 0.006838905775075988, - "grad_norm": 12.828629493713379, - "learning_rate": 2.0202020202020205e-07, - "loss": 0.9886435866355896, - "mean_token_accuracy": 0.733400285243988, - "num_tokens": 74272.0, - "step": 9 - }, - { - "epoch": 0.007598784194528876, - "grad_norm": 15.966923713684082, - "learning_rate": 2.2727272727272729e-07, - "loss": 1.064985990524292, - "mean_token_accuracy": 0.7101132869720459, - "num_tokens": 80524.0, - "step": 10 - }, - { - "epoch": 0.008358662613981762, - "grad_norm": 10.864850044250488, - "learning_rate": 2.525252525252525e-07, - "loss": 0.8311550617218018, - "mean_token_accuracy": 0.7431639432907104, - "num_tokens": 96292.0, - "step": 11 - }, - { - "epoch": 0.00911854103343465, - "grad_norm": 16.438785552978516, - "learning_rate": 2.7777777777777776e-07, - "loss": 1.0579866170883179, - "mean_token_accuracy": 0.7222976684570312, - "num_tokens": 102992.0, - "step": 12 - }, - { - "epoch": 0.009878419452887538, - "grad_norm": 11.179214477539062, - "learning_rate": 3.0303030303030305e-07, - "loss": 0.9816144704818726, - "mean_token_accuracy": 0.7206371426582336, - "num_tokens": 113571.0, - "step": 13 - }, - { - "epoch": 0.010638297872340425, - "grad_norm": 12.780299186706543, - "learning_rate": 3.2828282828282834e-07, - "loss": 0.847449004650116, - "mean_token_accuracy": 0.7826199531555176, - "num_tokens": 119568.0, - "step": 14 - }, - { - "epoch": 0.011398176291793313, - "grad_norm": 14.800421714782715, - "learning_rate": 3.535353535353536e-07, - "loss": 0.9275516271591187, - "mean_token_accuracy": 0.7655045986175537, - "num_tokens": 126258.0, - "step": 15 - }, - { - "epoch": 0.0121580547112462, - "grad_norm": 11.267602920532227, - "learning_rate": 3.787878787878788e-07, - "loss": 0.8464037179946899, - "mean_token_accuracy": 0.7606508731842041, - "num_tokens": 136831.0, - "step": 16 - }, - { - "epoch": 0.012917933130699088, - "grad_norm": 12.891013145446777, - "learning_rate": 4.040404040404041e-07, - "loss": 0.9903074502944946, - "mean_token_accuracy": 0.7247487306594849, - "num_tokens": 150434.0, - "step": 17 - }, - { - "epoch": 0.013677811550151976, - "grad_norm": 11.13957691192627, - "learning_rate": 4.2929292929292934e-07, - "loss": 0.8287211656570435, - "mean_token_accuracy": 0.7621913552284241, - "num_tokens": 158516.0, - "step": 18 - }, - { - "epoch": 0.014437689969604863, - "grad_norm": 18.39569664001465, - "learning_rate": 4.5454545454545457e-07, - "loss": 1.150015115737915, - "mean_token_accuracy": 0.7349498271942139, - "num_tokens": 162214.0, - "step": 19 - }, - { - "epoch": 0.015197568389057751, - "grad_norm": 9.353750228881836, - "learning_rate": 4.797979797979798e-07, - "loss": 0.7228299379348755, - "mean_token_accuracy": 0.7969573736190796, - "num_tokens": 173035.0, - "step": 20 - }, - { - "epoch": 0.015957446808510637, - "grad_norm": 8.267163276672363, - "learning_rate": 5.05050505050505e-07, - "loss": 0.7358136177062988, - "mean_token_accuracy": 0.7903937101364136, - "num_tokens": 183568.0, - "step": 21 - }, - { - "epoch": 0.016717325227963525, - "grad_norm": 11.137128829956055, - "learning_rate": 5.303030303030304e-07, - "loss": 1.0075397491455078, - "mean_token_accuracy": 0.702807605266571, - "num_tokens": 192759.0, - "step": 22 - }, - { - "epoch": 0.017477203647416412, - "grad_norm": 10.734103202819824, - "learning_rate": 5.555555555555555e-07, - "loss": 0.8925919532775879, - "mean_token_accuracy": 0.7475671768188477, - "num_tokens": 201280.0, - "step": 23 - }, - { - "epoch": 0.0182370820668693, - "grad_norm": 11.945566177368164, - "learning_rate": 5.808080808080809e-07, - "loss": 0.7260514497756958, - "mean_token_accuracy": 0.7859152555465698, - "num_tokens": 218053.0, - "step": 24 - }, - { - "epoch": 0.018996960486322188, - "grad_norm": 18.610652923583984, - "learning_rate": 6.060606060606061e-07, - "loss": 0.8995465636253357, - "mean_token_accuracy": 0.7931990623474121, - "num_tokens": 220953.0, - "step": 25 - }, - { - "epoch": 0.019756838905775075, - "grad_norm": 10.51898193359375, - "learning_rate": 6.313131313131314e-07, - "loss": 0.9532671570777893, - "mean_token_accuracy": 0.7257645726203918, - "num_tokens": 231200.0, - "step": 26 - }, - { - "epoch": 0.020516717325227963, - "grad_norm": 9.581812858581543, - "learning_rate": 6.565656565656567e-07, - "loss": 0.9038010239601135, - "mean_token_accuracy": 0.7390379905700684, - "num_tokens": 237711.0, - "step": 27 - }, - { - "epoch": 0.02127659574468085, - "grad_norm": 12.297484397888184, - "learning_rate": 6.818181818181818e-07, - "loss": 1.048936367034912, - "mean_token_accuracy": 0.7175670862197876, - "num_tokens": 242503.0, - "step": 28 - }, - { - "epoch": 0.022036474164133738, - "grad_norm": 7.437953472137451, - "learning_rate": 7.070707070707071e-07, - "loss": 0.8308826684951782, - "mean_token_accuracy": 0.7415335774421692, - "num_tokens": 250842.0, - "step": 29 - }, - { - "epoch": 0.022796352583586626, - "grad_norm": 6.134475231170654, - "learning_rate": 7.323232323232324e-07, - "loss": 0.647913932800293, - "mean_token_accuracy": 0.8124054670333862, - "num_tokens": 267453.0, - "step": 30 - }, - { - "epoch": 0.023556231003039513, - "grad_norm": 6.678966045379639, - "learning_rate": 7.575757575757576e-07, - "loss": 0.7052810192108154, - "mean_token_accuracy": 0.7908754348754883, - "num_tokens": 284416.0, - "step": 31 - }, - { - "epoch": 0.0243161094224924, - "grad_norm": 7.42232084274292, - "learning_rate": 7.82828282828283e-07, - "loss": 1.022383213043213, - "mean_token_accuracy": 0.7053230404853821, - "num_tokens": 292073.0, - "step": 32 - }, - { - "epoch": 0.02507598784194529, - "grad_norm": 6.463219165802002, - "learning_rate": 8.080808080808082e-07, - "loss": 0.7603012323379517, - "mean_token_accuracy": 0.7728140354156494, - "num_tokens": 298550.0, - "step": 33 - }, - { - "epoch": 0.025835866261398176, - "grad_norm": 5.668411731719971, - "learning_rate": 8.333333333333333e-07, - "loss": 0.7707852721214294, - "mean_token_accuracy": 0.7827773094177246, - "num_tokens": 306683.0, - "step": 34 - }, - { - "epoch": 0.026595744680851064, - "grad_norm": 4.984964847564697, - "learning_rate": 8.585858585858587e-07, - "loss": 0.6317349672317505, - "mean_token_accuracy": 0.8106861114501953, - "num_tokens": 318842.0, - "step": 35 - }, - { - "epoch": 0.02735562310030395, - "grad_norm": 4.421732425689697, - "learning_rate": 8.838383838383839e-07, - "loss": 0.6228617429733276, - "mean_token_accuracy": 0.8023355603218079, - "num_tokens": 329850.0, - "step": 36 - }, - { - "epoch": 0.02811550151975684, - "grad_norm": 5.970808029174805, - "learning_rate": 9.090909090909091e-07, - "loss": 0.8443238139152527, - "mean_token_accuracy": 0.7462409734725952, - "num_tokens": 335844.0, - "step": 37 - }, - { - "epoch": 0.028875379939209727, - "grad_norm": 4.5389084815979, - "learning_rate": 9.343434343434345e-07, - "loss": 0.6976436376571655, - "mean_token_accuracy": 0.790410041809082, - "num_tokens": 348768.0, - "step": 38 - }, - { - "epoch": 0.029635258358662615, - "grad_norm": 4.116631507873535, - "learning_rate": 9.595959595959596e-07, - "loss": 0.6698519587516785, - "mean_token_accuracy": 0.7818127870559692, - "num_tokens": 355460.0, - "step": 39 - }, - { - "epoch": 0.030395136778115502, - "grad_norm": 3.3714773654937744, - "learning_rate": 9.84848484848485e-07, - "loss": 0.5723201036453247, - "mean_token_accuracy": 0.8100086450576782, - "num_tokens": 368507.0, - "step": 40 - }, - { - "epoch": 0.03115501519756839, - "grad_norm": 4.4438347816467285, - "learning_rate": 1.01010101010101e-06, - "loss": 0.7508786916732788, - "mean_token_accuracy": 0.7711942791938782, - "num_tokens": 376467.0, - "step": 41 - }, - { - "epoch": 0.031914893617021274, - "grad_norm": 5.609974384307861, - "learning_rate": 1.0353535353535354e-06, - "loss": 0.566256046295166, - "mean_token_accuracy": 0.8319284319877625, - "num_tokens": 381399.0, - "step": 42 - }, - { - "epoch": 0.03267477203647416, - "grad_norm": 5.124386787414551, - "learning_rate": 1.0606060606060608e-06, - "loss": 0.8151067495346069, - "mean_token_accuracy": 0.7537785768508911, - "num_tokens": 387389.0, - "step": 43 - }, - { - "epoch": 0.03343465045592705, - "grad_norm": 3.6318116188049316, - "learning_rate": 1.085858585858586e-06, - "loss": 0.5989949107170105, - "mean_token_accuracy": 0.8129256963729858, - "num_tokens": 395302.0, - "step": 44 - }, - { - "epoch": 0.03419452887537994, - "grad_norm": 2.694424629211426, - "learning_rate": 1.111111111111111e-06, - "loss": 0.5831396579742432, - "mean_token_accuracy": 0.8056820631027222, - "num_tokens": 409920.0, - "step": 45 - }, - { - "epoch": 0.034954407294832825, - "grad_norm": 2.2949178218841553, - "learning_rate": 1.1363636363636364e-06, - "loss": 0.472550630569458, - "mean_token_accuracy": 0.8343006372451782, - "num_tokens": 428323.0, - "step": 46 - }, - { - "epoch": 0.03571428571428571, - "grad_norm": 3.3930575847625732, - "learning_rate": 1.1616161616161617e-06, - "loss": 0.6246505379676819, - "mean_token_accuracy": 0.783149003982544, - "num_tokens": 435889.0, - "step": 47 - }, - { - "epoch": 0.0364741641337386, - "grad_norm": 3.692598819732666, - "learning_rate": 1.186868686868687e-06, - "loss": 0.46132946014404297, - "mean_token_accuracy": 0.8583089113235474, - "num_tokens": 441192.0, - "step": 48 - }, - { - "epoch": 0.03723404255319149, - "grad_norm": 6.571533203125, - "learning_rate": 1.2121212121212122e-06, - "loss": 0.9351121783256531, - "mean_token_accuracy": 0.7580878734588623, - "num_tokens": 444277.0, - "step": 49 - }, - { - "epoch": 0.037993920972644375, - "grad_norm": 5.029570579528809, - "learning_rate": 1.2373737373737375e-06, - "loss": 0.6921554803848267, - "mean_token_accuracy": 0.8131166100502014, - "num_tokens": 447646.0, - "step": 50 - }, - { - "epoch": 0.03875379939209726, - "grad_norm": 2.9174208641052246, - "learning_rate": 1.2626262626262629e-06, - "loss": 0.591706395149231, - "mean_token_accuracy": 0.8108617067337036, - "num_tokens": 461397.0, - "step": 51 - }, - { - "epoch": 0.03951367781155015, - "grad_norm": 4.315536022186279, - "learning_rate": 1.287878787878788e-06, - "loss": 0.6986310482025146, - "mean_token_accuracy": 0.7710754871368408, - "num_tokens": 472047.0, - "step": 52 - }, - { - "epoch": 0.04027355623100304, - "grad_norm": 2.6216275691986084, - "learning_rate": 1.3131313131313134e-06, - "loss": 0.5553690791130066, - "mean_token_accuracy": 0.8167896866798401, - "num_tokens": 482795.0, - "step": 53 - }, - { - "epoch": 0.041033434650455926, - "grad_norm": 3.0562477111816406, - "learning_rate": 1.3383838383838385e-06, - "loss": 0.6909202337265015, - "mean_token_accuracy": 0.7859863638877869, - "num_tokens": 494818.0, - "step": 54 - }, - { - "epoch": 0.04179331306990881, - "grad_norm": 2.1420412063598633, - "learning_rate": 1.3636363636363636e-06, - "loss": 0.5415265560150146, - "mean_token_accuracy": 0.818886399269104, - "num_tokens": 513695.0, - "step": 55 - }, - { - "epoch": 0.0425531914893617, - "grad_norm": 2.9610488414764404, - "learning_rate": 1.3888888888888892e-06, - "loss": 0.6602212190628052, - "mean_token_accuracy": 0.7830734252929688, - "num_tokens": 523784.0, - "step": 56 - }, - { - "epoch": 0.04331306990881459, - "grad_norm": 2.511972665786743, - "learning_rate": 1.4141414141414143e-06, - "loss": 0.5717809796333313, - "mean_token_accuracy": 0.8053616285324097, - "num_tokens": 546308.0, - "step": 57 - }, - { - "epoch": 0.044072948328267476, - "grad_norm": 3.52642822265625, - "learning_rate": 1.4393939393939396e-06, - "loss": 0.6242594718933105, - "mean_token_accuracy": 0.8162082433700562, - "num_tokens": 552019.0, - "step": 58 - }, - { - "epoch": 0.044832826747720364, - "grad_norm": 3.02362322807312, - "learning_rate": 1.4646464646464648e-06, - "loss": 0.6634255647659302, - "mean_token_accuracy": 0.7682032585144043, - "num_tokens": 560009.0, - "step": 59 - }, - { - "epoch": 0.04559270516717325, - "grad_norm": 2.3910107612609863, - "learning_rate": 1.48989898989899e-06, - "loss": 0.5519146919250488, - "mean_token_accuracy": 0.8270269632339478, - "num_tokens": 571005.0, - "step": 60 - }, - { - "epoch": 0.04635258358662614, - "grad_norm": 4.28154993057251, - "learning_rate": 1.5151515151515152e-06, - "loss": 0.7437789440155029, - "mean_token_accuracy": 0.7782418131828308, - "num_tokens": 574950.0, - "step": 61 - }, - { - "epoch": 0.04711246200607903, - "grad_norm": 3.4078686237335205, - "learning_rate": 1.5404040404040404e-06, - "loss": 0.6345915198326111, - "mean_token_accuracy": 0.7903392314910889, - "num_tokens": 581657.0, - "step": 62 - }, - { - "epoch": 0.047872340425531915, - "grad_norm": 2.6834158897399902, - "learning_rate": 1.565656565656566e-06, - "loss": 0.5981127023696899, - "mean_token_accuracy": 0.7911489605903625, - "num_tokens": 591267.0, - "step": 63 - }, - { - "epoch": 0.0486322188449848, - "grad_norm": 2.1054461002349854, - "learning_rate": 1.590909090909091e-06, - "loss": 0.5523523688316345, - "mean_token_accuracy": 0.8194501399993896, - "num_tokens": 606787.0, - "step": 64 - }, - { - "epoch": 0.04939209726443769, - "grad_norm": 3.322596788406372, - "learning_rate": 1.6161616161616164e-06, - "loss": 0.48417025804519653, - "mean_token_accuracy": 0.8293706178665161, - "num_tokens": 611068.0, - "step": 65 - }, - { - "epoch": 0.05015197568389058, - "grad_norm": 2.302450180053711, - "learning_rate": 1.6414141414141415e-06, - "loss": 0.6498389840126038, - "mean_token_accuracy": 0.7728497385978699, - "num_tokens": 624452.0, - "step": 66 - }, - { - "epoch": 0.050911854103343465, - "grad_norm": 2.680191993713379, - "learning_rate": 1.6666666666666667e-06, - "loss": 0.6347037553787231, - "mean_token_accuracy": 0.8108306527137756, - "num_tokens": 638049.0, - "step": 67 - }, - { - "epoch": 0.05167173252279635, - "grad_norm": 3.0297021865844727, - "learning_rate": 1.6919191919191922e-06, - "loss": 0.5344363451004028, - "mean_token_accuracy": 0.8113535046577454, - "num_tokens": 643892.0, - "step": 68 - }, - { - "epoch": 0.05243161094224924, - "grad_norm": 2.9283676147460938, - "learning_rate": 1.7171717171717173e-06, - "loss": 0.6999260187149048, - "mean_token_accuracy": 0.7782022356987, - "num_tokens": 654418.0, - "step": 69 - }, - { - "epoch": 0.05319148936170213, - "grad_norm": 3.4098572731018066, - "learning_rate": 1.7424242424242427e-06, - "loss": 0.6508946418762207, - "mean_token_accuracy": 0.7942900657653809, - "num_tokens": 659837.0, - "step": 70 - }, - { - "epoch": 0.053951367781155016, - "grad_norm": 2.6756019592285156, - "learning_rate": 1.7676767676767678e-06, - "loss": 0.603486180305481, - "mean_token_accuracy": 0.8015457391738892, - "num_tokens": 668361.0, - "step": 71 - }, - { - "epoch": 0.0547112462006079, - "grad_norm": 2.2630293369293213, - "learning_rate": 1.792929292929293e-06, - "loss": 0.6608274579048157, - "mean_token_accuracy": 0.7753809690475464, - "num_tokens": 679025.0, - "step": 72 - }, - { - "epoch": 0.05547112462006079, - "grad_norm": 2.123962879180908, - "learning_rate": 1.8181818181818183e-06, - "loss": 0.4525482654571533, - "mean_token_accuracy": 0.8425612449645996, - "num_tokens": 688574.0, - "step": 73 - }, - { - "epoch": 0.05623100303951368, - "grad_norm": 7.90519905090332, - "learning_rate": 1.8434343434343434e-06, - "loss": 0.6507195830345154, - "mean_token_accuracy": 0.7714964151382446, - "num_tokens": 694534.0, - "step": 74 - }, - { - "epoch": 0.056990881458966566, - "grad_norm": 2.372203826904297, - "learning_rate": 1.868686868686869e-06, - "loss": 0.4458143413066864, - "mean_token_accuracy": 0.7991449236869812, - "num_tokens": 703114.0, - "step": 75 - }, - { - "epoch": 0.057750759878419454, - "grad_norm": 2.918677568435669, - "learning_rate": 1.8939393939393941e-06, - "loss": 0.5614339113235474, - "mean_token_accuracy": 0.8211464881896973, - "num_tokens": 709038.0, - "step": 76 - }, - { - "epoch": 0.05851063829787234, - "grad_norm": 1.6106709241867065, - "learning_rate": 1.9191919191919192e-06, - "loss": 0.5802098512649536, - "mean_token_accuracy": 0.8055065870285034, - "num_tokens": 730482.0, - "step": 77 - }, - { - "epoch": 0.05927051671732523, - "grad_norm": 2.8069989681243896, - "learning_rate": 1.944444444444445e-06, - "loss": 0.5709059238433838, - "mean_token_accuracy": 0.8024872541427612, - "num_tokens": 751817.0, - "step": 78 - }, - { - "epoch": 0.06003039513677812, - "grad_norm": 2.641667127609253, - "learning_rate": 1.96969696969697e-06, - "loss": 0.6480152606964111, - "mean_token_accuracy": 0.7912271618843079, - "num_tokens": 759236.0, - "step": 79 - }, - { - "epoch": 0.060790273556231005, - "grad_norm": 2.6034350395202637, - "learning_rate": 1.994949494949495e-06, - "loss": 0.5535176396369934, - "mean_token_accuracy": 0.7980542778968811, - "num_tokens": 766496.0, - "step": 80 - }, - { - "epoch": 0.06155015197568389, - "grad_norm": 1.7095069885253906, - "learning_rate": 2.02020202020202e-06, - "loss": 0.4545496106147766, - "mean_token_accuracy": 0.8229660391807556, - "num_tokens": 780124.0, - "step": 81 - }, - { - "epoch": 0.06231003039513678, - "grad_norm": 3.788830518722534, - "learning_rate": 2.0454545454545457e-06, - "loss": 0.6679391264915466, - "mean_token_accuracy": 0.7942397594451904, - "num_tokens": 784555.0, - "step": 82 - }, - { - "epoch": 0.06306990881458967, - "grad_norm": 2.009831666946411, - "learning_rate": 2.070707070707071e-06, - "loss": 0.5067101120948792, - "mean_token_accuracy": 0.8276634216308594, - "num_tokens": 797459.0, - "step": 83 - }, - { - "epoch": 0.06382978723404255, - "grad_norm": 2.201627731323242, - "learning_rate": 2.095959595959596e-06, - "loss": 0.5012127161026001, - "mean_token_accuracy": 0.8432504534721375, - "num_tokens": 810817.0, - "step": 84 - }, - { - "epoch": 0.06458966565349544, - "grad_norm": 2.492568016052246, - "learning_rate": 2.1212121212121216e-06, - "loss": 0.6142797470092773, - "mean_token_accuracy": 0.8338661193847656, - "num_tokens": 818191.0, - "step": 85 - }, - { - "epoch": 0.06534954407294832, - "grad_norm": 2.8360862731933594, - "learning_rate": 2.1464646464646467e-06, - "loss": 0.5569300651550293, - "mean_token_accuracy": 0.8121030330657959, - "num_tokens": 825325.0, - "step": 86 - }, - { - "epoch": 0.06610942249240122, - "grad_norm": 2.407548427581787, - "learning_rate": 2.171717171717172e-06, - "loss": 0.6442930102348328, - "mean_token_accuracy": 0.792514443397522, - "num_tokens": 834439.0, - "step": 87 - }, - { - "epoch": 0.0668693009118541, - "grad_norm": 2.340728759765625, - "learning_rate": 2.196969696969697e-06, - "loss": 0.6494365930557251, - "mean_token_accuracy": 0.7746615409851074, - "num_tokens": 843078.0, - "step": 88 - }, - { - "epoch": 0.067629179331307, - "grad_norm": 1.7703697681427002, - "learning_rate": 2.222222222222222e-06, - "loss": 0.598991870880127, - "mean_token_accuracy": 0.7992157340049744, - "num_tokens": 860171.0, - "step": 89 - }, - { - "epoch": 0.06838905775075987, - "grad_norm": 2.5779271125793457, - "learning_rate": 2.2474747474747476e-06, - "loss": 0.5693082809448242, - "mean_token_accuracy": 0.8093700408935547, - "num_tokens": 866669.0, - "step": 90 - }, - { - "epoch": 0.06914893617021277, - "grad_norm": 2.014092206954956, - "learning_rate": 2.2727272727272728e-06, - "loss": 0.5346695780754089, - "mean_token_accuracy": 0.8165590763092041, - "num_tokens": 876698.0, - "step": 91 - }, - { - "epoch": 0.06990881458966565, - "grad_norm": 1.7555919885635376, - "learning_rate": 2.2979797979797983e-06, - "loss": 0.5321458578109741, - "mean_token_accuracy": 0.8166656494140625, - "num_tokens": 889488.0, - "step": 92 - }, - { - "epoch": 0.07066869300911854, - "grad_norm": 1.8631824254989624, - "learning_rate": 2.3232323232323234e-06, - "loss": 0.5246532559394836, - "mean_token_accuracy": 0.8088107705116272, - "num_tokens": 901322.0, - "step": 93 - }, - { - "epoch": 0.07142857142857142, - "grad_norm": 3.2332139015197754, - "learning_rate": 2.348484848484849e-06, - "loss": 0.5141711235046387, - "mean_token_accuracy": 0.8382217884063721, - "num_tokens": 905792.0, - "step": 94 - }, - { - "epoch": 0.07218844984802432, - "grad_norm": 1.7806555032730103, - "learning_rate": 2.373737373737374e-06, - "loss": 0.5233149528503418, - "mean_token_accuracy": 0.8101529479026794, - "num_tokens": 917320.0, - "step": 95 - }, - { - "epoch": 0.0729483282674772, - "grad_norm": 1.8169859647750854, - "learning_rate": 2.3989898989898993e-06, - "loss": 0.578881561756134, - "mean_token_accuracy": 0.8044873476028442, - "num_tokens": 931062.0, - "step": 96 - }, - { - "epoch": 0.0737082066869301, - "grad_norm": 4.677402496337891, - "learning_rate": 2.4242424242424244e-06, - "loss": 0.7842556238174438, - "mean_token_accuracy": 0.7579764127731323, - "num_tokens": 934712.0, - "step": 97 - }, - { - "epoch": 0.07446808510638298, - "grad_norm": 2.6987264156341553, - "learning_rate": 2.4494949494949495e-06, - "loss": 0.5669287443161011, - "mean_token_accuracy": 0.8186933994293213, - "num_tokens": 941058.0, - "step": 98 - }, - { - "epoch": 0.07522796352583587, - "grad_norm": 1.6906023025512695, - "learning_rate": 2.474747474747475e-06, - "loss": 0.4976363778114319, - "mean_token_accuracy": 0.8198553323745728, - "num_tokens": 956509.0, - "step": 99 - }, - { - "epoch": 0.07598784194528875, - "grad_norm": 2.7256152629852295, - "learning_rate": 2.5e-06, - "loss": 0.7138420343399048, - "mean_token_accuracy": 0.7752805948257446, - "num_tokens": 963920.0, - "step": 100 - }, - { - "epoch": 0.07674772036474165, - "grad_norm": 2.174870491027832, - "learning_rate": 2.5252525252525258e-06, - "loss": 0.6733541488647461, - "mean_token_accuracy": 0.7745175361633301, - "num_tokens": 975268.0, - "step": 101 - }, - { - "epoch": 0.07750759878419453, - "grad_norm": 1.5587213039398193, - "learning_rate": 2.5505050505050505e-06, - "loss": 0.44223445653915405, - "mean_token_accuracy": 0.8278359174728394, - "num_tokens": 991837.0, - "step": 102 - }, - { - "epoch": 0.07826747720364742, - "grad_norm": 2.181840658187866, - "learning_rate": 2.575757575757576e-06, - "loss": 0.625128448009491, - "mean_token_accuracy": 0.7941786050796509, - "num_tokens": 1004325.0, - "step": 103 - }, - { - "epoch": 0.0790273556231003, - "grad_norm": 1.4986687898635864, - "learning_rate": 2.601010101010101e-06, - "loss": 0.39262527227401733, - "mean_token_accuracy": 0.8412648439407349, - "num_tokens": 1018331.0, - "step": 104 - }, - { - "epoch": 0.0797872340425532, - "grad_norm": 2.3416061401367188, - "learning_rate": 2.6262626262626267e-06, - "loss": 0.5495132803916931, - "mean_token_accuracy": 0.8193322420120239, - "num_tokens": 1026090.0, - "step": 105 - }, - { - "epoch": 0.08054711246200608, - "grad_norm": 3.8168859481811523, - "learning_rate": 2.6515151515151514e-06, - "loss": 0.4898706376552582, - "mean_token_accuracy": 0.8467956185340881, - "num_tokens": 1029955.0, - "step": 106 - }, - { - "epoch": 0.08130699088145897, - "grad_norm": 4.113908767700195, - "learning_rate": 2.676767676767677e-06, - "loss": 0.6189584732055664, - "mean_token_accuracy": 0.8019394278526306, - "num_tokens": 1033598.0, - "step": 107 - }, - { - "epoch": 0.08206686930091185, - "grad_norm": 2.50003981590271, - "learning_rate": 2.7020202020202025e-06, - "loss": 0.6479471921920776, - "mean_token_accuracy": 0.7790026664733887, - "num_tokens": 1042533.0, - "step": 108 - }, - { - "epoch": 0.08282674772036475, - "grad_norm": 1.408934473991394, - "learning_rate": 2.7272727272727272e-06, - "loss": 0.3909248113632202, - "mean_token_accuracy": 0.8477586507797241, - "num_tokens": 1061755.0, - "step": 109 - }, - { - "epoch": 0.08358662613981763, - "grad_norm": 3.360633611679077, - "learning_rate": 2.7525252525252528e-06, - "loss": 0.6952459812164307, - "mean_token_accuracy": 0.777535080909729, - "num_tokens": 1067316.0, - "step": 110 - }, - { - "epoch": 0.08434650455927052, - "grad_norm": 1.8631696701049805, - "learning_rate": 2.7777777777777783e-06, - "loss": 0.5420593023300171, - "mean_token_accuracy": 0.8157662749290466, - "num_tokens": 1079930.0, - "step": 111 - }, - { - "epoch": 0.0851063829787234, - "grad_norm": 2.4308314323425293, - "learning_rate": 2.803030303030303e-06, - "loss": 0.5863882303237915, - "mean_token_accuracy": 0.8206346035003662, - "num_tokens": 1088069.0, - "step": 112 - }, - { - "epoch": 0.0858662613981763, - "grad_norm": 2.922808885574341, - "learning_rate": 2.8282828282828286e-06, - "loss": 0.5217319130897522, - "mean_token_accuracy": 0.8253234028816223, - "num_tokens": 1093607.0, - "step": 113 - }, - { - "epoch": 0.08662613981762918, - "grad_norm": 2.3596107959747314, - "learning_rate": 2.8535353535353537e-06, - "loss": 0.5070714950561523, - "mean_token_accuracy": 0.8258323669433594, - "num_tokens": 1100405.0, - "step": 114 - }, - { - "epoch": 0.08738601823708207, - "grad_norm": 3.0853066444396973, - "learning_rate": 2.8787878787878793e-06, - "loss": 0.591964840888977, - "mean_token_accuracy": 0.8047322630882263, - "num_tokens": 1107535.0, - "step": 115 - }, - { - "epoch": 0.08814589665653495, - "grad_norm": 1.9251092672348022, - "learning_rate": 2.904040404040404e-06, - "loss": 0.5226191878318787, - "mean_token_accuracy": 0.8022720217704773, - "num_tokens": 1118716.0, - "step": 116 - }, - { - "epoch": 0.08890577507598785, - "grad_norm": 1.9692988395690918, - "learning_rate": 2.9292929292929295e-06, - "loss": 0.5462069511413574, - "mean_token_accuracy": 0.8157015442848206, - "num_tokens": 1131917.0, - "step": 117 - }, - { - "epoch": 0.08966565349544073, - "grad_norm": 1.4738909006118774, - "learning_rate": 2.954545454545455e-06, - "loss": 0.4564219117164612, - "mean_token_accuracy": 0.849632978439331, - "num_tokens": 1148534.0, - "step": 118 - }, - { - "epoch": 0.09042553191489362, - "grad_norm": 2.72646164894104, - "learning_rate": 2.97979797979798e-06, - "loss": 0.6654808521270752, - "mean_token_accuracy": 0.7752684354782104, - "num_tokens": 1155438.0, - "step": 119 - }, - { - "epoch": 0.0911854103343465, - "grad_norm": 2.7843852043151855, - "learning_rate": 3.0050505050505054e-06, - "loss": 0.5354680418968201, - "mean_token_accuracy": 0.8196378946304321, - "num_tokens": 1161815.0, - "step": 120 - }, - { - "epoch": 0.0919452887537994, - "grad_norm": 2.8052573204040527, - "learning_rate": 3.0303030303030305e-06, - "loss": 0.6366757154464722, - "mean_token_accuracy": 0.7967483997344971, - "num_tokens": 1168295.0, - "step": 121 - }, - { - "epoch": 0.09270516717325228, - "grad_norm": 2.7462735176086426, - "learning_rate": 3.055555555555556e-06, - "loss": 0.59470534324646, - "mean_token_accuracy": 0.8023771047592163, - "num_tokens": 1174502.0, - "step": 122 - }, - { - "epoch": 0.09346504559270517, - "grad_norm": 2.2743821144104004, - "learning_rate": 3.0808080808080807e-06, - "loss": 0.5720560550689697, - "mean_token_accuracy": 0.8162771463394165, - "num_tokens": 1183615.0, - "step": 123 - }, - { - "epoch": 0.09422492401215805, - "grad_norm": 1.8669533729553223, - "learning_rate": 3.1060606060606063e-06, - "loss": 0.4655378758907318, - "mean_token_accuracy": 0.8360732793807983, - "num_tokens": 1193761.0, - "step": 124 - }, - { - "epoch": 0.09498480243161095, - "grad_norm": 1.7666901350021362, - "learning_rate": 3.131313131313132e-06, - "loss": 0.5524153709411621, - "mean_token_accuracy": 0.8252713680267334, - "num_tokens": 1207870.0, - "step": 125 - }, - { - "epoch": 0.09574468085106383, - "grad_norm": 2.4720070362091064, - "learning_rate": 3.1565656565656566e-06, - "loss": 0.5003011226654053, - "mean_token_accuracy": 0.8491042852401733, - "num_tokens": 1214603.0, - "step": 126 - }, - { - "epoch": 0.09650455927051672, - "grad_norm": 1.6500422954559326, - "learning_rate": 3.181818181818182e-06, - "loss": 0.5137069225311279, - "mean_token_accuracy": 0.8273531198501587, - "num_tokens": 1228717.0, - "step": 127 - }, - { - "epoch": 0.0972644376899696, - "grad_norm": 3.402543067932129, - "learning_rate": 3.2070707070707072e-06, - "loss": 0.708167552947998, - "mean_token_accuracy": 0.7705385684967041, - "num_tokens": 1234361.0, - "step": 128 - }, - { - "epoch": 0.0980243161094225, - "grad_norm": 2.547285795211792, - "learning_rate": 3.232323232323233e-06, - "loss": 0.6020137071609497, - "mean_token_accuracy": 0.7981340289115906, - "num_tokens": 1244169.0, - "step": 129 - }, - { - "epoch": 0.09878419452887538, - "grad_norm": 2.0578792095184326, - "learning_rate": 3.257575757575758e-06, - "loss": 0.4425000250339508, - "mean_token_accuracy": 0.8567807674407959, - "num_tokens": 1252709.0, - "step": 130 - }, - { - "epoch": 0.09954407294832827, - "grad_norm": 1.672614336013794, - "learning_rate": 3.282828282828283e-06, - "loss": 0.4860966205596924, - "mean_token_accuracy": 0.8393139243125916, - "num_tokens": 1265766.0, - "step": 131 - }, - { - "epoch": 0.10030395136778116, - "grad_norm": 3.2560198307037354, - "learning_rate": 3.3080808080808086e-06, - "loss": 0.624736487865448, - "mean_token_accuracy": 0.7875322699546814, - "num_tokens": 1270779.0, - "step": 132 - }, - { - "epoch": 0.10106382978723404, - "grad_norm": 2.4468185901641846, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.5062227249145508, - "mean_token_accuracy": 0.8217229843139648, - "num_tokens": 1277113.0, - "step": 133 - }, - { - "epoch": 0.10182370820668693, - "grad_norm": 2.6371328830718994, - "learning_rate": 3.358585858585859e-06, - "loss": 0.477113276720047, - "mean_token_accuracy": 0.8605583906173706, - "num_tokens": 1282514.0, - "step": 134 - }, - { - "epoch": 0.10258358662613981, - "grad_norm": 2.48421311378479, - "learning_rate": 3.3838383838383844e-06, - "loss": 0.40855684876441956, - "mean_token_accuracy": 0.864548921585083, - "num_tokens": 1287859.0, - "step": 135 - }, - { - "epoch": 0.1033434650455927, - "grad_norm": 1.993099331855774, - "learning_rate": 3.409090909090909e-06, - "loss": 0.5913145542144775, - "mean_token_accuracy": 0.8248485922813416, - "num_tokens": 1301074.0, - "step": 136 - }, - { - "epoch": 0.10410334346504559, - "grad_norm": 3.5947680473327637, - "learning_rate": 3.4343434343434347e-06, - "loss": 0.5028599500656128, - "mean_token_accuracy": 0.8367215394973755, - "num_tokens": 1305219.0, - "step": 137 - }, - { - "epoch": 0.10486322188449848, - "grad_norm": 2.5778582096099854, - "learning_rate": 3.45959595959596e-06, - "loss": 0.5297672748565674, - "mean_token_accuracy": 0.8232187032699585, - "num_tokens": 1312482.0, - "step": 138 - }, - { - "epoch": 0.10562310030395136, - "grad_norm": 1.8961588144302368, - "learning_rate": 3.4848484848484854e-06, - "loss": 0.39954107999801636, - "mean_token_accuracy": 0.8605833053588867, - "num_tokens": 1323404.0, - "step": 139 - }, - { - "epoch": 0.10638297872340426, - "grad_norm": 1.9687960147857666, - "learning_rate": 3.51010101010101e-06, - "loss": 0.48791587352752686, - "mean_token_accuracy": 0.8200347423553467, - "num_tokens": 1333027.0, - "step": 140 - }, - { - "epoch": 0.10714285714285714, - "grad_norm": 2.520242691040039, - "learning_rate": 3.5353535353535356e-06, - "loss": 0.6106002330780029, - "mean_token_accuracy": 0.790692150592804, - "num_tokens": 1340999.0, - "step": 141 - }, - { - "epoch": 0.10790273556231003, - "grad_norm": 3.751617431640625, - "learning_rate": 3.560606060606061e-06, - "loss": 0.48141729831695557, - "mean_token_accuracy": 0.8421382904052734, - "num_tokens": 1344687.0, - "step": 142 - }, - { - "epoch": 0.10866261398176291, - "grad_norm": 2.7101709842681885, - "learning_rate": 3.585858585858586e-06, - "loss": 0.5375241637229919, - "mean_token_accuracy": 0.8061438202857971, - "num_tokens": 1350192.0, - "step": 143 - }, - { - "epoch": 0.1094224924012158, - "grad_norm": 2.583484411239624, - "learning_rate": 3.6111111111111115e-06, - "loss": 0.6492470502853394, - "mean_token_accuracy": 0.7863001823425293, - "num_tokens": 1358148.0, - "step": 144 - }, - { - "epoch": 0.11018237082066869, - "grad_norm": 1.792561650276184, - "learning_rate": 3.6363636363636366e-06, - "loss": 0.48480600118637085, - "mean_token_accuracy": 0.8358709812164307, - "num_tokens": 1369519.0, - "step": 145 - }, - { - "epoch": 0.11094224924012158, - "grad_norm": 2.6480472087860107, - "learning_rate": 3.661616161616162e-06, - "loss": 0.5268933176994324, - "mean_token_accuracy": 0.8214013576507568, - "num_tokens": 1375862.0, - "step": 146 - }, - { - "epoch": 0.11170212765957446, - "grad_norm": 2.3174469470977783, - "learning_rate": 3.686868686868687e-06, - "loss": 0.42517897486686707, - "mean_token_accuracy": 0.8523461222648621, - "num_tokens": 1381546.0, - "step": 147 - }, - { - "epoch": 0.11246200607902736, - "grad_norm": 3.0090949535369873, - "learning_rate": 3.7121212121212124e-06, - "loss": 0.4042336940765381, - "mean_token_accuracy": 0.8670448064804077, - "num_tokens": 1385896.0, - "step": 148 - }, - { - "epoch": 0.11322188449848024, - "grad_norm": 2.4928104877471924, - "learning_rate": 3.737373737373738e-06, - "loss": 0.6498878598213196, - "mean_token_accuracy": 0.7967068552970886, - "num_tokens": 1394169.0, - "step": 149 - }, - { - "epoch": 0.11398176291793313, - "grad_norm": 1.5984913110733032, - "learning_rate": 3.7626262626262627e-06, - "loss": 0.546096920967102, - "mean_token_accuracy": 0.8035850524902344, - "num_tokens": 1408785.0, - "step": 150 - }, - { - "epoch": 0.11474164133738601, - "grad_norm": 2.3663532733917236, - "learning_rate": 3.7878787878787882e-06, - "loss": 0.6111721992492676, - "mean_token_accuracy": 0.8015355467796326, - "num_tokens": 1417510.0, - "step": 151 - }, - { - "epoch": 0.11550151975683891, - "grad_norm": 2.518932819366455, - "learning_rate": 3.8131313131313138e-06, - "loss": 0.5274964570999146, - "mean_token_accuracy": 0.8155480623245239, - "num_tokens": 1424186.0, - "step": 152 - }, - { - "epoch": 0.11626139817629179, - "grad_norm": 2.14353609085083, - "learning_rate": 3.8383838383838385e-06, - "loss": 0.5283297896385193, - "mean_token_accuracy": 0.8275758028030396, - "num_tokens": 1432630.0, - "step": 153 - }, - { - "epoch": 0.11702127659574468, - "grad_norm": 1.8243604898452759, - "learning_rate": 3.863636363636364e-06, - "loss": 0.41854870319366455, - "mean_token_accuracy": 0.8222295045852661, - "num_tokens": 1442691.0, - "step": 154 - }, - { - "epoch": 0.11778115501519756, - "grad_norm": 2.088212251663208, - "learning_rate": 3.88888888888889e-06, - "loss": 0.6062943339347839, - "mean_token_accuracy": 0.8009427785873413, - "num_tokens": 1456890.0, - "step": 155 - }, - { - "epoch": 0.11854103343465046, - "grad_norm": 1.3469511270523071, - "learning_rate": 3.914141414141415e-06, - "loss": 0.4390433728694916, - "mean_token_accuracy": 0.8436295986175537, - "num_tokens": 1475349.0, - "step": 156 - }, - { - "epoch": 0.11930091185410334, - "grad_norm": 3.247023105621338, - "learning_rate": 3.93939393939394e-06, - "loss": 0.6490433216094971, - "mean_token_accuracy": 0.8037861585617065, - "num_tokens": 1479952.0, - "step": 157 - }, - { - "epoch": 0.12006079027355623, - "grad_norm": 2.6610445976257324, - "learning_rate": 3.964646464646465e-06, - "loss": 0.6221826076507568, - "mean_token_accuracy": 0.7848749160766602, - "num_tokens": 1487306.0, - "step": 158 - }, - { - "epoch": 0.12082066869300911, - "grad_norm": 2.3060810565948486, - "learning_rate": 3.98989898989899e-06, - "loss": 0.5052388310432434, - "mean_token_accuracy": 0.8281195759773254, - "num_tokens": 1495367.0, - "step": 159 - }, - { - "epoch": 0.12158054711246201, - "grad_norm": 2.504448652267456, - "learning_rate": 4.015151515151515e-06, - "loss": 0.5005477666854858, - "mean_token_accuracy": 0.8408058881759644, - "num_tokens": 1502069.0, - "step": 160 - }, - { - "epoch": 0.12234042553191489, - "grad_norm": 3.993938446044922, - "learning_rate": 4.04040404040404e-06, - "loss": 0.5569638013839722, - "mean_token_accuracy": 0.8095242977142334, - "num_tokens": 1510224.0, - "step": 161 - }, - { - "epoch": 0.12310030395136778, - "grad_norm": 2.2287683486938477, - "learning_rate": 4.065656565656566e-06, - "loss": 0.524042546749115, - "mean_token_accuracy": 0.8102203607559204, - "num_tokens": 1518364.0, - "step": 162 - }, - { - "epoch": 0.12386018237082067, - "grad_norm": 1.9531738758087158, - "learning_rate": 4.0909090909090915e-06, - "loss": 0.45794573426246643, - "mean_token_accuracy": 0.8560376167297363, - "num_tokens": 1528097.0, - "step": 163 - }, - { - "epoch": 0.12462006079027356, - "grad_norm": 1.5841206312179565, - "learning_rate": 4.116161616161617e-06, - "loss": 0.5420972108840942, - "mean_token_accuracy": 0.8092726469039917, - "num_tokens": 1544119.0, - "step": 164 - }, - { - "epoch": 0.12537993920972645, - "grad_norm": 1.7536218166351318, - "learning_rate": 4.141414141414142e-06, - "loss": 0.554668664932251, - "mean_token_accuracy": 0.8193825483322144, - "num_tokens": 1559140.0, - "step": 165 - }, - { - "epoch": 0.12613981762917933, - "grad_norm": 3.545454740524292, - "learning_rate": 4.166666666666667e-06, - "loss": 0.580947995185852, - "mean_token_accuracy": 0.8286383152008057, - "num_tokens": 1563625.0, - "step": 166 - }, - { - "epoch": 0.12689969604863222, - "grad_norm": 1.6608915328979492, - "learning_rate": 4.191919191919192e-06, - "loss": 0.5523324012756348, - "mean_token_accuracy": 0.8155215978622437, - "num_tokens": 1574945.0, - "step": 167 - }, - { - "epoch": 0.1276595744680851, - "grad_norm": 1.4832708835601807, - "learning_rate": 4.217171717171717e-06, - "loss": 0.5133191347122192, - "mean_token_accuracy": 0.8367571830749512, - "num_tokens": 1595865.0, - "step": 168 - }, - { - "epoch": 0.128419452887538, - "grad_norm": 1.7807520627975464, - "learning_rate": 4.242424242424243e-06, - "loss": 0.5131410360336304, - "mean_token_accuracy": 0.8129367232322693, - "num_tokens": 1608723.0, - "step": 169 - }, - { - "epoch": 0.12917933130699089, - "grad_norm": 2.707569122314453, - "learning_rate": 4.267676767676767e-06, - "loss": 0.6129013299942017, - "mean_token_accuracy": 0.7926048040390015, - "num_tokens": 1616136.0, - "step": 170 - }, - { - "epoch": 0.12993920972644377, - "grad_norm": 2.5831644535064697, - "learning_rate": 4.292929292929293e-06, - "loss": 0.6264227628707886, - "mean_token_accuracy": 0.8074911236763, - "num_tokens": 1624228.0, - "step": 171 - }, - { - "epoch": 0.13069908814589665, - "grad_norm": 3.1124250888824463, - "learning_rate": 4.3181818181818185e-06, - "loss": 0.41763827204704285, - "mean_token_accuracy": 0.8565453290939331, - "num_tokens": 1628098.0, - "step": 172 - }, - { - "epoch": 0.13145896656534956, - "grad_norm": 2.3214211463928223, - "learning_rate": 4.343434343434344e-06, - "loss": 0.421974778175354, - "mean_token_accuracy": 0.8391546010971069, - "num_tokens": 1634950.0, - "step": 173 - }, - { - "epoch": 0.13221884498480244, - "grad_norm": 2.1010327339172363, - "learning_rate": 4.368686868686869e-06, - "loss": 0.5307331681251526, - "mean_token_accuracy": 0.8139588236808777, - "num_tokens": 1644132.0, - "step": 174 - }, - { - "epoch": 0.13297872340425532, - "grad_norm": 2.533612012863159, - "learning_rate": 4.393939393939394e-06, - "loss": 0.5626664161682129, - "mean_token_accuracy": 0.8029808402061462, - "num_tokens": 1651637.0, - "step": 175 - }, - { - "epoch": 0.1337386018237082, - "grad_norm": 1.669508457183838, - "learning_rate": 4.41919191919192e-06, - "loss": 0.5351508259773254, - "mean_token_accuracy": 0.8281655311584473, - "num_tokens": 1666776.0, - "step": 176 - }, - { - "epoch": 0.1344984802431611, - "grad_norm": 1.7579659223556519, - "learning_rate": 4.444444444444444e-06, - "loss": 0.5235031247138977, - "mean_token_accuracy": 0.8143284320831299, - "num_tokens": 1679241.0, - "step": 177 - }, - { - "epoch": 0.135258358662614, - "grad_norm": 3.123563528060913, - "learning_rate": 4.46969696969697e-06, - "loss": 0.43051332235336304, - "mean_token_accuracy": 0.8518186211585999, - "num_tokens": 1683317.0, - "step": 178 - }, - { - "epoch": 0.13601823708206687, - "grad_norm": 2.2411575317382812, - "learning_rate": 4.494949494949495e-06, - "loss": 0.5471380949020386, - "mean_token_accuracy": 0.8267596960067749, - "num_tokens": 1691366.0, - "step": 179 - }, - { - "epoch": 0.13677811550151975, - "grad_norm": 2.621973991394043, - "learning_rate": 4.520202020202021e-06, - "loss": 0.5685839653015137, - "mean_token_accuracy": 0.8260642290115356, - "num_tokens": 1698148.0, - "step": 180 - }, - { - "epoch": 0.13753799392097266, - "grad_norm": 2.1553852558135986, - "learning_rate": 4.5454545454545455e-06, - "loss": 0.5703883171081543, - "mean_token_accuracy": 0.8219090700149536, - "num_tokens": 1707225.0, - "step": 181 - }, - { - "epoch": 0.13829787234042554, - "grad_norm": 5.1767897605896, - "learning_rate": 4.5707070707070715e-06, - "loss": 0.32704639434814453, - "mean_token_accuracy": 0.8754568099975586, - "num_tokens": 1712748.0, - "step": 182 - }, - { - "epoch": 0.13905775075987842, - "grad_norm": 2.609168291091919, - "learning_rate": 4.595959595959597e-06, - "loss": 0.5939987301826477, - "mean_token_accuracy": 0.8034975528717041, - "num_tokens": 1719932.0, - "step": 183 - }, - { - "epoch": 0.1398176291793313, - "grad_norm": 2.2059099674224854, - "learning_rate": 4.621212121212122e-06, - "loss": 0.5310720205307007, - "mean_token_accuracy": 0.8177368640899658, - "num_tokens": 1727640.0, - "step": 184 - }, - { - "epoch": 0.1405775075987842, - "grad_norm": 2.6367759704589844, - "learning_rate": 4.646464646464647e-06, - "loss": 0.522086501121521, - "mean_token_accuracy": 0.826233983039856, - "num_tokens": 1733609.0, - "step": 185 - }, - { - "epoch": 0.1413373860182371, - "grad_norm": 3.326732873916626, - "learning_rate": 4.671717171717172e-06, - "loss": 0.4127829074859619, - "mean_token_accuracy": 0.8551101684570312, - "num_tokens": 1737256.0, - "step": 186 - }, - { - "epoch": 0.14209726443768997, - "grad_norm": 1.828412413597107, - "learning_rate": 4.696969696969698e-06, - "loss": 0.5444269180297852, - "mean_token_accuracy": 0.8350818157196045, - "num_tokens": 1750196.0, - "step": 187 - }, - { - "epoch": 0.14285714285714285, - "grad_norm": 3.209203004837036, - "learning_rate": 4.722222222222222e-06, - "loss": 0.5087994933128357, - "mean_token_accuracy": 0.8349015712738037, - "num_tokens": 1754836.0, - "step": 188 - }, - { - "epoch": 0.14361702127659576, - "grad_norm": 1.7339166402816772, - "learning_rate": 4.747474747474748e-06, - "loss": 0.5151352286338806, - "mean_token_accuracy": 0.8321266174316406, - "num_tokens": 1766015.0, - "step": 189 - }, - { - "epoch": 0.14437689969604864, - "grad_norm": 2.699068069458008, - "learning_rate": 4.772727272727273e-06, - "loss": 0.4406203031539917, - "mean_token_accuracy": 0.8425000905990601, - "num_tokens": 1771684.0, - "step": 190 - }, - { - "epoch": 0.14513677811550152, - "grad_norm": 2.8117282390594482, - "learning_rate": 4.7979797979797985e-06, - "loss": 0.40428489446640015, - "mean_token_accuracy": 0.8654326796531677, - "num_tokens": 1776301.0, - "step": 191 - }, - { - "epoch": 0.1458966565349544, - "grad_norm": 2.9204647541046143, - "learning_rate": 4.823232323232324e-06, - "loss": 0.4191770553588867, - "mean_token_accuracy": 0.8574687242507935, - "num_tokens": 1781678.0, - "step": 192 - }, - { - "epoch": 0.1466565349544073, - "grad_norm": 2.1648988723754883, - "learning_rate": 4.848484848484849e-06, - "loss": 0.5839012861251831, - "mean_token_accuracy": 0.8053664565086365, - "num_tokens": 1792516.0, - "step": 193 - }, - { - "epoch": 0.1474164133738602, - "grad_norm": 2.3221631050109863, - "learning_rate": 4.873737373737374e-06, - "loss": 0.5037894248962402, - "mean_token_accuracy": 0.8427227139472961, - "num_tokens": 1800192.0, - "step": 194 - }, - { - "epoch": 0.14817629179331307, - "grad_norm": 2.4536430835723877, - "learning_rate": 4.898989898989899e-06, - "loss": 0.42326074838638306, - "mean_token_accuracy": 0.8510633111000061, - "num_tokens": 1806159.0, - "step": 195 - }, - { - "epoch": 0.14893617021276595, - "grad_norm": 2.4875805377960205, - "learning_rate": 4.924242424242425e-06, - "loss": 0.539531409740448, - "mean_token_accuracy": 0.8060250282287598, - "num_tokens": 1813392.0, - "step": 196 - }, - { - "epoch": 0.14969604863221886, - "grad_norm": 2.1664798259735107, - "learning_rate": 4.94949494949495e-06, - "loss": 0.42502015829086304, - "mean_token_accuracy": 0.8503251075744629, - "num_tokens": 1821424.0, - "step": 197 - }, - { - "epoch": 0.15045592705167174, - "grad_norm": 2.568808078765869, - "learning_rate": 4.974747474747475e-06, - "loss": 0.5025098323822021, - "mean_token_accuracy": 0.8182311058044434, - "num_tokens": 1827225.0, - "step": 198 - }, - { - "epoch": 0.15121580547112462, - "grad_norm": 1.9116802215576172, - "learning_rate": 5e-06, - "loss": 0.4907258450984955, - "mean_token_accuracy": 0.8310189843177795, - "num_tokens": 1836297.0, - "step": 199 - }, - { - "epoch": 0.1519756838905775, - "grad_norm": 3.150765895843506, - "learning_rate": 4.999999122701883e-06, - "loss": 0.390616774559021, - "mean_token_accuracy": 0.8626647591590881, - "num_tokens": 1839984.0, - "step": 200 - }, - { - "epoch": 0.15273556231003038, - "grad_norm": 3.2229044437408447, - "learning_rate": 4.999996490808146e-06, - "loss": 0.48009657859802246, - "mean_token_accuracy": 0.825214147567749, - "num_tokens": 1844610.0, - "step": 201 - }, - { - "epoch": 0.1534954407294833, - "grad_norm": 1.4473289251327515, - "learning_rate": 4.9999921043206356e-06, - "loss": 0.40135183930397034, - "mean_token_accuracy": 0.8537827730178833, - "num_tokens": 1859573.0, - "step": 202 - }, - { - "epoch": 0.15425531914893617, - "grad_norm": 4.072319507598877, - "learning_rate": 4.999985963242432e-06, - "loss": 0.6158689260482788, - "mean_token_accuracy": 0.8075432777404785, - "num_tokens": 1863147.0, - "step": 203 - }, - { - "epoch": 0.15501519756838905, - "grad_norm": 3.15741229057312, - "learning_rate": 4.999978067577844e-06, - "loss": 0.4603108763694763, - "mean_token_accuracy": 0.8418779373168945, - "num_tokens": 1867201.0, - "step": 204 - }, - { - "epoch": 0.15577507598784193, - "grad_norm": 2.1925418376922607, - "learning_rate": 4.999968417332415e-06, - "loss": 0.5552488565444946, - "mean_token_accuracy": 0.8216016292572021, - "num_tokens": 1874837.0, - "step": 205 - }, - { - "epoch": 0.15653495440729484, - "grad_norm": 2.2518117427825928, - "learning_rate": 4.999957012512916e-06, - "loss": 0.4912569522857666, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 1881842.0, - "step": 206 - }, - { - "epoch": 0.15729483282674772, - "grad_norm": 1.8223762512207031, - "learning_rate": 4.999943853127351e-06, - "loss": 0.47709137201309204, - "mean_token_accuracy": 0.8311659097671509, - "num_tokens": 1890805.0, - "step": 207 - }, - { - "epoch": 0.1580547112462006, - "grad_norm": 2.066499948501587, - "learning_rate": 4.999928939184958e-06, - "loss": 0.44794657826423645, - "mean_token_accuracy": 0.8513424396514893, - "num_tokens": 1898264.0, - "step": 208 - }, - { - "epoch": 0.15881458966565348, - "grad_norm": 3.53865909576416, - "learning_rate": 4.999912270696202e-06, - "loss": 0.5978270769119263, - "mean_token_accuracy": 0.8080137968063354, - "num_tokens": 1902435.0, - "step": 209 - }, - { - "epoch": 0.1595744680851064, - "grad_norm": 2.0760679244995117, - "learning_rate": 4.999893847672783e-06, - "loss": 0.5930601358413696, - "mean_token_accuracy": 0.8028650283813477, - "num_tokens": 1912252.0, - "step": 210 - }, - { - "epoch": 0.16033434650455927, - "grad_norm": 2.21551513671875, - "learning_rate": 4.99987367012763e-06, - "loss": 0.6336753964424133, - "mean_token_accuracy": 0.7902286648750305, - "num_tokens": 1922095.0, - "step": 211 - }, - { - "epoch": 0.16109422492401215, - "grad_norm": 1.7654480934143066, - "learning_rate": 4.999851738074904e-06, - "loss": 0.6373403668403625, - "mean_token_accuracy": 0.7802424430847168, - "num_tokens": 1938962.0, - "step": 212 - }, - { - "epoch": 0.16185410334346503, - "grad_norm": 2.852834701538086, - "learning_rate": 4.9998280515300006e-06, - "loss": 0.6418683528900146, - "mean_token_accuracy": 0.7895716428756714, - "num_tokens": 1944668.0, - "step": 213 - }, - { - "epoch": 0.16261398176291794, - "grad_norm": 3.4737212657928467, - "learning_rate": 4.999802610509541e-06, - "loss": 0.6323273181915283, - "mean_token_accuracy": 0.7982614636421204, - "num_tokens": 1949142.0, - "step": 214 - }, - { - "epoch": 0.16337386018237082, - "grad_norm": 3.0802664756774902, - "learning_rate": 4.999775415031381e-06, - "loss": 0.5929068326950073, - "mean_token_accuracy": 0.8112219572067261, - "num_tokens": 1954141.0, - "step": 215 - }, - { - "epoch": 0.1641337386018237, - "grad_norm": 2.9808855056762695, - "learning_rate": 4.999746465114609e-06, - "loss": 0.5556406378746033, - "mean_token_accuracy": 0.8117628693580627, - "num_tokens": 1959406.0, - "step": 216 - }, - { - "epoch": 0.16489361702127658, - "grad_norm": 1.7346166372299194, - "learning_rate": 4.999715760779541e-06, - "loss": 0.5122925043106079, - "mean_token_accuracy": 0.8040724992752075, - "num_tokens": 1971921.0, - "step": 217 - }, - { - "epoch": 0.1656534954407295, - "grad_norm": 1.4183907508850098, - "learning_rate": 4.999683302047729e-06, - "loss": 0.46471893787384033, - "mean_token_accuracy": 0.8381330966949463, - "num_tokens": 1988863.0, - "step": 218 - }, - { - "epoch": 0.16641337386018237, - "grad_norm": 1.6797802448272705, - "learning_rate": 4.999649088941951e-06, - "loss": 0.38348832726478577, - "mean_token_accuracy": 0.8344278931617737, - "num_tokens": 2000003.0, - "step": 219 - }, - { - "epoch": 0.16717325227963525, - "grad_norm": 3.036963939666748, - "learning_rate": 4.999613121486222e-06, - "loss": 0.6062780618667603, - "mean_token_accuracy": 0.8217900991439819, - "num_tokens": 2004813.0, - "step": 220 - }, - { - "epoch": 0.16793313069908813, - "grad_norm": 2.0343217849731445, - "learning_rate": 4.999575399705782e-06, - "loss": 0.5052450895309448, - "mean_token_accuracy": 0.8368623852729797, - "num_tokens": 2013565.0, - "step": 221 - }, - { - "epoch": 0.16869300911854104, - "grad_norm": 2.1162009239196777, - "learning_rate": 4.9995359236271094e-06, - "loss": 0.5169756412506104, - "mean_token_accuracy": 0.8339958190917969, - "num_tokens": 2025763.0, - "step": 222 - }, - { - "epoch": 0.16945288753799392, - "grad_norm": 2.055333375930786, - "learning_rate": 4.9994946932779076e-06, - "loss": 0.6327048540115356, - "mean_token_accuracy": 0.8078711032867432, - "num_tokens": 2037005.0, - "step": 223 - }, - { - "epoch": 0.1702127659574468, - "grad_norm": 3.334620475769043, - "learning_rate": 4.999451708687114e-06, - "loss": 0.5688358545303345, - "mean_token_accuracy": 0.8015589714050293, - "num_tokens": 2041473.0, - "step": 224 - }, - { - "epoch": 0.17097264437689969, - "grad_norm": 2.3734676837921143, - "learning_rate": 4.999406969884897e-06, - "loss": 0.5673821568489075, - "mean_token_accuracy": 0.8054057359695435, - "num_tokens": 2049397.0, - "step": 225 - }, - { - "epoch": 0.1717325227963526, - "grad_norm": 1.807358980178833, - "learning_rate": 4.999360476902656e-06, - "loss": 0.4376158118247986, - "mean_token_accuracy": 0.8456039428710938, - "num_tokens": 2058721.0, - "step": 226 - }, - { - "epoch": 0.17249240121580547, - "grad_norm": 3.231638193130493, - "learning_rate": 4.999312229773022e-06, - "loss": 0.5592809915542603, - "mean_token_accuracy": 0.8170154094696045, - "num_tokens": 2063455.0, - "step": 227 - }, - { - "epoch": 0.17325227963525835, - "grad_norm": 2.2717151641845703, - "learning_rate": 4.999262228529855e-06, - "loss": 0.6144396066665649, - "mean_token_accuracy": 0.7948470115661621, - "num_tokens": 2071686.0, - "step": 228 - }, - { - "epoch": 0.17401215805471124, - "grad_norm": 1.4171342849731445, - "learning_rate": 4.99921047320825e-06, - "loss": 0.43680912256240845, - "mean_token_accuracy": 0.84850013256073, - "num_tokens": 2086999.0, - "step": 229 - }, - { - "epoch": 0.17477203647416414, - "grad_norm": 3.162736654281616, - "learning_rate": 4.99915696384453e-06, - "loss": 0.6025407910346985, - "mean_token_accuracy": 0.8042335510253906, - "num_tokens": 2092001.0, - "step": 230 - }, - { - "epoch": 0.17553191489361702, - "grad_norm": 1.8672804832458496, - "learning_rate": 4.99910170047625e-06, - "loss": 0.5843087434768677, - "mean_token_accuracy": 0.8016980886459351, - "num_tokens": 2103372.0, - "step": 231 - }, - { - "epoch": 0.1762917933130699, - "grad_norm": 2.967587471008301, - "learning_rate": 4.999044683142196e-06, - "loss": 0.5123642086982727, - "mean_token_accuracy": 0.8216149806976318, - "num_tokens": 2108008.0, - "step": 232 - }, - { - "epoch": 0.1770516717325228, - "grad_norm": 1.9651981592178345, - "learning_rate": 4.998985911882383e-06, - "loss": 0.5868178606033325, - "mean_token_accuracy": 0.7904198169708252, - "num_tokens": 2119009.0, - "step": 233 - }, - { - "epoch": 0.1778115501519757, - "grad_norm": 2.7785449028015137, - "learning_rate": 4.998925386738063e-06, - "loss": 0.5075510144233704, - "mean_token_accuracy": 0.8280210494995117, - "num_tokens": 2124915.0, - "step": 234 - }, - { - "epoch": 0.17857142857142858, - "grad_norm": 2.957470417022705, - "learning_rate": 4.998863107751711e-06, - "loss": 0.5351958274841309, - "mean_token_accuracy": 0.846825122833252, - "num_tokens": 2129905.0, - "step": 235 - }, - { - "epoch": 0.17933130699088146, - "grad_norm": 3.207671880722046, - "learning_rate": 4.99879907496704e-06, - "loss": 0.6209091544151306, - "mean_token_accuracy": 0.789960503578186, - "num_tokens": 2135027.0, - "step": 236 - }, - { - "epoch": 0.18009118541033434, - "grad_norm": 2.018953800201416, - "learning_rate": 4.998733288428987e-06, - "loss": 0.601510763168335, - "mean_token_accuracy": 0.8136930465698242, - "num_tokens": 2147016.0, - "step": 237 - }, - { - "epoch": 0.18085106382978725, - "grad_norm": 2.437281847000122, - "learning_rate": 4.998665748183727e-06, - "loss": 0.5813639163970947, - "mean_token_accuracy": 0.8116716146469116, - "num_tokens": 2155386.0, - "step": 238 - }, - { - "epoch": 0.18161094224924013, - "grad_norm": 1.5708180665969849, - "learning_rate": 4.998596454278661e-06, - "loss": 0.5252395272254944, - "mean_token_accuracy": 0.8193864822387695, - "num_tokens": 2170295.0, - "step": 239 - }, - { - "epoch": 0.182370820668693, - "grad_norm": 1.9921495914459229, - "learning_rate": 4.998525406762422e-06, - "loss": 0.5335029363632202, - "mean_token_accuracy": 0.8120872974395752, - "num_tokens": 2180012.0, - "step": 240 - }, - { - "epoch": 0.1831306990881459, - "grad_norm": 2.6562681198120117, - "learning_rate": 4.998452605684874e-06, - "loss": 0.48021435737609863, - "mean_token_accuracy": 0.8388714790344238, - "num_tokens": 2185607.0, - "step": 241 - }, - { - "epoch": 0.1838905775075988, - "grad_norm": 2.2535853385925293, - "learning_rate": 4.998378051097111e-06, - "loss": 0.5747300386428833, - "mean_token_accuracy": 0.8004639148712158, - "num_tokens": 2194105.0, - "step": 242 - }, - { - "epoch": 0.18465045592705168, - "grad_norm": 1.6151788234710693, - "learning_rate": 4.998301743051459e-06, - "loss": 0.6190565824508667, - "mean_token_accuracy": 0.7816627621650696, - "num_tokens": 2210629.0, - "step": 243 - }, - { - "epoch": 0.18541033434650456, - "grad_norm": 2.1088173389434814, - "learning_rate": 4.9982236816014735e-06, - "loss": 0.4715560972690582, - "mean_token_accuracy": 0.8485721349716187, - "num_tokens": 2218958.0, - "step": 244 - }, - { - "epoch": 0.18617021276595744, - "grad_norm": 2.6168735027313232, - "learning_rate": 4.998143866801941e-06, - "loss": 0.6077103018760681, - "mean_token_accuracy": 0.8057924509048462, - "num_tokens": 2226368.0, - "step": 245 - }, - { - "epoch": 0.18693009118541035, - "grad_norm": 2.5988616943359375, - "learning_rate": 4.99806229870888e-06, - "loss": 0.5021637678146362, - "mean_token_accuracy": 0.8361666202545166, - "num_tokens": 2232485.0, - "step": 246 - }, - { - "epoch": 0.18768996960486323, - "grad_norm": 2.015887498855591, - "learning_rate": 4.9979789773795365e-06, - "loss": 0.4309737980365753, - "mean_token_accuracy": 0.8508044481277466, - "num_tokens": 2240819.0, - "step": 247 - }, - { - "epoch": 0.1884498480243161, - "grad_norm": 2.3115265369415283, - "learning_rate": 4.997893902872389e-06, - "loss": 0.5776500701904297, - "mean_token_accuracy": 0.8079549074172974, - "num_tokens": 2249460.0, - "step": 248 - }, - { - "epoch": 0.189209726443769, - "grad_norm": 1.7387021780014038, - "learning_rate": 4.997807075247147e-06, - "loss": 0.430944561958313, - "mean_token_accuracy": 0.8483544588088989, - "num_tokens": 2259124.0, - "step": 249 - }, - { - "epoch": 0.1899696048632219, - "grad_norm": 1.6378381252288818, - "learning_rate": 4.997718494564747e-06, - "loss": 0.4123363792896271, - "mean_token_accuracy": 0.8557409644126892, - "num_tokens": 2269899.0, - "step": 250 - }, - { - "epoch": 0.19072948328267478, - "grad_norm": 1.336282730102539, - "learning_rate": 4.997628160887361e-06, - "loss": 0.502329409122467, - "mean_token_accuracy": 0.8186938166618347, - "num_tokens": 2292821.0, - "step": 251 - }, - { - "epoch": 0.19148936170212766, - "grad_norm": 3.3335583209991455, - "learning_rate": 4.997536074278388e-06, - "loss": 0.584446907043457, - "mean_token_accuracy": 0.8062717318534851, - "num_tokens": 2297175.0, - "step": 252 - }, - { - "epoch": 0.19224924012158054, - "grad_norm": 2.246727228164673, - "learning_rate": 4.9974422348024565e-06, - "loss": 0.5683060884475708, - "mean_token_accuracy": 0.8193703293800354, - "num_tokens": 2305456.0, - "step": 253 - }, - { - "epoch": 0.19300911854103345, - "grad_norm": 2.3520865440368652, - "learning_rate": 4.997346642525429e-06, - "loss": 0.4724946618080139, - "mean_token_accuracy": 0.8426719307899475, - "num_tokens": 2312241.0, - "step": 254 - }, - { - "epoch": 0.19376899696048633, - "grad_norm": 2.7115702629089355, - "learning_rate": 4.9972492975143936e-06, - "loss": 0.5019032955169678, - "mean_token_accuracy": 0.8253573179244995, - "num_tokens": 2318094.0, - "step": 255 - }, - { - "epoch": 0.1945288753799392, - "grad_norm": 1.705528974533081, - "learning_rate": 4.997150199837671e-06, - "loss": 0.45588475465774536, - "mean_token_accuracy": 0.836666464805603, - "num_tokens": 2329025.0, - "step": 256 - }, - { - "epoch": 0.1952887537993921, - "grad_norm": 2.161400318145752, - "learning_rate": 4.997049349564814e-06, - "loss": 0.5170183777809143, - "mean_token_accuracy": 0.8287534117698669, - "num_tokens": 2337448.0, - "step": 257 - }, - { - "epoch": 0.196048632218845, - "grad_norm": 2.629669189453125, - "learning_rate": 4.996946746766602e-06, - "loss": 0.44650501012802124, - "mean_token_accuracy": 0.850114107131958, - "num_tokens": 2343207.0, - "step": 258 - }, - { - "epoch": 0.19680851063829788, - "grad_norm": 1.6735503673553467, - "learning_rate": 4.996842391515045e-06, - "loss": 0.5247820019721985, - "mean_token_accuracy": 0.8285071849822998, - "num_tokens": 2356801.0, - "step": 259 - }, - { - "epoch": 0.19756838905775076, - "grad_norm": 1.2753115892410278, - "learning_rate": 4.996736283883382e-06, - "loss": 0.41870927810668945, - "mean_token_accuracy": 0.8448047637939453, - "num_tokens": 2377306.0, - "step": 260 - }, - { - "epoch": 0.19832826747720364, - "grad_norm": 2.6947314739227295, - "learning_rate": 4.9966284239460875e-06, - "loss": 0.5059205889701843, - "mean_token_accuracy": 0.8430814743041992, - "num_tokens": 2383352.0, - "step": 261 - }, - { - "epoch": 0.19908814589665655, - "grad_norm": 2.0509963035583496, - "learning_rate": 4.996518811778858e-06, - "loss": 0.4565388560295105, - "mean_token_accuracy": 0.8453130722045898, - "num_tokens": 2391149.0, - "step": 262 - }, - { - "epoch": 0.19984802431610943, - "grad_norm": 2.1856348514556885, - "learning_rate": 4.996407447458626e-06, - "loss": 0.531380832195282, - "mean_token_accuracy": 0.8387004137039185, - "num_tokens": 2399875.0, - "step": 263 - }, - { - "epoch": 0.2006079027355623, - "grad_norm": 2.7348573207855225, - "learning_rate": 4.99629433106355e-06, - "loss": 0.5242817401885986, - "mean_token_accuracy": 0.8177423477172852, - "num_tokens": 2406586.0, - "step": 264 - }, - { - "epoch": 0.2013677811550152, - "grad_norm": 1.76587975025177, - "learning_rate": 4.99617946267302e-06, - "loss": 0.49298471212387085, - "mean_token_accuracy": 0.8271149396896362, - "num_tokens": 2418683.0, - "step": 265 - }, - { - "epoch": 0.20212765957446807, - "grad_norm": 2.8129730224609375, - "learning_rate": 4.996062842367655e-06, - "loss": 0.46420302987098694, - "mean_token_accuracy": 0.8453244566917419, - "num_tokens": 2422929.0, - "step": 266 - }, - { - "epoch": 0.20288753799392098, - "grad_norm": 2.575744152069092, - "learning_rate": 4.9959444702293025e-06, - "loss": 0.43208545446395874, - "mean_token_accuracy": 0.8494843244552612, - "num_tokens": 2429567.0, - "step": 267 - }, - { - "epoch": 0.20364741641337386, - "grad_norm": 2.7586750984191895, - "learning_rate": 4.995824346341041e-06, - "loss": 0.4390473961830139, - "mean_token_accuracy": 0.8348895311355591, - "num_tokens": 2434700.0, - "step": 268 - }, - { - "epoch": 0.20440729483282674, - "grad_norm": 1.972145438194275, - "learning_rate": 4.99570247078718e-06, - "loss": 0.6219544410705566, - "mean_token_accuracy": 0.7939999103546143, - "num_tokens": 2447007.0, - "step": 269 - }, - { - "epoch": 0.20516717325227962, - "grad_norm": 2.2963485717773438, - "learning_rate": 4.995578843653255e-06, - "loss": 0.5008970499038696, - "mean_token_accuracy": 0.8255308866500854, - "num_tokens": 2453936.0, - "step": 270 - }, - { - "epoch": 0.20592705167173253, - "grad_norm": 1.8897721767425537, - "learning_rate": 4.995453465026033e-06, - "loss": 0.5436089038848877, - "mean_token_accuracy": 0.819086492061615, - "num_tokens": 2464494.0, - "step": 271 - }, - { - "epoch": 0.2066869300911854, - "grad_norm": 2.319728374481201, - "learning_rate": 4.995326334993508e-06, - "loss": 0.5136368870735168, - "mean_token_accuracy": 0.820817232131958, - "num_tokens": 2470938.0, - "step": 272 - }, - { - "epoch": 0.2074468085106383, - "grad_norm": 2.230414390563965, - "learning_rate": 4.9951974536449055e-06, - "loss": 0.5272846817970276, - "mean_token_accuracy": 0.8203279972076416, - "num_tokens": 2478629.0, - "step": 273 - }, - { - "epoch": 0.20820668693009117, - "grad_norm": 3.401937484741211, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.4389592111110687, - "mean_token_accuracy": 0.8647899031639099, - "num_tokens": 2482193.0, - "step": 274 - }, - { - "epoch": 0.20896656534954408, - "grad_norm": 2.1278507709503174, - "learning_rate": 4.994934437362513e-06, - "loss": 0.598863422870636, - "mean_token_accuracy": 0.7945119738578796, - "num_tokens": 2492465.0, - "step": 275 - }, - { - "epoch": 0.20972644376899696, - "grad_norm": 1.9259960651397705, - "learning_rate": 4.994800302613318e-06, - "loss": 0.49520939588546753, - "mean_token_accuracy": 0.8371536135673523, - "num_tokens": 2500825.0, - "step": 276 - }, - { - "epoch": 0.21048632218844984, - "grad_norm": 2.346418857574463, - "learning_rate": 4.994664416917236e-06, - "loss": 0.5412614345550537, - "mean_token_accuracy": 0.810661792755127, - "num_tokens": 2509513.0, - "step": 277 - }, - { - "epoch": 0.21124620060790272, - "grad_norm": 1.3092039823532104, - "learning_rate": 4.994526780369636e-06, - "loss": 0.46305379271507263, - "mean_token_accuracy": 0.8358527421951294, - "num_tokens": 2531405.0, - "step": 278 - }, - { - "epoch": 0.21200607902735563, - "grad_norm": 2.924611806869507, - "learning_rate": 4.9943873930671175e-06, - "loss": 0.6134544610977173, - "mean_token_accuracy": 0.7947378754615784, - "num_tokens": 2536744.0, - "step": 279 - }, - { - "epoch": 0.2127659574468085, - "grad_norm": 2.8290598392486572, - "learning_rate": 4.994246255107506e-06, - "loss": 0.465520441532135, - "mean_token_accuracy": 0.8440108299255371, - "num_tokens": 2541184.0, - "step": 280 - }, - { - "epoch": 0.2135258358662614, - "grad_norm": 3.8081259727478027, - "learning_rate": 4.994103366589859e-06, - "loss": 0.43394139409065247, - "mean_token_accuracy": 0.8579148054122925, - "num_tokens": 2545395.0, - "step": 281 - }, - { - "epoch": 0.21428571428571427, - "grad_norm": 1.7994529008865356, - "learning_rate": 4.993958727614462e-06, - "loss": 0.5076484680175781, - "mean_token_accuracy": 0.8270803093910217, - "num_tokens": 2556541.0, - "step": 282 - }, - { - "epoch": 0.21504559270516718, - "grad_norm": 2.5582659244537354, - "learning_rate": 4.993812338282826e-06, - "loss": 0.4453684389591217, - "mean_token_accuracy": 0.8488293886184692, - "num_tokens": 2562949.0, - "step": 283 - }, - { - "epoch": 0.21580547112462006, - "grad_norm": 1.6448938846588135, - "learning_rate": 4.993664198697694e-06, - "loss": 0.461971640586853, - "mean_token_accuracy": 0.824763298034668, - "num_tokens": 2576407.0, - "step": 284 - }, - { - "epoch": 0.21656534954407294, - "grad_norm": 2.1264469623565674, - "learning_rate": 4.993514308963037e-06, - "loss": 0.6241602897644043, - "mean_token_accuracy": 0.7916014790534973, - "num_tokens": 2585695.0, - "step": 285 - }, - { - "epoch": 0.21732522796352582, - "grad_norm": 3.629991292953491, - "learning_rate": 4.993362669184051e-06, - "loss": 0.610355019569397, - "mean_token_accuracy": 0.7847568988800049, - "num_tokens": 2589778.0, - "step": 286 - }, - { - "epoch": 0.21808510638297873, - "grad_norm": 1.9070756435394287, - "learning_rate": 4.993209279467164e-06, - "loss": 0.5513623952865601, - "mean_token_accuracy": 0.7911607027053833, - "num_tokens": 2600920.0, - "step": 287 - }, - { - "epoch": 0.2188449848024316, - "grad_norm": 1.761062741279602, - "learning_rate": 4.993054139920031e-06, - "loss": 0.4579957127571106, - "mean_token_accuracy": 0.8189530372619629, - "num_tokens": 2611856.0, - "step": 288 - }, - { - "epoch": 0.2196048632218845, - "grad_norm": 1.7264713048934937, - "learning_rate": 4.992897250651535e-06, - "loss": 0.5871305465698242, - "mean_token_accuracy": 0.7918527126312256, - "num_tokens": 2624730.0, - "step": 289 - }, - { - "epoch": 0.22036474164133737, - "grad_norm": 1.7455977201461792, - "learning_rate": 4.992738611771787e-06, - "loss": 0.5475119948387146, - "mean_token_accuracy": 0.8226917386054993, - "num_tokens": 2635705.0, - "step": 290 - }, - { - "epoch": 0.22112462006079028, - "grad_norm": 2.095095157623291, - "learning_rate": 4.992578223392124e-06, - "loss": 0.5952225923538208, - "mean_token_accuracy": 0.8078469038009644, - "num_tokens": 2643954.0, - "step": 291 - }, - { - "epoch": 0.22188449848024316, - "grad_norm": 2.994664192199707, - "learning_rate": 4.992416085625115e-06, - "loss": 0.5432442426681519, - "mean_token_accuracy": 0.8329008221626282, - "num_tokens": 2648800.0, - "step": 292 - }, - { - "epoch": 0.22264437689969604, - "grad_norm": 2.796790361404419, - "learning_rate": 4.992252198584554e-06, - "loss": 0.5168961882591248, - "mean_token_accuracy": 0.8393474817276001, - "num_tokens": 2653546.0, - "step": 293 - }, - { - "epoch": 0.22340425531914893, - "grad_norm": 1.8610522747039795, - "learning_rate": 4.992086562385462e-06, - "loss": 0.5728024244308472, - "mean_token_accuracy": 0.797406792640686, - "num_tokens": 2667483.0, - "step": 294 - }, - { - "epoch": 0.22416413373860183, - "grad_norm": 1.695472002029419, - "learning_rate": 4.9919191771440905e-06, - "loss": 0.5460028648376465, - "mean_token_accuracy": 0.8123016357421875, - "num_tokens": 2683574.0, - "step": 295 - }, - { - "epoch": 0.22492401215805471, - "grad_norm": 2.8627376556396484, - "learning_rate": 4.9917500429779165e-06, - "loss": 0.5566985011100769, - "mean_token_accuracy": 0.815531313419342, - "num_tokens": 2688985.0, - "step": 296 - }, - { - "epoch": 0.2256838905775076, - "grad_norm": 2.73323655128479, - "learning_rate": 4.991579160005644e-06, - "loss": 0.48197102546691895, - "mean_token_accuracy": 0.8471829295158386, - "num_tokens": 2694799.0, - "step": 297 - }, - { - "epoch": 0.22644376899696048, - "grad_norm": 1.8436161279678345, - "learning_rate": 4.991406528347206e-06, - "loss": 0.4528339207172394, - "mean_token_accuracy": 0.8603188395500183, - "num_tokens": 2707321.0, - "step": 298 - }, - { - "epoch": 0.22720364741641338, - "grad_norm": 2.6231515407562256, - "learning_rate": 4.9912321481237616e-06, - "loss": 0.5916541814804077, - "mean_token_accuracy": 0.8050242066383362, - "num_tokens": 2714233.0, - "step": 299 - }, - { - "epoch": 0.22796352583586627, - "grad_norm": 3.08776593208313, - "learning_rate": 4.991056019457697e-06, - "loss": 0.4860580563545227, - "mean_token_accuracy": 0.8464088439941406, - "num_tokens": 2718443.0, - "step": 300 - }, - { - "epoch": 0.22872340425531915, - "grad_norm": 2.2537803649902344, - "learning_rate": 4.990878142472628e-06, - "loss": 0.5158311128616333, - "mean_token_accuracy": 0.824694812297821, - "num_tokens": 2726158.0, - "step": 301 - }, - { - "epoch": 0.22948328267477203, - "grad_norm": 2.1122705936431885, - "learning_rate": 4.990698517293394e-06, - "loss": 0.495265394449234, - "mean_token_accuracy": 0.8343238830566406, - "num_tokens": 2735022.0, - "step": 302 - }, - { - "epoch": 0.23024316109422494, - "grad_norm": 3.5503528118133545, - "learning_rate": 4.9905171440460645e-06, - "loss": 0.46063232421875, - "mean_token_accuracy": 0.8420047760009766, - "num_tokens": 2738550.0, - "step": 303 - }, - { - "epoch": 0.23100303951367782, - "grad_norm": 3.9858486652374268, - "learning_rate": 4.990334022857932e-06, - "loss": 0.5832710266113281, - "mean_token_accuracy": 0.8144199848175049, - "num_tokens": 2741720.0, - "step": 304 - }, - { - "epoch": 0.2317629179331307, - "grad_norm": 2.407231330871582, - "learning_rate": 4.990149153857519e-06, - "loss": 0.4692630171775818, - "mean_token_accuracy": 0.8429223299026489, - "num_tokens": 2748693.0, - "step": 305 - }, - { - "epoch": 0.23252279635258358, - "grad_norm": 1.6996397972106934, - "learning_rate": 4.989962537174573e-06, - "loss": 0.49143946170806885, - "mean_token_accuracy": 0.8340128064155579, - "num_tokens": 2761254.0, - "step": 306 - }, - { - "epoch": 0.23328267477203649, - "grad_norm": 3.746432065963745, - "learning_rate": 4.989774172940071e-06, - "loss": 0.6282026767730713, - "mean_token_accuracy": 0.775698184967041, - "num_tokens": 2765115.0, - "step": 307 - }, - { - "epoch": 0.23404255319148937, - "grad_norm": 2.212872266769409, - "learning_rate": 4.989584061286211e-06, - "loss": 0.5193763971328735, - "mean_token_accuracy": 0.8168246746063232, - "num_tokens": 2772345.0, - "step": 308 - }, - { - "epoch": 0.23480243161094225, - "grad_norm": 1.752297282218933, - "learning_rate": 4.989392202346423e-06, - "loss": 0.4437984824180603, - "mean_token_accuracy": 0.8451256155967712, - "num_tokens": 2783072.0, - "step": 309 - }, - { - "epoch": 0.23556231003039513, - "grad_norm": 2.386019706726074, - "learning_rate": 4.989198596255361e-06, - "loss": 0.4090752899646759, - "mean_token_accuracy": 0.8480085134506226, - "num_tokens": 2788757.0, - "step": 310 - }, - { - "epoch": 0.23632218844984804, - "grad_norm": 3.9981489181518555, - "learning_rate": 4.989003243148904e-06, - "loss": 0.5149132013320923, - "mean_token_accuracy": 0.8179056644439697, - "num_tokens": 2792096.0, - "step": 311 - }, - { - "epoch": 0.23708206686930092, - "grad_norm": 1.8723100423812866, - "learning_rate": 4.988806143164159e-06, - "loss": 0.4531487822532654, - "mean_token_accuracy": 0.8400167226791382, - "num_tokens": 2802210.0, - "step": 312 - }, - { - "epoch": 0.2378419452887538, - "grad_norm": 2.3415136337280273, - "learning_rate": 4.988607296439459e-06, - "loss": 0.5974439978599548, - "mean_token_accuracy": 0.8035976886749268, - "num_tokens": 2810088.0, - "step": 313 - }, - { - "epoch": 0.23860182370820668, - "grad_norm": 1.5317577123641968, - "learning_rate": 4.98840670311436e-06, - "loss": 0.49247145652770996, - "mean_token_accuracy": 0.8292540311813354, - "num_tokens": 2824005.0, - "step": 314 - }, - { - "epoch": 0.2393617021276596, - "grad_norm": 2.170772075653076, - "learning_rate": 4.988204363329648e-06, - "loss": 0.6359974145889282, - "mean_token_accuracy": 0.7785564661026001, - "num_tokens": 2834680.0, - "step": 315 - }, - { - "epoch": 0.24012158054711247, - "grad_norm": 3.2655932903289795, - "learning_rate": 4.988000277227334e-06, - "loss": 0.5080196857452393, - "mean_token_accuracy": 0.8295877575874329, - "num_tokens": 2838735.0, - "step": 316 - }, - { - "epoch": 0.24088145896656535, - "grad_norm": 3.406589984893799, - "learning_rate": 4.987794444950651e-06, - "loss": 0.3939085006713867, - "mean_token_accuracy": 0.8700719475746155, - "num_tokens": 2842127.0, - "step": 317 - }, - { - "epoch": 0.24164133738601823, - "grad_norm": 1.8211106061935425, - "learning_rate": 4.987586866644061e-06, - "loss": 0.5270540118217468, - "mean_token_accuracy": 0.826683521270752, - "num_tokens": 2853656.0, - "step": 318 - }, - { - "epoch": 0.24240121580547114, - "grad_norm": 1.8429969549179077, - "learning_rate": 4.9873775424532515e-06, - "loss": 0.4705049991607666, - "mean_token_accuracy": 0.8355701565742493, - "num_tokens": 2863513.0, - "step": 319 - }, - { - "epoch": 0.24316109422492402, - "grad_norm": 2.2425320148468018, - "learning_rate": 4.9871664725251314e-06, - "loss": 0.485736608505249, - "mean_token_accuracy": 0.835182785987854, - "num_tokens": 2871556.0, - "step": 320 - }, - { - "epoch": 0.2439209726443769, - "grad_norm": 1.6202056407928467, - "learning_rate": 4.986953657007841e-06, - "loss": 0.4437887370586395, - "mean_token_accuracy": 0.8282591700553894, - "num_tokens": 2884335.0, - "step": 321 - }, - { - "epoch": 0.24468085106382978, - "grad_norm": 1.1027268171310425, - "learning_rate": 4.98673909605074e-06, - "loss": 0.3770800828933716, - "mean_token_accuracy": 0.8325437307357788, - "num_tokens": 2904286.0, - "step": 322 - }, - { - "epoch": 0.2454407294832827, - "grad_norm": 2.3239076137542725, - "learning_rate": 4.986522789804417e-06, - "loss": 0.5387254953384399, - "mean_token_accuracy": 0.806242823600769, - "num_tokens": 2910975.0, - "step": 323 - }, - { - "epoch": 0.24620060790273557, - "grad_norm": 2.243482828140259, - "learning_rate": 4.986304738420684e-06, - "loss": 0.4396553039550781, - "mean_token_accuracy": 0.8561904430389404, - "num_tokens": 2917087.0, - "step": 324 - }, - { - "epoch": 0.24696048632218845, - "grad_norm": 2.537264347076416, - "learning_rate": 4.986084942052577e-06, - "loss": 0.395110160112381, - "mean_token_accuracy": 0.8636915683746338, - "num_tokens": 2921887.0, - "step": 325 - }, - { - "epoch": 0.24772036474164133, - "grad_norm": 2.319399118423462, - "learning_rate": 4.9858634008543574e-06, - "loss": 0.581517219543457, - "mean_token_accuracy": 0.8157487511634827, - "num_tokens": 2928996.0, - "step": 326 - }, - { - "epoch": 0.24848024316109424, - "grad_norm": 1.9787474870681763, - "learning_rate": 4.985640114981513e-06, - "loss": 0.5084106922149658, - "mean_token_accuracy": 0.835221529006958, - "num_tokens": 2940302.0, - "step": 327 - }, - { - "epoch": 0.24924012158054712, - "grad_norm": 2.4783265590667725, - "learning_rate": 4.985415084590752e-06, - "loss": 0.6062222719192505, - "mean_token_accuracy": 0.7885516285896301, - "num_tokens": 2946386.0, - "step": 328 - }, - { - "epoch": 0.25, - "grad_norm": 2.4081411361694336, - "learning_rate": 4.985188309840012e-06, - "loss": 0.5079880356788635, - "mean_token_accuracy": 0.8313904404640198, - "num_tokens": 2952323.0, - "step": 329 - }, - { - "epoch": 0.2507598784194529, - "grad_norm": 2.64993953704834, - "learning_rate": 4.984959790888451e-06, - "loss": 0.5461447834968567, - "mean_token_accuracy": 0.8125468492507935, - "num_tokens": 2958119.0, - "step": 330 - }, - { - "epoch": 0.25151975683890576, - "grad_norm": 2.549734115600586, - "learning_rate": 4.984729527896451e-06, - "loss": 0.5998573303222656, - "mean_token_accuracy": 0.8076666593551636, - "num_tokens": 2964947.0, - "step": 331 - }, - { - "epoch": 0.25227963525835867, - "grad_norm": 3.2185161113739014, - "learning_rate": 4.984497521025622e-06, - "loss": 0.4232945442199707, - "mean_token_accuracy": 0.8543803095817566, - "num_tokens": 2968598.0, - "step": 332 - }, - { - "epoch": 0.2530395136778115, - "grad_norm": 2.588994264602661, - "learning_rate": 4.984263770438793e-06, - "loss": 0.460967481136322, - "mean_token_accuracy": 0.8416207432746887, - "num_tokens": 2974510.0, - "step": 333 - }, - { - "epoch": 0.25379939209726443, - "grad_norm": 2.1373162269592285, - "learning_rate": 4.984028276300021e-06, - "loss": 0.49382102489471436, - "mean_token_accuracy": 0.8388048410415649, - "num_tokens": 2981632.0, - "step": 334 - }, - { - "epoch": 0.25455927051671734, - "grad_norm": 2.2524826526641846, - "learning_rate": 4.983791038774585e-06, - "loss": 0.4947671890258789, - "mean_token_accuracy": 0.8066365122795105, - "num_tokens": 2988736.0, - "step": 335 - }, - { - "epoch": 0.2553191489361702, - "grad_norm": 1.7244199514389038, - "learning_rate": 4.983552058028985e-06, - "loss": 0.48096776008605957, - "mean_token_accuracy": 0.830735445022583, - "num_tokens": 3003576.0, - "step": 336 - }, - { - "epoch": 0.2560790273556231, - "grad_norm": 3.0628933906555176, - "learning_rate": 4.9833113342309495e-06, - "loss": 0.6027032136917114, - "mean_token_accuracy": 0.8008694648742676, - "num_tokens": 3009549.0, - "step": 337 - }, - { - "epoch": 0.256838905775076, - "grad_norm": 2.438674211502075, - "learning_rate": 4.983068867549427e-06, - "loss": 0.517090916633606, - "mean_token_accuracy": 0.827893853187561, - "num_tokens": 3015236.0, - "step": 338 - }, - { - "epoch": 0.25759878419452886, - "grad_norm": 2.131535053253174, - "learning_rate": 4.982824658154589e-06, - "loss": 0.6656812429428101, - "mean_token_accuracy": 0.7772425413131714, - "num_tokens": 3028142.0, - "step": 339 - }, - { - "epoch": 0.25835866261398177, - "grad_norm": 2.3206584453582764, - "learning_rate": 4.9825787062178315e-06, - "loss": 0.5757625699043274, - "mean_token_accuracy": 0.8073873519897461, - "num_tokens": 3040996.0, - "step": 340 - }, - { - "epoch": 0.2591185410334346, - "grad_norm": 1.3905521631240845, - "learning_rate": 4.982331011911774e-06, - "loss": 0.4193805456161499, - "mean_token_accuracy": 0.8399466872215271, - "num_tokens": 3061931.0, - "step": 341 - }, - { - "epoch": 0.25987841945288753, - "grad_norm": 2.184173345565796, - "learning_rate": 4.982081575410256e-06, - "loss": 0.4751223921775818, - "mean_token_accuracy": 0.8409271240234375, - "num_tokens": 3069081.0, - "step": 342 - }, - { - "epoch": 0.26063829787234044, - "grad_norm": 3.538764238357544, - "learning_rate": 4.9818303968883445e-06, - "loss": 0.8119601011276245, - "mean_token_accuracy": 0.7442739009857178, - "num_tokens": 3073628.0, - "step": 343 - }, - { - "epoch": 0.2613981762917933, - "grad_norm": 1.8063762187957764, - "learning_rate": 4.981577476522323e-06, - "loss": 0.5615730881690979, - "mean_token_accuracy": 0.8207751512527466, - "num_tokens": 3086596.0, - "step": 344 - }, - { - "epoch": 0.2621580547112462, - "grad_norm": 2.4346961975097656, - "learning_rate": 4.981322814489703e-06, - "loss": 0.5266709327697754, - "mean_token_accuracy": 0.8211277723312378, - "num_tokens": 3092631.0, - "step": 345 - }, - { - "epoch": 0.2629179331306991, - "grad_norm": 1.91289484500885, - "learning_rate": 4.981066410969215e-06, - "loss": 0.5047177672386169, - "mean_token_accuracy": 0.8356877565383911, - "num_tokens": 3101102.0, - "step": 346 - }, - { - "epoch": 0.26367781155015196, - "grad_norm": 2.1495707035064697, - "learning_rate": 4.980808266140813e-06, - "loss": 0.47876280546188354, - "mean_token_accuracy": 0.8364313244819641, - "num_tokens": 3107998.0, - "step": 347 - }, - { - "epoch": 0.26443768996960487, - "grad_norm": 2.5961992740631104, - "learning_rate": 4.9805483801856744e-06, - "loss": 0.5512958765029907, - "mean_token_accuracy": 0.8181467652320862, - "num_tokens": 3113848.0, - "step": 348 - }, - { - "epoch": 0.2651975683890577, - "grad_norm": 3.2828900814056396, - "learning_rate": 4.980286753286196e-06, - "loss": 0.4217945635318756, - "mean_token_accuracy": 0.8617103099822998, - "num_tokens": 3117652.0, - "step": 349 - }, - { - "epoch": 0.26595744680851063, - "grad_norm": 1.425554871559143, - "learning_rate": 4.980023385625996e-06, - "loss": 0.4042487144470215, - "mean_token_accuracy": 0.8492785692214966, - "num_tokens": 3132336.0, - "step": 350 - }, - { - "epoch": 0.26671732522796354, - "grad_norm": 2.933504104614258, - "learning_rate": 4.979758277389919e-06, - "loss": 0.5406704545021057, - "mean_token_accuracy": 0.8035423755645752, - "num_tokens": 3137544.0, - "step": 351 - }, - { - "epoch": 0.2674772036474164, - "grad_norm": 1.9958966970443726, - "learning_rate": 4.9794914287640264e-06, - "loss": 0.5857555270195007, - "mean_token_accuracy": 0.7965140342712402, - "num_tokens": 3149705.0, - "step": 352 - }, - { - "epoch": 0.2682370820668693, - "grad_norm": 2.467694044113159, - "learning_rate": 4.979222839935602e-06, - "loss": 0.6404043436050415, - "mean_token_accuracy": 0.7823755741119385, - "num_tokens": 3158353.0, - "step": 353 - }, - { - "epoch": 0.2689969604863222, - "grad_norm": 2.0102720260620117, - "learning_rate": 4.9789525110931545e-06, - "loss": 0.5681496858596802, - "mean_token_accuracy": 0.8108169436454773, - "num_tokens": 3167121.0, - "step": 354 - }, - { - "epoch": 0.26975683890577506, - "grad_norm": 2.6017866134643555, - "learning_rate": 4.978680442426409e-06, - "loss": 0.6309828162193298, - "mean_token_accuracy": 0.7742617130279541, - "num_tokens": 3175012.0, - "step": 355 - }, - { - "epoch": 0.270516717325228, - "grad_norm": 1.8799268007278442, - "learning_rate": 4.978406634126315e-06, - "loss": 0.524029016494751, - "mean_token_accuracy": 0.8317689895629883, - "num_tokens": 3185331.0, - "step": 356 - }, - { - "epoch": 0.2712765957446808, - "grad_norm": 1.508332371711731, - "learning_rate": 4.978131086385041e-06, - "loss": 0.46656402945518494, - "mean_token_accuracy": 0.8339117765426636, - "num_tokens": 3198813.0, - "step": 357 - }, - { - "epoch": 0.27203647416413373, - "grad_norm": 3.595707654953003, - "learning_rate": 4.977853799395976e-06, - "loss": 0.5101234912872314, - "mean_token_accuracy": 0.8251723051071167, - "num_tokens": 3206557.0, - "step": 358 - }, - { - "epoch": 0.27279635258358664, - "grad_norm": 3.5317916870117188, - "learning_rate": 4.977574773353732e-06, - "loss": 0.5684665441513062, - "mean_token_accuracy": 0.8124493360519409, - "num_tokens": 3210912.0, - "step": 359 - }, - { - "epoch": 0.2735562310030395, - "grad_norm": 2.8606204986572266, - "learning_rate": 4.97729400845414e-06, - "loss": 0.4746384620666504, - "mean_token_accuracy": 0.8195606470108032, - "num_tokens": 3215365.0, - "step": 360 - }, - { - "epoch": 0.2743161094224924, - "grad_norm": 1.8214033842086792, - "learning_rate": 4.977011504894253e-06, - "loss": 0.4842769503593445, - "mean_token_accuracy": 0.82928866147995, - "num_tokens": 3224037.0, - "step": 361 - }, - { - "epoch": 0.2750759878419453, - "grad_norm": 1.628746509552002, - "learning_rate": 4.97672726287234e-06, - "loss": 0.4397493302822113, - "mean_token_accuracy": 0.8606528043746948, - "num_tokens": 3235589.0, - "step": 362 - }, - { - "epoch": 0.27583586626139817, - "grad_norm": 3.557973861694336, - "learning_rate": 4.976441282587894e-06, - "loss": 0.5732032060623169, - "mean_token_accuracy": 0.8041545748710632, - "num_tokens": 3239958.0, - "step": 363 - }, - { - "epoch": 0.2765957446808511, - "grad_norm": 1.3467901945114136, - "learning_rate": 4.9761535642416284e-06, - "loss": 0.4525323510169983, - "mean_token_accuracy": 0.8281061053276062, - "num_tokens": 3257703.0, - "step": 364 - }, - { - "epoch": 0.2773556231003039, - "grad_norm": 2.2649986743927, - "learning_rate": 4.9758641080354745e-06, - "loss": 0.5074734687805176, - "mean_token_accuracy": 0.8447474241256714, - "num_tokens": 3264334.0, - "step": 365 - }, - { - "epoch": 0.27811550151975684, - "grad_norm": 2.8667566776275635, - "learning_rate": 4.975572914172581e-06, - "loss": 0.5759559869766235, - "mean_token_accuracy": 0.7976793050765991, - "num_tokens": 3269314.0, - "step": 366 - }, - { - "epoch": 0.27887537993920974, - "grad_norm": 2.2514986991882324, - "learning_rate": 4.975279982857324e-06, - "loss": 0.5786465406417847, - "mean_token_accuracy": 0.8058781623840332, - "num_tokens": 3277324.0, - "step": 367 - }, - { - "epoch": 0.2796352583586626, - "grad_norm": 1.3826723098754883, - "learning_rate": 4.97498531429529e-06, - "loss": 0.40801727771759033, - "mean_token_accuracy": 0.8601310849189758, - "num_tokens": 3290530.0, - "step": 368 - }, - { - "epoch": 0.2803951367781155, - "grad_norm": 2.084092617034912, - "learning_rate": 4.97468890869329e-06, - "loss": 0.47076648473739624, - "mean_token_accuracy": 0.8310186862945557, - "num_tokens": 3298325.0, - "step": 369 - }, - { - "epoch": 0.2811550151975684, - "grad_norm": 1.3467998504638672, - "learning_rate": 4.974390766259353e-06, - "loss": 0.44668465852737427, - "mean_token_accuracy": 0.8275353908538818, - "num_tokens": 3314302.0, - "step": 370 - }, - { - "epoch": 0.28191489361702127, - "grad_norm": 2.5921075344085693, - "learning_rate": 4.974090887202726e-06, - "loss": 0.5343953967094421, - "mean_token_accuracy": 0.8110706806182861, - "num_tokens": 3320963.0, - "step": 371 - }, - { - "epoch": 0.2826747720364742, - "grad_norm": 2.042781352996826, - "learning_rate": 4.973789271733877e-06, - "loss": 0.6293343305587769, - "mean_token_accuracy": 0.7800243496894836, - "num_tokens": 3332742.0, - "step": 372 - }, - { - "epoch": 0.28343465045592703, - "grad_norm": 4.822193145751953, - "learning_rate": 4.973485920064491e-06, - "loss": 0.6256728768348694, - "mean_token_accuracy": 0.7962433099746704, - "num_tokens": 3335872.0, - "step": 373 - }, - { - "epoch": 0.28419452887537994, - "grad_norm": 1.260988473892212, - "learning_rate": 4.973180832407471e-06, - "loss": 0.38731223344802856, - "mean_token_accuracy": 0.8385066986083984, - "num_tokens": 3351884.0, - "step": 374 - }, - { - "epoch": 0.28495440729483285, - "grad_norm": 2.669966697692871, - "learning_rate": 4.97287400897694e-06, - "loss": 0.5594710111618042, - "mean_token_accuracy": 0.8097212314605713, - "num_tokens": 3358197.0, - "step": 375 - }, - { - "epoch": 0.2857142857142857, - "grad_norm": 3.0344486236572266, - "learning_rate": 4.972565449988238e-06, - "loss": 0.34449583292007446, - "mean_token_accuracy": 0.8813316822052002, - "num_tokens": 3362133.0, - "step": 376 - }, - { - "epoch": 0.2864741641337386, - "grad_norm": 2.562251091003418, - "learning_rate": 4.972255155657925e-06, - "loss": 0.5331522822380066, - "mean_token_accuracy": 0.8212941288948059, - "num_tokens": 3370346.0, - "step": 377 - }, - { - "epoch": 0.2872340425531915, - "grad_norm": 2.7083740234375, - "learning_rate": 4.9719431262037755e-06, - "loss": 0.5403046011924744, - "mean_token_accuracy": 0.8108335733413696, - "num_tokens": 3375588.0, - "step": 378 - }, - { - "epoch": 0.28799392097264437, - "grad_norm": 1.396430492401123, - "learning_rate": 4.971629361844785e-06, - "loss": 0.4041529893875122, - "mean_token_accuracy": 0.8588063716888428, - "num_tokens": 3390749.0, - "step": 379 - }, - { - "epoch": 0.2887537993920973, - "grad_norm": 1.9872784614562988, - "learning_rate": 4.971313862801166e-06, - "loss": 0.4336993098258972, - "mean_token_accuracy": 0.8511303663253784, - "num_tokens": 3399064.0, - "step": 380 - }, - { - "epoch": 0.28951367781155013, - "grad_norm": 1.9652575254440308, - "learning_rate": 4.9709966292943455e-06, - "loss": 0.4578358232975006, - "mean_token_accuracy": 0.8229440450668335, - "num_tokens": 3407229.0, - "step": 381 - }, - { - "epoch": 0.29027355623100304, - "grad_norm": 1.6626898050308228, - "learning_rate": 4.970677661546972e-06, - "loss": 0.5427594184875488, - "mean_token_accuracy": 0.815427303314209, - "num_tokens": 3422321.0, - "step": 382 - }, - { - "epoch": 0.29103343465045595, - "grad_norm": 3.5265562534332275, - "learning_rate": 4.970356959782909e-06, - "loss": 0.6661460995674133, - "mean_token_accuracy": 0.7856965065002441, - "num_tokens": 3427442.0, - "step": 383 - }, - { - "epoch": 0.2917933130699088, - "grad_norm": 1.667205572128296, - "learning_rate": 4.970034524227239e-06, - "loss": 0.36256325244903564, - "mean_token_accuracy": 0.8711205720901489, - "num_tokens": 3436662.0, - "step": 384 - }, - { - "epoch": 0.2925531914893617, - "grad_norm": 1.3389486074447632, - "learning_rate": 4.969710355106256e-06, - "loss": 0.4282698631286621, - "mean_token_accuracy": 0.838951587677002, - "num_tokens": 3450060.0, - "step": 385 - }, - { - "epoch": 0.2933130699088146, - "grad_norm": 2.5163397789001465, - "learning_rate": 4.969384452647477e-06, - "loss": 0.5176984071731567, - "mean_token_accuracy": 0.8235267996788025, - "num_tokens": 3456990.0, - "step": 386 - }, - { - "epoch": 0.29407294832826747, - "grad_norm": 1.7588495016098022, - "learning_rate": 4.969056817079633e-06, - "loss": 0.49710947275161743, - "mean_token_accuracy": 0.818520724773407, - "num_tokens": 3468098.0, - "step": 387 - }, - { - "epoch": 0.2948328267477204, - "grad_norm": 2.6381046772003174, - "learning_rate": 4.968727448632669e-06, - "loss": 0.4425308108329773, - "mean_token_accuracy": 0.8451643586158752, - "num_tokens": 3472899.0, - "step": 388 - }, - { - "epoch": 0.29559270516717323, - "grad_norm": 1.6345038414001465, - "learning_rate": 4.968396347537751e-06, - "loss": 0.4177059829235077, - "mean_token_accuracy": 0.8498886227607727, - "num_tokens": 3484826.0, - "step": 389 - }, - { - "epoch": 0.29635258358662614, - "grad_norm": 3.0466468334198, - "learning_rate": 4.968063514027258e-06, - "loss": 0.4274463951587677, - "mean_token_accuracy": 0.8387278318405151, - "num_tokens": 3488610.0, - "step": 390 - }, - { - "epoch": 0.29711246200607905, - "grad_norm": 2.6509406566619873, - "learning_rate": 4.967728948334784e-06, - "loss": 0.5401753783226013, - "mean_token_accuracy": 0.8252490162849426, - "num_tokens": 3493657.0, - "step": 391 - }, - { - "epoch": 0.2978723404255319, - "grad_norm": 1.6372219324111938, - "learning_rate": 4.967392650695141e-06, - "loss": 0.3862472176551819, - "mean_token_accuracy": 0.8555525541305542, - "num_tokens": 3505588.0, - "step": 392 - }, - { - "epoch": 0.2986322188449848, - "grad_norm": 2.1615452766418457, - "learning_rate": 4.967054621344356e-06, - "loss": 0.57850581407547, - "mean_token_accuracy": 0.8222678899765015, - "num_tokens": 3514396.0, - "step": 393 - }, - { - "epoch": 0.2993920972644377, - "grad_norm": 1.8610916137695312, - "learning_rate": 4.96671486051967e-06, - "loss": 0.5440595149993896, - "mean_token_accuracy": 0.8196715116500854, - "num_tokens": 3523604.0, - "step": 394 - }, - { - "epoch": 0.30015197568389057, - "grad_norm": 2.9585862159729004, - "learning_rate": 4.966373368459542e-06, - "loss": 0.6921588182449341, - "mean_token_accuracy": 0.7816659808158875, - "num_tokens": 3529849.0, - "step": 395 - }, - { - "epoch": 0.3009118541033435, - "grad_norm": 1.9374035596847534, - "learning_rate": 4.966030145403642e-06, - "loss": 0.5494055151939392, - "mean_token_accuracy": 0.8126792907714844, - "num_tokens": 3539529.0, - "step": 396 - }, - { - "epoch": 0.30167173252279633, - "grad_norm": 1.730530023574829, - "learning_rate": 4.965685191592859e-06, - "loss": 0.4271572232246399, - "mean_token_accuracy": 0.8383668661117554, - "num_tokens": 3550972.0, - "step": 397 - }, - { - "epoch": 0.30243161094224924, - "grad_norm": 3.9635560512542725, - "learning_rate": 4.9653385072692935e-06, - "loss": 0.5576210021972656, - "mean_token_accuracy": 0.799404501914978, - "num_tokens": 3554147.0, - "step": 398 - }, - { - "epoch": 0.30319148936170215, - "grad_norm": 2.5731968879699707, - "learning_rate": 4.964990092676263e-06, - "loss": 0.5478942394256592, - "mean_token_accuracy": 0.8220961093902588, - "num_tokens": 3559972.0, - "step": 399 - }, - { - "epoch": 0.303951367781155, - "grad_norm": 2.2096588611602783, - "learning_rate": 4.964639948058297e-06, - "loss": 0.35461270809173584, - "mean_token_accuracy": 0.8640927076339722, - "num_tokens": 3565770.0, - "step": 400 - }, - { - "epoch": 0.3047112462006079, - "grad_norm": 1.7874189615249634, - "learning_rate": 4.964288073661142e-06, - "loss": 0.38849619030952454, - "mean_token_accuracy": 0.8443037271499634, - "num_tokens": 3574514.0, - "step": 401 - }, - { - "epoch": 0.30547112462006076, - "grad_norm": 1.5583146810531616, - "learning_rate": 4.963934469731756e-06, - "loss": 0.48909449577331543, - "mean_token_accuracy": 0.8429768681526184, - "num_tokens": 3585877.0, - "step": 402 - }, - { - "epoch": 0.30623100303951367, - "grad_norm": 3.026599645614624, - "learning_rate": 4.963579136518312e-06, - "loss": 0.5138992071151733, - "mean_token_accuracy": 0.8283728361129761, - "num_tokens": 3590412.0, - "step": 403 - }, - { - "epoch": 0.3069908814589666, - "grad_norm": 2.777505874633789, - "learning_rate": 4.963222074270197e-06, - "loss": 0.6241534948348999, - "mean_token_accuracy": 0.8130464553833008, - "num_tokens": 3596246.0, - "step": 404 - }, - { - "epoch": 0.30775075987841943, - "grad_norm": 2.4772839546203613, - "learning_rate": 4.962863283238011e-06, - "loss": 0.5930814146995544, - "mean_token_accuracy": 0.8036394715309143, - "num_tokens": 3602878.0, - "step": 405 - }, - { - "epoch": 0.30851063829787234, - "grad_norm": 1.5049982070922852, - "learning_rate": 4.962502763673566e-06, - "loss": 0.4903082549571991, - "mean_token_accuracy": 0.8184912204742432, - "num_tokens": 3617018.0, - "step": 406 - }, - { - "epoch": 0.30927051671732525, - "grad_norm": 2.453155040740967, - "learning_rate": 4.96214051582989e-06, - "loss": 0.5138067603111267, - "mean_token_accuracy": 0.8336835503578186, - "num_tokens": 3624188.0, - "step": 407 - }, - { - "epoch": 0.3100303951367781, - "grad_norm": 2.4038336277008057, - "learning_rate": 4.961776539961222e-06, - "loss": 0.5752760171890259, - "mean_token_accuracy": 0.8054730892181396, - "num_tokens": 3634152.0, - "step": 408 - }, - { - "epoch": 0.310790273556231, - "grad_norm": 2.629068374633789, - "learning_rate": 4.961410836323014e-06, - "loss": 0.5580606460571289, - "mean_token_accuracy": 0.8121089935302734, - "num_tokens": 3639528.0, - "step": 409 - }, - { - "epoch": 0.31155015197568386, - "grad_norm": 1.4245928525924683, - "learning_rate": 4.961043405171931e-06, - "loss": 0.5399882793426514, - "mean_token_accuracy": 0.812280535697937, - "num_tokens": 3655744.0, - "step": 410 - }, - { - "epoch": 0.3123100303951368, - "grad_norm": 1.5236459970474243, - "learning_rate": 4.9606742467658505e-06, - "loss": 0.5234690308570862, - "mean_token_accuracy": 0.8188928365707397, - "num_tokens": 3675010.0, - "step": 411 - }, - { - "epoch": 0.3130699088145897, - "grad_norm": 2.27961802482605, - "learning_rate": 4.960303361363863e-06, - "loss": 0.5502505898475647, - "mean_token_accuracy": 0.8161963224411011, - "num_tokens": 3682328.0, - "step": 412 - }, - { - "epoch": 0.31382978723404253, - "grad_norm": 1.554518222808838, - "learning_rate": 4.959930749226269e-06, - "loss": 0.420867919921875, - "mean_token_accuracy": 0.8499157428741455, - "num_tokens": 3694980.0, - "step": 413 - }, - { - "epoch": 0.31458966565349544, - "grad_norm": 2.609218120574951, - "learning_rate": 4.9595564106145825e-06, - "loss": 0.4706704318523407, - "mean_token_accuracy": 0.8412490487098694, - "num_tokens": 3700033.0, - "step": 414 - }, - { - "epoch": 0.31534954407294835, - "grad_norm": 1.5303231477737427, - "learning_rate": 4.959180345791528e-06, - "loss": 0.4668654799461365, - "mean_token_accuracy": 0.8125015497207642, - "num_tokens": 3715012.0, - "step": 415 - }, - { - "epoch": 0.3161094224924012, - "grad_norm": 1.2774665355682373, - "learning_rate": 4.958802555021042e-06, - "loss": 0.4339369237422943, - "mean_token_accuracy": 0.8442851901054382, - "num_tokens": 3733928.0, - "step": 416 - }, - { - "epoch": 0.3168693009118541, - "grad_norm": 2.1240181922912598, - "learning_rate": 4.958423038568274e-06, - "loss": 0.4029104709625244, - "mean_token_accuracy": 0.8627674579620361, - "num_tokens": 3740202.0, - "step": 417 - }, - { - "epoch": 0.31762917933130697, - "grad_norm": 2.00538969039917, - "learning_rate": 4.958041796699583e-06, - "loss": 0.5229607820510864, - "mean_token_accuracy": 0.8282366394996643, - "num_tokens": 3749308.0, - "step": 418 - }, - { - "epoch": 0.3183890577507599, - "grad_norm": 2.6555092334747314, - "learning_rate": 4.957658829682539e-06, - "loss": 0.5344101190567017, - "mean_token_accuracy": 0.8183202743530273, - "num_tokens": 3754595.0, - "step": 419 - }, - { - "epoch": 0.3191489361702128, - "grad_norm": 1.7468839883804321, - "learning_rate": 4.9572741377859225e-06, - "loss": 0.5667245984077454, - "mean_token_accuracy": 0.8080123662948608, - "num_tokens": 3765761.0, - "step": 420 - }, - { - "epoch": 0.31990881458966564, - "grad_norm": 2.9612457752227783, - "learning_rate": 4.956887721279726e-06, - "loss": 0.5389559864997864, - "mean_token_accuracy": 0.8019476532936096, - "num_tokens": 3770844.0, - "step": 421 - }, - { - "epoch": 0.32066869300911854, - "grad_norm": 1.842403769493103, - "learning_rate": 4.95649958043515e-06, - "loss": 0.38279837369918823, - "mean_token_accuracy": 0.858866810798645, - "num_tokens": 3778094.0, - "step": 422 - }, - { - "epoch": 0.32142857142857145, - "grad_norm": 2.3108131885528564, - "learning_rate": 4.956109715524609e-06, - "loss": 0.5453893542289734, - "mean_token_accuracy": 0.8085013031959534, - "num_tokens": 3785015.0, - "step": 423 - }, - { - "epoch": 0.3221884498480243, - "grad_norm": 3.0326945781707764, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.5550523400306702, - "mean_token_accuracy": 0.8125876188278198, - "num_tokens": 3789830.0, - "step": 424 - }, - { - "epoch": 0.3229483282674772, - "grad_norm": 1.8851977586746216, - "learning_rate": 4.955324814601324e-06, - "loss": 0.4902324974536896, - "mean_token_accuracy": 0.8205406665802002, - "num_tokens": 3799862.0, - "step": 425 - }, - { - "epoch": 0.32370820668693007, - "grad_norm": 2.6018171310424805, - "learning_rate": 4.954929779139455e-06, - "loss": 0.5920133590698242, - "mean_token_accuracy": 0.8340690732002258, - "num_tokens": 3806617.0, - "step": 426 - }, - { - "epoch": 0.324468085106383, - "grad_norm": 2.4283878803253174, - "learning_rate": 4.954533020713367e-06, - "loss": 0.5305854082107544, - "mean_token_accuracy": 0.8137468099594116, - "num_tokens": 3813843.0, - "step": 427 - }, - { - "epoch": 0.3252279635258359, - "grad_norm": 2.667978525161743, - "learning_rate": 4.954134539601519e-06, - "loss": 0.5333638787269592, - "mean_token_accuracy": 0.8402629494667053, - "num_tokens": 3819450.0, - "step": 428 - }, - { - "epoch": 0.32598784194528874, - "grad_norm": 1.7302523851394653, - "learning_rate": 4.953734336083582e-06, - "loss": 0.422895610332489, - "mean_token_accuracy": 0.8709704875946045, - "num_tokens": 3831027.0, - "step": 429 - }, - { - "epoch": 0.32674772036474165, - "grad_norm": 2.427192211151123, - "learning_rate": 4.953332410440434e-06, - "loss": 0.6334598064422607, - "mean_token_accuracy": 0.7817479968070984, - "num_tokens": 3841776.0, - "step": 430 - }, - { - "epoch": 0.32750759878419455, - "grad_norm": 1.460949182510376, - "learning_rate": 4.952928762954161e-06, - "loss": 0.3654777705669403, - "mean_token_accuracy": 0.8780122995376587, - "num_tokens": 3852213.0, - "step": 431 - }, - { - "epoch": 0.3282674772036474, - "grad_norm": 1.9855005741119385, - "learning_rate": 4.952523393908059e-06, - "loss": 0.5117089748382568, - "mean_token_accuracy": 0.811911404132843, - "num_tokens": 3861176.0, - "step": 432 - }, - { - "epoch": 0.3290273556231003, - "grad_norm": 2.2653207778930664, - "learning_rate": 4.952116303586631e-06, - "loss": 0.42514950037002563, - "mean_token_accuracy": 0.8448518514633179, - "num_tokens": 3867164.0, - "step": 433 - }, - { - "epoch": 0.32978723404255317, - "grad_norm": 1.9780964851379395, - "learning_rate": 4.951707492275589e-06, - "loss": 0.5095293521881104, - "mean_token_accuracy": 0.8262748718261719, - "num_tokens": 3876406.0, - "step": 434 - }, - { - "epoch": 0.3305471124620061, - "grad_norm": 2.9480233192443848, - "learning_rate": 4.951296960261853e-06, - "loss": 0.3494448959827423, - "mean_token_accuracy": 0.8781307935714722, - "num_tokens": 3880298.0, - "step": 435 - }, - { - "epoch": 0.331306990881459, - "grad_norm": 2.335571527481079, - "learning_rate": 4.95088470783355e-06, - "loss": 0.5456914901733398, - "mean_token_accuracy": 0.816297173500061, - "num_tokens": 3886487.0, - "step": 436 - }, - { - "epoch": 0.33206686930091184, - "grad_norm": 2.3046419620513916, - "learning_rate": 4.950470735280013e-06, - "loss": 0.4835948944091797, - "mean_token_accuracy": 0.8539175391197205, - "num_tokens": 3892706.0, - "step": 437 - }, - { - "epoch": 0.33282674772036475, - "grad_norm": 2.44047474861145, - "learning_rate": 4.950055042891786e-06, - "loss": 0.5154092907905579, - "mean_token_accuracy": 0.8579919338226318, - "num_tokens": 3899532.0, - "step": 438 - }, - { - "epoch": 0.33358662613981765, - "grad_norm": 4.826764106750488, - "learning_rate": 4.949637630960618e-06, - "loss": 0.5270259976387024, - "mean_token_accuracy": 0.8172192573547363, - "num_tokens": 3902260.0, - "step": 439 - }, - { - "epoch": 0.3343465045592705, - "grad_norm": 2.001574754714966, - "learning_rate": 4.949218499779462e-06, - "loss": 0.5413002967834473, - "mean_token_accuracy": 0.8162837028503418, - "num_tokens": 3911706.0, - "step": 440 - }, - { - "epoch": 0.3351063829787234, - "grad_norm": 1.7998944520950317, - "learning_rate": 4.948797649642484e-06, - "loss": 0.5131614208221436, - "mean_token_accuracy": 0.8367440700531006, - "num_tokens": 3923490.0, - "step": 441 - }, - { - "epoch": 0.33586626139817627, - "grad_norm": 3.4566173553466797, - "learning_rate": 4.94837508084505e-06, - "loss": 0.7258909940719604, - "mean_token_accuracy": 0.771377444267273, - "num_tokens": 3928099.0, - "step": 442 - }, - { - "epoch": 0.3366261398176292, - "grad_norm": 2.0040442943573, - "learning_rate": 4.9479507936837364e-06, - "loss": 0.482135534286499, - "mean_token_accuracy": 0.8339327573776245, - "num_tokens": 3937328.0, - "step": 443 - }, - { - "epoch": 0.3373860182370821, - "grad_norm": 2.949502944946289, - "learning_rate": 4.947524788456325e-06, - "loss": 0.6474795341491699, - "mean_token_accuracy": 0.7951677441596985, - "num_tokens": 3942529.0, - "step": 444 - }, - { - "epoch": 0.33814589665653494, - "grad_norm": 1.5528364181518555, - "learning_rate": 4.947097065461801e-06, - "loss": 0.48791584372520447, - "mean_token_accuracy": 0.8425545692443848, - "num_tokens": 3955200.0, - "step": 445 - }, - { - "epoch": 0.33890577507598785, - "grad_norm": 1.8813284635543823, - "learning_rate": 4.946667625000358e-06, - "loss": 0.45922309160232544, - "mean_token_accuracy": 0.8206527233123779, - "num_tokens": 3962975.0, - "step": 446 - }, - { - "epoch": 0.33966565349544076, - "grad_norm": 1.7157847881317139, - "learning_rate": 4.946236467373392e-06, - "loss": 0.5454182028770447, - "mean_token_accuracy": 0.8049604892730713, - "num_tokens": 3973956.0, - "step": 447 - }, - { - "epoch": 0.3404255319148936, - "grad_norm": 2.008857250213623, - "learning_rate": 4.945803592883509e-06, - "loss": 0.5151860117912292, - "mean_token_accuracy": 0.8262045383453369, - "num_tokens": 3982853.0, - "step": 448 - }, - { - "epoch": 0.3411854103343465, - "grad_norm": 1.6632496118545532, - "learning_rate": 4.9453690018345144e-06, - "loss": 0.42710691690444946, - "mean_token_accuracy": 0.8521314859390259, - "num_tokens": 3993838.0, - "step": 449 - }, - { - "epoch": 0.34194528875379937, - "grad_norm": 1.365234375, - "learning_rate": 4.944932694531423e-06, - "loss": 0.5172526836395264, - "mean_token_accuracy": 0.8277045488357544, - "num_tokens": 4014179.0, - "step": 450 - }, - { - "epoch": 0.3427051671732523, - "grad_norm": 1.7610243558883667, - "learning_rate": 4.94449467128045e-06, - "loss": 0.42104798555374146, - "mean_token_accuracy": 0.8552065491676331, - "num_tokens": 4023663.0, - "step": 451 - }, - { - "epoch": 0.3434650455927052, - "grad_norm": 2.3732354640960693, - "learning_rate": 4.944054932389018e-06, - "loss": 0.5471175909042358, - "mean_token_accuracy": 0.8487317562103271, - "num_tokens": 4030100.0, - "step": 452 - }, - { - "epoch": 0.34422492401215804, - "grad_norm": 1.5973623991012573, - "learning_rate": 4.943613478165753e-06, - "loss": 0.419813871383667, - "mean_token_accuracy": 0.8484025001525879, - "num_tokens": 4041124.0, - "step": 453 - }, - { - "epoch": 0.34498480243161095, - "grad_norm": 2.966381549835205, - "learning_rate": 4.943170308920484e-06, - "loss": 0.5370652675628662, - "mean_token_accuracy": 0.8439491987228394, - "num_tokens": 4045675.0, - "step": 454 - }, - { - "epoch": 0.34574468085106386, - "grad_norm": 2.5097248554229736, - "learning_rate": 4.9427254249642445e-06, - "loss": 0.5776349306106567, - "mean_token_accuracy": 0.8060523867607117, - "num_tokens": 4053250.0, - "step": 455 - }, - { - "epoch": 0.3465045592705167, - "grad_norm": 1.6779125928878784, - "learning_rate": 4.942278826609272e-06, - "loss": 0.5245476961135864, - "mean_token_accuracy": 0.8168526887893677, - "num_tokens": 4064106.0, - "step": 456 - }, - { - "epoch": 0.3472644376899696, - "grad_norm": 1.5945546627044678, - "learning_rate": 4.9418305141690045e-06, - "loss": 0.4972047209739685, - "mean_token_accuracy": 0.8257735967636108, - "num_tokens": 4077687.0, - "step": 457 - }, - { - "epoch": 0.34802431610942247, - "grad_norm": 2.864778757095337, - "learning_rate": 4.9413804879580865e-06, - "loss": 0.5372499823570251, - "mean_token_accuracy": 0.8423776626586914, - "num_tokens": 4082632.0, - "step": 458 - }, - { - "epoch": 0.3487841945288754, - "grad_norm": 1.4797078371047974, - "learning_rate": 4.940928748292363e-06, - "loss": 0.5903409719467163, - "mean_token_accuracy": 0.8061295747756958, - "num_tokens": 4104218.0, - "step": 459 - }, - { - "epoch": 0.3495440729483283, - "grad_norm": 2.4376983642578125, - "learning_rate": 4.940475295488882e-06, - "loss": 0.4534894824028015, - "mean_token_accuracy": 0.8395825028419495, - "num_tokens": 4110530.0, - "step": 460 - }, - { - "epoch": 0.35030395136778114, - "grad_norm": 1.2955626249313354, - "learning_rate": 4.940020129865895e-06, - "loss": 0.47155818343162537, - "mean_token_accuracy": 0.8253582715988159, - "num_tokens": 4128398.0, - "step": 461 - }, - { - "epoch": 0.35106382978723405, - "grad_norm": 2.066575527191162, - "learning_rate": 4.9395632517428546e-06, - "loss": 0.5555641651153564, - "mean_token_accuracy": 0.814624547958374, - "num_tokens": 4137623.0, - "step": 462 - }, - { - "epoch": 0.3518237082066869, - "grad_norm": 1.6407525539398193, - "learning_rate": 4.939104661440415e-06, - "loss": 0.4361790418624878, - "mean_token_accuracy": 0.8544459342956543, - "num_tokens": 4152803.0, - "step": 463 - }, - { - "epoch": 0.3525835866261398, - "grad_norm": 2.1685116291046143, - "learning_rate": 4.938644359280433e-06, - "loss": 0.5347012877464294, - "mean_token_accuracy": 0.853853702545166, - "num_tokens": 4160778.0, - "step": 464 - }, - { - "epoch": 0.3533434650455927, - "grad_norm": 1.8824869394302368, - "learning_rate": 4.938182345585967e-06, - "loss": 0.5512481927871704, - "mean_token_accuracy": 0.7985891699790955, - "num_tokens": 4170380.0, - "step": 465 - }, - { - "epoch": 0.3541033434650456, - "grad_norm": 2.2229504585266113, - "learning_rate": 4.937718620681273e-06, - "loss": 0.516828179359436, - "mean_token_accuracy": 0.8265621066093445, - "num_tokens": 4178179.0, - "step": 466 - }, - { - "epoch": 0.3548632218844985, - "grad_norm": 1.955990195274353, - "learning_rate": 4.9372531848918145e-06, - "loss": 0.5586158037185669, - "mean_token_accuracy": 0.8367916345596313, - "num_tokens": 4188626.0, - "step": 467 - }, - { - "epoch": 0.3556231003039514, - "grad_norm": 1.9687023162841797, - "learning_rate": 4.936786038544251e-06, - "loss": 0.5517531633377075, - "mean_token_accuracy": 0.8134098052978516, - "num_tokens": 4198144.0, - "step": 468 - }, - { - "epoch": 0.35638297872340424, - "grad_norm": 1.405516505241394, - "learning_rate": 4.9363171819664434e-06, - "loss": 0.5305492877960205, - "mean_token_accuracy": 0.8014427423477173, - "num_tokens": 4222818.0, - "step": 469 - }, - { - "epoch": 0.35714285714285715, - "grad_norm": 2.6355695724487305, - "learning_rate": 4.9358466154874535e-06, - "loss": 0.5303391218185425, - "mean_token_accuracy": 0.8028861284255981, - "num_tokens": 4228318.0, - "step": 470 - }, - { - "epoch": 0.35790273556231, - "grad_norm": 1.5133824348449707, - "learning_rate": 4.935374339437543e-06, - "loss": 0.5329189300537109, - "mean_token_accuracy": 0.8479441404342651, - "num_tokens": 4244527.0, - "step": 471 - }, - { - "epoch": 0.3586626139817629, - "grad_norm": 3.4356725215911865, - "learning_rate": 4.934900354148173e-06, - "loss": 0.5431582927703857, - "mean_token_accuracy": 0.8328983783721924, - "num_tokens": 4248034.0, - "step": 472 - }, - { - "epoch": 0.3594224924012158, - "grad_norm": 2.5789499282836914, - "learning_rate": 4.934424659952006e-06, - "loss": 0.4141455292701721, - "mean_token_accuracy": 0.8658635020256042, - "num_tokens": 4252953.0, - "step": 473 - }, - { - "epoch": 0.3601823708206687, - "grad_norm": 1.145262598991394, - "learning_rate": 4.933947257182901e-06, - "loss": 0.40294092893600464, - "mean_token_accuracy": 0.8565847277641296, - "num_tokens": 4277813.0, - "step": 474 - }, - { - "epoch": 0.3609422492401216, - "grad_norm": 1.7242133617401123, - "learning_rate": 4.933468146175918e-06, - "loss": 0.6036738753318787, - "mean_token_accuracy": 0.8072597980499268, - "num_tokens": 4291088.0, - "step": 475 - }, - { - "epoch": 0.3617021276595745, - "grad_norm": 2.3490941524505615, - "learning_rate": 4.932987327267317e-06, - "loss": 0.49456146359443665, - "mean_token_accuracy": 0.8372673988342285, - "num_tokens": 4297376.0, - "step": 476 - }, - { - "epoch": 0.36246200607902734, - "grad_norm": 1.3605526685714722, - "learning_rate": 4.932504800794553e-06, - "loss": 0.43595948815345764, - "mean_token_accuracy": 0.8415953516960144, - "num_tokens": 4312054.0, - "step": 477 - }, - { - "epoch": 0.36322188449848025, - "grad_norm": 1.4525885581970215, - "learning_rate": 4.9320205670962815e-06, - "loss": 0.5390371680259705, - "mean_token_accuracy": 0.8101649284362793, - "num_tokens": 4328701.0, - "step": 478 - }, - { - "epoch": 0.3639817629179331, - "grad_norm": 1.9862419366836548, - "learning_rate": 4.931534626512359e-06, - "loss": 0.45436930656433105, - "mean_token_accuracy": 0.8352861404418945, - "num_tokens": 4338372.0, - "step": 479 - }, - { - "epoch": 0.364741641337386, - "grad_norm": 1.7804961204528809, - "learning_rate": 4.931046979383836e-06, - "loss": 0.4677754044532776, - "mean_token_accuracy": 0.840467095375061, - "num_tokens": 4347897.0, - "step": 480 - }, - { - "epoch": 0.3655015197568389, - "grad_norm": 2.066632032394409, - "learning_rate": 4.930557626052961e-06, - "loss": 0.42418140172958374, - "mean_token_accuracy": 0.8528275489807129, - "num_tokens": 4354061.0, - "step": 481 - }, - { - "epoch": 0.3662613981762918, - "grad_norm": 1.6155282258987427, - "learning_rate": 4.930066566863182e-06, - "loss": 0.5424284934997559, - "mean_token_accuracy": 0.825040876865387, - "num_tokens": 4370400.0, - "step": 482 - }, - { - "epoch": 0.3670212765957447, - "grad_norm": 2.1452953815460205, - "learning_rate": 4.929573802159143e-06, - "loss": 0.5105804204940796, - "mean_token_accuracy": 0.8284053802490234, - "num_tokens": 4377579.0, - "step": 483 - }, - { - "epoch": 0.3677811550151976, - "grad_norm": 1.8940945863723755, - "learning_rate": 4.929079332286685e-06, - "loss": 0.43478304147720337, - "mean_token_accuracy": 0.8505665063858032, - "num_tokens": 4385686.0, - "step": 484 - }, - { - "epoch": 0.36854103343465044, - "grad_norm": 1.6785860061645508, - "learning_rate": 4.928583157592846e-06, - "loss": 0.40227848291397095, - "mean_token_accuracy": 0.8623573780059814, - "num_tokens": 4396128.0, - "step": 485 - }, - { - "epoch": 0.36930091185410335, - "grad_norm": 1.6416733264923096, - "learning_rate": 4.928085278425862e-06, - "loss": 0.526267409324646, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 4407963.0, - "step": 486 - }, - { - "epoch": 0.3700607902735562, - "grad_norm": 1.8882389068603516, - "learning_rate": 4.927585695135162e-06, - "loss": 0.5555213093757629, - "mean_token_accuracy": 0.8115293979644775, - "num_tokens": 4418057.0, - "step": 487 - }, - { - "epoch": 0.3708206686930091, - "grad_norm": 2.300248384475708, - "learning_rate": 4.9270844080713735e-06, - "loss": 0.5812339186668396, - "mean_token_accuracy": 0.800270676612854, - "num_tokens": 4425358.0, - "step": 488 - }, - { - "epoch": 0.371580547112462, - "grad_norm": 1.6802922487258911, - "learning_rate": 4.926581417586319e-06, - "loss": 0.5134941935539246, - "mean_token_accuracy": 0.8247408866882324, - "num_tokens": 4437702.0, - "step": 489 - }, - { - "epoch": 0.3723404255319149, - "grad_norm": 1.7620291709899902, - "learning_rate": 4.926076724033016e-06, - "loss": 0.5233973264694214, - "mean_token_accuracy": 0.8102161884307861, - "num_tokens": 4448584.0, - "step": 490 - }, - { - "epoch": 0.3731003039513678, - "grad_norm": 1.6911998987197876, - "learning_rate": 4.925570327765678e-06, - "loss": 0.5337274074554443, - "mean_token_accuracy": 0.845306396484375, - "num_tokens": 4462651.0, - "step": 491 - }, - { - "epoch": 0.3738601823708207, - "grad_norm": 1.7991242408752441, - "learning_rate": 4.9250622291397144e-06, - "loss": 0.31018948554992676, - "mean_token_accuracy": 0.8857606053352356, - "num_tokens": 4469971.0, - "step": 492 - }, - { - "epoch": 0.37462006079027355, - "grad_norm": 4.9776835441589355, - "learning_rate": 4.924552428511727e-06, - "loss": 0.44114983081817627, - "mean_token_accuracy": 0.8429906368255615, - "num_tokens": 4478275.0, - "step": 493 - }, - { - "epoch": 0.37537993920972645, - "grad_norm": 1.8007272481918335, - "learning_rate": 4.924040926239515e-06, - "loss": 0.574328601360321, - "mean_token_accuracy": 0.7669196128845215, - "num_tokens": 4491551.0, - "step": 494 - }, - { - "epoch": 0.3761398176291793, - "grad_norm": 2.021300792694092, - "learning_rate": 4.92352772268207e-06, - "loss": 0.45636120438575745, - "mean_token_accuracy": 0.840438723564148, - "num_tokens": 4498658.0, - "step": 495 - }, - { - "epoch": 0.3768996960486322, - "grad_norm": 2.369748592376709, - "learning_rate": 4.923012818199576e-06, - "loss": 0.5206376910209656, - "mean_token_accuracy": 0.8521823287010193, - "num_tokens": 4504648.0, - "step": 496 - }, - { - "epoch": 0.3776595744680851, - "grad_norm": 2.733485221862793, - "learning_rate": 4.922496213153416e-06, - "loss": 0.5067723989486694, - "mean_token_accuracy": 0.8168281316757202, - "num_tokens": 4509990.0, - "step": 497 - }, - { - "epoch": 0.378419452887538, - "grad_norm": 2.3751676082611084, - "learning_rate": 4.921977907906161e-06, - "loss": 0.49757206439971924, - "mean_token_accuracy": 0.8325017690658569, - "num_tokens": 4518373.0, - "step": 498 - }, - { - "epoch": 0.3791793313069909, - "grad_norm": 2.1672775745391846, - "learning_rate": 4.921457902821578e-06, - "loss": 0.4237566590309143, - "mean_token_accuracy": 0.8404698371887207, - "num_tokens": 4524338.0, - "step": 499 - }, - { - "epoch": 0.3799392097264438, - "grad_norm": 1.8374360799789429, - "learning_rate": 4.9209361982646275e-06, - "loss": 0.4995468854904175, - "mean_token_accuracy": 0.8299649953842163, - "num_tokens": 4533396.0, - "step": 500 - }, - { - "epoch": 0.38069908814589665, - "grad_norm": 2.083967924118042, - "learning_rate": 4.920412794601461e-06, - "loss": 0.489935040473938, - "mean_token_accuracy": 0.8315291404724121, - "num_tokens": 4540941.0, - "step": 501 - }, - { - "epoch": 0.38145896656534956, - "grad_norm": 2.2075610160827637, - "learning_rate": 4.919887692199423e-06, - "loss": 0.5233147740364075, - "mean_token_accuracy": 0.804171085357666, - "num_tokens": 4548215.0, - "step": 502 - }, - { - "epoch": 0.3822188449848024, - "grad_norm": 2.076775312423706, - "learning_rate": 4.9193608914270515e-06, - "loss": 0.5785550475120544, - "mean_token_accuracy": 0.7993186116218567, - "num_tokens": 4558204.0, - "step": 503 - }, - { - "epoch": 0.3829787234042553, - "grad_norm": 2.238546133041382, - "learning_rate": 4.918832392654075e-06, - "loss": 0.5287384390830994, - "mean_token_accuracy": 0.8214945793151855, - "num_tokens": 4565407.0, - "step": 504 - }, - { - "epoch": 0.3837386018237082, - "grad_norm": 1.6783074140548706, - "learning_rate": 4.9183021962514145e-06, - "loss": 0.6063359379768372, - "mean_token_accuracy": 0.7914625406265259, - "num_tokens": 4580991.0, - "step": 505 - }, - { - "epoch": 0.3844984802431611, - "grad_norm": 1.6287449598312378, - "learning_rate": 4.917770302591183e-06, - "loss": 0.3598247766494751, - "mean_token_accuracy": 0.8706809878349304, - "num_tokens": 4590579.0, - "step": 506 - }, - { - "epoch": 0.385258358662614, - "grad_norm": 1.5432041883468628, - "learning_rate": 4.917236712046682e-06, - "loss": 0.5267890095710754, - "mean_token_accuracy": 0.8032117486000061, - "num_tokens": 4608380.0, - "step": 507 - }, - { - "epoch": 0.3860182370820669, - "grad_norm": 1.7664037942886353, - "learning_rate": 4.9167014249924075e-06, - "loss": 0.3552354574203491, - "mean_token_accuracy": 0.8569793701171875, - "num_tokens": 4616426.0, - "step": 508 - }, - { - "epoch": 0.38677811550151975, - "grad_norm": 2.1147472858428955, - "learning_rate": 4.916164441804044e-06, - "loss": 0.5212404727935791, - "mean_token_accuracy": 0.8196578025817871, - "num_tokens": 4623908.0, - "step": 509 - }, - { - "epoch": 0.38753799392097266, - "grad_norm": 2.1092333793640137, - "learning_rate": 4.915625762858467e-06, - "loss": 0.5197038650512695, - "mean_token_accuracy": 0.8245604634284973, - "num_tokens": 4630956.0, - "step": 510 - }, - { - "epoch": 0.3882978723404255, - "grad_norm": 1.23331880569458, - "learning_rate": 4.915085388533743e-06, - "loss": 0.4759839177131653, - "mean_token_accuracy": 0.8192248344421387, - "num_tokens": 4651269.0, - "step": 511 - }, - { - "epoch": 0.3890577507598784, - "grad_norm": 2.424199104309082, - "learning_rate": 4.914543319209126e-06, - "loss": 0.5576270818710327, - "mean_token_accuracy": 0.8203302621841431, - "num_tokens": 4657296.0, - "step": 512 - }, - { - "epoch": 0.3898176291793313, - "grad_norm": 2.725156307220459, - "learning_rate": 4.913999555265062e-06, - "loss": 0.4337949752807617, - "mean_token_accuracy": 0.8382406234741211, - "num_tokens": 4661850.0, - "step": 513 - }, - { - "epoch": 0.3905775075987842, - "grad_norm": 2.3120534420013428, - "learning_rate": 4.913454097083185e-06, - "loss": 0.4941597580909729, - "mean_token_accuracy": 0.8302834033966064, - "num_tokens": 4667769.0, - "step": 514 - }, - { - "epoch": 0.3913373860182371, - "grad_norm": 2.3111207485198975, - "learning_rate": 4.912906945046319e-06, - "loss": 0.5253715515136719, - "mean_token_accuracy": 0.84515380859375, - "num_tokens": 4674537.0, - "step": 515 - }, - { - "epoch": 0.39209726443769, - "grad_norm": 1.4117841720581055, - "learning_rate": 4.912358099538476e-06, - "loss": 0.4521017074584961, - "mean_token_accuracy": 0.8208256959915161, - "num_tokens": 4690605.0, - "step": 516 - }, - { - "epoch": 0.39285714285714285, - "grad_norm": 2.3742799758911133, - "learning_rate": 4.911807560944858e-06, - "loss": 0.41572901606559753, - "mean_token_accuracy": 0.8550551533699036, - "num_tokens": 4706437.0, - "step": 517 - }, - { - "epoch": 0.39361702127659576, - "grad_norm": 2.4052202701568604, - "learning_rate": 4.911255329651852e-06, - "loss": 0.6003736257553101, - "mean_token_accuracy": 0.8247885704040527, - "num_tokens": 4712746.0, - "step": 518 - }, - { - "epoch": 0.3943768996960486, - "grad_norm": 1.9335490465164185, - "learning_rate": 4.910701406047037e-06, - "loss": 0.5457713603973389, - "mean_token_accuracy": 0.787429690361023, - "num_tokens": 4731937.0, - "step": 519 - }, - { - "epoch": 0.3951367781155015, - "grad_norm": 2.257706880569458, - "learning_rate": 4.910145790519177e-06, - "loss": 0.5300652980804443, - "mean_token_accuracy": 0.8192912936210632, - "num_tokens": 4739422.0, - "step": 520 - }, - { - "epoch": 0.3958966565349544, - "grad_norm": 1.2099462747573853, - "learning_rate": 4.9095884834582256e-06, - "loss": 0.45872747898101807, - "mean_token_accuracy": 0.8362667560577393, - "num_tokens": 4757113.0, - "step": 521 - }, - { - "epoch": 0.3966565349544073, - "grad_norm": 2.7991135120391846, - "learning_rate": 4.909029485255321e-06, - "loss": 0.49039560556411743, - "mean_token_accuracy": 0.8260016441345215, - "num_tokens": 4761709.0, - "step": 522 - }, - { - "epoch": 0.3974164133738602, - "grad_norm": 2.2360129356384277, - "learning_rate": 4.90846879630279e-06, - "loss": 0.49556830525398254, - "mean_token_accuracy": 0.827864408493042, - "num_tokens": 4769048.0, - "step": 523 - }, - { - "epoch": 0.3981762917933131, - "grad_norm": 2.5953688621520996, - "learning_rate": 4.907906416994146e-06, - "loss": 0.387208491563797, - "mean_token_accuracy": 0.8467001914978027, - "num_tokens": 4774637.0, - "step": 524 - }, - { - "epoch": 0.39893617021276595, - "grad_norm": 2.1046814918518066, - "learning_rate": 4.907342347724088e-06, - "loss": 0.5477259755134583, - "mean_token_accuracy": 0.8060322999954224, - "num_tokens": 4782774.0, - "step": 525 - }, - { - "epoch": 0.39969604863221886, - "grad_norm": 2.5622646808624268, - "learning_rate": 4.906776588888502e-06, - "loss": 0.5684159398078918, - "mean_token_accuracy": 0.8095303177833557, - "num_tokens": 4788766.0, - "step": 526 - }, - { - "epoch": 0.4004559270516717, - "grad_norm": 1.9027913808822632, - "learning_rate": 4.906209140884459e-06, - "loss": 0.535524845123291, - "mean_token_accuracy": 0.815237820148468, - "num_tokens": 4798492.0, - "step": 527 - }, - { - "epoch": 0.4012158054711246, - "grad_norm": 2.1447622776031494, - "learning_rate": 4.905640004110216e-06, - "loss": 0.5628632307052612, - "mean_token_accuracy": 0.8085395097732544, - "num_tokens": 4805737.0, - "step": 528 - }, - { - "epoch": 0.40197568389057753, - "grad_norm": 1.6754741668701172, - "learning_rate": 4.905069178965215e-06, - "loss": 0.5046736598014832, - "mean_token_accuracy": 0.8247535228729248, - "num_tokens": 4816912.0, - "step": 529 - }, - { - "epoch": 0.4027355623100304, - "grad_norm": 2.271230459213257, - "learning_rate": 4.904496665850083e-06, - "loss": 0.6086187958717346, - "mean_token_accuracy": 0.7935276627540588, - "num_tokens": 4824577.0, - "step": 530 - }, - { - "epoch": 0.4034954407294833, - "grad_norm": 2.107595205307007, - "learning_rate": 4.903922465166633e-06, - "loss": 0.5431341528892517, - "mean_token_accuracy": 0.8129537105560303, - "num_tokens": 4831772.0, - "step": 531 - }, - { - "epoch": 0.40425531914893614, - "grad_norm": 1.3860732316970825, - "learning_rate": 4.903346577317859e-06, - "loss": 0.45816320180892944, - "mean_token_accuracy": 0.8328287601470947, - "num_tokens": 4850302.0, - "step": 532 - }, - { - "epoch": 0.40501519756838905, - "grad_norm": 1.9186837673187256, - "learning_rate": 4.902769002707942e-06, - "loss": 0.3294633626937866, - "mean_token_accuracy": 0.8853933811187744, - "num_tokens": 4856624.0, - "step": 533 - }, - { - "epoch": 0.40577507598784196, - "grad_norm": 1.516194462776184, - "learning_rate": 4.902189741742247e-06, - "loss": 0.45482105016708374, - "mean_token_accuracy": 0.8370342254638672, - "num_tokens": 4870395.0, - "step": 534 - }, - { - "epoch": 0.4065349544072948, - "grad_norm": 2.3235628604888916, - "learning_rate": 4.901608794827321e-06, - "loss": 0.40688639879226685, - "mean_token_accuracy": 0.8643521666526794, - "num_tokens": 4875645.0, - "step": 535 - }, - { - "epoch": 0.4072948328267477, - "grad_norm": 2.29286527633667, - "learning_rate": 4.9010261623708945e-06, - "loss": 0.45482826232910156, - "mean_token_accuracy": 0.8429383039474487, - "num_tokens": 4881772.0, - "step": 536 - }, - { - "epoch": 0.40805471124620063, - "grad_norm": 1.5907070636749268, - "learning_rate": 4.900441844781882e-06, - "loss": 0.5266948342323303, - "mean_token_accuracy": 0.8348641395568848, - "num_tokens": 4894289.0, - "step": 537 - }, - { - "epoch": 0.4088145896656535, - "grad_norm": 2.1816294193267822, - "learning_rate": 4.89985584247038e-06, - "loss": 0.4797617793083191, - "mean_token_accuracy": 0.8549500703811646, - "num_tokens": 4901106.0, - "step": 538 - }, - { - "epoch": 0.4095744680851064, - "grad_norm": 1.7347146272659302, - "learning_rate": 4.899268155847667e-06, - "loss": 0.4754739999771118, - "mean_token_accuracy": 0.8278418183326721, - "num_tokens": 4912131.0, - "step": 539 - }, - { - "epoch": 0.41033434650455924, - "grad_norm": 2.0694527626037598, - "learning_rate": 4.898678785326205e-06, - "loss": 0.5071008801460266, - "mean_token_accuracy": 0.8157946467399597, - "num_tokens": 4921141.0, - "step": 540 - }, - { - "epoch": 0.41109422492401215, - "grad_norm": 2.570047616958618, - "learning_rate": 4.898087731319637e-06, - "loss": 0.43639278411865234, - "mean_token_accuracy": 0.8682913780212402, - "num_tokens": 4926182.0, - "step": 541 - }, - { - "epoch": 0.41185410334346506, - "grad_norm": 4.064006805419922, - "learning_rate": 4.8974949942427854e-06, - "loss": 0.539260745048523, - "mean_token_accuracy": 0.8225528001785278, - "num_tokens": 4929449.0, - "step": 542 - }, - { - "epoch": 0.4126139817629179, - "grad_norm": 1.7644332647323608, - "learning_rate": 4.896900574511657e-06, - "loss": 0.472618043422699, - "mean_token_accuracy": 0.8332902193069458, - "num_tokens": 4939443.0, - "step": 543 - }, - { - "epoch": 0.4133738601823708, - "grad_norm": 2.879918336868286, - "learning_rate": 4.89630447254344e-06, - "loss": 0.6360667943954468, - "mean_token_accuracy": 0.8215296268463135, - "num_tokens": 4950838.0, - "step": 544 - }, - { - "epoch": 0.41413373860182373, - "grad_norm": 1.4575570821762085, - "learning_rate": 4.8957066887565005e-06, - "loss": 0.45617997646331787, - "mean_token_accuracy": 0.8373187184333801, - "num_tokens": 4965222.0, - "step": 545 - }, - { - "epoch": 0.4148936170212766, - "grad_norm": 2.4829535484313965, - "learning_rate": 4.895107223570386e-06, - "loss": 0.42285341024398804, - "mean_token_accuracy": 0.8686380386352539, - "num_tokens": 4970724.0, - "step": 546 - }, - { - "epoch": 0.4156534954407295, - "grad_norm": 2.639474630355835, - "learning_rate": 4.894506077405824e-06, - "loss": 0.5906289219856262, - "mean_token_accuracy": 0.8174435496330261, - "num_tokens": 4976766.0, - "step": 547 - }, - { - "epoch": 0.41641337386018235, - "grad_norm": 2.7960562705993652, - "learning_rate": 4.893903250684723e-06, - "loss": 0.4518949091434479, - "mean_token_accuracy": 0.8387585282325745, - "num_tokens": 4980991.0, - "step": 548 - }, - { - "epoch": 0.41717325227963525, - "grad_norm": 2.184176206588745, - "learning_rate": 4.893298743830168e-06, - "loss": 0.5223842859268188, - "mean_token_accuracy": 0.8170937299728394, - "num_tokens": 4987781.0, - "step": 549 - }, - { - "epoch": 0.41793313069908816, - "grad_norm": 2.2393438816070557, - "learning_rate": 4.892692557266429e-06, - "loss": 0.5238431692123413, - "mean_token_accuracy": 0.8217905759811401, - "num_tokens": 4994321.0, - "step": 550 - }, - { - "epoch": 0.418693009118541, - "grad_norm": 3.579047441482544, - "learning_rate": 4.8920846914189465e-06, - "loss": 0.5367584228515625, - "mean_token_accuracy": 0.8312011361122131, - "num_tokens": 4997951.0, - "step": 551 - }, - { - "epoch": 0.4194528875379939, - "grad_norm": 1.6330240964889526, - "learning_rate": 4.891475146714348e-06, - "loss": 0.6054705381393433, - "mean_token_accuracy": 0.7938206791877747, - "num_tokens": 5012726.0, - "step": 552 - }, - { - "epoch": 0.42021276595744683, - "grad_norm": 1.5775716304779053, - "learning_rate": 4.8908639235804324e-06, - "loss": 0.4774656891822815, - "mean_token_accuracy": 0.828762948513031, - "num_tokens": 5026751.0, - "step": 553 - }, - { - "epoch": 0.4209726443768997, - "grad_norm": 1.5719101428985596, - "learning_rate": 4.890251022446181e-06, - "loss": 0.549429178237915, - "mean_token_accuracy": 0.8110791444778442, - "num_tokens": 5041861.0, - "step": 554 - }, - { - "epoch": 0.4217325227963526, - "grad_norm": 1.8585275411605835, - "learning_rate": 4.889636443741752e-06, - "loss": 0.4448118805885315, - "mean_token_accuracy": 0.8462690711021423, - "num_tokens": 5052690.0, - "step": 555 - }, - { - "epoch": 0.42249240121580545, - "grad_norm": 2.189202070236206, - "learning_rate": 4.88902018789848e-06, - "loss": 0.4296762943267822, - "mean_token_accuracy": 0.8488791584968567, - "num_tokens": 5058964.0, - "step": 556 - }, - { - "epoch": 0.42325227963525835, - "grad_norm": 1.9328460693359375, - "learning_rate": 4.888402255348877e-06, - "loss": 0.5369474291801453, - "mean_token_accuracy": 0.8184729814529419, - "num_tokens": 5068465.0, - "step": 557 - }, - { - "epoch": 0.42401215805471126, - "grad_norm": 1.6233323812484741, - "learning_rate": 4.887782646526631e-06, - "loss": 0.5284391641616821, - "mean_token_accuracy": 0.8276044726371765, - "num_tokens": 5081052.0, - "step": 558 - }, - { - "epoch": 0.4247720364741641, - "grad_norm": 2.222813844680786, - "learning_rate": 4.887161361866608e-06, - "loss": 0.5679137706756592, - "mean_token_accuracy": 0.8012375831604004, - "num_tokens": 5090001.0, - "step": 559 - }, - { - "epoch": 0.425531914893617, - "grad_norm": 2.1062207221984863, - "learning_rate": 4.8865384018048494e-06, - "loss": 0.5554201602935791, - "mean_token_accuracy": 0.8128066062927246, - "num_tokens": 5097644.0, - "step": 560 - }, - { - "epoch": 0.42629179331306993, - "grad_norm": 1.5380984544754028, - "learning_rate": 4.8859137667785735e-06, - "loss": 0.4948265850543976, - "mean_token_accuracy": 0.8258291482925415, - "num_tokens": 5110069.0, - "step": 561 - }, - { - "epoch": 0.4270516717325228, - "grad_norm": 2.0290257930755615, - "learning_rate": 4.8852874572261715e-06, - "loss": 0.4969530403614044, - "mean_token_accuracy": 0.8297134637832642, - "num_tokens": 5117452.0, - "step": 562 - }, - { - "epoch": 0.4278115501519757, - "grad_norm": 1.5651452541351318, - "learning_rate": 4.884659473587213e-06, - "loss": 0.5353102087974548, - "mean_token_accuracy": 0.8161719441413879, - "num_tokens": 5133756.0, - "step": 563 - }, - { - "epoch": 0.42857142857142855, - "grad_norm": 2.2470998764038086, - "learning_rate": 4.884029816302441e-06, - "loss": 0.5104288458824158, - "mean_token_accuracy": 0.8081635236740112, - "num_tokens": 5140278.0, - "step": 564 - }, - { - "epoch": 0.42933130699088146, - "grad_norm": 1.726891279220581, - "learning_rate": 4.883398485813772e-06, - "loss": 0.4508771002292633, - "mean_token_accuracy": 0.8548800349235535, - "num_tokens": 5150115.0, - "step": 565 - }, - { - "epoch": 0.43009118541033436, - "grad_norm": 1.4779289960861206, - "learning_rate": 4.8827654825642984e-06, - "loss": 0.46861088275909424, - "mean_token_accuracy": 0.8209476470947266, - "num_tokens": 5163225.0, - "step": 566 - }, - { - "epoch": 0.4308510638297872, - "grad_norm": 1.2361034154891968, - "learning_rate": 4.882130806998287e-06, - "loss": 0.4591076672077179, - "mean_token_accuracy": 0.803041934967041, - "num_tokens": 5180342.0, - "step": 567 - }, - { - "epoch": 0.4316109422492401, - "grad_norm": 1.882467269897461, - "learning_rate": 4.881494459561177e-06, - "loss": 0.579258143901825, - "mean_token_accuracy": 0.8007112741470337, - "num_tokens": 5189595.0, - "step": 568 - }, - { - "epoch": 0.43237082066869303, - "grad_norm": 1.095462441444397, - "learning_rate": 4.880856440699582e-06, - "loss": 0.3806574046611786, - "mean_token_accuracy": 0.8650111556053162, - "num_tokens": 5211642.0, - "step": 569 - }, - { - "epoch": 0.4331306990881459, - "grad_norm": 1.6469846963882446, - "learning_rate": 4.880216750861288e-06, - "loss": 0.544589638710022, - "mean_token_accuracy": 0.8060122728347778, - "num_tokens": 5224137.0, - "step": 570 - }, - { - "epoch": 0.4338905775075988, - "grad_norm": 1.8561251163482666, - "learning_rate": 4.879575390495254e-06, - "loss": 0.4094924330711365, - "mean_token_accuracy": 0.8591406345367432, - "num_tokens": 5231588.0, - "step": 571 - }, - { - "epoch": 0.43465045592705165, - "grad_norm": 3.01326847076416, - "learning_rate": 4.878932360051611e-06, - "loss": 0.6139192581176758, - "mean_token_accuracy": 0.8108739852905273, - "num_tokens": 5236853.0, - "step": 572 - }, - { - "epoch": 0.43541033434650456, - "grad_norm": 2.1753034591674805, - "learning_rate": 4.878287659981663e-06, - "loss": 0.49082931876182556, - "mean_token_accuracy": 0.862828254699707, - "num_tokens": 5243264.0, - "step": 573 - }, - { - "epoch": 0.43617021276595747, - "grad_norm": 1.4437755346298218, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.5608728528022766, - "mean_token_accuracy": 0.8271626234054565, - "num_tokens": 5261757.0, - "step": 574 - }, - { - "epoch": 0.4369300911854103, - "grad_norm": 1.786683440208435, - "learning_rate": 4.876993252773923e-06, - "loss": 0.4377627968788147, - "mean_token_accuracy": 0.844936192035675, - "num_tokens": 5271038.0, - "step": 575 - }, - { - "epoch": 0.4376899696048632, - "grad_norm": 1.3425915241241455, - "learning_rate": 4.876343546544596e-06, - "loss": 0.44762521982192993, - "mean_token_accuracy": 0.8397793769836426, - "num_tokens": 5285555.0, - "step": 576 - }, - { - "epoch": 0.43844984802431614, - "grad_norm": 2.1549675464630127, - "learning_rate": 4.8756921725058935e-06, - "loss": 0.5332942008972168, - "mean_token_accuracy": 0.820149302482605, - "num_tokens": 5294595.0, - "step": 577 - }, - { - "epoch": 0.439209726443769, - "grad_norm": 1.5254042148590088, - "learning_rate": 4.875039131114975e-06, - "loss": 0.3646543622016907, - "mean_token_accuracy": 0.8442583084106445, - "num_tokens": 5304955.0, - "step": 578 - }, - { - "epoch": 0.4399696048632219, - "grad_norm": 1.5751557350158691, - "learning_rate": 4.8743844228301676e-06, - "loss": 0.4854734539985657, - "mean_token_accuracy": 0.8317523002624512, - "num_tokens": 5317351.0, - "step": 579 - }, - { - "epoch": 0.44072948328267475, - "grad_norm": 1.6950466632843018, - "learning_rate": 4.873728048110973e-06, - "loss": 0.5907570719718933, - "mean_token_accuracy": 0.7946986556053162, - "num_tokens": 5332542.0, - "step": 580 - }, - { - "epoch": 0.44148936170212766, - "grad_norm": 2.1180708408355713, - "learning_rate": 4.873070007418059e-06, - "loss": 0.5220296382904053, - "mean_token_accuracy": 0.8037363290786743, - "num_tokens": 5341722.0, - "step": 581 - }, - { - "epoch": 0.44224924012158057, - "grad_norm": 1.3643816709518433, - "learning_rate": 4.872410301213265e-06, - "loss": 0.4865502417087555, - "mean_token_accuracy": 0.8377852439880371, - "num_tokens": 5359359.0, - "step": 582 - }, - { - "epoch": 0.4430091185410334, - "grad_norm": 1.483280897140503, - "learning_rate": 4.871748929959598e-06, - "loss": 0.36856764554977417, - "mean_token_accuracy": 0.8709549903869629, - "num_tokens": 5369749.0, - "step": 583 - }, - { - "epoch": 0.44376899696048633, - "grad_norm": 1.6891541481018066, - "learning_rate": 4.871085894121234e-06, - "loss": 0.5768930912017822, - "mean_token_accuracy": 0.8030461668968201, - "num_tokens": 5383912.0, - "step": 584 - }, - { - "epoch": 0.44452887537993924, - "grad_norm": 2.1318740844726562, - "learning_rate": 4.870421194163515e-06, - "loss": 0.4337100386619568, - "mean_token_accuracy": 0.8562518358230591, - "num_tokens": 5389412.0, - "step": 585 - }, - { - "epoch": 0.4452887537993921, - "grad_norm": 2.540255546569824, - "learning_rate": 4.869754830552956e-06, - "loss": 0.4708256125450134, - "mean_token_accuracy": 0.8446552753448486, - "num_tokens": 5394762.0, - "step": 586 - }, - { - "epoch": 0.446048632218845, - "grad_norm": 2.048015594482422, - "learning_rate": 4.869086803757235e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8181137442588806, - "num_tokens": 5402379.0, - "step": 587 - }, - { - "epoch": 0.44680851063829785, - "grad_norm": 2.9821012020111084, - "learning_rate": 4.868417114245199e-06, - "loss": 0.6299797296524048, - "mean_token_accuracy": 0.8237329125404358, - "num_tokens": 5408229.0, - "step": 588 - }, - { - "epoch": 0.44756838905775076, - "grad_norm": 1.7807202339172363, - "learning_rate": 4.867745762486862e-06, - "loss": 0.5176759958267212, - "mean_token_accuracy": 0.8184244632720947, - "num_tokens": 5418383.0, - "step": 589 - }, - { - "epoch": 0.44832826747720367, - "grad_norm": 1.5466399192810059, - "learning_rate": 4.8670727489534035e-06, - "loss": 0.5137228965759277, - "mean_token_accuracy": 0.8365053534507751, - "num_tokens": 5432127.0, - "step": 590 - }, - { - "epoch": 0.4490881458966565, - "grad_norm": 2.9521141052246094, - "learning_rate": 4.866398074117173e-06, - "loss": 0.4056887924671173, - "mean_token_accuracy": 0.8561501502990723, - "num_tokens": 5436062.0, - "step": 591 - }, - { - "epoch": 0.44984802431610943, - "grad_norm": 2.058743953704834, - "learning_rate": 4.86572173845168e-06, - "loss": 0.6124799251556396, - "mean_token_accuracy": 0.8007957339286804, - "num_tokens": 5444989.0, - "step": 592 - }, - { - "epoch": 0.4506079027355623, - "grad_norm": 2.1243767738342285, - "learning_rate": 4.865043742431605e-06, - "loss": 0.5659694671630859, - "mean_token_accuracy": 0.8084750175476074, - "num_tokens": 5453865.0, - "step": 593 - }, - { - "epoch": 0.4513677811550152, - "grad_norm": 1.6732314825057983, - "learning_rate": 4.864364086532792e-06, - "loss": 0.47879064083099365, - "mean_token_accuracy": 0.8346436023712158, - "num_tokens": 5466398.0, - "step": 594 - }, - { - "epoch": 0.4521276595744681, - "grad_norm": 1.3793858289718628, - "learning_rate": 4.863682771232249e-06, - "loss": 0.45989373326301575, - "mean_token_accuracy": 0.8254791498184204, - "num_tokens": 5482121.0, - "step": 595 - }, - { - "epoch": 0.45288753799392095, - "grad_norm": 1.9812315702438354, - "learning_rate": 4.862999797008149e-06, - "loss": 0.5778874754905701, - "mean_token_accuracy": 0.8041508197784424, - "num_tokens": 5493000.0, - "step": 596 - }, - { - "epoch": 0.45364741641337386, - "grad_norm": 3.3065083026885986, - "learning_rate": 4.862315164339829e-06, - "loss": 0.4623975157737732, - "mean_token_accuracy": 0.8426318168640137, - "num_tokens": 5496723.0, - "step": 597 - }, - { - "epoch": 0.45440729483282677, - "grad_norm": 3.167119026184082, - "learning_rate": 4.861628873707792e-06, - "loss": 0.6984533667564392, - "mean_token_accuracy": 0.772136926651001, - "num_tokens": 5501161.0, - "step": 598 - }, - { - "epoch": 0.4551671732522796, - "grad_norm": 2.2130985260009766, - "learning_rate": 4.860940925593703e-06, - "loss": 0.4823192059993744, - "mean_token_accuracy": 0.8462972640991211, - "num_tokens": 5509544.0, - "step": 599 - }, - { - "epoch": 0.45592705167173253, - "grad_norm": 3.029191732406616, - "learning_rate": 4.86025132048039e-06, - "loss": 0.523664116859436, - "mean_token_accuracy": 0.8229140043258667, - "num_tokens": 5514586.0, - "step": 600 - }, - { - "epoch": 0.4566869300911854, - "grad_norm": 1.6983962059020996, - "learning_rate": 4.859560058851844e-06, - "loss": 0.4832698106765747, - "mean_token_accuracy": 0.8403248190879822, - "num_tokens": 5525773.0, - "step": 601 - }, - { - "epoch": 0.4574468085106383, - "grad_norm": 3.0504038333892822, - "learning_rate": 4.8588671411932195e-06, - "loss": 0.5158926248550415, - "mean_token_accuracy": 0.8098392486572266, - "num_tokens": 5529739.0, - "step": 602 - }, - { - "epoch": 0.4582066869300912, - "grad_norm": 2.584836483001709, - "learning_rate": 4.858172567990832e-06, - "loss": 0.5724587440490723, - "mean_token_accuracy": 0.8128519058227539, - "num_tokens": 5535763.0, - "step": 603 - }, - { - "epoch": 0.45896656534954405, - "grad_norm": 2.0514042377471924, - "learning_rate": 4.857476339732162e-06, - "loss": 0.4337679445743561, - "mean_token_accuracy": 0.8405929207801819, - "num_tokens": 5543075.0, - "step": 604 - }, - { - "epoch": 0.45972644376899696, - "grad_norm": 2.2949347496032715, - "learning_rate": 4.856778456905846e-06, - "loss": 0.46532145142555237, - "mean_token_accuracy": 0.8345137238502502, - "num_tokens": 5549035.0, - "step": 605 - }, - { - "epoch": 0.46048632218844987, - "grad_norm": 2.2067551612854004, - "learning_rate": 4.856078920001689e-06, - "loss": 0.5855136513710022, - "mean_token_accuracy": 0.8043795228004456, - "num_tokens": 5555545.0, - "step": 606 - }, - { - "epoch": 0.4612462006079027, - "grad_norm": 2.101945161819458, - "learning_rate": 4.855377729510648e-06, - "loss": 0.6071814298629761, - "mean_token_accuracy": 0.7973253130912781, - "num_tokens": 5563615.0, - "step": 607 - }, - { - "epoch": 0.46200607902735563, - "grad_norm": 2.5958821773529053, - "learning_rate": 4.8546748859248504e-06, - "loss": 0.6278061866760254, - "mean_token_accuracy": 0.7864972352981567, - "num_tokens": 5570078.0, - "step": 608 - }, - { - "epoch": 0.4627659574468085, - "grad_norm": 2.778101921081543, - "learning_rate": 4.853970389737576e-06, - "loss": 0.35521194338798523, - "mean_token_accuracy": 0.8752605319023132, - "num_tokens": 5573995.0, - "step": 609 - }, - { - "epoch": 0.4635258358662614, - "grad_norm": 2.600534677505493, - "learning_rate": 4.8532642414432675e-06, - "loss": 0.6541563868522644, - "mean_token_accuracy": 0.7843613028526306, - "num_tokens": 5580333.0, - "step": 610 - }, - { - "epoch": 0.4642857142857143, - "grad_norm": 1.778337836265564, - "learning_rate": 4.852556441537528e-06, - "loss": 0.3561405837535858, - "mean_token_accuracy": 0.8579353094100952, - "num_tokens": 5588430.0, - "step": 611 - }, - { - "epoch": 0.46504559270516715, - "grad_norm": 1.5653862953186035, - "learning_rate": 4.851846990517118e-06, - "loss": 0.6067906618118286, - "mean_token_accuracy": 0.7919317483901978, - "num_tokens": 5601700.0, - "step": 612 - }, - { - "epoch": 0.46580547112462006, - "grad_norm": 1.6097723245620728, - "learning_rate": 4.851135888879958e-06, - "loss": 0.446664422750473, - "mean_token_accuracy": 0.8441969156265259, - "num_tokens": 5612063.0, - "step": 613 - }, - { - "epoch": 0.46656534954407297, - "grad_norm": 1.961207389831543, - "learning_rate": 4.850423137125126e-06, - "loss": 0.5508605241775513, - "mean_token_accuracy": 0.8240450024604797, - "num_tokens": 5620245.0, - "step": 614 - }, - { - "epoch": 0.4673252279635258, - "grad_norm": 2.2189085483551025, - "learning_rate": 4.8497087357528585e-06, - "loss": 0.6805076599121094, - "mean_token_accuracy": 0.771978497505188, - "num_tokens": 5629590.0, - "step": 615 - }, - { - "epoch": 0.46808510638297873, - "grad_norm": 2.5176279544830322, - "learning_rate": 4.8489926852645505e-06, - "loss": 0.4512156844139099, - "mean_token_accuracy": 0.836459755897522, - "num_tokens": 5635259.0, - "step": 616 - }, - { - "epoch": 0.4688449848024316, - "grad_norm": 1.5327287912368774, - "learning_rate": 4.848274986162754e-06, - "loss": 0.4884302616119385, - "mean_token_accuracy": 0.8194037079811096, - "num_tokens": 5649993.0, - "step": 617 - }, - { - "epoch": 0.4696048632218845, - "grad_norm": 2.184554100036621, - "learning_rate": 4.847555638951177e-06, - "loss": 0.5141451358795166, - "mean_token_accuracy": 0.8245922327041626, - "num_tokens": 5657375.0, - "step": 618 - }, - { - "epoch": 0.4703647416413374, - "grad_norm": 1.6143407821655273, - "learning_rate": 4.846834644134686e-06, - "loss": 0.4276641607284546, - "mean_token_accuracy": 0.8481845855712891, - "num_tokens": 5667941.0, - "step": 619 - }, - { - "epoch": 0.47112462006079026, - "grad_norm": 2.3747270107269287, - "learning_rate": 4.846112002219301e-06, - "loss": 0.5608246922492981, - "mean_token_accuracy": 0.8073011040687561, - "num_tokens": 5675042.0, - "step": 620 - }, - { - "epoch": 0.47188449848024316, - "grad_norm": 2.390404224395752, - "learning_rate": 4.845387713712203e-06, - "loss": 0.46616724133491516, - "mean_token_accuracy": 0.8468319177627563, - "num_tokens": 5680207.0, - "step": 621 - }, - { - "epoch": 0.4726443768996961, - "grad_norm": 1.7245099544525146, - "learning_rate": 4.844661779121723e-06, - "loss": 0.5652435421943665, - "mean_token_accuracy": 0.8010749816894531, - "num_tokens": 5693759.0, - "step": 622 - }, - { - "epoch": 0.4734042553191489, - "grad_norm": 2.6923108100891113, - "learning_rate": 4.843934198957351e-06, - "loss": 0.6254661679267883, - "mean_token_accuracy": 0.8236024975776672, - "num_tokens": 5699916.0, - "step": 623 - }, - { - "epoch": 0.47416413373860183, - "grad_norm": 2.516901969909668, - "learning_rate": 4.84320497372973e-06, - "loss": 0.6334252953529358, - "mean_token_accuracy": 0.7803834676742554, - "num_tokens": 5706554.0, - "step": 624 - }, - { - "epoch": 0.4749240121580547, - "grad_norm": 2.3744447231292725, - "learning_rate": 4.842474103950658e-06, - "loss": 0.4221811890602112, - "mean_token_accuracy": 0.8639545440673828, - "num_tokens": 5711756.0, - "step": 625 - }, - { - "epoch": 0.4756838905775076, - "grad_norm": 3.2373476028442383, - "learning_rate": 4.841741590133089e-06, - "loss": 0.6637828946113586, - "mean_token_accuracy": 0.7968347072601318, - "num_tokens": 5716458.0, - "step": 626 - }, - { - "epoch": 0.4764437689969605, - "grad_norm": 2.153888463973999, - "learning_rate": 4.841007432791129e-06, - "loss": 0.4877486228942871, - "mean_token_accuracy": 0.8345249891281128, - "num_tokens": 5723155.0, - "step": 627 - }, - { - "epoch": 0.47720364741641336, - "grad_norm": 2.120497703552246, - "learning_rate": 4.8402716324400375e-06, - "loss": 0.37323033809661865, - "mean_token_accuracy": 0.8734050393104553, - "num_tokens": 5729171.0, - "step": 628 - }, - { - "epoch": 0.47796352583586627, - "grad_norm": 1.5294172763824463, - "learning_rate": 4.839534189596228e-06, - "loss": 0.4057067334651947, - "mean_token_accuracy": 0.8523319959640503, - "num_tokens": 5740112.0, - "step": 629 - }, - { - "epoch": 0.4787234042553192, - "grad_norm": 2.1913886070251465, - "learning_rate": 4.8387951047772656e-06, - "loss": 0.4835960865020752, - "mean_token_accuracy": 0.8438145518302917, - "num_tokens": 5746838.0, - "step": 630 - }, - { - "epoch": 0.479483282674772, - "grad_norm": 1.482897162437439, - "learning_rate": 4.838054378501868e-06, - "loss": 0.46967992186546326, - "mean_token_accuracy": 0.8315759897232056, - "num_tokens": 5760428.0, - "step": 631 - }, - { - "epoch": 0.48024316109422494, - "grad_norm": 1.38850998878479, - "learning_rate": 4.837312011289907e-06, - "loss": 0.41845446825027466, - "mean_token_accuracy": 0.8557186126708984, - "num_tokens": 5773437.0, - "step": 632 - }, - { - "epoch": 0.4810030395136778, - "grad_norm": 3.8337457180023193, - "learning_rate": 4.836568003662403e-06, - "loss": 0.5102912187576294, - "mean_token_accuracy": 0.830644965171814, - "num_tokens": 5776367.0, - "step": 633 - }, - { - "epoch": 0.4817629179331307, - "grad_norm": 1.2084007263183594, - "learning_rate": 4.8358223561415304e-06, - "loss": 0.3835333585739136, - "mean_token_accuracy": 0.8639016151428223, - "num_tokens": 5792246.0, - "step": 634 - }, - { - "epoch": 0.4825227963525836, - "grad_norm": 1.939408540725708, - "learning_rate": 4.835075069250613e-06, - "loss": 0.4044850468635559, - "mean_token_accuracy": 0.8488376140594482, - "num_tokens": 5799853.0, - "step": 635 - }, - { - "epoch": 0.48328267477203646, - "grad_norm": 1.345870852470398, - "learning_rate": 4.8343261435141245e-06, - "loss": 0.46660199761390686, - "mean_token_accuracy": 0.8371681571006775, - "num_tokens": 5817478.0, - "step": 636 - }, - { - "epoch": 0.48404255319148937, - "grad_norm": 1.6531339883804321, - "learning_rate": 4.833575579457691e-06, - "loss": 0.3886989951133728, - "mean_token_accuracy": 0.8763507008552551, - "num_tokens": 5825739.0, - "step": 637 - }, - { - "epoch": 0.4848024316109423, - "grad_norm": 1.6443969011306763, - "learning_rate": 4.832823377608088e-06, - "loss": 0.4070289731025696, - "mean_token_accuracy": 0.8586630821228027, - "num_tokens": 5837917.0, - "step": 638 - }, - { - "epoch": 0.48556231003039513, - "grad_norm": 2.005136013031006, - "learning_rate": 4.832069538493237e-06, - "loss": 0.40616685152053833, - "mean_token_accuracy": 0.8571510314941406, - "num_tokens": 5845250.0, - "step": 639 - }, - { - "epoch": 0.48632218844984804, - "grad_norm": 1.5244266986846924, - "learning_rate": 4.831314062642213e-06, - "loss": 0.49530288577079773, - "mean_token_accuracy": 0.8328841924667358, - "num_tokens": 5857407.0, - "step": 640 - }, - { - "epoch": 0.4870820668693009, - "grad_norm": 1.9876971244812012, - "learning_rate": 4.830556950585239e-06, - "loss": 0.4583776593208313, - "mean_token_accuracy": 0.8427221179008484, - "num_tokens": 5865391.0, - "step": 641 - }, - { - "epoch": 0.4878419452887538, - "grad_norm": 3.023336172103882, - "learning_rate": 4.829798202853683e-06, - "loss": 0.6134771108627319, - "mean_token_accuracy": 0.7981935739517212, - "num_tokens": 5870729.0, - "step": 642 - }, - { - "epoch": 0.4886018237082067, - "grad_norm": 1.8889515399932861, - "learning_rate": 4.829037819980065e-06, - "loss": 0.4420135021209717, - "mean_token_accuracy": 0.8480775356292725, - "num_tokens": 5878982.0, - "step": 643 - }, - { - "epoch": 0.48936170212765956, - "grad_norm": 2.2408435344696045, - "learning_rate": 4.828275802498051e-06, - "loss": 0.525706946849823, - "mean_token_accuracy": 0.8271557092666626, - "num_tokens": 5885097.0, - "step": 644 - }, - { - "epoch": 0.49012158054711247, - "grad_norm": 1.9734224081039429, - "learning_rate": 4.827512150942454e-06, - "loss": 0.44246578216552734, - "mean_token_accuracy": 0.8456668257713318, - "num_tokens": 5893941.0, - "step": 645 - }, - { - "epoch": 0.4908814589665654, - "grad_norm": 1.9618173837661743, - "learning_rate": 4.8267468658492335e-06, - "loss": 0.5119768381118774, - "mean_token_accuracy": 0.8355510830879211, - "num_tokens": 5902829.0, - "step": 646 - }, - { - "epoch": 0.49164133738601823, - "grad_norm": 1.7181587219238281, - "learning_rate": 4.825979947755496e-06, - "loss": 0.5666520595550537, - "mean_token_accuracy": 0.7951971888542175, - "num_tokens": 5915212.0, - "step": 647 - }, - { - "epoch": 0.49240121580547114, - "grad_norm": 3.0121164321899414, - "learning_rate": 4.8252113971994955e-06, - "loss": 0.628632128238678, - "mean_token_accuracy": 0.8041050434112549, - "num_tokens": 5921410.0, - "step": 648 - }, - { - "epoch": 0.493161094224924, - "grad_norm": 2.9980475902557373, - "learning_rate": 4.824441214720629e-06, - "loss": 0.4507424831390381, - "mean_token_accuracy": 0.8636263608932495, - "num_tokens": 5925179.0, - "step": 649 - }, - { - "epoch": 0.4939209726443769, - "grad_norm": 2.0096445083618164, - "learning_rate": 4.823669400859441e-06, - "loss": 0.602759838104248, - "mean_token_accuracy": 0.8104915618896484, - "num_tokens": 5934160.0, - "step": 650 - }, - { - "epoch": 0.4946808510638298, - "grad_norm": 1.1186442375183105, - "learning_rate": 4.8228959561576195e-06, - "loss": 0.41168469190597534, - "mean_token_accuracy": 0.8461419939994812, - "num_tokens": 5954163.0, - "step": 651 - }, - { - "epoch": 0.49544072948328266, - "grad_norm": 1.855465054512024, - "learning_rate": 4.822120881157998e-06, - "loss": 0.5049735307693481, - "mean_token_accuracy": 0.8225747346878052, - "num_tokens": 5963840.0, - "step": 652 - }, - { - "epoch": 0.49620060790273557, - "grad_norm": 3.550563335418701, - "learning_rate": 4.821344176404554e-06, - "loss": 0.49025264382362366, - "mean_token_accuracy": 0.8265978693962097, - "num_tokens": 5967358.0, - "step": 653 - }, - { - "epoch": 0.4969604863221885, - "grad_norm": 3.063910484313965, - "learning_rate": 4.820565842442408e-06, - "loss": 0.5652767419815063, - "mean_token_accuracy": 0.811700701713562, - "num_tokens": 5971858.0, - "step": 654 - }, - { - "epoch": 0.49772036474164133, - "grad_norm": 2.4613308906555176, - "learning_rate": 4.819785879817827e-06, - "loss": 0.5296125411987305, - "mean_token_accuracy": 0.8336488008499146, - "num_tokens": 5977442.0, - "step": 655 - }, - { - "epoch": 0.49848024316109424, - "grad_norm": 2.342519760131836, - "learning_rate": 4.819004289078217e-06, - "loss": 0.5753380060195923, - "mean_token_accuracy": 0.7922406792640686, - "num_tokens": 5984531.0, - "step": 656 - }, - { - "epoch": 0.4992401215805471, - "grad_norm": 2.0410680770874023, - "learning_rate": 4.818221070772129e-06, - "loss": 0.5433275699615479, - "mean_token_accuracy": 0.8043830990791321, - "num_tokens": 5992642.0, - "step": 657 - }, - { - "epoch": 0.5, - "grad_norm": 1.4999698400497437, - "learning_rate": 4.8174362254492555e-06, - "loss": 0.5248899459838867, - "mean_token_accuracy": 0.8107168674468994, - "num_tokens": 6005543.0, - "step": 658 - }, - { - "epoch": 0.5007598784194529, - "grad_norm": 1.9494401216506958, - "learning_rate": 4.816649753660431e-06, - "loss": 0.41291385889053345, - "mean_token_accuracy": 0.8650569915771484, - "num_tokens": 6012185.0, - "step": 659 - }, - { - "epoch": 0.5015197568389058, - "grad_norm": 2.7514095306396484, - "learning_rate": 4.815861655957632e-06, - "loss": 0.4244142770767212, - "mean_token_accuracy": 0.8485112190246582, - "num_tokens": 6016809.0, - "step": 660 - }, - { - "epoch": 0.5022796352583586, - "grad_norm": 1.4354928731918335, - "learning_rate": 4.815071932893976e-06, - "loss": 0.4332060217857361, - "mean_token_accuracy": 0.8386815786361694, - "num_tokens": 6034795.0, - "step": 661 - }, - { - "epoch": 0.5030395136778115, - "grad_norm": 1.3113417625427246, - "learning_rate": 4.81428058502372e-06, - "loss": 0.5415540933609009, - "mean_token_accuracy": 0.8115285038948059, - "num_tokens": 6053624.0, - "step": 662 - }, - { - "epoch": 0.5037993920972644, - "grad_norm": 1.820868730545044, - "learning_rate": 4.813487612902265e-06, - "loss": 0.5360245108604431, - "mean_token_accuracy": 0.8313555717468262, - "num_tokens": 6063399.0, - "step": 663 - }, - { - "epoch": 0.5045592705167173, - "grad_norm": 2.347001552581787, - "learning_rate": 4.812693017086145e-06, - "loss": 0.4926982820034027, - "mean_token_accuracy": 0.8137006759643555, - "num_tokens": 6070111.0, - "step": 664 - }, - { - "epoch": 0.5053191489361702, - "grad_norm": 1.8830888271331787, - "learning_rate": 4.811896798133042e-06, - "loss": 0.5419014692306519, - "mean_token_accuracy": 0.8027454614639282, - "num_tokens": 6081090.0, - "step": 665 - }, - { - "epoch": 0.506079027355623, - "grad_norm": 2.3258056640625, - "learning_rate": 4.811098956601772e-06, - "loss": 0.4629337787628174, - "mean_token_accuracy": 0.8416580557823181, - "num_tokens": 6087921.0, - "step": 666 - }, - { - "epoch": 0.506838905775076, - "grad_norm": 1.9578291177749634, - "learning_rate": 4.810299493052289e-06, - "loss": 0.40305402874946594, - "mean_token_accuracy": 0.8529061079025269, - "num_tokens": 6100034.0, - "step": 667 - }, - { - "epoch": 0.5075987841945289, - "grad_norm": 2.800635576248169, - "learning_rate": 4.809498408045691e-06, - "loss": 0.5087342262268066, - "mean_token_accuracy": 0.8214689493179321, - "num_tokens": 6104742.0, - "step": 668 - }, - { - "epoch": 0.5083586626139818, - "grad_norm": 1.5318149328231812, - "learning_rate": 4.808695702144206e-06, - "loss": 0.4733222723007202, - "mean_token_accuracy": 0.837577223777771, - "num_tokens": 6117242.0, - "step": 669 - }, - { - "epoch": 0.5091185410334347, - "grad_norm": 1.2368661165237427, - "learning_rate": 4.807891375911207e-06, - "loss": 0.3929097056388855, - "mean_token_accuracy": 0.8331400752067566, - "num_tokens": 6133509.0, - "step": 670 - }, - { - "epoch": 0.5098784194528876, - "grad_norm": 2.4711415767669678, - "learning_rate": 4.8070854299112e-06, - "loss": 0.6294851303100586, - "mean_token_accuracy": 0.7956781983375549, - "num_tokens": 6140294.0, - "step": 671 - }, - { - "epoch": 0.5106382978723404, - "grad_norm": 2.590961217880249, - "learning_rate": 4.806277864709828e-06, - "loss": 0.580160915851593, - "mean_token_accuracy": 0.809589684009552, - "num_tokens": 6145803.0, - "step": 672 - }, - { - "epoch": 0.5113981762917933, - "grad_norm": 2.4653842449188232, - "learning_rate": 4.805468680873874e-06, - "loss": 0.5262120366096497, - "mean_token_accuracy": 0.822458803653717, - "num_tokens": 6151236.0, - "step": 673 - }, - { - "epoch": 0.5121580547112462, - "grad_norm": 2.860720157623291, - "learning_rate": 4.804657878971252e-06, - "loss": 0.4007391035556793, - "mean_token_accuracy": 0.8637382984161377, - "num_tokens": 6155310.0, - "step": 674 - }, - { - "epoch": 0.5129179331306991, - "grad_norm": 2.520282030105591, - "learning_rate": 4.803845459571014e-06, - "loss": 0.45798182487487793, - "mean_token_accuracy": 0.8270114660263062, - "num_tokens": 6160326.0, - "step": 675 - }, - { - "epoch": 0.513677811550152, - "grad_norm": 2.7290921211242676, - "learning_rate": 4.803031423243349e-06, - "loss": 0.5745848417282104, - "mean_token_accuracy": 0.8401234745979309, - "num_tokens": 6165709.0, - "step": 676 - }, - { - "epoch": 0.5144376899696048, - "grad_norm": 1.6678650379180908, - "learning_rate": 4.802215770559578e-06, - "loss": 0.5257721543312073, - "mean_token_accuracy": 0.8241991996765137, - "num_tokens": 6177875.0, - "step": 677 - }, - { - "epoch": 0.5151975683890577, - "grad_norm": 2.1720468997955322, - "learning_rate": 4.801398502092156e-06, - "loss": 0.45342206954956055, - "mean_token_accuracy": 0.8463799953460693, - "num_tokens": 6185415.0, - "step": 678 - }, - { - "epoch": 0.5159574468085106, - "grad_norm": 2.282259702682495, - "learning_rate": 4.800579618414677e-06, - "loss": 0.4864169955253601, - "mean_token_accuracy": 0.8300632238388062, - "num_tokens": 6191832.0, - "step": 679 - }, - { - "epoch": 0.5167173252279635, - "grad_norm": 2.0092248916625977, - "learning_rate": 4.799759120101861e-06, - "loss": 0.5781463980674744, - "mean_token_accuracy": 0.8267031908035278, - "num_tokens": 6199440.0, - "step": 680 - }, - { - "epoch": 0.5174772036474165, - "grad_norm": 1.396580696105957, - "learning_rate": 4.798937007729568e-06, - "loss": 0.49689239263534546, - "mean_token_accuracy": 0.8257499933242798, - "num_tokens": 6213840.0, - "step": 681 - }, - { - "epoch": 0.5182370820668692, - "grad_norm": 1.9060769081115723, - "learning_rate": 4.798113281874788e-06, - "loss": 0.48969539999961853, - "mean_token_accuracy": 0.8171790838241577, - "num_tokens": 6223006.0, - "step": 682 - }, - { - "epoch": 0.5189969604863222, - "grad_norm": 1.6255282163619995, - "learning_rate": 4.797287943115642e-06, - "loss": 0.5532330870628357, - "mean_token_accuracy": 0.8173393607139587, - "num_tokens": 6234857.0, - "step": 683 - }, - { - "epoch": 0.5197568389057751, - "grad_norm": 1.6923905611038208, - "learning_rate": 4.796460992031386e-06, - "loss": 0.4880887269973755, - "mean_token_accuracy": 0.834983229637146, - "num_tokens": 6245252.0, - "step": 684 - }, - { - "epoch": 0.520516717325228, - "grad_norm": 2.13161301612854, - "learning_rate": 4.7956324292024045e-06, - "loss": 0.5687593817710876, - "mean_token_accuracy": 0.7996571063995361, - "num_tokens": 6253726.0, - "step": 685 - }, - { - "epoch": 0.5212765957446809, - "grad_norm": 2.509375810623169, - "learning_rate": 4.794802255210217e-06, - "loss": 0.5396929979324341, - "mean_token_accuracy": 0.8007107973098755, - "num_tokens": 6259238.0, - "step": 686 - }, - { - "epoch": 0.5220364741641338, - "grad_norm": 2.393710136413574, - "learning_rate": 4.793970470637469e-06, - "loss": 0.6165191531181335, - "mean_token_accuracy": 0.7891418933868408, - "num_tokens": 6266325.0, - "step": 687 - }, - { - "epoch": 0.5227963525835866, - "grad_norm": 1.511647343635559, - "learning_rate": 4.7931370760679415e-06, - "loss": 0.4773876965045929, - "mean_token_accuracy": 0.8381044864654541, - "num_tokens": 6277447.0, - "step": 688 - }, - { - "epoch": 0.5235562310030395, - "grad_norm": 2.206587314605713, - "learning_rate": 4.792302072086542e-06, - "loss": 0.5482058525085449, - "mean_token_accuracy": 0.8239108920097351, - "num_tokens": 6285163.0, - "step": 689 - }, - { - "epoch": 0.5243161094224924, - "grad_norm": 3.018146514892578, - "learning_rate": 4.7914654592793065e-06, - "loss": 0.4880615472793579, - "mean_token_accuracy": 0.8361308574676514, - "num_tokens": 6289386.0, - "step": 690 - }, - { - "epoch": 0.5250759878419453, - "grad_norm": 1.6469231843948364, - "learning_rate": 4.790627238233405e-06, - "loss": 0.4164774715900421, - "mean_token_accuracy": 0.8496290445327759, - "num_tokens": 6298915.0, - "step": 691 - }, - { - "epoch": 0.5258358662613982, - "grad_norm": 2.352505922317505, - "learning_rate": 4.789787409537131e-06, - "loss": 0.5366303324699402, - "mean_token_accuracy": 0.8350417613983154, - "num_tokens": 6306130.0, - "step": 692 - }, - { - "epoch": 0.526595744680851, - "grad_norm": 1.7463021278381348, - "learning_rate": 4.7889459737799105e-06, - "loss": 0.4389137923717499, - "mean_token_accuracy": 0.8463300466537476, - "num_tokens": 6315503.0, - "step": 693 - }, - { - "epoch": 0.5273556231003039, - "grad_norm": 2.257706642150879, - "learning_rate": 4.788102931552294e-06, - "loss": 0.5309344530105591, - "mean_token_accuracy": 0.8164352178573608, - "num_tokens": 6321852.0, - "step": 694 - }, - { - "epoch": 0.5281155015197568, - "grad_norm": 2.392732620239258, - "learning_rate": 4.787258283445962e-06, - "loss": 0.3956204056739807, - "mean_token_accuracy": 0.8671456575393677, - "num_tokens": 6327380.0, - "step": 695 - }, - { - "epoch": 0.5288753799392097, - "grad_norm": 2.210514545440674, - "learning_rate": 4.786412030053721e-06, - "loss": 0.4842875003814697, - "mean_token_accuracy": 0.8508446216583252, - "num_tokens": 6334898.0, - "step": 696 - }, - { - "epoch": 0.5296352583586627, - "grad_norm": 1.8678946495056152, - "learning_rate": 4.785564171969503e-06, - "loss": 0.47399595379829407, - "mean_token_accuracy": 0.8514996767044067, - "num_tokens": 6346374.0, - "step": 697 - }, - { - "epoch": 0.5303951367781155, - "grad_norm": 2.604079484939575, - "learning_rate": 4.784714709788368e-06, - "loss": 0.5950228571891785, - "mean_token_accuracy": 0.7983481884002686, - "num_tokens": 6351648.0, - "step": 698 - }, - { - "epoch": 0.5311550151975684, - "grad_norm": 1.662381649017334, - "learning_rate": 4.783863644106502e-06, - "loss": 0.41616758704185486, - "mean_token_accuracy": 0.8554803133010864, - "num_tokens": 6360506.0, - "step": 699 - }, - { - "epoch": 0.5319148936170213, - "grad_norm": 1.6300342082977295, - "learning_rate": 4.783010975521216e-06, - "loss": 0.43029269576072693, - "mean_token_accuracy": 0.8443028926849365, - "num_tokens": 6370675.0, - "step": 700 - }, - { - "epoch": 0.5326747720364742, - "grad_norm": 1.731873869895935, - "learning_rate": 4.782156704630944e-06, - "loss": 0.4383814334869385, - "mean_token_accuracy": 0.8443183898925781, - "num_tokens": 6381803.0, - "step": 701 - }, - { - "epoch": 0.5334346504559271, - "grad_norm": 3.1788413524627686, - "learning_rate": 4.7813008320352475e-06, - "loss": 0.32194480299949646, - "mean_token_accuracy": 0.8870962858200073, - "num_tokens": 6389263.0, - "step": 702 - }, - { - "epoch": 0.53419452887538, - "grad_norm": 2.099513530731201, - "learning_rate": 4.78044335833481e-06, - "loss": 0.36962923407554626, - "mean_token_accuracy": 0.8661133646965027, - "num_tokens": 6395589.0, - "step": 703 - }, - { - "epoch": 0.5349544072948328, - "grad_norm": 1.4859435558319092, - "learning_rate": 4.77958428413144e-06, - "loss": 0.4619954824447632, - "mean_token_accuracy": 0.8438555002212524, - "num_tokens": 6407470.0, - "step": 704 - }, - { - "epoch": 0.5357142857142857, - "grad_norm": 1.2561073303222656, - "learning_rate": 4.7787236100280685e-06, - "loss": 0.3770977258682251, - "mean_token_accuracy": 0.8515733480453491, - "num_tokens": 6422888.0, - "step": 705 - }, - { - "epoch": 0.5364741641337386, - "grad_norm": 1.4455817937850952, - "learning_rate": 4.777861336628751e-06, - "loss": 0.46481069922447205, - "mean_token_accuracy": 0.8502002954483032, - "num_tokens": 6441266.0, - "step": 706 - }, - { - "epoch": 0.5372340425531915, - "grad_norm": 1.1387295722961426, - "learning_rate": 4.7769974645386616e-06, - "loss": 0.36964765191078186, - "mean_token_accuracy": 0.8719524145126343, - "num_tokens": 6463686.0, - "step": 707 - }, - { - "epoch": 0.5379939209726444, - "grad_norm": 1.7179663181304932, - "learning_rate": 4.776131994364102e-06, - "loss": 0.4231719970703125, - "mean_token_accuracy": 0.8416585922241211, - "num_tokens": 6472956.0, - "step": 708 - }, - { - "epoch": 0.5387537993920972, - "grad_norm": 1.6328502893447876, - "learning_rate": 4.775264926712489e-06, - "loss": 0.5836569666862488, - "mean_token_accuracy": 0.8039724230766296, - "num_tokens": 6485773.0, - "step": 709 - }, - { - "epoch": 0.5395136778115501, - "grad_norm": 1.8515360355377197, - "learning_rate": 4.774396262192368e-06, - "loss": 0.5477553009986877, - "mean_token_accuracy": 0.8136521577835083, - "num_tokens": 6496379.0, - "step": 710 - }, - { - "epoch": 0.540273556231003, - "grad_norm": 1.741858959197998, - "learning_rate": 4.7735260014133986e-06, - "loss": 0.4663267731666565, - "mean_token_accuracy": 0.8473691940307617, - "num_tokens": 6507652.0, - "step": 711 - }, - { - "epoch": 0.541033434650456, - "grad_norm": 1.7516659498214722, - "learning_rate": 4.772654144986364e-06, - "loss": 0.374914288520813, - "mean_token_accuracy": 0.8600220680236816, - "num_tokens": 6519030.0, - "step": 712 - }, - { - "epoch": 0.5417933130699089, - "grad_norm": 2.662343978881836, - "learning_rate": 4.7717806935231665e-06, - "loss": 0.4206875264644623, - "mean_token_accuracy": 0.8544126749038696, - "num_tokens": 6523669.0, - "step": 713 - }, - { - "epoch": 0.5425531914893617, - "grad_norm": 1.4088834524154663, - "learning_rate": 4.770905647636828e-06, - "loss": 0.5824331045150757, - "mean_token_accuracy": 0.7857901453971863, - "num_tokens": 6540560.0, - "step": 714 - }, - { - "epoch": 0.5433130699088146, - "grad_norm": 2.173656940460205, - "learning_rate": 4.77002900794149e-06, - "loss": 0.555023729801178, - "mean_token_accuracy": 0.8067290782928467, - "num_tokens": 6548946.0, - "step": 715 - }, - { - "epoch": 0.5440729483282675, - "grad_norm": 2.121018648147583, - "learning_rate": 4.769150775052411e-06, - "loss": 0.559730052947998, - "mean_token_accuracy": 0.8166372776031494, - "num_tokens": 6556065.0, - "step": 716 - }, - { - "epoch": 0.5448328267477204, - "grad_norm": 3.335866928100586, - "learning_rate": 4.768270949585968e-06, - "loss": 0.6442267894744873, - "mean_token_accuracy": 0.7858607769012451, - "num_tokens": 6560615.0, - "step": 717 - }, - { - "epoch": 0.5455927051671733, - "grad_norm": 2.3813695907592773, - "learning_rate": 4.767389532159659e-06, - "loss": 0.4027421474456787, - "mean_token_accuracy": 0.8635619282722473, - "num_tokens": 6565841.0, - "step": 718 - }, - { - "epoch": 0.5463525835866262, - "grad_norm": 2.0657708644866943, - "learning_rate": 4.766506523392095e-06, - "loss": 0.38899827003479004, - "mean_token_accuracy": 0.8660480380058289, - "num_tokens": 6572362.0, - "step": 719 - }, - { - "epoch": 0.547112462006079, - "grad_norm": 1.093705415725708, - "learning_rate": 4.765621923903005e-06, - "loss": 0.45967352390289307, - "mean_token_accuracy": 0.8338102102279663, - "num_tokens": 6595998.0, - "step": 720 - }, - { - "epoch": 0.5478723404255319, - "grad_norm": 2.942065954208374, - "learning_rate": 4.764735734313236e-06, - "loss": 0.42910510301589966, - "mean_token_accuracy": 0.8406122922897339, - "num_tokens": 6601075.0, - "step": 721 - }, - { - "epoch": 0.5486322188449848, - "grad_norm": 2.049011707305908, - "learning_rate": 4.763847955244749e-06, - "loss": 0.5584231615066528, - "mean_token_accuracy": 0.8171684741973877, - "num_tokens": 6609310.0, - "step": 722 - }, - { - "epoch": 0.5493920972644377, - "grad_norm": 2.485543966293335, - "learning_rate": 4.762958587320623e-06, - "loss": 0.5396170020103455, - "mean_token_accuracy": 0.8158525824546814, - "num_tokens": 6616185.0, - "step": 723 - }, - { - "epoch": 0.5501519756838906, - "grad_norm": 1.87015962600708, - "learning_rate": 4.762067631165049e-06, - "loss": 0.49739527702331543, - "mean_token_accuracy": 0.8303765654563904, - "num_tokens": 6625629.0, - "step": 724 - }, - { - "epoch": 0.5509118541033434, - "grad_norm": 4.239654541015625, - "learning_rate": 4.761175087403336e-06, - "loss": 0.6029239296913147, - "mean_token_accuracy": 0.8123486042022705, - "num_tokens": 6629194.0, - "step": 725 - }, - { - "epoch": 0.5516717325227963, - "grad_norm": 2.0134730339050293, - "learning_rate": 4.760280956661904e-06, - "loss": 0.4777873754501343, - "mean_token_accuracy": 0.8283513784408569, - "num_tokens": 6636929.0, - "step": 726 - }, - { - "epoch": 0.5524316109422492, - "grad_norm": 1.991780400276184, - "learning_rate": 4.75938523956829e-06, - "loss": 0.4631248116493225, - "mean_token_accuracy": 0.8275107741355896, - "num_tokens": 6645135.0, - "step": 727 - }, - { - "epoch": 0.5531914893617021, - "grad_norm": 1.423792839050293, - "learning_rate": 4.75848793675114e-06, - "loss": 0.49630722403526306, - "mean_token_accuracy": 0.8388000130653381, - "num_tokens": 6662690.0, - "step": 728 - }, - { - "epoch": 0.5539513677811551, - "grad_norm": 2.345294952392578, - "learning_rate": 4.757589048840219e-06, - "loss": 0.37830638885498047, - "mean_token_accuracy": 0.8782080411911011, - "num_tokens": 6667285.0, - "step": 729 - }, - { - "epoch": 0.5547112462006079, - "grad_norm": 2.7452144622802734, - "learning_rate": 4.756688576466398e-06, - "loss": 0.51595538854599, - "mean_token_accuracy": 0.8441770672798157, - "num_tokens": 6672324.0, - "step": 730 - }, - { - "epoch": 0.5554711246200608, - "grad_norm": 1.5247859954833984, - "learning_rate": 4.755786520261666e-06, - "loss": 0.48365193605422974, - "mean_token_accuracy": 0.8276445269584656, - "num_tokens": 6685296.0, - "step": 731 - }, - { - "epoch": 0.5562310030395137, - "grad_norm": 1.4018276929855347, - "learning_rate": 4.75488288085912e-06, - "loss": 0.3876481354236603, - "mean_token_accuracy": 0.8612343072891235, - "num_tokens": 6697515.0, - "step": 732 - }, - { - "epoch": 0.5569908814589666, - "grad_norm": 2.9570324420928955, - "learning_rate": 4.753977658892967e-06, - "loss": 0.5468149185180664, - "mean_token_accuracy": 0.8054271340370178, - "num_tokens": 6702194.0, - "step": 733 - }, - { - "epoch": 0.5577507598784195, - "grad_norm": 1.9282715320587158, - "learning_rate": 4.753070854998529e-06, - "loss": 0.4758574962615967, - "mean_token_accuracy": 0.8379775285720825, - "num_tokens": 6709938.0, - "step": 734 - }, - { - "epoch": 0.5585106382978723, - "grad_norm": 1.981264591217041, - "learning_rate": 4.752162469812234e-06, - "loss": 0.48461222648620605, - "mean_token_accuracy": 0.833509087562561, - "num_tokens": 6718125.0, - "step": 735 - }, - { - "epoch": 0.5592705167173252, - "grad_norm": 1.1643427610397339, - "learning_rate": 4.751252503971624e-06, - "loss": 0.410121887922287, - "mean_token_accuracy": 0.8221402764320374, - "num_tokens": 6735125.0, - "step": 736 - }, - { - "epoch": 0.5600303951367781, - "grad_norm": 1.786566972732544, - "learning_rate": 4.750340958115346e-06, - "loss": 0.5964341163635254, - "mean_token_accuracy": 0.8038164377212524, - "num_tokens": 6747369.0, - "step": 737 - }, - { - "epoch": 0.560790273556231, - "grad_norm": 1.7256991863250732, - "learning_rate": 4.749427832883158e-06, - "loss": 0.48737066984176636, - "mean_token_accuracy": 0.830894947052002, - "num_tokens": 6758115.0, - "step": 738 - }, - { - "epoch": 0.5615501519756839, - "grad_norm": 1.997747540473938, - "learning_rate": 4.748513128915928e-06, - "loss": 0.5238886475563049, - "mean_token_accuracy": 0.8066858053207397, - "num_tokens": 6766111.0, - "step": 739 - }, - { - "epoch": 0.5623100303951368, - "grad_norm": 2.127016305923462, - "learning_rate": 4.747596846855629e-06, - "loss": 0.5045586228370667, - "mean_token_accuracy": 0.821424126625061, - "num_tokens": 6772893.0, - "step": 740 - }, - { - "epoch": 0.5630699088145896, - "grad_norm": 1.7664796113967896, - "learning_rate": 4.7466789873453446e-06, - "loss": 0.42954835295677185, - "mean_token_accuracy": 0.8533384799957275, - "num_tokens": 6785133.0, - "step": 741 - }, - { - "epoch": 0.5638297872340425, - "grad_norm": 1.4987404346466064, - "learning_rate": 4.7457595510292615e-06, - "loss": 0.5378558039665222, - "mean_token_accuracy": 0.8184819221496582, - "num_tokens": 6799563.0, - "step": 742 - }, - { - "epoch": 0.5645896656534954, - "grad_norm": 1.4444655179977417, - "learning_rate": 4.744838538552678e-06, - "loss": 0.42193782329559326, - "mean_token_accuracy": 0.837514340877533, - "num_tokens": 6812470.0, - "step": 743 - }, - { - "epoch": 0.5653495440729484, - "grad_norm": 3.867751121520996, - "learning_rate": 4.7439159505619946e-06, - "loss": 0.4457814693450928, - "mean_token_accuracy": 0.8630104660987854, - "num_tokens": 6815652.0, - "step": 744 - }, - { - "epoch": 0.5661094224924013, - "grad_norm": 2.1250710487365723, - "learning_rate": 4.74299178770472e-06, - "loss": 0.5638922452926636, - "mean_token_accuracy": 0.7969781160354614, - "num_tokens": 6824566.0, - "step": 745 - }, - { - "epoch": 0.5668693009118541, - "grad_norm": 2.547072410583496, - "learning_rate": 4.742066050629465e-06, - "loss": 0.5516207814216614, - "mean_token_accuracy": 0.8160669803619385, - "num_tokens": 6830589.0, - "step": 746 - }, - { - "epoch": 0.567629179331307, - "grad_norm": 1.2975233793258667, - "learning_rate": 4.741138739985951e-06, - "loss": 0.3823344111442566, - "mean_token_accuracy": 0.8668368458747864, - "num_tokens": 6842707.0, - "step": 747 - }, - { - "epoch": 0.5683890577507599, - "grad_norm": 1.3410450220108032, - "learning_rate": 4.740209856424998e-06, - "loss": 0.5148671269416809, - "mean_token_accuracy": 0.8188045024871826, - "num_tokens": 6857624.0, - "step": 748 - }, - { - "epoch": 0.5691489361702128, - "grad_norm": 1.219467282295227, - "learning_rate": 4.7392794005985324e-06, - "loss": 0.3998957872390747, - "mean_token_accuracy": 0.855175256729126, - "num_tokens": 6875064.0, - "step": 749 - }, - { - "epoch": 0.5699088145896657, - "grad_norm": 1.3530343770980835, - "learning_rate": 4.738347373159585e-06, - "loss": 0.5359633564949036, - "mean_token_accuracy": 0.8178457021713257, - "num_tokens": 6890911.0, - "step": 750 - }, - { - "epoch": 0.5706686930091185, - "grad_norm": 2.146988868713379, - "learning_rate": 4.737413774762287e-06, - "loss": 0.4460008144378662, - "mean_token_accuracy": 0.8172903060913086, - "num_tokens": 6896959.0, - "step": 751 - }, - { - "epoch": 0.5714285714285714, - "grad_norm": 1.456023097038269, - "learning_rate": 4.736478606061876e-06, - "loss": 0.43616920709609985, - "mean_token_accuracy": 0.8465108871459961, - "num_tokens": 6908904.0, - "step": 752 - }, - { - "epoch": 0.5721884498480243, - "grad_norm": 2.9696967601776123, - "learning_rate": 4.735541867714687e-06, - "loss": 0.43464532494544983, - "mean_token_accuracy": 0.8608652353286743, - "num_tokens": 6913026.0, - "step": 753 - }, - { - "epoch": 0.5729483282674772, - "grad_norm": 2.2990667819976807, - "learning_rate": 4.73460356037816e-06, - "loss": 0.6619116067886353, - "mean_token_accuracy": 0.7821142673492432, - "num_tokens": 6920588.0, - "step": 754 - }, - { - "epoch": 0.5737082066869301, - "grad_norm": 2.054746389389038, - "learning_rate": 4.733663684710835e-06, - "loss": 0.5304250717163086, - "mean_token_accuracy": 0.8265531063079834, - "num_tokens": 6928910.0, - "step": 755 - }, - { - "epoch": 0.574468085106383, - "grad_norm": 2.0050594806671143, - "learning_rate": 4.732722241372354e-06, - "loss": 0.6393026113510132, - "mean_token_accuracy": 0.796819806098938, - "num_tokens": 6940217.0, - "step": 756 - }, - { - "epoch": 0.5752279635258358, - "grad_norm": 1.4285320043563843, - "learning_rate": 4.731779231023456e-06, - "loss": 0.5432837009429932, - "mean_token_accuracy": 0.8104778528213501, - "num_tokens": 6959101.0, - "step": 757 - }, - { - "epoch": 0.5759878419452887, - "grad_norm": 2.3941943645477295, - "learning_rate": 4.730834654325984e-06, - "loss": 0.46550673246383667, - "mean_token_accuracy": 0.8444503545761108, - "num_tokens": 6965036.0, - "step": 758 - }, - { - "epoch": 0.5767477203647416, - "grad_norm": 2.3850574493408203, - "learning_rate": 4.729888511942877e-06, - "loss": 0.4916389584541321, - "mean_token_accuracy": 0.8228527307510376, - "num_tokens": 6971184.0, - "step": 759 - }, - { - "epoch": 0.5775075987841946, - "grad_norm": 1.627480149269104, - "learning_rate": 4.728940804538176e-06, - "loss": 0.5863215923309326, - "mean_token_accuracy": 0.7995302677154541, - "num_tokens": 6982569.0, - "step": 760 - }, - { - "epoch": 0.5782674772036475, - "grad_norm": 1.1723195314407349, - "learning_rate": 4.727991532777016e-06, - "loss": 0.36908864974975586, - "mean_token_accuracy": 0.8355655670166016, - "num_tokens": 6998659.0, - "step": 761 - }, - { - "epoch": 0.5790273556231003, - "grad_norm": 1.5324925184249878, - "learning_rate": 4.727040697325634e-06, - "loss": 0.557658851146698, - "mean_token_accuracy": 0.8141458034515381, - "num_tokens": 7012969.0, - "step": 762 - }, - { - "epoch": 0.5797872340425532, - "grad_norm": 2.4106390476226807, - "learning_rate": 4.726088298851362e-06, - "loss": 0.5004243850708008, - "mean_token_accuracy": 0.8376860618591309, - "num_tokens": 7018301.0, - "step": 763 - }, - { - "epoch": 0.5805471124620061, - "grad_norm": 2.2594921588897705, - "learning_rate": 4.725134338022631e-06, - "loss": 0.6067016124725342, - "mean_token_accuracy": 0.8100241422653198, - "num_tokens": 7025201.0, - "step": 764 - }, - { - "epoch": 0.581306990881459, - "grad_norm": 1.4649826288223267, - "learning_rate": 4.724178815508967e-06, - "loss": 0.36200693249702454, - "mean_token_accuracy": 0.8621826171875, - "num_tokens": 7035112.0, - "step": 765 - }, - { - "epoch": 0.5820668693009119, - "grad_norm": 2.3634560108184814, - "learning_rate": 4.723221731980993e-06, - "loss": 0.41862213611602783, - "mean_token_accuracy": 0.8541463613510132, - "num_tokens": 7040339.0, - "step": 766 - }, - { - "epoch": 0.5828267477203647, - "grad_norm": 2.7798104286193848, - "learning_rate": 4.722263088110426e-06, - "loss": 0.4647108018398285, - "mean_token_accuracy": 0.8505672216415405, - "num_tokens": 7044880.0, - "step": 767 - }, - { - "epoch": 0.5835866261398176, - "grad_norm": 2.070528507232666, - "learning_rate": 4.721302884570079e-06, - "loss": 0.5147565007209778, - "mean_token_accuracy": 0.8113877773284912, - "num_tokens": 7052433.0, - "step": 768 - }, - { - "epoch": 0.5843465045592705, - "grad_norm": 2.1953284740448, - "learning_rate": 4.720341122033862e-06, - "loss": 0.5075466632843018, - "mean_token_accuracy": 0.8474211096763611, - "num_tokens": 7058686.0, - "step": 769 - }, - { - "epoch": 0.5851063829787234, - "grad_norm": 1.9287755489349365, - "learning_rate": 4.719377801176774e-06, - "loss": 0.5382202863693237, - "mean_token_accuracy": 0.8148090243339539, - "num_tokens": 7067538.0, - "step": 770 - }, - { - "epoch": 0.5858662613981763, - "grad_norm": 1.5574456453323364, - "learning_rate": 4.718412922674913e-06, - "loss": 0.43406790494918823, - "mean_token_accuracy": 0.8477081060409546, - "num_tokens": 7077853.0, - "step": 771 - }, - { - "epoch": 0.5866261398176292, - "grad_norm": 1.5490336418151855, - "learning_rate": 4.717446487205466e-06, - "loss": 0.43164271116256714, - "mean_token_accuracy": 0.8504570126533508, - "num_tokens": 7091728.0, - "step": 772 - }, - { - "epoch": 0.587386018237082, - "grad_norm": 1.6945984363555908, - "learning_rate": 4.716478495446717e-06, - "loss": 0.5153743624687195, - "mean_token_accuracy": 0.8213579058647156, - "num_tokens": 7108680.0, - "step": 773 - }, - { - "epoch": 0.5881458966565349, - "grad_norm": 2.2633883953094482, - "learning_rate": 4.715508948078037e-06, - "loss": 0.45254790782928467, - "mean_token_accuracy": 0.8392219543457031, - "num_tokens": 7115546.0, - "step": 774 - }, - { - "epoch": 0.5889057750759878, - "grad_norm": 1.5731090307235718, - "learning_rate": 4.714537845779894e-06, - "loss": 0.38678881525993347, - "mean_token_accuracy": 0.8800252676010132, - "num_tokens": 7126360.0, - "step": 775 - }, - { - "epoch": 0.5896656534954408, - "grad_norm": 2.4873392581939697, - "learning_rate": 4.7135651892338445e-06, - "loss": 0.5190927386283875, - "mean_token_accuracy": 0.8145407438278198, - "num_tokens": 7135705.0, - "step": 776 - }, - { - "epoch": 0.5904255319148937, - "grad_norm": 1.2931004762649536, - "learning_rate": 4.712590979122534e-06, - "loss": 0.3686544895172119, - "mean_token_accuracy": 0.8720537424087524, - "num_tokens": 7150688.0, - "step": 777 - }, - { - "epoch": 0.5911854103343465, - "grad_norm": 1.6353671550750732, - "learning_rate": 4.7116152161297045e-06, - "loss": 0.49065062403678894, - "mean_token_accuracy": 0.8203760385513306, - "num_tokens": 7161040.0, - "step": 778 - }, - { - "epoch": 0.5919452887537994, - "grad_norm": 1.2345483303070068, - "learning_rate": 4.710637900940181e-06, - "loss": 0.4004976451396942, - "mean_token_accuracy": 0.8302007913589478, - "num_tokens": 7178074.0, - "step": 779 - }, - { - "epoch": 0.5927051671732523, - "grad_norm": 2.2506837844848633, - "learning_rate": 4.7096590342398825e-06, - "loss": 0.45142874121665955, - "mean_token_accuracy": 0.8481036424636841, - "num_tokens": 7184153.0, - "step": 780 - }, - { - "epoch": 0.5934650455927052, - "grad_norm": 1.420479416847229, - "learning_rate": 4.708678616715815e-06, - "loss": 0.4802100360393524, - "mean_token_accuracy": 0.8586992025375366, - "num_tokens": 7202810.0, - "step": 781 - }, - { - "epoch": 0.5942249240121581, - "grad_norm": 3.457632303237915, - "learning_rate": 4.707696649056073e-06, - "loss": 0.5265094041824341, - "mean_token_accuracy": 0.8260114192962646, - "num_tokens": 7206396.0, - "step": 782 - }, - { - "epoch": 0.5949848024316109, - "grad_norm": 1.1592093706130981, - "learning_rate": 4.706713131949839e-06, - "loss": 0.3708173632621765, - "mean_token_accuracy": 0.8476542234420776, - "num_tokens": 7225034.0, - "step": 783 - }, - { - "epoch": 0.5957446808510638, - "grad_norm": 1.6761400699615479, - "learning_rate": 4.705728066087384e-06, - "loss": 0.4137252867221832, - "mean_token_accuracy": 0.8462049961090088, - "num_tokens": 7237101.0, - "step": 784 - }, - { - "epoch": 0.5965045592705167, - "grad_norm": 2.320185422897339, - "learning_rate": 4.704741452160064e-06, - "loss": 0.5157154202461243, - "mean_token_accuracy": 0.8391785621643066, - "num_tokens": 7243826.0, - "step": 785 - }, - { - "epoch": 0.5972644376899696, - "grad_norm": 2.079423427581787, - "learning_rate": 4.703753290860323e-06, - "loss": 0.4734993278980255, - "mean_token_accuracy": 0.8353281021118164, - "num_tokens": 7250175.0, - "step": 786 - }, - { - "epoch": 0.5980243161094225, - "grad_norm": 1.8215159177780151, - "learning_rate": 4.702763582881692e-06, - "loss": 0.520193338394165, - "mean_token_accuracy": 0.844062864780426, - "num_tokens": 7258868.0, - "step": 787 - }, - { - "epoch": 0.5987841945288754, - "grad_norm": 1.3823071718215942, - "learning_rate": 4.701772328918784e-06, - "loss": 0.4177844822406769, - "mean_token_accuracy": 0.8363165259361267, - "num_tokens": 7271744.0, - "step": 788 - }, - { - "epoch": 0.5995440729483282, - "grad_norm": 2.4749298095703125, - "learning_rate": 4.700779529667301e-06, - "loss": 0.5115069150924683, - "mean_token_accuracy": 0.8473520278930664, - "num_tokens": 7277040.0, - "step": 789 - }, - { - "epoch": 0.6003039513677811, - "grad_norm": 1.7072296142578125, - "learning_rate": 4.699785185824026e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8161447048187256, - "num_tokens": 7288288.0, - "step": 790 - }, - { - "epoch": 0.601063829787234, - "grad_norm": 1.6479384899139404, - "learning_rate": 4.69878929808683e-06, - "loss": 0.4445168972015381, - "mean_token_accuracy": 0.8381255865097046, - "num_tokens": 7298640.0, - "step": 791 - }, - { - "epoch": 0.601823708206687, - "grad_norm": 1.9095896482467651, - "learning_rate": 4.6977918671546635e-06, - "loss": 0.5841238498687744, - "mean_token_accuracy": 0.7971454858779907, - "num_tokens": 7307220.0, - "step": 792 - }, - { - "epoch": 0.6025835866261399, - "grad_norm": 1.9614146947860718, - "learning_rate": 4.696792893727562e-06, - "loss": 0.34684082865715027, - "mean_token_accuracy": 0.8739526271820068, - "num_tokens": 7313875.0, - "step": 793 - }, - { - "epoch": 0.6033434650455927, - "grad_norm": 2.015570640563965, - "learning_rate": 4.695792378506645e-06, - "loss": 0.42779117822647095, - "mean_token_accuracy": 0.8625012636184692, - "num_tokens": 7321439.0, - "step": 794 - }, - { - "epoch": 0.6041033434650456, - "grad_norm": 2.8581228256225586, - "learning_rate": 4.694790322194111e-06, - "loss": 0.6519991159439087, - "mean_token_accuracy": 0.7629562616348267, - "num_tokens": 7326916.0, - "step": 795 - }, - { - "epoch": 0.6048632218844985, - "grad_norm": 2.482715368270874, - "learning_rate": 4.693786725493242e-06, - "loss": 0.532963216304779, - "mean_token_accuracy": 0.832184910774231, - "num_tokens": 7333311.0, - "step": 796 - }, - { - "epoch": 0.6056231003039514, - "grad_norm": 1.6076741218566895, - "learning_rate": 4.692781589108402e-06, - "loss": 0.43381205201148987, - "mean_token_accuracy": 0.8402494192123413, - "num_tokens": 7343731.0, - "step": 797 - }, - { - "epoch": 0.6063829787234043, - "grad_norm": 2.2133216857910156, - "learning_rate": 4.691774913745033e-06, - "loss": 0.4380851089954376, - "mean_token_accuracy": 0.8600908517837524, - "num_tokens": 7350224.0, - "step": 798 - }, - { - "epoch": 0.6071428571428571, - "grad_norm": 2.046280860900879, - "learning_rate": 4.690766700109659e-06, - "loss": 0.3821919560432434, - "mean_token_accuracy": 0.8691814541816711, - "num_tokens": 7356717.0, - "step": 799 - }, - { - "epoch": 0.60790273556231, - "grad_norm": 1.8482693433761597, - "learning_rate": 4.689756948909884e-06, - "loss": 0.5217651128768921, - "mean_token_accuracy": 0.803473711013794, - "num_tokens": 7365806.0, - "step": 800 - }, - { - "epoch": 0.6086626139817629, - "grad_norm": 2.192134141921997, - "learning_rate": 4.688745660854388e-06, - "loss": 0.573980987071991, - "mean_token_accuracy": 0.8198676109313965, - "num_tokens": 7380281.0, - "step": 801 - }, - { - "epoch": 0.6094224924012158, - "grad_norm": 2.363626718521118, - "learning_rate": 4.687732836652935e-06, - "loss": 0.5204599499702454, - "mean_token_accuracy": 0.8373252153396606, - "num_tokens": 7386938.0, - "step": 802 - }, - { - "epoch": 0.6101823708206687, - "grad_norm": 1.9320523738861084, - "learning_rate": 4.686718477016361e-06, - "loss": 0.47316622734069824, - "mean_token_accuracy": 0.830596923828125, - "num_tokens": 7395069.0, - "step": 803 - }, - { - "epoch": 0.6109422492401215, - "grad_norm": 2.6573057174682617, - "learning_rate": 4.6857025826565845e-06, - "loss": 0.5495861768722534, - "mean_token_accuracy": 0.8187421560287476, - "num_tokens": 7400563.0, - "step": 804 - }, - { - "epoch": 0.6117021276595744, - "grad_norm": 2.0893123149871826, - "learning_rate": 4.684685154286599e-06, - "loss": 0.5362675786018372, - "mean_token_accuracy": 0.8394701480865479, - "num_tokens": 7406973.0, - "step": 805 - }, - { - "epoch": 0.6124620060790273, - "grad_norm": 2.455130100250244, - "learning_rate": 4.683666192620474e-06, - "loss": 0.5405995845794678, - "mean_token_accuracy": 0.8079100847244263, - "num_tokens": 7412931.0, - "step": 806 - }, - { - "epoch": 0.6132218844984803, - "grad_norm": 2.311915636062622, - "learning_rate": 4.682645698373357e-06, - "loss": 0.5395106077194214, - "mean_token_accuracy": 0.8156260251998901, - "num_tokens": 7419699.0, - "step": 807 - }, - { - "epoch": 0.6139817629179332, - "grad_norm": 1.686838984489441, - "learning_rate": 4.6816236722614694e-06, - "loss": 0.6034521460533142, - "mean_token_accuracy": 0.7855954170227051, - "num_tokens": 7431899.0, - "step": 808 - }, - { - "epoch": 0.6147416413373861, - "grad_norm": 1.682759165763855, - "learning_rate": 4.680600115002109e-06, - "loss": 0.48593831062316895, - "mean_token_accuracy": 0.8229435682296753, - "num_tokens": 7443187.0, - "step": 809 - }, - { - "epoch": 0.6155015197568389, - "grad_norm": 2.064589738845825, - "learning_rate": 4.679575027313649e-06, - "loss": 0.5098468661308289, - "mean_token_accuracy": 0.8234638571739197, - "num_tokens": 7450868.0, - "step": 810 - }, - { - "epoch": 0.6162613981762918, - "grad_norm": 2.2063486576080322, - "learning_rate": 4.6785484099155324e-06, - "loss": 0.5138497352600098, - "mean_token_accuracy": 0.8152111172676086, - "num_tokens": 7457176.0, - "step": 811 - }, - { - "epoch": 0.6170212765957447, - "grad_norm": 1.6258726119995117, - "learning_rate": 4.67752026352828e-06, - "loss": 0.4064181447029114, - "mean_token_accuracy": 0.8720619678497314, - "num_tokens": 7466557.0, - "step": 812 - }, - { - "epoch": 0.6177811550151976, - "grad_norm": 2.3309383392333984, - "learning_rate": 4.676490588873486e-06, - "loss": 0.5180112719535828, - "mean_token_accuracy": 0.8233879804611206, - "num_tokens": 7472650.0, - "step": 813 - }, - { - "epoch": 0.6185410334346505, - "grad_norm": 1.4545246362686157, - "learning_rate": 4.675459386673815e-06, - "loss": 0.37917959690093994, - "mean_token_accuracy": 0.8598103523254395, - "num_tokens": 7485171.0, - "step": 814 - }, - { - "epoch": 0.6193009118541033, - "grad_norm": 2.654231071472168, - "learning_rate": 4.674426657653003e-06, - "loss": 0.554074227809906, - "mean_token_accuracy": 0.8026446104049683, - "num_tokens": 7490787.0, - "step": 815 - }, - { - "epoch": 0.6200607902735562, - "grad_norm": 1.5543994903564453, - "learning_rate": 4.67339240253586e-06, - "loss": 0.6335440278053284, - "mean_token_accuracy": 0.783241868019104, - "num_tokens": 7505975.0, - "step": 816 - }, - { - "epoch": 0.6208206686930091, - "grad_norm": 2.079998016357422, - "learning_rate": 4.672356622048266e-06, - "loss": 0.5169394016265869, - "mean_token_accuracy": 0.8088761568069458, - "num_tokens": 7513470.0, - "step": 817 - }, - { - "epoch": 0.621580547112462, - "grad_norm": 1.5971896648406982, - "learning_rate": 4.671319316917172e-06, - "loss": 0.44588586688041687, - "mean_token_accuracy": 0.8518649339675903, - "num_tokens": 7524352.0, - "step": 818 - }, - { - "epoch": 0.6223404255319149, - "grad_norm": 2.477579116821289, - "learning_rate": 4.670280487870599e-06, - "loss": 0.5713893175125122, - "mean_token_accuracy": 0.8116940259933472, - "num_tokens": 7530359.0, - "step": 819 - }, - { - "epoch": 0.6231003039513677, - "grad_norm": 2.066211700439453, - "learning_rate": 4.669240135637635e-06, - "loss": 0.5295331478118896, - "mean_token_accuracy": 0.819536566734314, - "num_tokens": 7536963.0, - "step": 820 - }, - { - "epoch": 0.6238601823708206, - "grad_norm": 2.1217997074127197, - "learning_rate": 4.668198260948442e-06, - "loss": 0.6146406531333923, - "mean_token_accuracy": 0.7932635545730591, - "num_tokens": 7545800.0, - "step": 821 - }, - { - "epoch": 0.6246200607902735, - "grad_norm": 2.0173542499542236, - "learning_rate": 4.667154864534245e-06, - "loss": 0.6240535974502563, - "mean_token_accuracy": 0.7883644104003906, - "num_tokens": 7556165.0, - "step": 822 - }, - { - "epoch": 0.6253799392097265, - "grad_norm": 2.014526128768921, - "learning_rate": 4.666109947127343e-06, - "loss": 0.40367332100868225, - "mean_token_accuracy": 0.8653522729873657, - "num_tokens": 7562665.0, - "step": 823 - }, - { - "epoch": 0.6261398176291794, - "grad_norm": 2.5078861713409424, - "learning_rate": 4.665063509461098e-06, - "loss": 0.5903617739677429, - "mean_token_accuracy": 0.7902897596359253, - "num_tokens": 7568922.0, - "step": 824 - }, - { - "epoch": 0.6268996960486323, - "grad_norm": 2.454622745513916, - "learning_rate": 4.664015552269938e-06, - "loss": 0.5238361358642578, - "mean_token_accuracy": 0.838546872138977, - "num_tokens": 7575965.0, - "step": 825 - }, - { - "epoch": 0.6276595744680851, - "grad_norm": 2.920919418334961, - "learning_rate": 4.662966076289363e-06, - "loss": 0.5028782486915588, - "mean_token_accuracy": 0.8311152458190918, - "num_tokens": 7580193.0, - "step": 826 - }, - { - "epoch": 0.628419452887538, - "grad_norm": 1.545382022857666, - "learning_rate": 4.661915082255932e-06, - "loss": 0.4817378520965576, - "mean_token_accuracy": 0.8373227119445801, - "num_tokens": 7593024.0, - "step": 827 - }, - { - "epoch": 0.6291793313069909, - "grad_norm": 1.5152469873428345, - "learning_rate": 4.6608625709072766e-06, - "loss": 0.4693033695220947, - "mean_token_accuracy": 0.8150848150253296, - "num_tokens": 7606459.0, - "step": 828 - }, - { - "epoch": 0.6299392097264438, - "grad_norm": 2.1310224533081055, - "learning_rate": 4.659808542982089e-06, - "loss": 0.4653395414352417, - "mean_token_accuracy": 0.8286294341087341, - "num_tokens": 7613036.0, - "step": 829 - }, - { - "epoch": 0.6306990881458967, - "grad_norm": 2.1949679851531982, - "learning_rate": 4.658752999220125e-06, - "loss": 0.3698633909225464, - "mean_token_accuracy": 0.871590793132782, - "num_tokens": 7618527.0, - "step": 830 - }, - { - "epoch": 0.6314589665653495, - "grad_norm": 2.2770416736602783, - "learning_rate": 4.657695940362207e-06, - "loss": 0.5202419757843018, - "mean_token_accuracy": 0.817577600479126, - "num_tokens": 7624459.0, - "step": 831 - }, - { - "epoch": 0.6322188449848024, - "grad_norm": 1.402042269706726, - "learning_rate": 4.65663736715022e-06, - "loss": 0.51531583070755, - "mean_token_accuracy": 0.8228116631507874, - "num_tokens": 7639371.0, - "step": 832 - }, - { - "epoch": 0.6329787234042553, - "grad_norm": 3.3554883003234863, - "learning_rate": 4.65557728032711e-06, - "loss": 0.6771188378334045, - "mean_token_accuracy": 0.7880028486251831, - "num_tokens": 7643924.0, - "step": 833 - }, - { - "epoch": 0.6337386018237082, - "grad_norm": 2.081040143966675, - "learning_rate": 4.654515680636888e-06, - "loss": 0.5712796449661255, - "mean_token_accuracy": 0.8177868127822876, - "num_tokens": 7651881.0, - "step": 834 - }, - { - "epoch": 0.6344984802431611, - "grad_norm": 0.9128716588020325, - "learning_rate": 4.653452568824625e-06, - "loss": 0.3423936069011688, - "mean_token_accuracy": 0.8782886266708374, - "num_tokens": 7677829.0, - "step": 835 - }, - { - "epoch": 0.6352583586626139, - "grad_norm": 3.49015736579895, - "learning_rate": 4.652387945636454e-06, - "loss": 0.34657734632492065, - "mean_token_accuracy": 0.8770567178726196, - "num_tokens": 7680796.0, - "step": 836 - }, - { - "epoch": 0.6360182370820668, - "grad_norm": 2.026247501373291, - "learning_rate": 4.651321811819568e-06, - "loss": 0.5098431706428528, - "mean_token_accuracy": 0.8216961622238159, - "num_tokens": 7688746.0, - "step": 837 - }, - { - "epoch": 0.6367781155015197, - "grad_norm": 2.444343090057373, - "learning_rate": 4.650254168122222e-06, - "loss": 0.5490090250968933, - "mean_token_accuracy": 0.8092857599258423, - "num_tokens": 7695220.0, - "step": 838 - }, - { - "epoch": 0.6375379939209727, - "grad_norm": 2.0171122550964355, - "learning_rate": 4.649185015293728e-06, - "loss": 0.47221142053604126, - "mean_token_accuracy": 0.8514408469200134, - "num_tokens": 7702759.0, - "step": 839 - }, - { - "epoch": 0.6382978723404256, - "grad_norm": 1.9800984859466553, - "learning_rate": 4.64811435408446e-06, - "loss": 0.5238803625106812, - "mean_token_accuracy": 0.8479194641113281, - "num_tokens": 7714017.0, - "step": 840 - }, - { - "epoch": 0.6390577507598785, - "grad_norm": 3.0674357414245605, - "learning_rate": 4.647042185245848e-06, - "loss": 0.4668245315551758, - "mean_token_accuracy": 0.8381714820861816, - "num_tokens": 7717801.0, - "step": 841 - }, - { - "epoch": 0.6398176291793313, - "grad_norm": 1.5672820806503296, - "learning_rate": 4.645968509530381e-06, - "loss": 0.4428741931915283, - "mean_token_accuracy": 0.8416479825973511, - "num_tokens": 7728342.0, - "step": 842 - }, - { - "epoch": 0.6405775075987842, - "grad_norm": 2.3042354583740234, - "learning_rate": 4.644893327691608e-06, - "loss": 0.49937760829925537, - "mean_token_accuracy": 0.827070951461792, - "num_tokens": 7734576.0, - "step": 843 - }, - { - "epoch": 0.6413373860182371, - "grad_norm": 2.057772159576416, - "learning_rate": 4.6438166404841316e-06, - "loss": 0.5912986993789673, - "mean_token_accuracy": 0.805509090423584, - "num_tokens": 7742481.0, - "step": 844 - }, - { - "epoch": 0.64209726443769, - "grad_norm": 1.9688186645507812, - "learning_rate": 4.6427384486636115e-06, - "loss": 0.482401967048645, - "mean_token_accuracy": 0.8358086347579956, - "num_tokens": 7750002.0, - "step": 845 - }, - { - "epoch": 0.6428571428571429, - "grad_norm": 2.6852948665618896, - "learning_rate": 4.6416587529867665e-06, - "loss": 0.5479315519332886, - "mean_token_accuracy": 0.8091106414794922, - "num_tokens": 7755578.0, - "step": 846 - }, - { - "epoch": 0.6436170212765957, - "grad_norm": 2.0547337532043457, - "learning_rate": 4.640577554211366e-06, - "loss": 0.5327274203300476, - "mean_token_accuracy": 0.8280376195907593, - "num_tokens": 7763513.0, - "step": 847 - }, - { - "epoch": 0.6443768996960486, - "grad_norm": 2.0328633785247803, - "learning_rate": 4.63949485309624e-06, - "loss": 0.4814409613609314, - "mean_token_accuracy": 0.8527672290802002, - "num_tokens": 7771131.0, - "step": 848 - }, - { - "epoch": 0.6451367781155015, - "grad_norm": 1.5892863273620605, - "learning_rate": 4.638410650401267e-06, - "loss": 0.4492785334587097, - "mean_token_accuracy": 0.846997857093811, - "num_tokens": 7781572.0, - "step": 849 - }, - { - "epoch": 0.6458966565349544, - "grad_norm": 1.8295910358428955, - "learning_rate": 4.637324946887384e-06, - "loss": 0.37088239192962646, - "mean_token_accuracy": 0.8616628646850586, - "num_tokens": 7788604.0, - "step": 850 - }, - { - "epoch": 0.6466565349544073, - "grad_norm": 3.380040168762207, - "learning_rate": 4.636237743316578e-06, - "loss": 0.4737280607223511, - "mean_token_accuracy": 0.855940580368042, - "num_tokens": 7792504.0, - "step": 851 - }, - { - "epoch": 0.6474164133738601, - "grad_norm": 2.8790009021759033, - "learning_rate": 4.635149040451891e-06, - "loss": 0.39790448546409607, - "mean_token_accuracy": 0.8710698485374451, - "num_tokens": 7796333.0, - "step": 852 - }, - { - "epoch": 0.648176291793313, - "grad_norm": 1.914914608001709, - "learning_rate": 4.634058839057417e-06, - "loss": 0.2954312562942505, - "mean_token_accuracy": 0.8880234956741333, - "num_tokens": 7802456.0, - "step": 853 - }, - { - "epoch": 0.648936170212766, - "grad_norm": 1.3709120750427246, - "learning_rate": 4.632967139898301e-06, - "loss": 0.43224576115608215, - "mean_token_accuracy": 0.8446190357208252, - "num_tokens": 7816770.0, - "step": 854 - }, - { - "epoch": 0.6496960486322189, - "grad_norm": 1.6579312086105347, - "learning_rate": 4.63187394374074e-06, - "loss": 0.3535553514957428, - "mean_token_accuracy": 0.8738704919815063, - "num_tokens": 7824963.0, - "step": 855 - }, - { - "epoch": 0.6504559270516718, - "grad_norm": 2.4055678844451904, - "learning_rate": 4.63077925135198e-06, - "loss": 0.5078744292259216, - "mean_token_accuracy": 0.8430874347686768, - "num_tokens": 7830962.0, - "step": 856 - }, - { - "epoch": 0.6512158054711246, - "grad_norm": 2.5171499252319336, - "learning_rate": 4.629683063500319e-06, - "loss": 0.5172419548034668, - "mean_token_accuracy": 0.8087141513824463, - "num_tokens": 7836638.0, - "step": 857 - }, - { - "epoch": 0.6519756838905775, - "grad_norm": 1.7588486671447754, - "learning_rate": 4.628585380955104e-06, - "loss": 0.5759496092796326, - "mean_token_accuracy": 0.8043236136436462, - "num_tokens": 7844654.0, - "step": 858 - }, - { - "epoch": 0.6527355623100304, - "grad_norm": 1.5887070894241333, - "learning_rate": 4.62748620448673e-06, - "loss": 0.41849038004875183, - "mean_token_accuracy": 0.8556643724441528, - "num_tokens": 7855642.0, - "step": 859 - }, - { - "epoch": 0.6534954407294833, - "grad_norm": 3.227942705154419, - "learning_rate": 4.626385534866642e-06, - "loss": 0.5279449224472046, - "mean_token_accuracy": 0.8250958323478699, - "num_tokens": 7859890.0, - "step": 860 - }, - { - "epoch": 0.6542553191489362, - "grad_norm": 2.440467119216919, - "learning_rate": 4.625283372867333e-06, - "loss": 0.5294933319091797, - "mean_token_accuracy": 0.8235013484954834, - "num_tokens": 7866766.0, - "step": 861 - }, - { - "epoch": 0.6550151975683891, - "grad_norm": 2.4106903076171875, - "learning_rate": 4.624179719262342e-06, - "loss": 0.5662813186645508, - "mean_token_accuracy": 0.8061668872833252, - "num_tokens": 7872809.0, - "step": 862 - }, - { - "epoch": 0.6557750759878419, - "grad_norm": 3.5151145458221436, - "learning_rate": 4.623074574826254e-06, - "loss": 0.5471097230911255, - "mean_token_accuracy": 0.8220691084861755, - "num_tokens": 7876136.0, - "step": 863 - }, - { - "epoch": 0.6565349544072948, - "grad_norm": 1.5319840908050537, - "learning_rate": 4.621967940334705e-06, - "loss": 0.4178982377052307, - "mean_token_accuracy": 0.8517135977745056, - "num_tokens": 7886113.0, - "step": 864 - }, - { - "epoch": 0.6572948328267477, - "grad_norm": 1.63701331615448, - "learning_rate": 4.620859816564371e-06, - "loss": 0.4666512608528137, - "mean_token_accuracy": 0.8223508596420288, - "num_tokens": 7897982.0, - "step": 865 - }, - { - "epoch": 0.6580547112462006, - "grad_norm": 2.1515414714813232, - "learning_rate": 4.619750204292978e-06, - "loss": 0.5359305143356323, - "mean_token_accuracy": 0.8192868232727051, - "num_tokens": 7904947.0, - "step": 866 - }, - { - "epoch": 0.6588145896656535, - "grad_norm": 2.2140955924987793, - "learning_rate": 4.618639104299294e-06, - "loss": 0.5275633931159973, - "mean_token_accuracy": 0.8120715618133545, - "num_tokens": 7913913.0, - "step": 867 - }, - { - "epoch": 0.6595744680851063, - "grad_norm": 1.3956893682479858, - "learning_rate": 4.6175265173631304e-06, - "loss": 0.4378768503665924, - "mean_token_accuracy": 0.8479125499725342, - "num_tokens": 7927979.0, - "step": 868 - }, - { - "epoch": 0.6603343465045592, - "grad_norm": 2.98103928565979, - "learning_rate": 4.616412444265344e-06, - "loss": 0.42614591121673584, - "mean_token_accuracy": 0.8595094680786133, - "num_tokens": 7934293.0, - "step": 869 - }, - { - "epoch": 0.6610942249240122, - "grad_norm": 2.554845094680786, - "learning_rate": 4.6152968857878365e-06, - "loss": 0.3698030412197113, - "mean_token_accuracy": 0.8717041015625, - "num_tokens": 7938547.0, - "step": 870 - }, - { - "epoch": 0.6618541033434651, - "grad_norm": 3.0901825428009033, - "learning_rate": 4.6141798427135475e-06, - "loss": 0.5037497282028198, - "mean_token_accuracy": 0.8354041576385498, - "num_tokens": 7942829.0, - "step": 871 - }, - { - "epoch": 0.662613981762918, - "grad_norm": 2.8692073822021484, - "learning_rate": 4.6130613158264605e-06, - "loss": 0.5418164134025574, - "mean_token_accuracy": 0.8298909664154053, - "num_tokens": 7949303.0, - "step": 872 - }, - { - "epoch": 0.6633738601823708, - "grad_norm": 3.960404396057129, - "learning_rate": 4.611941305911602e-06, - "loss": 0.6284480094909668, - "mean_token_accuracy": 0.837495744228363, - "num_tokens": 7952486.0, - "step": 873 - }, - { - "epoch": 0.6641337386018237, - "grad_norm": 2.6690115928649902, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5214360952377319, - "mean_token_accuracy": 0.8213508129119873, - "num_tokens": 7957559.0, - "step": 874 - }, - { - "epoch": 0.6648936170212766, - "grad_norm": 2.3376171588897705, - "learning_rate": 4.609696840143875e-06, - "loss": 0.46887528896331787, - "mean_token_accuracy": 0.8438819646835327, - "num_tokens": 7962826.0, - "step": 875 - }, - { - "epoch": 0.6656534954407295, - "grad_norm": 2.2222683429718018, - "learning_rate": 4.6085723858662575e-06, - "loss": 0.5607719421386719, - "mean_token_accuracy": 0.8128405809402466, - "num_tokens": 7970131.0, - "step": 876 - }, - { - "epoch": 0.6664133738601824, - "grad_norm": 2.069091558456421, - "learning_rate": 4.607446451711372e-06, - "loss": 0.506301760673523, - "mean_token_accuracy": 0.8256827592849731, - "num_tokens": 7977524.0, - "step": 877 - }, - { - "epoch": 0.6671732522796353, - "grad_norm": 1.3724967241287231, - "learning_rate": 4.606319038469443e-06, - "loss": 0.43285101652145386, - "mean_token_accuracy": 0.8525032997131348, - "num_tokens": 7989174.0, - "step": 878 - }, - { - "epoch": 0.6679331306990881, - "grad_norm": 2.278205156326294, - "learning_rate": 4.605190146931731e-06, - "loss": 0.4845905303955078, - "mean_token_accuracy": 0.8284652829170227, - "num_tokens": 7998524.0, - "step": 879 - }, - { - "epoch": 0.668693009118541, - "grad_norm": 1.3871766328811646, - "learning_rate": 4.604059777890537e-06, - "loss": 0.5736679434776306, - "mean_token_accuracy": 0.8223285675048828, - "num_tokens": 8015776.0, - "step": 880 - }, - { - "epoch": 0.6694528875379939, - "grad_norm": 1.926164984703064, - "learning_rate": 4.602927932139197e-06, - "loss": 0.4133230447769165, - "mean_token_accuracy": 0.8653768301010132, - "num_tokens": 8022979.0, - "step": 881 - }, - { - "epoch": 0.6702127659574468, - "grad_norm": 2.109272003173828, - "learning_rate": 4.601794610472083e-06, - "loss": 0.7005600929260254, - "mean_token_accuracy": 0.7777010202407837, - "num_tokens": 8032618.0, - "step": 882 - }, - { - "epoch": 0.6709726443768997, - "grad_norm": 2.077977418899536, - "learning_rate": 4.6006598136846056e-06, - "loss": 0.5278208255767822, - "mean_token_accuracy": 0.8230358958244324, - "num_tokens": 8040534.0, - "step": 883 - }, - { - "epoch": 0.6717325227963525, - "grad_norm": 1.678581714630127, - "learning_rate": 4.599523542573207e-06, - "loss": 0.4955351650714874, - "mean_token_accuracy": 0.8270003795623779, - "num_tokens": 8052249.0, - "step": 884 - }, - { - "epoch": 0.6724924012158054, - "grad_norm": 2.0751662254333496, - "learning_rate": 4.598385797935368e-06, - "loss": 0.5266247987747192, - "mean_token_accuracy": 0.8263581991195679, - "num_tokens": 8060600.0, - "step": 885 - }, - { - "epoch": 0.6732522796352584, - "grad_norm": 2.418405771255493, - "learning_rate": 4.5972465805696e-06, - "loss": 0.4481425881385803, - "mean_token_accuracy": 0.846164345741272, - "num_tokens": 8066025.0, - "step": 886 - }, - { - "epoch": 0.6740121580547113, - "grad_norm": 2.3936474323272705, - "learning_rate": 4.596105891275449e-06, - "loss": 0.4553404450416565, - "mean_token_accuracy": 0.8412896394729614, - "num_tokens": 8071544.0, - "step": 887 - }, - { - "epoch": 0.6747720364741642, - "grad_norm": 2.2024407386779785, - "learning_rate": 4.594963730853497e-06, - "loss": 0.6218541860580444, - "mean_token_accuracy": 0.7890232801437378, - "num_tokens": 8079061.0, - "step": 888 - }, - { - "epoch": 0.675531914893617, - "grad_norm": 2.51015567779541, - "learning_rate": 4.593820100105355e-06, - "loss": 0.5149124264717102, - "mean_token_accuracy": 0.8241918087005615, - "num_tokens": 8084293.0, - "step": 889 - }, - { - "epoch": 0.6762917933130699, - "grad_norm": 1.8748939037322998, - "learning_rate": 4.5926749998336665e-06, - "loss": 0.50836181640625, - "mean_token_accuracy": 0.8067223429679871, - "num_tokens": 8092511.0, - "step": 890 - }, - { - "epoch": 0.6770516717325228, - "grad_norm": 1.801193118095398, - "learning_rate": 4.5915284308421075e-06, - "loss": 0.4372861683368683, - "mean_token_accuracy": 0.8510604500770569, - "num_tokens": 8101174.0, - "step": 891 - }, - { - "epoch": 0.6778115501519757, - "grad_norm": 2.6476457118988037, - "learning_rate": 4.590380393935383e-06, - "loss": 0.38700711727142334, - "mean_token_accuracy": 0.8659796714782715, - "num_tokens": 8105398.0, - "step": 892 - }, - { - "epoch": 0.6785714285714286, - "grad_norm": 1.1147183179855347, - "learning_rate": 4.589230889919232e-06, - "loss": 0.38546115159988403, - "mean_token_accuracy": 0.8570581674575806, - "num_tokens": 8127394.0, - "step": 893 - }, - { - "epoch": 0.6793313069908815, - "grad_norm": 2.908905506134033, - "learning_rate": 4.588079919600419e-06, - "loss": 0.5108504295349121, - "mean_token_accuracy": 0.8121406435966492, - "num_tokens": 8131801.0, - "step": 894 - }, - { - "epoch": 0.6800911854103343, - "grad_norm": 3.1522326469421387, - "learning_rate": 4.586927483786739e-06, - "loss": 0.44059112668037415, - "mean_token_accuracy": 0.8448011875152588, - "num_tokens": 8154416.0, - "step": 895 - }, - { - "epoch": 0.6808510638297872, - "grad_norm": 1.5142440795898438, - "learning_rate": 4.585773583287017e-06, - "loss": 0.513217568397522, - "mean_token_accuracy": 0.8386049270629883, - "num_tokens": 8171156.0, - "step": 896 - }, - { - "epoch": 0.6816109422492401, - "grad_norm": 2.597881317138672, - "learning_rate": 4.584618218911104e-06, - "loss": 0.4937712550163269, - "mean_token_accuracy": 0.8223681449890137, - "num_tokens": 8176124.0, - "step": 897 - }, - { - "epoch": 0.682370820668693, - "grad_norm": 1.8185619115829468, - "learning_rate": 4.583461391469879e-06, - "loss": 0.519811749458313, - "mean_token_accuracy": 0.8169777393341064, - "num_tokens": 8185136.0, - "step": 898 - }, - { - "epoch": 0.6831306990881459, - "grad_norm": 3.2061994075775146, - "learning_rate": 4.582303101775249e-06, - "loss": 0.4655115008354187, - "mean_token_accuracy": 0.8425977230072021, - "num_tokens": 8188864.0, - "step": 899 - }, - { - "epoch": 0.6838905775075987, - "grad_norm": 1.3485229015350342, - "learning_rate": 4.581143350640146e-06, - "loss": 0.5014470815658569, - "mean_token_accuracy": 0.8273109197616577, - "num_tokens": 8203460.0, - "step": 900 - }, - { - "epoch": 0.6846504559270516, - "grad_norm": 1.3264713287353516, - "learning_rate": 4.579982138878527e-06, - "loss": 0.5073703527450562, - "mean_token_accuracy": 0.8259357213973999, - "num_tokens": 8219348.0, - "step": 901 - }, - { - "epoch": 0.6854103343465046, - "grad_norm": 2.4436347484588623, - "learning_rate": 4.578819467305375e-06, - "loss": 0.47020310163497925, - "mean_token_accuracy": 0.8567265272140503, - "num_tokens": 8224427.0, - "step": 902 - }, - { - "epoch": 0.6861702127659575, - "grad_norm": 1.921749234199524, - "learning_rate": 4.5776553367367e-06, - "loss": 0.622514009475708, - "mean_token_accuracy": 0.7863982319831848, - "num_tokens": 8233151.0, - "step": 903 - }, - { - "epoch": 0.6869300911854104, - "grad_norm": 1.8815616369247437, - "learning_rate": 4.576489747989532e-06, - "loss": 0.4910545349121094, - "mean_token_accuracy": 0.8147122859954834, - "num_tokens": 8240762.0, - "step": 904 - }, - { - "epoch": 0.6876899696048632, - "grad_norm": 1.2366989850997925, - "learning_rate": 4.575322701881926e-06, - "loss": 0.3947566747665405, - "mean_token_accuracy": 0.873993992805481, - "num_tokens": 8259381.0, - "step": 905 - }, - { - "epoch": 0.6884498480243161, - "grad_norm": 1.5767735242843628, - "learning_rate": 4.57415419923296e-06, - "loss": 0.57136070728302, - "mean_token_accuracy": 0.8028088808059692, - "num_tokens": 8273296.0, - "step": 906 - }, - { - "epoch": 0.689209726443769, - "grad_norm": 2.378675699234009, - "learning_rate": 4.572984240862733e-06, - "loss": 0.5894849896430969, - "mean_token_accuracy": 0.7977708578109741, - "num_tokens": 8280083.0, - "step": 907 - }, - { - "epoch": 0.6899696048632219, - "grad_norm": 2.0401132106781006, - "learning_rate": 4.57181282759237e-06, - "loss": 0.5524613261222839, - "mean_token_accuracy": 0.8138598203659058, - "num_tokens": 8288236.0, - "step": 908 - }, - { - "epoch": 0.6907294832826748, - "grad_norm": 2.293701648712158, - "learning_rate": 4.570639960244011e-06, - "loss": 0.5154546499252319, - "mean_token_accuracy": 0.8234660625457764, - "num_tokens": 8294493.0, - "step": 909 - }, - { - "epoch": 0.6914893617021277, - "grad_norm": 1.9286527633666992, - "learning_rate": 4.56946563964082e-06, - "loss": 0.5364264845848083, - "mean_token_accuracy": 0.8147368431091309, - "num_tokens": 8303441.0, - "step": 910 - }, - { - "epoch": 0.6922492401215805, - "grad_norm": 1.2571251392364502, - "learning_rate": 4.5682898666069815e-06, - "loss": 0.43535223603248596, - "mean_token_accuracy": 0.859239935874939, - "num_tokens": 8321548.0, - "step": 911 - }, - { - "epoch": 0.6930091185410334, - "grad_norm": 1.2224860191345215, - "learning_rate": 4.567112641967697e-06, - "loss": 0.40205076336860657, - "mean_token_accuracy": 0.8724711537361145, - "num_tokens": 8335205.0, - "step": 912 - }, - { - "epoch": 0.6937689969604863, - "grad_norm": 1.2064491510391235, - "learning_rate": 4.5659339665491894e-06, - "loss": 0.37790587544441223, - "mean_token_accuracy": 0.8464339971542358, - "num_tokens": 8350926.0, - "step": 913 - }, - { - "epoch": 0.6945288753799392, - "grad_norm": 2.1755270957946777, - "learning_rate": 4.5647538411786965e-06, - "loss": 0.42034298181533813, - "mean_token_accuracy": 0.84148108959198, - "num_tokens": 8356739.0, - "step": 914 - }, - { - "epoch": 0.6952887537993921, - "grad_norm": 1.234864592552185, - "learning_rate": 4.563572266684478e-06, - "loss": 0.5062938332557678, - "mean_token_accuracy": 0.8132052421569824, - "num_tokens": 8373660.0, - "step": 915 - }, - { - "epoch": 0.6960486322188449, - "grad_norm": 2.4250621795654297, - "learning_rate": 4.562389243895807e-06, - "loss": 0.4907791018486023, - "mean_token_accuracy": 0.8337979912757874, - "num_tokens": 8378661.0, - "step": 916 - }, - { - "epoch": 0.6968085106382979, - "grad_norm": 1.5018314123153687, - "learning_rate": 4.561204773642974e-06, - "loss": 0.41041281819343567, - "mean_token_accuracy": 0.8569784164428711, - "num_tokens": 8390322.0, - "step": 917 - }, - { - "epoch": 0.6975683890577508, - "grad_norm": 2.797269344329834, - "learning_rate": 4.5600188567572874e-06, - "loss": 0.3146931529045105, - "mean_token_accuracy": 0.8913302421569824, - "num_tokens": 8393567.0, - "step": 918 - }, - { - "epoch": 0.6983282674772037, - "grad_norm": 1.4002827405929565, - "learning_rate": 4.558831494071069e-06, - "loss": 0.4275597333908081, - "mean_token_accuracy": 0.8504893779754639, - "num_tokens": 8407119.0, - "step": 919 - }, - { - "epoch": 0.6990881458966566, - "grad_norm": 1.7045831680297852, - "learning_rate": 4.557642686417654e-06, - "loss": 0.49593430757522583, - "mean_token_accuracy": 0.8185091018676758, - "num_tokens": 8417408.0, - "step": 920 - }, - { - "epoch": 0.6998480243161094, - "grad_norm": 2.8818066120147705, - "learning_rate": 4.556452434631396e-06, - "loss": 0.637908935546875, - "mean_token_accuracy": 0.7883946895599365, - "num_tokens": 8422319.0, - "step": 921 - }, - { - "epoch": 0.7006079027355623, - "grad_norm": 2.3587265014648438, - "learning_rate": 4.555260739547657e-06, - "loss": 0.38749319314956665, - "mean_token_accuracy": 0.8774704933166504, - "num_tokens": 8427315.0, - "step": 922 - }, - { - "epoch": 0.7013677811550152, - "grad_norm": 1.6648749113082886, - "learning_rate": 4.554067602002815e-06, - "loss": 0.4044865369796753, - "mean_token_accuracy": 0.8524141311645508, - "num_tokens": 8438662.0, - "step": 923 - }, - { - "epoch": 0.7021276595744681, - "grad_norm": 3.467787742614746, - "learning_rate": 4.55287302283426e-06, - "loss": 0.591016411781311, - "mean_token_accuracy": 0.81184983253479, - "num_tokens": 8442237.0, - "step": 924 - }, - { - "epoch": 0.702887537993921, - "grad_norm": 2.1458635330200195, - "learning_rate": 4.551677002880395e-06, - "loss": 0.5017476677894592, - "mean_token_accuracy": 0.822914183139801, - "num_tokens": 8449494.0, - "step": 925 - }, - { - "epoch": 0.7036474164133738, - "grad_norm": 2.521714448928833, - "learning_rate": 4.550479542980632e-06, - "loss": 0.531912088394165, - "mean_token_accuracy": 0.8225687742233276, - "num_tokens": 8454983.0, - "step": 926 - }, - { - "epoch": 0.7044072948328267, - "grad_norm": 3.5248100757598877, - "learning_rate": 4.549280643975394e-06, - "loss": 0.4631815254688263, - "mean_token_accuracy": 0.8443771600723267, - "num_tokens": 8458504.0, - "step": 927 - }, - { - "epoch": 0.7051671732522796, - "grad_norm": 2.5105819702148438, - "learning_rate": 4.548080306706114e-06, - "loss": 0.30487123131752014, - "mean_token_accuracy": 0.9018767476081848, - "num_tokens": 8462589.0, - "step": 928 - }, - { - "epoch": 0.7059270516717325, - "grad_norm": 1.3367713689804077, - "learning_rate": 4.5468785320152365e-06, - "loss": 0.4355026185512543, - "mean_token_accuracy": 0.8323584794998169, - "num_tokens": 8478450.0, - "step": 929 - }, - { - "epoch": 0.7066869300911854, - "grad_norm": 2.2506282329559326, - "learning_rate": 4.545675320746212e-06, - "loss": 0.5082957744598389, - "mean_token_accuracy": 0.823430597782135, - "num_tokens": 8485991.0, - "step": 930 - }, - { - "epoch": 0.7074468085106383, - "grad_norm": 1.7164632081985474, - "learning_rate": 4.544470673743502e-06, - "loss": 0.3960164785385132, - "mean_token_accuracy": 0.8592486381530762, - "num_tokens": 8495217.0, - "step": 931 - }, - { - "epoch": 0.7082066869300911, - "grad_norm": 1.5864969491958618, - "learning_rate": 4.543264591852572e-06, - "loss": 0.49114471673965454, - "mean_token_accuracy": 0.8330780267715454, - "num_tokens": 8508904.0, - "step": 932 - }, - { - "epoch": 0.708966565349544, - "grad_norm": 2.1707003116607666, - "learning_rate": 4.542057075919898e-06, - "loss": 0.49895772337913513, - "mean_token_accuracy": 0.8327431082725525, - "num_tokens": 8515792.0, - "step": 933 - }, - { - "epoch": 0.709726443768997, - "grad_norm": 1.9002083539962769, - "learning_rate": 4.54084812679296e-06, - "loss": 0.4548531472682953, - "mean_token_accuracy": 0.834532618522644, - "num_tokens": 8524006.0, - "step": 934 - }, - { - "epoch": 0.7104863221884499, - "grad_norm": 1.8505141735076904, - "learning_rate": 4.539637745320247e-06, - "loss": 0.35716521739959717, - "mean_token_accuracy": 0.872222900390625, - "num_tokens": 8533647.0, - "step": 935 - }, - { - "epoch": 0.7112462006079028, - "grad_norm": 2.092620849609375, - "learning_rate": 4.53842593235125e-06, - "loss": 0.4673694372177124, - "mean_token_accuracy": 0.8460999131202698, - "num_tokens": 8540734.0, - "step": 936 - }, - { - "epoch": 0.7120060790273556, - "grad_norm": 2.689514636993408, - "learning_rate": 4.537212688736466e-06, - "loss": 0.45461273193359375, - "mean_token_accuracy": 0.8450704216957092, - "num_tokens": 8544948.0, - "step": 937 - }, - { - "epoch": 0.7127659574468085, - "grad_norm": 2.4507734775543213, - "learning_rate": 4.535998015327396e-06, - "loss": 0.4571906626224518, - "mean_token_accuracy": 0.8429360389709473, - "num_tokens": 8550445.0, - "step": 938 - }, - { - "epoch": 0.7135258358662614, - "grad_norm": 1.8960013389587402, - "learning_rate": 4.534781912976546e-06, - "loss": 0.4461391568183899, - "mean_token_accuracy": 0.8487973213195801, - "num_tokens": 8557630.0, - "step": 939 - }, - { - "epoch": 0.7142857142857143, - "grad_norm": 1.602611780166626, - "learning_rate": 4.533564382537421e-06, - "loss": 0.5277102589607239, - "mean_token_accuracy": 0.8330916166305542, - "num_tokens": 8570397.0, - "step": 940 - }, - { - "epoch": 0.7150455927051672, - "grad_norm": 1.8936395645141602, - "learning_rate": 4.532345424864533e-06, - "loss": 0.38619571924209595, - "mean_token_accuracy": 0.8514572381973267, - "num_tokens": 8582673.0, - "step": 941 - }, - { - "epoch": 0.71580547112462, - "grad_norm": 1.3898619413375854, - "learning_rate": 4.531125040813392e-06, - "loss": 0.4825032949447632, - "mean_token_accuracy": 0.833012580871582, - "num_tokens": 8597239.0, - "step": 942 - }, - { - "epoch": 0.7165653495440729, - "grad_norm": 2.128230571746826, - "learning_rate": 4.529903231240511e-06, - "loss": 0.4862118065357208, - "mean_token_accuracy": 0.8210917711257935, - "num_tokens": 8605877.0, - "step": 943 - }, - { - "epoch": 0.7173252279635258, - "grad_norm": 1.6552259922027588, - "learning_rate": 4.528679997003403e-06, - "loss": 0.5092059373855591, - "mean_token_accuracy": 0.8247389793395996, - "num_tokens": 8617060.0, - "step": 944 - }, - { - "epoch": 0.7180851063829787, - "grad_norm": 2.1174771785736084, - "learning_rate": 4.52745533896058e-06, - "loss": 0.39110174775123596, - "mean_token_accuracy": 0.8672944903373718, - "num_tokens": 8623306.0, - "step": 945 - }, - { - "epoch": 0.7188449848024316, - "grad_norm": 2.8648383617401123, - "learning_rate": 4.526229257971556e-06, - "loss": 0.49864327907562256, - "mean_token_accuracy": 0.8305130004882812, - "num_tokens": 8627466.0, - "step": 946 - }, - { - "epoch": 0.7196048632218845, - "grad_norm": 2.155514717102051, - "learning_rate": 4.52500175489684e-06, - "loss": 0.5070191025733948, - "mean_token_accuracy": 0.8311188817024231, - "num_tokens": 8634759.0, - "step": 947 - }, - { - "epoch": 0.7203647416413373, - "grad_norm": 1.8432683944702148, - "learning_rate": 4.523772830597942e-06, - "loss": 0.5569252371788025, - "mean_token_accuracy": 0.8070821762084961, - "num_tokens": 8644160.0, - "step": 948 - }, - { - "epoch": 0.7211246200607903, - "grad_norm": 2.8912241458892822, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4799427390098572, - "mean_token_accuracy": 0.8443552851676941, - "num_tokens": 8648377.0, - "step": 949 - }, - { - "epoch": 0.7218844984802432, - "grad_norm": 3.3449625968933105, - "learning_rate": 4.521310721778622e-06, - "loss": 0.44043463468551636, - "mean_token_accuracy": 0.8521315455436707, - "num_tokens": 8651846.0, - "step": 950 - }, - { - "epoch": 0.7226443768996961, - "grad_norm": 1.4127917289733887, - "learning_rate": 4.520077538986203e-06, - "loss": 0.4700999855995178, - "mean_token_accuracy": 0.8377952575683594, - "num_tokens": 8665199.0, - "step": 951 - }, - { - "epoch": 0.723404255319149, - "grad_norm": 2.1607301235198975, - "learning_rate": 4.518842938425606e-06, - "loss": 0.4374256730079651, - "mean_token_accuracy": 0.8448896408081055, - "num_tokens": 8672158.0, - "step": 952 - }, - { - "epoch": 0.7241641337386018, - "grad_norm": 1.3442779779434204, - "learning_rate": 4.51760692096332e-06, - "loss": 0.38948923349380493, - "mean_token_accuracy": 0.8598923683166504, - "num_tokens": 8684532.0, - "step": 953 - }, - { - "epoch": 0.7249240121580547, - "grad_norm": 2.0003178119659424, - "learning_rate": 4.516369487466832e-06, - "loss": 0.3797217011451721, - "mean_token_accuracy": 0.8652102947235107, - "num_tokens": 8691460.0, - "step": 954 - }, - { - "epoch": 0.7256838905775076, - "grad_norm": 1.8196535110473633, - "learning_rate": 4.5151306388046175e-06, - "loss": 0.5676811933517456, - "mean_token_accuracy": 0.818500816822052, - "num_tokens": 8701624.0, - "step": 955 - }, - { - "epoch": 0.7264437689969605, - "grad_norm": 2.1962296962738037, - "learning_rate": 4.513890375846152e-06, - "loss": 0.45399484038352966, - "mean_token_accuracy": 0.8463879227638245, - "num_tokens": 8707410.0, - "step": 956 - }, - { - "epoch": 0.7272036474164134, - "grad_norm": 1.8798872232437134, - "learning_rate": 4.512648699461897e-06, - "loss": 0.5679811239242554, - "mean_token_accuracy": 0.8089900016784668, - "num_tokens": 8715630.0, - "step": 957 - }, - { - "epoch": 0.7279635258358662, - "grad_norm": 2.3540258407592773, - "learning_rate": 4.511405610523309e-06, - "loss": 0.5282865762710571, - "mean_token_accuracy": 0.8196114301681519, - "num_tokens": 8721934.0, - "step": 958 - }, - { - "epoch": 0.7287234042553191, - "grad_norm": 2.5630908012390137, - "learning_rate": 4.510161109902837e-06, - "loss": 0.39442378282546997, - "mean_token_accuracy": 0.8400980830192566, - "num_tokens": 8726511.0, - "step": 959 - }, - { - "epoch": 0.729483282674772, - "grad_norm": 1.9829226732254028, - "learning_rate": 4.508915198473919e-06, - "loss": 0.4611976742744446, - "mean_token_accuracy": 0.8439624309539795, - "num_tokens": 8733460.0, - "step": 960 - }, - { - "epoch": 0.7302431610942249, - "grad_norm": 3.0291950702667236, - "learning_rate": 4.507667877110982e-06, - "loss": 0.5158340930938721, - "mean_token_accuracy": 0.8300060033798218, - "num_tokens": 8737629.0, - "step": 961 - }, - { - "epoch": 0.7310030395136778, - "grad_norm": 1.9208252429962158, - "learning_rate": 4.506419146689445e-06, - "loss": 0.3807099163532257, - "mean_token_accuracy": 0.871469259262085, - "num_tokens": 8744615.0, - "step": 962 - }, - { - "epoch": 0.7317629179331308, - "grad_norm": 3.051565408706665, - "learning_rate": 4.505169008085717e-06, - "loss": 0.38461726903915405, - "mean_token_accuracy": 0.874465823173523, - "num_tokens": 8748154.0, - "step": 963 - }, - { - "epoch": 0.7325227963525835, - "grad_norm": 1.375466227531433, - "learning_rate": 4.503917462177192e-06, - "loss": 0.42490679025650024, - "mean_token_accuracy": 0.8457326889038086, - "num_tokens": 8760965.0, - "step": 964 - }, - { - "epoch": 0.7332826747720365, - "grad_norm": 2.216681957244873, - "learning_rate": 4.5026645098422515e-06, - "loss": 0.43149900436401367, - "mean_token_accuracy": 0.8527278900146484, - "num_tokens": 8766996.0, - "step": 965 - }, - { - "epoch": 0.7340425531914894, - "grad_norm": 1.9422595500946045, - "learning_rate": 4.5014101519602684e-06, - "loss": 0.4964504539966583, - "mean_token_accuracy": 0.8137556314468384, - "num_tokens": 8774411.0, - "step": 966 - }, - { - "epoch": 0.7348024316109423, - "grad_norm": 2.058887004852295, - "learning_rate": 4.500154389411598e-06, - "loss": 0.4977570176124573, - "mean_token_accuracy": 0.8254626989364624, - "num_tokens": 8782220.0, - "step": 967 - }, - { - "epoch": 0.7355623100303952, - "grad_norm": 2.9977786540985107, - "learning_rate": 4.498897223077582e-06, - "loss": 0.4061415195465088, - "mean_token_accuracy": 0.8752427101135254, - "num_tokens": 8786120.0, - "step": 968 - }, - { - "epoch": 0.736322188449848, - "grad_norm": 2.2636303901672363, - "learning_rate": 4.49763865384055e-06, - "loss": 0.5062161087989807, - "mean_token_accuracy": 0.8171653747558594, - "num_tokens": 8792459.0, - "step": 969 - }, - { - "epoch": 0.7370820668693009, - "grad_norm": 1.8850842714309692, - "learning_rate": 4.496378682583813e-06, - "loss": 0.5014280676841736, - "mean_token_accuracy": 0.8547511100769043, - "num_tokens": 8800675.0, - "step": 970 - }, - { - "epoch": 0.7378419452887538, - "grad_norm": 1.191985011100769, - "learning_rate": 4.495117310191667e-06, - "loss": 0.4713883101940155, - "mean_token_accuracy": 0.8213596343994141, - "num_tokens": 8820740.0, - "step": 971 - }, - { - "epoch": 0.7386018237082067, - "grad_norm": 1.823000192642212, - "learning_rate": 4.493854537549393e-06, - "loss": 0.46332645416259766, - "mean_token_accuracy": 0.8359860777854919, - "num_tokens": 8828884.0, - "step": 972 - }, - { - "epoch": 0.7393617021276596, - "grad_norm": 2.590446949005127, - "learning_rate": 4.492590365543253e-06, - "loss": 0.49074703454971313, - "mean_token_accuracy": 0.8433758020401001, - "num_tokens": 8833859.0, - "step": 973 - }, - { - "epoch": 0.7401215805471124, - "grad_norm": 2.2762670516967773, - "learning_rate": 4.491324795060491e-06, - "loss": 0.39465656876564026, - "mean_token_accuracy": 0.8734766244888306, - "num_tokens": 8839350.0, - "step": 974 - }, - { - "epoch": 0.7408814589665653, - "grad_norm": 2.698725461959839, - "learning_rate": 4.490057826989333e-06, - "loss": 0.5552085041999817, - "mean_token_accuracy": 0.8132266998291016, - "num_tokens": 8844373.0, - "step": 975 - }, - { - "epoch": 0.7416413373860182, - "grad_norm": 2.704606294631958, - "learning_rate": 4.488789462218988e-06, - "loss": 0.3447791635990143, - "mean_token_accuracy": 0.8736170530319214, - "num_tokens": 8848236.0, - "step": 976 - }, - { - "epoch": 0.7424012158054711, - "grad_norm": 3.1260716915130615, - "learning_rate": 4.487519701639641e-06, - "loss": 0.5945233702659607, - "mean_token_accuracy": 0.7997599840164185, - "num_tokens": 8852935.0, - "step": 977 - }, - { - "epoch": 0.743161094224924, - "grad_norm": 1.6895452737808228, - "learning_rate": 4.486248546142459e-06, - "loss": 0.4823892116546631, - "mean_token_accuracy": 0.8279662132263184, - "num_tokens": 8861743.0, - "step": 978 - }, - { - "epoch": 0.743920972644377, - "grad_norm": 1.9161452054977417, - "learning_rate": 4.4849759966195885e-06, - "loss": 0.5266581773757935, - "mean_token_accuracy": 0.8218623399734497, - "num_tokens": 8870601.0, - "step": 979 - }, - { - "epoch": 0.7446808510638298, - "grad_norm": 1.6894301176071167, - "learning_rate": 4.483702053964154e-06, - "loss": 0.4186219573020935, - "mean_token_accuracy": 0.8471781015396118, - "num_tokens": 8885617.0, - "step": 980 - }, - { - "epoch": 0.7454407294832827, - "grad_norm": 1.6319992542266846, - "learning_rate": 4.482426719070258e-06, - "loss": 0.541317880153656, - "mean_token_accuracy": 0.8216162323951721, - "num_tokens": 8897595.0, - "step": 981 - }, - { - "epoch": 0.7462006079027356, - "grad_norm": 5.102413177490234, - "learning_rate": 4.4811499928329775e-06, - "loss": 0.3928517699241638, - "mean_token_accuracy": 0.858033299446106, - "num_tokens": 8901682.0, - "step": 982 - }, - { - "epoch": 0.7469604863221885, - "grad_norm": 2.213860273361206, - "learning_rate": 4.479871876148368e-06, - "loss": 0.4276347756385803, - "mean_token_accuracy": 0.8529798984527588, - "num_tokens": 8908088.0, - "step": 983 - }, - { - "epoch": 0.7477203647416414, - "grad_norm": 1.2180038690567017, - "learning_rate": 4.478592369913464e-06, - "loss": 0.3941590189933777, - "mean_token_accuracy": 0.8608149290084839, - "num_tokens": 8925876.0, - "step": 984 - }, - { - "epoch": 0.7484802431610942, - "grad_norm": 2.849802255630493, - "learning_rate": 4.477311475026271e-06, - "loss": 0.42190325260162354, - "mean_token_accuracy": 0.860505223274231, - "num_tokens": 8930190.0, - "step": 985 - }, - { - "epoch": 0.7492401215805471, - "grad_norm": 1.704128384590149, - "learning_rate": 4.476029192385769e-06, - "loss": 0.4786282777786255, - "mean_token_accuracy": 0.8302322626113892, - "num_tokens": 8938340.0, - "step": 986 - }, - { - "epoch": 0.75, - "grad_norm": 2.06322979927063, - "learning_rate": 4.474745522891915e-06, - "loss": 0.4648786187171936, - "mean_token_accuracy": 0.8366481065750122, - "num_tokens": 8944633.0, - "step": 987 - }, - { - "epoch": 0.7507598784194529, - "grad_norm": 2.0745396614074707, - "learning_rate": 4.473460467445637e-06, - "loss": 0.5744885206222534, - "mean_token_accuracy": 0.8357284069061279, - "num_tokens": 8954457.0, - "step": 988 - }, - { - "epoch": 0.7515197568389058, - "grad_norm": 1.9281407594680786, - "learning_rate": 4.472174026948836e-06, - "loss": 0.528974175453186, - "mean_token_accuracy": 0.8083580732345581, - "num_tokens": 8962701.0, - "step": 989 - }, - { - "epoch": 0.7522796352583586, - "grad_norm": 3.012381076812744, - "learning_rate": 4.470886202304385e-06, - "loss": 0.48754751682281494, - "mean_token_accuracy": 0.8368391990661621, - "num_tokens": 8967272.0, - "step": 990 - }, - { - "epoch": 0.7530395136778115, - "grad_norm": 1.691826581954956, - "learning_rate": 4.469596994416131e-06, - "loss": 0.484740674495697, - "mean_token_accuracy": 0.8500643968582153, - "num_tokens": 8976615.0, - "step": 991 - }, - { - "epoch": 0.7537993920972644, - "grad_norm": 2.4961965084075928, - "learning_rate": 4.468306404188887e-06, - "loss": 0.50777268409729, - "mean_token_accuracy": 0.8168395757675171, - "num_tokens": 8983235.0, - "step": 992 - }, - { - "epoch": 0.7545592705167173, - "grad_norm": 1.512007713317871, - "learning_rate": 4.467014432528441e-06, - "loss": 0.4583340287208557, - "mean_token_accuracy": 0.8465162515640259, - "num_tokens": 8993815.0, - "step": 993 - }, - { - "epoch": 0.7553191489361702, - "grad_norm": 1.9362257719039917, - "learning_rate": 4.465721080341547e-06, - "loss": 0.6027892827987671, - "mean_token_accuracy": 0.8052380084991455, - "num_tokens": 9002697.0, - "step": 994 - }, - { - "epoch": 0.756079027355623, - "grad_norm": 2.473632335662842, - "learning_rate": 4.4644263485359316e-06, - "loss": 0.5394320487976074, - "mean_token_accuracy": 0.834665834903717, - "num_tokens": 9007428.0, - "step": 995 - }, - { - "epoch": 0.756838905775076, - "grad_norm": 2.2527434825897217, - "learning_rate": 4.463130238020284e-06, - "loss": 0.5485198497772217, - "mean_token_accuracy": 0.8090173006057739, - "num_tokens": 9013570.0, - "step": 996 - }, - { - "epoch": 0.7575987841945289, - "grad_norm": 1.4130940437316895, - "learning_rate": 4.4618327497042676e-06, - "loss": 0.37994423508644104, - "mean_token_accuracy": 0.8625167012214661, - "num_tokens": 9025485.0, - "step": 997 - }, - { - "epoch": 0.7583586626139818, - "grad_norm": 2.685115098953247, - "learning_rate": 4.460533884498509e-06, - "loss": 0.447973370552063, - "mean_token_accuracy": 0.8564165234565735, - "num_tokens": 9030355.0, - "step": 998 - }, - { - "epoch": 0.7591185410334347, - "grad_norm": 3.2743139266967773, - "learning_rate": 4.4592336433146e-06, - "loss": 0.45275989174842834, - "mean_token_accuracy": 0.8462578058242798, - "num_tokens": 9034406.0, - "step": 999 - }, - { - "epoch": 0.7598784194528876, - "grad_norm": 1.9383049011230469, - "learning_rate": 4.457932027065102e-06, - "loss": 0.5387729406356812, - "mean_token_accuracy": 0.8357330560684204, - "num_tokens": 9041502.0, - "step": 1000 - }, - { - "epoch": 0.7606382978723404, - "grad_norm": 2.7348275184631348, - "learning_rate": 4.456629036663537e-06, - "loss": 0.4448447823524475, - "mean_token_accuracy": 0.8453642129898071, - "num_tokens": 9046088.0, - "step": 1001 - }, - { - "epoch": 0.7613981762917933, - "grad_norm": 1.8477401733398438, - "learning_rate": 4.455324673024396e-06, - "loss": 0.5766505002975464, - "mean_token_accuracy": 0.8074213862419128, - "num_tokens": 9055678.0, - "step": 1002 - }, - { - "epoch": 0.7621580547112462, - "grad_norm": 3.134481430053711, - "learning_rate": 4.4540189370631315e-06, - "loss": 0.5690872669219971, - "mean_token_accuracy": 0.8414670825004578, - "num_tokens": 9062006.0, - "step": 1003 - }, - { - "epoch": 0.7629179331306991, - "grad_norm": 1.7933398485183716, - "learning_rate": 4.452711829696158e-06, - "loss": 0.4898291826248169, - "mean_token_accuracy": 0.8259007930755615, - "num_tokens": 9070754.0, - "step": 1004 - }, - { - "epoch": 0.763677811550152, - "grad_norm": 1.2552275657653809, - "learning_rate": 4.451403351840855e-06, - "loss": 0.4280198812484741, - "mean_token_accuracy": 0.8409112691879272, - "num_tokens": 9085306.0, - "step": 1005 - }, - { - "epoch": 0.7644376899696048, - "grad_norm": 1.6749331951141357, - "learning_rate": 4.450093504415562e-06, - "loss": 0.3723178505897522, - "mean_token_accuracy": 0.8545734882354736, - "num_tokens": 9102453.0, - "step": 1006 - }, - { - "epoch": 0.7651975683890577, - "grad_norm": 2.7514500617980957, - "learning_rate": 4.44878228833958e-06, - "loss": 0.5463190674781799, - "mean_token_accuracy": 0.8121639490127563, - "num_tokens": 9108342.0, - "step": 1007 - }, - { - "epoch": 0.7659574468085106, - "grad_norm": 1.3322733640670776, - "learning_rate": 4.447469704533172e-06, - "loss": 0.573723316192627, - "mean_token_accuracy": 0.8065711259841919, - "num_tokens": 9123712.0, - "step": 1008 - }, - { - "epoch": 0.7667173252279635, - "grad_norm": 2.6893765926361084, - "learning_rate": 4.446155753917559e-06, - "loss": 0.6856257915496826, - "mean_token_accuracy": 0.7718256711959839, - "num_tokens": 9130728.0, - "step": 1009 - }, - { - "epoch": 0.7674772036474165, - "grad_norm": 1.792765498161316, - "learning_rate": 4.444840437414923e-06, - "loss": 0.48203110694885254, - "mean_token_accuracy": 0.8419194221496582, - "num_tokens": 9137983.0, - "step": 1010 - }, - { - "epoch": 0.7682370820668692, - "grad_norm": 1.4957399368286133, - "learning_rate": 4.443523755948401e-06, - "loss": 0.4372181296348572, - "mean_token_accuracy": 0.8491764664649963, - "num_tokens": 9148081.0, - "step": 1011 - }, - { - "epoch": 0.7689969604863222, - "grad_norm": 1.7294867038726807, - "learning_rate": 4.442205710442095e-06, - "loss": 0.54277503490448, - "mean_token_accuracy": 0.8196806907653809, - "num_tokens": 9158407.0, - "step": 1012 - }, - { - "epoch": 0.7697568389057751, - "grad_norm": 2.2091221809387207, - "learning_rate": 4.4408863018210564e-06, - "loss": 0.4888187646865845, - "mean_token_accuracy": 0.8384175300598145, - "num_tokens": 9164754.0, - "step": 1013 - }, - { - "epoch": 0.770516717325228, - "grad_norm": 1.7615830898284912, - "learning_rate": 4.439565531011299e-06, - "loss": 0.4640008211135864, - "mean_token_accuracy": 0.8424701690673828, - "num_tokens": 9172715.0, - "step": 1014 - }, - { - "epoch": 0.7712765957446809, - "grad_norm": 1.6796128749847412, - "learning_rate": 4.43824339893979e-06, - "loss": 0.5227609276771545, - "mean_token_accuracy": 0.8135923743247986, - "num_tokens": 9183214.0, - "step": 1015 - }, - { - "epoch": 0.7720364741641338, - "grad_norm": 2.1485698223114014, - "learning_rate": 4.436919906534452e-06, - "loss": 0.4857056140899658, - "mean_token_accuracy": 0.8323013782501221, - "num_tokens": 9190360.0, - "step": 1016 - }, - { - "epoch": 0.7727963525835866, - "grad_norm": 2.7842206954956055, - "learning_rate": 4.4355950547241645e-06, - "loss": 0.46406883001327515, - "mean_token_accuracy": 0.859869122505188, - "num_tokens": 9194523.0, - "step": 1017 - }, - { - "epoch": 0.7735562310030395, - "grad_norm": 2.3774640560150146, - "learning_rate": 4.434268844438758e-06, - "loss": 0.5625549554824829, - "mean_token_accuracy": 0.8188897371292114, - "num_tokens": 9201155.0, - "step": 1018 - }, - { - "epoch": 0.7743161094224924, - "grad_norm": 2.004427909851074, - "learning_rate": 4.432941276609018e-06, - "loss": 0.5164387226104736, - "mean_token_accuracy": 0.829569935798645, - "num_tokens": 9209269.0, - "step": 1019 - }, - { - "epoch": 0.7750759878419453, - "grad_norm": 1.7218989133834839, - "learning_rate": 4.431612352166684e-06, - "loss": 0.481005996465683, - "mean_token_accuracy": 0.8359906673431396, - "num_tokens": 9220860.0, - "step": 1020 - }, - { - "epoch": 0.7758358662613982, - "grad_norm": 2.197108507156372, - "learning_rate": 4.4302820720444454e-06, - "loss": 0.440413236618042, - "mean_token_accuracy": 0.8412867784500122, - "num_tokens": 9226414.0, - "step": 1021 - }, - { - "epoch": 0.776595744680851, - "grad_norm": 2.6995162963867188, - "learning_rate": 4.428950437175944e-06, - "loss": 0.3884299397468567, - "mean_token_accuracy": 0.8696021437644958, - "num_tokens": 9230898.0, - "step": 1022 - }, - { - "epoch": 0.7773556231003039, - "grad_norm": 2.1671667098999023, - "learning_rate": 4.427617448495772e-06, - "loss": 0.5747478008270264, - "mean_token_accuracy": 0.7842930555343628, - "num_tokens": 9238479.0, - "step": 1023 - }, - { - "epoch": 0.7781155015197568, - "grad_norm": 1.6299028396606445, - "learning_rate": 4.426283106939474e-06, - "loss": 0.39478403329849243, - "mean_token_accuracy": 0.8685503602027893, - "num_tokens": 9248263.0, - "step": 1024 - }, - { - "epoch": 0.7788753799392097, - "grad_norm": 2.2621798515319824, - "learning_rate": 4.424947413443539e-06, - "loss": 0.4582178592681885, - "mean_token_accuracy": 0.8312377333641052, - "num_tokens": 9254168.0, - "step": 1025 - }, - { - "epoch": 0.7796352583586627, - "grad_norm": 2.121091365814209, - "learning_rate": 4.423610368945411e-06, - "loss": 0.5315121412277222, - "mean_token_accuracy": 0.8121483325958252, - "num_tokens": 9261808.0, - "step": 1026 - }, - { - "epoch": 0.7803951367781155, - "grad_norm": 1.8558297157287598, - "learning_rate": 4.422271974383479e-06, - "loss": 0.4299176037311554, - "mean_token_accuracy": 0.8452648520469666, - "num_tokens": 9269264.0, - "step": 1027 - }, - { - "epoch": 0.7811550151975684, - "grad_norm": 1.9089949131011963, - "learning_rate": 4.420932230697079e-06, - "loss": 0.43876272439956665, - "mean_token_accuracy": 0.8434094190597534, - "num_tokens": 9277381.0, - "step": 1028 - }, - { - "epoch": 0.7819148936170213, - "grad_norm": 1.8619649410247803, - "learning_rate": 4.419591138826495e-06, - "loss": 0.48798668384552, - "mean_token_accuracy": 0.8281317353248596, - "num_tokens": 9285413.0, - "step": 1029 - }, - { - "epoch": 0.7826747720364742, - "grad_norm": 1.3273087739944458, - "learning_rate": 4.418248699712955e-06, - "loss": 0.4611460864543915, - "mean_token_accuracy": 0.8233213424682617, - "num_tokens": 9300805.0, - "step": 1030 - }, - { - "epoch": 0.7834346504559271, - "grad_norm": 1.0473746061325073, - "learning_rate": 4.416904914298637e-06, - "loss": 0.36537665128707886, - "mean_token_accuracy": 0.8671857118606567, - "num_tokens": 9320035.0, - "step": 1031 - }, - { - "epoch": 0.78419452887538, - "grad_norm": 1.9130918979644775, - "learning_rate": 4.415559783526661e-06, - "loss": 0.4916655123233795, - "mean_token_accuracy": 0.8266351222991943, - "num_tokens": 9326795.0, - "step": 1032 - }, - { - "epoch": 0.7849544072948328, - "grad_norm": 2.0001816749572754, - "learning_rate": 4.414213308341092e-06, - "loss": 0.5711008310317993, - "mean_token_accuracy": 0.8093076348304749, - "num_tokens": 9335625.0, - "step": 1033 - }, - { - "epoch": 0.7857142857142857, - "grad_norm": 3.933542251586914, - "learning_rate": 4.412865489686936e-06, - "loss": 0.621616542339325, - "mean_token_accuracy": 0.7938898801803589, - "num_tokens": 9339080.0, - "step": 1034 - }, - { - "epoch": 0.7864741641337386, - "grad_norm": 2.061558961868286, - "learning_rate": 4.411516328510145e-06, - "loss": 0.583686113357544, - "mean_token_accuracy": 0.8216883540153503, - "num_tokens": 9348581.0, - "step": 1035 - }, - { - "epoch": 0.7872340425531915, - "grad_norm": 1.9401264190673828, - "learning_rate": 4.410165825757613e-06, - "loss": 0.4905240535736084, - "mean_token_accuracy": 0.8229951858520508, - "num_tokens": 9356032.0, - "step": 1036 - }, - { - "epoch": 0.7879939209726444, - "grad_norm": 3.620547294616699, - "learning_rate": 4.408813982377175e-06, - "loss": 0.4269888997077942, - "mean_token_accuracy": 0.8713940978050232, - "num_tokens": 9359061.0, - "step": 1037 - }, - { - "epoch": 0.7887537993920972, - "grad_norm": 1.2027851343154907, - "learning_rate": 4.407460799317605e-06, - "loss": 0.39972418546676636, - "mean_token_accuracy": 0.8610097765922546, - "num_tokens": 9377068.0, - "step": 1038 - }, - { - "epoch": 0.7895136778115501, - "grad_norm": 2.566753387451172, - "learning_rate": 4.40610627752862e-06, - "loss": 0.45267152786254883, - "mean_token_accuracy": 0.83243328332901, - "num_tokens": 9383604.0, - "step": 1039 - }, - { - "epoch": 0.790273556231003, - "grad_norm": 2.940094470977783, - "learning_rate": 4.404750417960876e-06, - "loss": 0.42862242460250854, - "mean_token_accuracy": 0.8582849502563477, - "num_tokens": 9387541.0, - "step": 1040 - }, - { - "epoch": 0.791033434650456, - "grad_norm": 2.0223944187164307, - "learning_rate": 4.403393221565966e-06, - "loss": 0.4349963665008545, - "mean_token_accuracy": 0.8453047871589661, - "num_tokens": 9394382.0, - "step": 1041 - }, - { - "epoch": 0.7917933130699089, - "grad_norm": 2.9399030208587646, - "learning_rate": 4.402034689296425e-06, - "loss": 0.32197174429893494, - "mean_token_accuracy": 0.8953392505645752, - "num_tokens": 9397741.0, - "step": 1042 - }, - { - "epoch": 0.7925531914893617, - "grad_norm": 2.819016456604004, - "learning_rate": 4.400674822105721e-06, - "loss": 0.6790289878845215, - "mean_token_accuracy": 0.8135063648223877, - "num_tokens": 9403509.0, - "step": 1043 - }, - { - "epoch": 0.7933130699088146, - "grad_norm": 1.3225977420806885, - "learning_rate": 4.399313620948262e-06, - "loss": 0.42203834652900696, - "mean_token_accuracy": 0.8399381637573242, - "num_tokens": 9418870.0, - "step": 1044 - }, - { - "epoch": 0.7940729483282675, - "grad_norm": 1.7822176218032837, - "learning_rate": 4.397951086779392e-06, - "loss": 0.4666554927825928, - "mean_token_accuracy": 0.8364764451980591, - "num_tokens": 9427640.0, - "step": 1045 - }, - { - "epoch": 0.7948328267477204, - "grad_norm": 3.186439037322998, - "learning_rate": 4.396587220555389e-06, - "loss": 0.6048363447189331, - "mean_token_accuracy": 0.7806557416915894, - "num_tokens": 9431927.0, - "step": 1046 - }, - { - "epoch": 0.7955927051671733, - "grad_norm": 3.0804805755615234, - "learning_rate": 4.395222023233467e-06, - "loss": 0.445969820022583, - "mean_token_accuracy": 0.850671112537384, - "num_tokens": 9436136.0, - "step": 1047 - }, - { - "epoch": 0.7963525835866262, - "grad_norm": 1.675968885421753, - "learning_rate": 4.393855495771774e-06, - "loss": 0.4311422109603882, - "mean_token_accuracy": 0.8449079990386963, - "num_tokens": 9445189.0, - "step": 1048 - }, - { - "epoch": 0.797112462006079, - "grad_norm": 2.342410087585449, - "learning_rate": 4.3924876391293915e-06, - "loss": 0.5733606219291687, - "mean_token_accuracy": 0.8156592845916748, - "num_tokens": 9451939.0, - "step": 1049 - }, - { - "epoch": 0.7978723404255319, - "grad_norm": 1.5967470407485962, - "learning_rate": 4.391118454266335e-06, - "loss": 0.46664729714393616, - "mean_token_accuracy": 0.8091695308685303, - "num_tokens": 9463968.0, - "step": 1050 - }, - { - "epoch": 0.7986322188449848, - "grad_norm": 1.5777863264083862, - "learning_rate": 4.389747942143549e-06, - "loss": 0.46028903126716614, - "mean_token_accuracy": 0.8347330093383789, - "num_tokens": 9475561.0, - "step": 1051 - }, - { - "epoch": 0.7993920972644377, - "grad_norm": 2.7630488872528076, - "learning_rate": 4.388376103722914e-06, - "loss": 0.5618188977241516, - "mean_token_accuracy": 0.8273467421531677, - "num_tokens": 9480661.0, - "step": 1052 - }, - { - "epoch": 0.8001519756838906, - "grad_norm": 2.093397378921509, - "learning_rate": 4.387002939967237e-06, - "loss": 0.2998353838920593, - "mean_token_accuracy": 0.8905231952667236, - "num_tokens": 9485924.0, - "step": 1053 - }, - { - "epoch": 0.8009118541033434, - "grad_norm": 1.4385871887207031, - "learning_rate": 4.38562845184026e-06, - "loss": 0.4944111704826355, - "mean_token_accuracy": 0.8403056263923645, - "num_tokens": 9500128.0, - "step": 1054 - }, - { - "epoch": 0.8016717325227963, - "grad_norm": 1.6393156051635742, - "learning_rate": 4.384252640306649e-06, - "loss": 0.5727907419204712, - "mean_token_accuracy": 0.7849414348602295, - "num_tokens": 9511569.0, - "step": 1055 - }, - { - "epoch": 0.8024316109422492, - "grad_norm": 2.3909664154052734, - "learning_rate": 4.382875506332002e-06, - "loss": 0.4760419726371765, - "mean_token_accuracy": 0.8408266305923462, - "num_tokens": 9517244.0, - "step": 1056 - }, - { - "epoch": 0.8031914893617021, - "grad_norm": 1.7288594245910645, - "learning_rate": 4.381497050882845e-06, - "loss": 0.5375926494598389, - "mean_token_accuracy": 0.8138614892959595, - "num_tokens": 9528736.0, - "step": 1057 - }, - { - "epoch": 0.8039513677811551, - "grad_norm": 2.093407392501831, - "learning_rate": 4.380117274926632e-06, - "loss": 0.46659404039382935, - "mean_token_accuracy": 0.8450702428817749, - "num_tokens": 9536200.0, - "step": 1058 - }, - { - "epoch": 0.8047112462006079, - "grad_norm": 1.6835898160934448, - "learning_rate": 4.3787361794317405e-06, - "loss": 0.43157699704170227, - "mean_token_accuracy": 0.8279973268508911, - "num_tokens": 9546314.0, - "step": 1059 - }, - { - "epoch": 0.8054711246200608, - "grad_norm": 1.983067512512207, - "learning_rate": 4.377353765367479e-06, - "loss": 0.5021739602088928, - "mean_token_accuracy": 0.8274815082550049, - "num_tokens": 9554375.0, - "step": 1060 - }, - { - "epoch": 0.8062310030395137, - "grad_norm": 2.0472030639648438, - "learning_rate": 4.375970033704078e-06, - "loss": 0.34298190474510193, - "mean_token_accuracy": 0.8900876045227051, - "num_tokens": 9560230.0, - "step": 1061 - }, - { - "epoch": 0.8069908814589666, - "grad_norm": 1.9613717794418335, - "learning_rate": 4.374584985412692e-06, - "loss": 0.3826758861541748, - "mean_token_accuracy": 0.839923620223999, - "num_tokens": 9566809.0, - "step": 1062 - }, - { - "epoch": 0.8077507598784195, - "grad_norm": 1.991289496421814, - "learning_rate": 4.373198621465405e-06, - "loss": 0.5492525100708008, - "mean_token_accuracy": 0.8153272867202759, - "num_tokens": 9576810.0, - "step": 1063 - }, - { - "epoch": 0.8085106382978723, - "grad_norm": 2.421370506286621, - "learning_rate": 4.3718109428352155e-06, - "loss": 0.5240297317504883, - "mean_token_accuracy": 0.8087242245674133, - "num_tokens": 9582906.0, - "step": 1064 - }, - { - "epoch": 0.8092705167173252, - "grad_norm": 3.697765588760376, - "learning_rate": 4.370421950496055e-06, - "loss": 0.6096476912498474, - "mean_token_accuracy": 0.787585973739624, - "num_tokens": 9586920.0, - "step": 1065 - }, - { - "epoch": 0.8100303951367781, - "grad_norm": 2.0767786502838135, - "learning_rate": 4.369031645422768e-06, - "loss": 0.41120079159736633, - "mean_token_accuracy": 0.8513731956481934, - "num_tokens": 9593902.0, - "step": 1066 - }, - { - "epoch": 0.810790273556231, - "grad_norm": 2.5968732833862305, - "learning_rate": 4.367640028591126e-06, - "loss": 0.3364982008934021, - "mean_token_accuracy": 0.8786963224411011, - "num_tokens": 9597745.0, - "step": 1067 - }, - { - "epoch": 0.8115501519756839, - "grad_norm": 2.165742874145508, - "learning_rate": 4.366247100977818e-06, - "loss": 0.406129390001297, - "mean_token_accuracy": 0.868243932723999, - "num_tokens": 9603496.0, - "step": 1068 - }, - { - "epoch": 0.8123100303951368, - "grad_norm": 2.0493404865264893, - "learning_rate": 4.364852863560456e-06, - "loss": 0.5356296300888062, - "mean_token_accuracy": 0.8191947340965271, - "num_tokens": 9610898.0, - "step": 1069 - }, - { - "epoch": 0.8130699088145896, - "grad_norm": 2.3224308490753174, - "learning_rate": 4.363457317317568e-06, - "loss": 0.41461923718452454, - "mean_token_accuracy": 0.8537945747375488, - "num_tokens": 9616626.0, - "step": 1070 - }, - { - "epoch": 0.8138297872340425, - "grad_norm": 1.7387986183166504, - "learning_rate": 4.362060463228603e-06, - "loss": 0.5134786367416382, - "mean_token_accuracy": 0.8511737585067749, - "num_tokens": 9626223.0, - "step": 1071 - }, - { - "epoch": 0.8145896656534954, - "grad_norm": 3.0270655155181885, - "learning_rate": 4.360662302273926e-06, - "loss": 0.3410695791244507, - "mean_token_accuracy": 0.8746449947357178, - "num_tokens": 9629455.0, - "step": 1072 - }, - { - "epoch": 0.8153495440729484, - "grad_norm": 1.7727062702178955, - "learning_rate": 4.35926283543482e-06, - "loss": 0.4610968828201294, - "mean_token_accuracy": 0.8444793224334717, - "num_tokens": 9638070.0, - "step": 1073 - }, - { - "epoch": 0.8161094224924013, - "grad_norm": 3.6333565711975098, - "learning_rate": 4.357862063693486e-06, - "loss": 0.3881273865699768, - "mean_token_accuracy": 0.8757344484329224, - "num_tokens": 9641028.0, - "step": 1074 - }, - { - "epoch": 0.8168693009118541, - "grad_norm": 3.024042844772339, - "learning_rate": 4.356459988033039e-06, - "loss": 0.3853808641433716, - "mean_token_accuracy": 0.8602254390716553, - "num_tokens": 9645730.0, - "step": 1075 - }, - { - "epoch": 0.817629179331307, - "grad_norm": 2.3359482288360596, - "learning_rate": 4.355056609437509e-06, - "loss": 0.4852045476436615, - "mean_token_accuracy": 0.8502728343009949, - "num_tokens": 9650975.0, - "step": 1076 - }, - { - "epoch": 0.8183890577507599, - "grad_norm": 2.2390685081481934, - "learning_rate": 4.353651928891842e-06, - "loss": 0.5287341475486755, - "mean_token_accuracy": 0.8247801065444946, - "num_tokens": 9657471.0, - "step": 1077 - }, - { - "epoch": 0.8191489361702128, - "grad_norm": 2.3809144496917725, - "learning_rate": 4.352245947381897e-06, - "loss": 0.5218510627746582, - "mean_token_accuracy": 0.8149170875549316, - "num_tokens": 9664108.0, - "step": 1078 - }, - { - "epoch": 0.8199088145896657, - "grad_norm": 1.7072309255599976, - "learning_rate": 4.3508386658944455e-06, - "loss": 0.46481168270111084, - "mean_token_accuracy": 0.834963321685791, - "num_tokens": 9673175.0, - "step": 1079 - }, - { - "epoch": 0.8206686930091185, - "grad_norm": 1.7383702993392944, - "learning_rate": 4.349430085417171e-06, - "loss": 0.4505952000617981, - "mean_token_accuracy": 0.8507769107818604, - "num_tokens": 9682800.0, - "step": 1080 - }, - { - "epoch": 0.8214285714285714, - "grad_norm": 2.4308547973632812, - "learning_rate": 4.348020206938672e-06, - "loss": 0.4832455515861511, - "mean_token_accuracy": 0.8538393974304199, - "num_tokens": 9688123.0, - "step": 1081 - }, - { - "epoch": 0.8221884498480243, - "grad_norm": 2.2686192989349365, - "learning_rate": 4.3466090314484526e-06, - "loss": 0.5112563371658325, - "mean_token_accuracy": 0.8308460712432861, - "num_tokens": 9694299.0, - "step": 1082 - }, - { - "epoch": 0.8229483282674772, - "grad_norm": 2.806093454360962, - "learning_rate": 4.345196559936931e-06, - "loss": 0.4818246364593506, - "mean_token_accuracy": 0.86617112159729, - "num_tokens": 9698471.0, - "step": 1083 - }, - { - "epoch": 0.8237082066869301, - "grad_norm": 1.7340706586837769, - "learning_rate": 4.343782793395435e-06, - "loss": 0.38246971368789673, - "mean_token_accuracy": 0.8675198554992676, - "num_tokens": 9706444.0, - "step": 1084 - }, - { - "epoch": 0.824468085106383, - "grad_norm": 1.664942741394043, - "learning_rate": 4.3423677328162e-06, - "loss": 0.498797208070755, - "mean_token_accuracy": 0.8447319865226746, - "num_tokens": 9716765.0, - "step": 1085 - }, - { - "epoch": 0.8252279635258358, - "grad_norm": 1.3608235120773315, - "learning_rate": 4.340951379192369e-06, - "loss": 0.41961491107940674, - "mean_token_accuracy": 0.8339346647262573, - "num_tokens": 9729564.0, - "step": 1086 - }, - { - "epoch": 0.8259878419452887, - "grad_norm": 1.642503261566162, - "learning_rate": 4.3395337335179945e-06, - "loss": 0.5477945804595947, - "mean_token_accuracy": 0.8117889761924744, - "num_tokens": 9741217.0, - "step": 1087 - }, - { - "epoch": 0.8267477203647416, - "grad_norm": 3.0345044136047363, - "learning_rate": 4.338114796788035e-06, - "loss": 0.5024623870849609, - "mean_token_accuracy": 0.8333141207695007, - "num_tokens": 9744941.0, - "step": 1088 - }, - { - "epoch": 0.8275075987841946, - "grad_norm": 1.3096630573272705, - "learning_rate": 4.336694569998354e-06, - "loss": 0.44169723987579346, - "mean_token_accuracy": 0.859926700592041, - "num_tokens": 9757854.0, - "step": 1089 - }, - { - "epoch": 0.8282674772036475, - "grad_norm": 2.203279495239258, - "learning_rate": 4.3352730541457215e-06, - "loss": 0.5283265113830566, - "mean_token_accuracy": 0.8053759932518005, - "num_tokens": 9764096.0, - "step": 1090 - }, - { - "epoch": 0.8290273556231003, - "grad_norm": 1.3774312734603882, - "learning_rate": 4.333850250227814e-06, - "loss": 0.4584103226661682, - "mean_token_accuracy": 0.8342611193656921, - "num_tokens": 9777768.0, - "step": 1091 - }, - { - "epoch": 0.8297872340425532, - "grad_norm": 1.822637915611267, - "learning_rate": 4.332426159243206e-06, - "loss": 0.5432791709899902, - "mean_token_accuracy": 0.8136210441589355, - "num_tokens": 9791276.0, - "step": 1092 - }, - { - "epoch": 0.8305471124620061, - "grad_norm": 3.0190067291259766, - "learning_rate": 4.331000782191384e-06, - "loss": 0.5018150806427002, - "mean_token_accuracy": 0.8234807252883911, - "num_tokens": 9794902.0, - "step": 1093 - }, - { - "epoch": 0.831306990881459, - "grad_norm": 2.09987735748291, - "learning_rate": 4.329574120072728e-06, - "loss": 0.4270891547203064, - "mean_token_accuracy": 0.8544977903366089, - "num_tokens": 9800903.0, - "step": 1094 - }, - { - "epoch": 0.8320668693009119, - "grad_norm": 1.969549536705017, - "learning_rate": 4.328146173888528e-06, - "loss": 0.45801427960395813, - "mean_token_accuracy": 0.8334714770317078, - "num_tokens": 9808719.0, - "step": 1095 - }, - { - "epoch": 0.8328267477203647, - "grad_norm": 1.4565571546554565, - "learning_rate": 4.32671694464097e-06, - "loss": 0.34864288568496704, - "mean_token_accuracy": 0.8689061999320984, - "num_tokens": 9818262.0, - "step": 1096 - }, - { - "epoch": 0.8335866261398176, - "grad_norm": 1.2163832187652588, - "learning_rate": 4.3252864333331424e-06, - "loss": 0.37953704595565796, - "mean_token_accuracy": 0.866554856300354, - "num_tokens": 9833942.0, - "step": 1097 - }, - { - "epoch": 0.8343465045592705, - "grad_norm": 1.6112010478973389, - "learning_rate": 4.323854640969033e-06, - "loss": 0.5442801713943481, - "mean_token_accuracy": 0.8190416097640991, - "num_tokens": 9844765.0, - "step": 1098 - }, - { - "epoch": 0.8351063829787234, - "grad_norm": 1.8190315961837769, - "learning_rate": 4.322421568553529e-06, - "loss": 0.48271381855010986, - "mean_token_accuracy": 0.8203652501106262, - "num_tokens": 9852625.0, - "step": 1099 - }, - { - "epoch": 0.8358662613981763, - "grad_norm": 2.7897756099700928, - "learning_rate": 4.320987217092416e-06, - "loss": 0.4086323380470276, - "mean_token_accuracy": 0.8504934310913086, - "num_tokens": 9856888.0, - "step": 1100 - }, - { - "epoch": 0.8366261398176292, - "grad_norm": 1.7035977840423584, - "learning_rate": 4.319551587592377e-06, - "loss": 0.6325064301490784, - "mean_token_accuracy": 0.788190484046936, - "num_tokens": 9869419.0, - "step": 1101 - }, - { - "epoch": 0.837386018237082, - "grad_norm": 2.609731912612915, - "learning_rate": 4.318114681060989e-06, - "loss": 0.519314706325531, - "mean_token_accuracy": 0.8469992280006409, - "num_tokens": 9874553.0, - "step": 1102 - }, - { - "epoch": 0.8381458966565349, - "grad_norm": 1.2519766092300415, - "learning_rate": 4.316676498506735e-06, - "loss": 0.3566005825996399, - "mean_token_accuracy": 0.8588439226150513, - "num_tokens": 9886498.0, - "step": 1103 - }, - { - "epoch": 0.8389057750759878, - "grad_norm": 1.430892825126648, - "learning_rate": 4.3152370409389795e-06, - "loss": 0.5250182747840881, - "mean_token_accuracy": 0.8164948225021362, - "num_tokens": 9900256.0, - "step": 1104 - }, - { - "epoch": 0.8396656534954408, - "grad_norm": 3.1245436668395996, - "learning_rate": 4.3137963093679945e-06, - "loss": 0.3173971176147461, - "mean_token_accuracy": 0.8835347890853882, - "num_tokens": 9903899.0, - "step": 1105 - }, - { - "epoch": 0.8404255319148937, - "grad_norm": 3.131812572479248, - "learning_rate": 4.3123543048049395e-06, - "loss": 0.6567763090133667, - "mean_token_accuracy": 0.8233605027198792, - "num_tokens": 9908798.0, - "step": 1106 - }, - { - "epoch": 0.8411854103343465, - "grad_norm": 1.3551725149154663, - "learning_rate": 4.310911028261867e-06, - "loss": 0.3993729054927826, - "mean_token_accuracy": 0.8529655933380127, - "num_tokens": 9922577.0, - "step": 1107 - }, - { - "epoch": 0.8419452887537994, - "grad_norm": 2.572533130645752, - "learning_rate": 4.309466480751726e-06, - "loss": 0.40906503796577454, - "mean_token_accuracy": 0.8630726933479309, - "num_tokens": 9926890.0, - "step": 1108 - }, - { - "epoch": 0.8427051671732523, - "grad_norm": 1.9146469831466675, - "learning_rate": 4.308020663288356e-06, - "loss": 0.48423194885253906, - "mean_token_accuracy": 0.8370280861854553, - "num_tokens": 9934293.0, - "step": 1109 - }, - { - "epoch": 0.8434650455927052, - "grad_norm": 1.6178001165390015, - "learning_rate": 4.306573576886485e-06, - "loss": 0.4262213408946991, - "mean_token_accuracy": 0.839401125907898, - "num_tokens": 9944513.0, - "step": 1110 - }, - { - "epoch": 0.8442249240121581, - "grad_norm": 2.4444572925567627, - "learning_rate": 4.305125222561736e-06, - "loss": 0.5199950933456421, - "mean_token_accuracy": 0.8507720232009888, - "num_tokens": 9949512.0, - "step": 1111 - }, - { - "epoch": 0.8449848024316109, - "grad_norm": 1.7983134984970093, - "learning_rate": 4.303675601330618e-06, - "loss": 0.36155956983566284, - "mean_token_accuracy": 0.8568712472915649, - "num_tokens": 9956402.0, - "step": 1112 - }, - { - "epoch": 0.8457446808510638, - "grad_norm": 2.391096353530884, - "learning_rate": 4.302224714210532e-06, - "loss": 0.5391949415206909, - "mean_token_accuracy": 0.8183057308197021, - "num_tokens": 9961606.0, - "step": 1113 - }, - { - "epoch": 0.8465045592705167, - "grad_norm": 1.8520214557647705, - "learning_rate": 4.3007725622197675e-06, - "loss": 0.5758882761001587, - "mean_token_accuracy": 0.7924330234527588, - "num_tokens": 9971473.0, - "step": 1114 - }, - { - "epoch": 0.8472644376899696, - "grad_norm": 2.436640739440918, - "learning_rate": 4.2993191463775e-06, - "loss": 0.3837985396385193, - "mean_token_accuracy": 0.8620110750198364, - "num_tokens": 9976333.0, - "step": 1115 - }, - { - "epoch": 0.8480243161094225, - "grad_norm": 1.7287120819091797, - "learning_rate": 4.29786446770379e-06, - "loss": 0.40066856145858765, - "mean_token_accuracy": 0.8618333339691162, - "num_tokens": 9985617.0, - "step": 1116 - }, - { - "epoch": 0.8487841945288754, - "grad_norm": 2.0310518741607666, - "learning_rate": 4.296408527219592e-06, - "loss": 0.5465943217277527, - "mean_token_accuracy": 0.812044620513916, - "num_tokens": 9995363.0, - "step": 1117 - }, - { - "epoch": 0.8495440729483282, - "grad_norm": 1.4858589172363281, - "learning_rate": 4.294951325946737e-06, - "loss": 0.45840176939964294, - "mean_token_accuracy": 0.8432979583740234, - "num_tokens": 10006400.0, - "step": 1118 - }, - { - "epoch": 0.8503039513677811, - "grad_norm": 1.6153514385223389, - "learning_rate": 4.293492864907947e-06, - "loss": 0.5225611925125122, - "mean_token_accuracy": 0.8180211186408997, - "num_tokens": 10018352.0, - "step": 1119 - }, - { - "epoch": 0.851063829787234, - "grad_norm": 2.1178412437438965, - "learning_rate": 4.2920331451268246e-06, - "loss": 0.5580621361732483, - "mean_token_accuracy": 0.8211709260940552, - "num_tokens": 10025614.0, - "step": 1120 - }, - { - "epoch": 0.851823708206687, - "grad_norm": 2.036839246749878, - "learning_rate": 4.2905721676278585e-06, - "loss": 0.4658433198928833, - "mean_token_accuracy": 0.8380423784255981, - "num_tokens": 10032489.0, - "step": 1121 - }, - { - "epoch": 0.8525835866261399, - "grad_norm": 2.0056262016296387, - "learning_rate": 4.28910993343642e-06, - "loss": 0.47023308277130127, - "mean_token_accuracy": 0.8340359926223755, - "num_tokens": 10040050.0, - "step": 1122 - }, - { - "epoch": 0.8533434650455927, - "grad_norm": 2.540024518966675, - "learning_rate": 4.2876464435787576e-06, - "loss": 0.502303957939148, - "mean_token_accuracy": 0.8288739919662476, - "num_tokens": 10045042.0, - "step": 1123 - }, - { - "epoch": 0.8541033434650456, - "grad_norm": 1.7894693613052368, - "learning_rate": 4.286181699082008e-06, - "loss": 0.4732973575592041, - "mean_token_accuracy": 0.8340568542480469, - "num_tokens": 10054424.0, - "step": 1124 - }, - { - "epoch": 0.8548632218844985, - "grad_norm": 1.5601223707199097, - "learning_rate": 4.284715700974186e-06, - "loss": 0.472471684217453, - "mean_token_accuracy": 0.8274722695350647, - "num_tokens": 10065523.0, - "step": 1125 - }, - { - "epoch": 0.8556231003039514, - "grad_norm": 1.7326055765151978, - "learning_rate": 4.283248450284182e-06, - "loss": 0.5924872159957886, - "mean_token_accuracy": 0.7943467497825623, - "num_tokens": 10076839.0, - "step": 1126 - }, - { - "epoch": 0.8563829787234043, - "grad_norm": 1.5165479183197021, - "learning_rate": 4.281779948041772e-06, - "loss": 0.44768425822257996, - "mean_token_accuracy": 0.8394696712493896, - "num_tokens": 10088168.0, - "step": 1127 - }, - { - "epoch": 0.8571428571428571, - "grad_norm": 1.5448920726776123, - "learning_rate": 4.280310195277606e-06, - "loss": 0.4458175003528595, - "mean_token_accuracy": 0.835773229598999, - "num_tokens": 10100306.0, - "step": 1128 - }, - { - "epoch": 0.85790273556231, - "grad_norm": 1.6311609745025635, - "learning_rate": 4.278839193023214e-06, - "loss": 0.4158072769641876, - "mean_token_accuracy": 0.8482539653778076, - "num_tokens": 10110581.0, - "step": 1129 - }, - { - "epoch": 0.8586626139817629, - "grad_norm": 1.6714754104614258, - "learning_rate": 4.277366942311001e-06, - "loss": 0.3686875104904175, - "mean_token_accuracy": 0.8681533336639404, - "num_tokens": 10118799.0, - "step": 1130 - }, - { - "epoch": 0.8594224924012158, - "grad_norm": 2.1604413986206055, - "learning_rate": 4.2758934441742494e-06, - "loss": 0.37267982959747314, - "mean_token_accuracy": 0.8520427346229553, - "num_tokens": 10124734.0, - "step": 1131 - }, - { - "epoch": 0.8601823708206687, - "grad_norm": 2.123013973236084, - "learning_rate": 4.274418699647117e-06, - "loss": 0.49963313341140747, - "mean_token_accuracy": 0.8248758912086487, - "num_tokens": 10131965.0, - "step": 1132 - }, - { - "epoch": 0.8609422492401215, - "grad_norm": 1.4308786392211914, - "learning_rate": 4.272942709764638e-06, - "loss": 0.48666873574256897, - "mean_token_accuracy": 0.8304717540740967, - "num_tokens": 10145164.0, - "step": 1133 - }, - { - "epoch": 0.8617021276595744, - "grad_norm": 1.7952618598937988, - "learning_rate": 4.271465475562716e-06, - "loss": 0.5536223649978638, - "mean_token_accuracy": 0.8093959093093872, - "num_tokens": 10154083.0, - "step": 1134 - }, - { - "epoch": 0.8624620060790273, - "grad_norm": 2.0622456073760986, - "learning_rate": 4.269986998078132e-06, - "loss": 0.5173629522323608, - "mean_token_accuracy": 0.8285619020462036, - "num_tokens": 10161889.0, - "step": 1135 - }, - { - "epoch": 0.8632218844984803, - "grad_norm": 2.0707509517669678, - "learning_rate": 4.268507278348539e-06, - "loss": 0.5871608257293701, - "mean_token_accuracy": 0.7827386856079102, - "num_tokens": 10170726.0, - "step": 1136 - }, - { - "epoch": 0.8639817629179332, - "grad_norm": 2.054368257522583, - "learning_rate": 4.2670263174124615e-06, - "loss": 0.5788969993591309, - "mean_token_accuracy": 0.7967237234115601, - "num_tokens": 10178474.0, - "step": 1137 - }, - { - "epoch": 0.8647416413373861, - "grad_norm": 1.901846170425415, - "learning_rate": 4.265544116309294e-06, - "loss": 0.5405587553977966, - "mean_token_accuracy": 0.8151819705963135, - "num_tokens": 10187013.0, - "step": 1138 - }, - { - "epoch": 0.8655015197568389, - "grad_norm": 2.901285409927368, - "learning_rate": 4.264060676079302e-06, - "loss": 0.44101861119270325, - "mean_token_accuracy": 0.8433429002761841, - "num_tokens": 10191517.0, - "step": 1139 - }, - { - "epoch": 0.8662613981762918, - "grad_norm": 2.4168388843536377, - "learning_rate": 4.262575997763622e-06, - "loss": 0.4686204195022583, - "mean_token_accuracy": 0.8505309820175171, - "num_tokens": 10196948.0, - "step": 1140 - }, - { - "epoch": 0.8670212765957447, - "grad_norm": 1.9588396549224854, - "learning_rate": 4.2610900824042575e-06, - "loss": 0.47056013345718384, - "mean_token_accuracy": 0.8280024528503418, - "num_tokens": 10204292.0, - "step": 1141 - }, - { - "epoch": 0.8677811550151976, - "grad_norm": 2.569150924682617, - "learning_rate": 4.2596029310440826e-06, - "loss": 0.573108434677124, - "mean_token_accuracy": 0.8108246326446533, - "num_tokens": 10209571.0, - "step": 1142 - }, - { - "epoch": 0.8685410334346505, - "grad_norm": 2.038032293319702, - "learning_rate": 4.258114544726835e-06, - "loss": 0.40545332431793213, - "mean_token_accuracy": 0.8611703515052795, - "num_tokens": 10215716.0, - "step": 1143 - }, - { - "epoch": 0.8693009118541033, - "grad_norm": 1.9884231090545654, - "learning_rate": 4.256624924497124e-06, - "loss": 0.40085992217063904, - "mean_token_accuracy": 0.8615031242370605, - "num_tokens": 10222775.0, - "step": 1144 - }, - { - "epoch": 0.8700607902735562, - "grad_norm": 1.912842035293579, - "learning_rate": 4.25513407140042e-06, - "loss": 0.41022324562072754, - "mean_token_accuracy": 0.8459607362747192, - "num_tokens": 10229589.0, - "step": 1145 - }, - { - "epoch": 0.8708206686930091, - "grad_norm": 1.9190576076507568, - "learning_rate": 4.253641986483063e-06, - "loss": 0.5541447401046753, - "mean_token_accuracy": 0.8256468772888184, - "num_tokens": 10240633.0, - "step": 1146 - }, - { - "epoch": 0.871580547112462, - "grad_norm": 1.3742294311523438, - "learning_rate": 4.2521486707922545e-06, - "loss": 0.3680543899536133, - "mean_token_accuracy": 0.8654477596282959, - "num_tokens": 10251252.0, - "step": 1147 - }, - { - "epoch": 0.8723404255319149, - "grad_norm": 1.4438525438308716, - "learning_rate": 4.250654125376062e-06, - "loss": 0.45830875635147095, - "mean_token_accuracy": 0.8433834314346313, - "num_tokens": 10263980.0, - "step": 1148 - }, - { - "epoch": 0.8731003039513677, - "grad_norm": 2.1273653507232666, - "learning_rate": 4.249158351283414e-06, - "loss": 0.4129376709461212, - "mean_token_accuracy": 0.861556351184845, - "num_tokens": 10270426.0, - "step": 1149 - }, - { - "epoch": 0.8738601823708206, - "grad_norm": 2.598440647125244, - "learning_rate": 4.247661349564103e-06, - "loss": 0.418030709028244, - "mean_token_accuracy": 0.86553955078125, - "num_tokens": 10275493.0, - "step": 1150 - }, - { - "epoch": 0.8746200607902735, - "grad_norm": 1.6852490901947021, - "learning_rate": 4.246163121268782e-06, - "loss": 0.6403408050537109, - "mean_token_accuracy": 0.7966094017028809, - "num_tokens": 10287989.0, - "step": 1151 - }, - { - "epoch": 0.8753799392097265, - "grad_norm": 2.5013794898986816, - "learning_rate": 4.244663667448965e-06, - "loss": 0.49922505021095276, - "mean_token_accuracy": 0.8318735361099243, - "num_tokens": 10293360.0, - "step": 1152 - }, - { - "epoch": 0.8761398176291794, - "grad_norm": 1.2022709846496582, - "learning_rate": 4.243162989157027e-06, - "loss": 0.4414965510368347, - "mean_token_accuracy": 0.8338693380355835, - "num_tokens": 10310558.0, - "step": 1153 - }, - { - "epoch": 0.8768996960486323, - "grad_norm": 1.9903281927108765, - "learning_rate": 4.241661087446202e-06, - "loss": 0.4277610778808594, - "mean_token_accuracy": 0.8560749292373657, - "num_tokens": 10316983.0, - "step": 1154 - }, - { - "epoch": 0.8776595744680851, - "grad_norm": 2.104923725128174, - "learning_rate": 4.240157963370583e-06, - "loss": 0.44431713223457336, - "mean_token_accuracy": 0.8785282969474792, - "num_tokens": 10323294.0, - "step": 1155 - }, - { - "epoch": 0.878419452887538, - "grad_norm": 2.8364813327789307, - "learning_rate": 4.2386536179851175e-06, - "loss": 0.49948397278785706, - "mean_token_accuracy": 0.8305255174636841, - "num_tokens": 10327662.0, - "step": 1156 - }, - { - "epoch": 0.8791793313069909, - "grad_norm": 1.9493682384490967, - "learning_rate": 4.2371480523456156e-06, - "loss": 0.45867404341697693, - "mean_token_accuracy": 0.8373264074325562, - "num_tokens": 10335699.0, - "step": 1157 - }, - { - "epoch": 0.8799392097264438, - "grad_norm": 2.268616199493408, - "learning_rate": 4.235641267508741e-06, - "loss": 0.4547857940196991, - "mean_token_accuracy": 0.8252766132354736, - "num_tokens": 10342464.0, - "step": 1158 - }, - { - "epoch": 0.8806990881458967, - "grad_norm": 2.1334283351898193, - "learning_rate": 4.234133264532012e-06, - "loss": 0.39503124356269836, - "mean_token_accuracy": 0.8648351430892944, - "num_tokens": 10347514.0, - "step": 1159 - }, - { - "epoch": 0.8814589665653495, - "grad_norm": 1.2775357961654663, - "learning_rate": 4.232624044473805e-06, - "loss": 0.39945733547210693, - "mean_token_accuracy": 0.8369829654693604, - "num_tokens": 10363316.0, - "step": 1160 - }, - { - "epoch": 0.8822188449848024, - "grad_norm": 2.458413600921631, - "learning_rate": 4.231113608393348e-06, - "loss": 0.5020045638084412, - "mean_token_accuracy": 0.8295938968658447, - "num_tokens": 10368401.0, - "step": 1161 - }, - { - "epoch": 0.8829787234042553, - "grad_norm": 1.7464948892593384, - "learning_rate": 4.229601957350722e-06, - "loss": 0.5335392951965332, - "mean_token_accuracy": 0.8134858012199402, - "num_tokens": 10378337.0, - "step": 1162 - }, - { - "epoch": 0.8837386018237082, - "grad_norm": 3.1152119636535645, - "learning_rate": 4.228089092406863e-06, - "loss": 0.4811682105064392, - "mean_token_accuracy": 0.8460187315940857, - "num_tokens": 10382362.0, - "step": 1163 - }, - { - "epoch": 0.8844984802431611, - "grad_norm": 2.190847158432007, - "learning_rate": 4.226575014623557e-06, - "loss": 0.4428049921989441, - "mean_token_accuracy": 0.8382467031478882, - "num_tokens": 10388211.0, - "step": 1164 - }, - { - "epoch": 0.8852583586626139, - "grad_norm": 1.860153079032898, - "learning_rate": 4.225059725063444e-06, - "loss": 0.5265918970108032, - "mean_token_accuracy": 0.8181334733963013, - "num_tokens": 10398873.0, - "step": 1165 - }, - { - "epoch": 0.8860182370820668, - "grad_norm": 1.3372713327407837, - "learning_rate": 4.22354322479001e-06, - "loss": 0.43202850222587585, - "mean_token_accuracy": 0.8432420492172241, - "num_tokens": 10413158.0, - "step": 1166 - }, - { - "epoch": 0.8867781155015197, - "grad_norm": 1.3653379678726196, - "learning_rate": 4.222025514867596e-06, - "loss": 0.43780991435050964, - "mean_token_accuracy": 0.8441485166549683, - "num_tokens": 10428137.0, - "step": 1167 - }, - { - "epoch": 0.8875379939209727, - "grad_norm": 3.0230672359466553, - "learning_rate": 4.220506596361387e-06, - "loss": 0.6039337515830994, - "mean_token_accuracy": 0.8274872303009033, - "num_tokens": 10432586.0, - "step": 1168 - }, - { - "epoch": 0.8882978723404256, - "grad_norm": 2.2180392742156982, - "learning_rate": 4.218986470337419e-06, - "loss": 0.5453792810440063, - "mean_token_accuracy": 0.8127184510231018, - "num_tokens": 10439471.0, - "step": 1169 - }, - { - "epoch": 0.8890577507598785, - "grad_norm": 1.8519103527069092, - "learning_rate": 4.217465137862575e-06, - "loss": 0.5145469903945923, - "mean_token_accuracy": 0.8178654909133911, - "num_tokens": 10450471.0, - "step": 1170 - }, - { - "epoch": 0.8898176291793313, - "grad_norm": 2.034008026123047, - "learning_rate": 4.215942600004586e-06, - "loss": 0.44061461091041565, - "mean_token_accuracy": 0.8572084307670593, - "num_tokens": 10457382.0, - "step": 1171 - }, - { - "epoch": 0.8905775075987842, - "grad_norm": 3.4304304122924805, - "learning_rate": 4.214418857832025e-06, - "loss": 0.44397830963134766, - "mean_token_accuracy": 0.842149019241333, - "num_tokens": 10460650.0, - "step": 1172 - }, - { - "epoch": 0.8913373860182371, - "grad_norm": 1.9021750688552856, - "learning_rate": 4.212893912414316e-06, - "loss": 0.3769867420196533, - "mean_token_accuracy": 0.8806171417236328, - "num_tokens": 10468214.0, - "step": 1173 - }, - { - "epoch": 0.89209726443769, - "grad_norm": 1.9704062938690186, - "learning_rate": 4.211367764821722e-06, - "loss": 0.5501819849014282, - "mean_token_accuracy": 0.8176811337471008, - "num_tokens": 10476739.0, - "step": 1174 - }, - { - "epoch": 0.8928571428571429, - "grad_norm": 1.4350415468215942, - "learning_rate": 4.209840416125353e-06, - "loss": 0.41897401213645935, - "mean_token_accuracy": 0.8498011827468872, - "num_tokens": 10491769.0, - "step": 1175 - }, - { - "epoch": 0.8936170212765957, - "grad_norm": 3.8237783908843994, - "learning_rate": 4.208311867397162e-06, - "loss": 0.5296977162361145, - "mean_token_accuracy": 0.8168715834617615, - "num_tokens": 10494958.0, - "step": 1176 - }, - { - "epoch": 0.8943768996960486, - "grad_norm": 2.04784893989563, - "learning_rate": 4.206782119709942e-06, - "loss": 0.476105272769928, - "mean_token_accuracy": 0.834011435508728, - "num_tokens": 10502077.0, - "step": 1177 - }, - { - "epoch": 0.8951367781155015, - "grad_norm": 1.8839610815048218, - "learning_rate": 4.205251174137329e-06, - "loss": 0.49628815054893494, - "mean_token_accuracy": 0.8212119936943054, - "num_tokens": 10510077.0, - "step": 1178 - }, - { - "epoch": 0.8958966565349544, - "grad_norm": 1.2100634574890137, - "learning_rate": 4.2037190317538e-06, - "loss": 0.4931519329547882, - "mean_token_accuracy": 0.8170043230056763, - "num_tokens": 10528373.0, - "step": 1179 - }, - { - "epoch": 0.8966565349544073, - "grad_norm": 1.884637713432312, - "learning_rate": 4.202185693634671e-06, - "loss": 0.4913347363471985, - "mean_token_accuracy": 0.8234949707984924, - "num_tokens": 10537108.0, - "step": 1180 - }, - { - "epoch": 0.8974164133738601, - "grad_norm": 1.5062434673309326, - "learning_rate": 4.200651160856099e-06, - "loss": 0.4160492420196533, - "mean_token_accuracy": 0.845937192440033, - "num_tokens": 10547577.0, - "step": 1181 - }, - { - "epoch": 0.898176291793313, - "grad_norm": 2.331169605255127, - "learning_rate": 4.1991154344950755e-06, - "loss": 0.6532632112503052, - "mean_token_accuracy": 0.7743191123008728, - "num_tokens": 10556328.0, - "step": 1182 - }, - { - "epoch": 0.898936170212766, - "grad_norm": 1.3538362979888916, - "learning_rate": 4.197578515629435e-06, - "loss": 0.4437566101551056, - "mean_token_accuracy": 0.8427901268005371, - "num_tokens": 10570026.0, - "step": 1183 - }, - { - "epoch": 0.8996960486322189, - "grad_norm": 2.3828957080841064, - "learning_rate": 4.196040405337846e-06, - "loss": 0.6185290217399597, - "mean_token_accuracy": 0.7969824075698853, - "num_tokens": 10576465.0, - "step": 1184 - }, - { - "epoch": 0.9004559270516718, - "grad_norm": 2.4759042263031006, - "learning_rate": 4.194501104699813e-06, - "loss": 0.46489226818084717, - "mean_token_accuracy": 0.8472316265106201, - "num_tokens": 10582034.0, - "step": 1185 - }, - { - "epoch": 0.9012158054711246, - "grad_norm": 1.9215164184570312, - "learning_rate": 4.192960614795676e-06, - "loss": 0.48001551628112793, - "mean_token_accuracy": 0.8371596336364746, - "num_tokens": 10590556.0, - "step": 1186 - }, - { - "epoch": 0.9019756838905775, - "grad_norm": 2.2717080116271973, - "learning_rate": 4.19141893670661e-06, - "loss": 0.40083563327789307, - "mean_token_accuracy": 0.8464195728302002, - "num_tokens": 10595661.0, - "step": 1187 - }, - { - "epoch": 0.9027355623100304, - "grad_norm": 2.187122344970703, - "learning_rate": 4.189876071514624e-06, - "loss": 0.4942901134490967, - "mean_token_accuracy": 0.8186990022659302, - "num_tokens": 10603366.0, - "step": 1188 - }, - { - "epoch": 0.9034954407294833, - "grad_norm": 1.542414665222168, - "learning_rate": 4.188332020302561e-06, - "loss": 0.4731982946395874, - "mean_token_accuracy": 0.8487229347229004, - "num_tokens": 10616203.0, - "step": 1189 - }, - { - "epoch": 0.9042553191489362, - "grad_norm": 0.9957579970359802, - "learning_rate": 4.186786784154096e-06, - "loss": 0.33211836218833923, - "mean_token_accuracy": 0.870644748210907, - "num_tokens": 10633294.0, - "step": 1190 - }, - { - "epoch": 0.9050151975683891, - "grad_norm": 2.593867540359497, - "learning_rate": 4.1852403641537344e-06, - "loss": 0.6825464963912964, - "mean_token_accuracy": 0.7716869115829468, - "num_tokens": 10640615.0, - "step": 1191 - }, - { - "epoch": 0.9057750759878419, - "grad_norm": 2.0424516201019287, - "learning_rate": 4.183692761386813e-06, - "loss": 0.5672709941864014, - "mean_token_accuracy": 0.7973801493644714, - "num_tokens": 10649845.0, - "step": 1192 - }, - { - "epoch": 0.9065349544072948, - "grad_norm": 1.429018259048462, - "learning_rate": 4.1821439769395e-06, - "loss": 0.5427846908569336, - "mean_token_accuracy": 0.8200292587280273, - "num_tokens": 10665898.0, - "step": 1193 - }, - { - "epoch": 0.9072948328267477, - "grad_norm": 1.9764264822006226, - "learning_rate": 4.180594011898791e-06, - "loss": 0.4784567356109619, - "mean_token_accuracy": 0.82924485206604, - "num_tokens": 10673595.0, - "step": 1194 - }, - { - "epoch": 0.9080547112462006, - "grad_norm": 1.4004309177398682, - "learning_rate": 4.1790428673525104e-06, - "loss": 0.4791432023048401, - "mean_token_accuracy": 0.8334879875183105, - "num_tokens": 10687892.0, - "step": 1195 - }, - { - "epoch": 0.9088145896656535, - "grad_norm": 2.2207727432250977, - "learning_rate": 4.177490544389313e-06, - "loss": 0.5089365243911743, - "mean_token_accuracy": 0.8270776271820068, - "num_tokens": 10694911.0, - "step": 1196 - }, - { - "epoch": 0.9095744680851063, - "grad_norm": 2.2890450954437256, - "learning_rate": 4.175937044098678e-06, - "loss": 0.5152267813682556, - "mean_token_accuracy": 0.8527299165725708, - "num_tokens": 10700512.0, - "step": 1197 - }, - { - "epoch": 0.9103343465045592, - "grad_norm": 1.7938050031661987, - "learning_rate": 4.1743823675709115e-06, - "loss": 0.3507300615310669, - "mean_token_accuracy": 0.8694599866867065, - "num_tokens": 10707953.0, - "step": 1198 - }, - { - "epoch": 0.9110942249240122, - "grad_norm": 1.4368808269500732, - "learning_rate": 4.172826515897146e-06, - "loss": 0.407418429851532, - "mean_token_accuracy": 0.8432893753051758, - "num_tokens": 10717485.0, - "step": 1199 - }, - { - "epoch": 0.9118541033434651, - "grad_norm": 1.735339879989624, - "learning_rate": 4.171269490169337e-06, - "loss": 0.46996885538101196, - "mean_token_accuracy": 0.8331948518753052, - "num_tokens": 10726160.0, - "step": 1200 - }, - { - "epoch": 0.912613981762918, - "grad_norm": 1.7859221696853638, - "learning_rate": 4.1697112914802665e-06, - "loss": 0.5325199365615845, - "mean_token_accuracy": 0.8179605007171631, - "num_tokens": 10736284.0, - "step": 1201 - }, - { - "epoch": 0.9133738601823708, - "grad_norm": 2.6394896507263184, - "learning_rate": 4.168151920923536e-06, - "loss": 0.4039744734764099, - "mean_token_accuracy": 0.8545527458190918, - "num_tokens": 10740673.0, - "step": 1202 - }, - { - "epoch": 0.9141337386018237, - "grad_norm": 1.910988211631775, - "learning_rate": 4.1665913795935755e-06, - "loss": 0.5190291404724121, - "mean_token_accuracy": 0.8203921318054199, - "num_tokens": 10751946.0, - "step": 1203 - }, - { - "epoch": 0.9148936170212766, - "grad_norm": 3.0006964206695557, - "learning_rate": 4.16502966858563e-06, - "loss": 0.5856777429580688, - "mean_token_accuracy": 0.8061224222183228, - "num_tokens": 10756795.0, - "step": 1204 - }, - { - "epoch": 0.9156534954407295, - "grad_norm": 1.7396167516708374, - "learning_rate": 4.163466788995768e-06, - "loss": 0.54935222864151, - "mean_token_accuracy": 0.8052443265914917, - "num_tokens": 10767202.0, - "step": 1205 - }, - { - "epoch": 0.9164133738601824, - "grad_norm": 2.143735885620117, - "learning_rate": 4.161902741920881e-06, - "loss": 0.5020298361778259, - "mean_token_accuracy": 0.8249630928039551, - "num_tokens": 10774329.0, - "step": 1206 - }, - { - "epoch": 0.9171732522796353, - "grad_norm": 2.8871893882751465, - "learning_rate": 4.160337528458676e-06, - "loss": 0.5154489278793335, - "mean_token_accuracy": 0.8276848793029785, - "num_tokens": 10778929.0, - "step": 1207 - }, - { - "epoch": 0.9179331306990881, - "grad_norm": 1.4642788171768188, - "learning_rate": 4.15877114970768e-06, - "loss": 0.5033774375915527, - "mean_token_accuracy": 0.8296241164207458, - "num_tokens": 10790928.0, - "step": 1208 - }, - { - "epoch": 0.918693009118541, - "grad_norm": 1.8313497304916382, - "learning_rate": 4.1572036067672386e-06, - "loss": 0.5674909353256226, - "mean_token_accuracy": 0.7975562214851379, - "num_tokens": 10801372.0, - "step": 1209 - }, - { - "epoch": 0.9194528875379939, - "grad_norm": 2.005958080291748, - "learning_rate": 4.155634900737513e-06, - "loss": 0.5557019114494324, - "mean_token_accuracy": 0.8141391277313232, - "num_tokens": 10809150.0, - "step": 1210 - }, - { - "epoch": 0.9202127659574468, - "grad_norm": 2.333519697189331, - "learning_rate": 4.154065032719482e-06, - "loss": 0.6990420818328857, - "mean_token_accuracy": 0.7565394043922424, - "num_tokens": 10816612.0, - "step": 1211 - }, - { - "epoch": 0.9209726443768997, - "grad_norm": 1.4472655057907104, - "learning_rate": 4.152494003814939e-06, - "loss": 0.541398286819458, - "mean_token_accuracy": 0.8027358055114746, - "num_tokens": 10833840.0, - "step": 1212 - }, - { - "epoch": 0.9217325227963525, - "grad_norm": 1.6183619499206543, - "learning_rate": 4.150921815126493e-06, - "loss": 0.6096762418746948, - "mean_token_accuracy": 0.7994354963302612, - "num_tokens": 10846367.0, - "step": 1213 - }, - { - "epoch": 0.9224924012158054, - "grad_norm": 2.614919900894165, - "learning_rate": 4.149348467757566e-06, - "loss": 0.41846764087677, - "mean_token_accuracy": 0.8555068969726562, - "num_tokens": 10850836.0, - "step": 1214 - }, - { - "epoch": 0.9232522796352584, - "grad_norm": 1.4419831037521362, - "learning_rate": 4.147773962812393e-06, - "loss": 0.4139535427093506, - "mean_token_accuracy": 0.845671534538269, - "num_tokens": 10864228.0, - "step": 1215 - }, - { - "epoch": 0.9240121580547113, - "grad_norm": 2.3868865966796875, - "learning_rate": 4.146198301396025e-06, - "loss": 0.3357275128364563, - "mean_token_accuracy": 0.8829520344734192, - "num_tokens": 10868920.0, - "step": 1216 - }, - { - "epoch": 0.9247720364741642, - "grad_norm": 1.7685474157333374, - "learning_rate": 4.14462148461432e-06, - "loss": 0.45333072543144226, - "mean_token_accuracy": 0.8505891561508179, - "num_tokens": 10877286.0, - "step": 1217 - }, - { - "epoch": 0.925531914893617, - "grad_norm": 1.7627625465393066, - "learning_rate": 4.143043513573949e-06, - "loss": 0.5028705596923828, - "mean_token_accuracy": 0.825471043586731, - "num_tokens": 10887047.0, - "step": 1218 - }, - { - "epoch": 0.9262917933130699, - "grad_norm": 1.3168725967407227, - "learning_rate": 4.141464389382392e-06, - "loss": 0.5494637489318848, - "mean_token_accuracy": 0.8121747970581055, - "num_tokens": 10903599.0, - "step": 1219 - }, - { - "epoch": 0.9270516717325228, - "grad_norm": 2.5180399417877197, - "learning_rate": 4.13988411314794e-06, - "loss": 0.6134277582168579, - "mean_token_accuracy": 0.7983006834983826, - "num_tokens": 10909791.0, - "step": 1220 - }, - { - "epoch": 0.9278115501519757, - "grad_norm": 1.1889166831970215, - "learning_rate": 4.13830268597969e-06, - "loss": 0.36713096499443054, - "mean_token_accuracy": 0.8416121006011963, - "num_tokens": 10925794.0, - "step": 1221 - }, - { - "epoch": 0.9285714285714286, - "grad_norm": 2.142422676086426, - "learning_rate": 4.136720108987552e-06, - "loss": 0.4427933096885681, - "mean_token_accuracy": 0.8427745699882507, - "num_tokens": 10931622.0, - "step": 1222 - }, - { - "epoch": 0.9293313069908815, - "grad_norm": 1.908564567565918, - "learning_rate": 4.1351363832822364e-06, - "loss": 0.5088109374046326, - "mean_token_accuracy": 0.8309272527694702, - "num_tokens": 10940843.0, - "step": 1223 - }, - { - "epoch": 0.9300911854103343, - "grad_norm": 1.2862322330474854, - "learning_rate": 4.133551509975264e-06, - "loss": 0.3963761329650879, - "mean_token_accuracy": 0.8602159023284912, - "num_tokens": 10954481.0, - "step": 1224 - }, - { - "epoch": 0.9308510638297872, - "grad_norm": 1.5876200199127197, - "learning_rate": 4.13196549017896e-06, - "loss": 0.4311184287071228, - "mean_token_accuracy": 0.8460899591445923, - "num_tokens": 10963501.0, - "step": 1225 - }, - { - "epoch": 0.9316109422492401, - "grad_norm": 2.459878444671631, - "learning_rate": 4.130378325006453e-06, - "loss": 0.5016295313835144, - "mean_token_accuracy": 0.8125218152999878, - "num_tokens": 10968850.0, - "step": 1226 - }, - { - "epoch": 0.932370820668693, - "grad_norm": 2.059718370437622, - "learning_rate": 4.128790015571679e-06, - "loss": 0.48982277512550354, - "mean_token_accuracy": 0.8327049016952515, - "num_tokens": 10976642.0, - "step": 1227 - }, - { - "epoch": 0.9331306990881459, - "grad_norm": 1.3719185590744019, - "learning_rate": 4.127200562989372e-06, - "loss": 0.38778752088546753, - "mean_token_accuracy": 0.8623501062393188, - "num_tokens": 10988703.0, - "step": 1228 - }, - { - "epoch": 0.9338905775075987, - "grad_norm": 1.302140712738037, - "learning_rate": 4.125609968375073e-06, - "loss": 0.4887842535972595, - "mean_token_accuracy": 0.8322232961654663, - "num_tokens": 11005981.0, - "step": 1229 - }, - { - "epoch": 0.9346504559270516, - "grad_norm": 1.819624423980713, - "learning_rate": 4.12401823284512e-06, - "loss": 0.49825209379196167, - "mean_token_accuracy": 0.8278916478157043, - "num_tokens": 11014145.0, - "step": 1230 - }, - { - "epoch": 0.9354103343465046, - "grad_norm": 1.2762807607650757, - "learning_rate": 4.122425357516658e-06, - "loss": 0.433994323015213, - "mean_token_accuracy": 0.853028416633606, - "num_tokens": 11029232.0, - "step": 1231 - }, - { - "epoch": 0.9361702127659575, - "grad_norm": 2.2171671390533447, - "learning_rate": 4.1208313435076255e-06, - "loss": 0.38436949253082275, - "mean_token_accuracy": 0.8616260290145874, - "num_tokens": 11034743.0, - "step": 1232 - }, - { - "epoch": 0.9369300911854104, - "grad_norm": 1.355879545211792, - "learning_rate": 4.119236191936764e-06, - "loss": 0.5378084182739258, - "mean_token_accuracy": 0.8256701231002808, - "num_tokens": 11048149.0, - "step": 1233 - }, - { - "epoch": 0.9376899696048632, - "grad_norm": 2.66812801361084, - "learning_rate": 4.117639903923611e-06, - "loss": 0.5236451625823975, - "mean_token_accuracy": 0.8431973457336426, - "num_tokens": 11052295.0, - "step": 1234 - }, - { - "epoch": 0.9384498480243161, - "grad_norm": 1.5740545988082886, - "learning_rate": 4.116042480588505e-06, - "loss": 0.44322824478149414, - "mean_token_accuracy": 0.8436908721923828, - "num_tokens": 11062066.0, - "step": 1235 - }, - { - "epoch": 0.939209726443769, - "grad_norm": 1.230706810951233, - "learning_rate": 4.114443923052577e-06, - "loss": 0.3325323462486267, - "mean_token_accuracy": 0.8674666881561279, - "num_tokens": 11074300.0, - "step": 1236 - }, - { - "epoch": 0.9399696048632219, - "grad_norm": 1.9870070219039917, - "learning_rate": 4.112844232437757e-06, - "loss": 0.5711548328399658, - "mean_token_accuracy": 0.8081738948822021, - "num_tokens": 11082297.0, - "step": 1237 - }, - { - "epoch": 0.9407294832826748, - "grad_norm": 1.3020970821380615, - "learning_rate": 4.11124340986677e-06, - "loss": 0.4187922477722168, - "mean_token_accuracy": 0.8566171526908875, - "num_tokens": 11096810.0, - "step": 1238 - }, - { - "epoch": 0.9414893617021277, - "grad_norm": 2.1399197578430176, - "learning_rate": 4.109641456463135e-06, - "loss": 0.5293116569519043, - "mean_token_accuracy": 0.8176157474517822, - "num_tokens": 11102761.0, - "step": 1239 - }, - { - "epoch": 0.9422492401215805, - "grad_norm": 1.3503763675689697, - "learning_rate": 4.108038373351163e-06, - "loss": 0.4907652735710144, - "mean_token_accuracy": 0.8204987049102783, - "num_tokens": 11118480.0, - "step": 1240 - }, - { - "epoch": 0.9430091185410334, - "grad_norm": 1.9571399688720703, - "learning_rate": 4.106434161655962e-06, - "loss": 0.4709656536579132, - "mean_token_accuracy": 0.8371885418891907, - "num_tokens": 11126265.0, - "step": 1241 - }, - { - "epoch": 0.9437689969604863, - "grad_norm": 2.1277313232421875, - "learning_rate": 4.104828822503427e-06, - "loss": 0.4010283350944519, - "mean_token_accuracy": 0.8586333990097046, - "num_tokens": 11133022.0, - "step": 1242 - }, - { - "epoch": 0.9445288753799392, - "grad_norm": 1.6745036840438843, - "learning_rate": 4.103222357020248e-06, - "loss": 0.562545657157898, - "mean_token_accuracy": 0.8052060604095459, - "num_tokens": 11145255.0, - "step": 1243 - }, - { - "epoch": 0.9452887537993921, - "grad_norm": 2.3616299629211426, - "learning_rate": 4.101614766333904e-06, - "loss": 0.5878340601921082, - "mean_token_accuracy": 0.796745777130127, - "num_tokens": 11152020.0, - "step": 1244 - }, - { - "epoch": 0.9460486322188449, - "grad_norm": 1.6182078123092651, - "learning_rate": 4.100006051572664e-06, - "loss": 0.5357589721679688, - "mean_token_accuracy": 0.8089962005615234, - "num_tokens": 11163112.0, - "step": 1245 - }, - { - "epoch": 0.9468085106382979, - "grad_norm": 1.911770224571228, - "learning_rate": 4.098396213865587e-06, - "loss": 0.49805426597595215, - "mean_token_accuracy": 0.8289647102355957, - "num_tokens": 11171768.0, - "step": 1246 - }, - { - "epoch": 0.9475683890577508, - "grad_norm": 1.649155616760254, - "learning_rate": 4.096785254342518e-06, - "loss": 0.5756166577339172, - "mean_token_accuracy": 0.807680606842041, - "num_tokens": 11183527.0, - "step": 1247 - }, - { - "epoch": 0.9483282674772037, - "grad_norm": 1.8922761678695679, - "learning_rate": 4.095173174134091e-06, - "loss": 0.44688963890075684, - "mean_token_accuracy": 0.8375608921051025, - "num_tokens": 11191494.0, - "step": 1248 - }, - { - "epoch": 0.9490881458966566, - "grad_norm": 2.9044547080993652, - "learning_rate": 4.093559974371725e-06, - "loss": 0.48609739542007446, - "mean_token_accuracy": 0.8404892086982727, - "num_tokens": 11195837.0, - "step": 1249 - }, - { - "epoch": 0.9498480243161094, - "grad_norm": 2.287506580352783, - "learning_rate": 4.091945656187626e-06, - "loss": 0.5260225534439087, - "mean_token_accuracy": 0.8181945085525513, - "num_tokens": 11202174.0, - "step": 1250 - }, - { - "epoch": 0.9506079027355623, - "grad_norm": 1.7908886671066284, - "learning_rate": 4.090330220714785e-06, - "loss": 0.4207724928855896, - "mean_token_accuracy": 0.8616912364959717, - "num_tokens": 11209995.0, - "step": 1251 - }, - { - "epoch": 0.9513677811550152, - "grad_norm": 2.905418634414673, - "learning_rate": 4.0887136690869774e-06, - "loss": 0.4209241271018982, - "mean_token_accuracy": 0.8561323285102844, - "num_tokens": 11213799.0, - "step": 1252 - }, - { - "epoch": 0.9521276595744681, - "grad_norm": 2.814150333404541, - "learning_rate": 4.08709600243876e-06, - "loss": 0.36855608224868774, - "mean_token_accuracy": 0.8764539361000061, - "num_tokens": 11217643.0, - "step": 1253 - }, - { - "epoch": 0.952887537993921, - "grad_norm": 1.9385707378387451, - "learning_rate": 4.0854772219054735e-06, - "loss": 0.531031608581543, - "mean_token_accuracy": 0.80600905418396, - "num_tokens": 11225871.0, - "step": 1254 - }, - { - "epoch": 0.9536474164133738, - "grad_norm": 2.103058099746704, - "learning_rate": 4.083857328623243e-06, - "loss": 0.4576364755630493, - "mean_token_accuracy": 0.8447524905204773, - "num_tokens": 11231829.0, - "step": 1255 - }, - { - "epoch": 0.9544072948328267, - "grad_norm": 1.7518818378448486, - "learning_rate": 4.082236323728969e-06, - "loss": 0.5386767983436584, - "mean_token_accuracy": 0.8055596351623535, - "num_tokens": 11240977.0, - "step": 1256 - }, - { - "epoch": 0.9551671732522796, - "grad_norm": 1.8434966802597046, - "learning_rate": 4.0806142083603365e-06, - "loss": 0.5415925979614258, - "mean_token_accuracy": 0.809962272644043, - "num_tokens": 11249616.0, - "step": 1257 - }, - { - "epoch": 0.9559270516717325, - "grad_norm": 1.7341015338897705, - "learning_rate": 4.078990983655807e-06, - "loss": 0.4621101915836334, - "mean_token_accuracy": 0.8330386877059937, - "num_tokens": 11258616.0, - "step": 1258 - }, - { - "epoch": 0.9566869300911854, - "grad_norm": 1.8589727878570557, - "learning_rate": 4.077366650754624e-06, - "loss": 0.4031238555908203, - "mean_token_accuracy": 0.842434287071228, - "num_tokens": 11266006.0, - "step": 1259 - }, - { - "epoch": 0.9574468085106383, - "grad_norm": 1.657175898551941, - "learning_rate": 4.075741210796806e-06, - "loss": 0.41686388850212097, - "mean_token_accuracy": 0.8443650007247925, - "num_tokens": 11275601.0, - "step": 1260 - }, - { - "epoch": 0.9582066869300911, - "grad_norm": 2.4303717613220215, - "learning_rate": 4.07411466492315e-06, - "loss": 0.4554435610771179, - "mean_token_accuracy": 0.853043794631958, - "num_tokens": 11280650.0, - "step": 1261 - }, - { - "epoch": 0.958966565349544, - "grad_norm": 2.3653745651245117, - "learning_rate": 4.072487014275228e-06, - "loss": 0.4304995536804199, - "mean_token_accuracy": 0.8462260961532593, - "num_tokens": 11285637.0, - "step": 1262 - }, - { - "epoch": 0.959726443768997, - "grad_norm": 1.6689718961715698, - "learning_rate": 4.070858259995388e-06, - "loss": 0.5290807485580444, - "mean_token_accuracy": 0.8176917433738708, - "num_tokens": 11299110.0, - "step": 1263 - }, - { - "epoch": 0.9604863221884499, - "grad_norm": 2.103879451751709, - "learning_rate": 4.069228403226751e-06, - "loss": 0.4620879888534546, - "mean_token_accuracy": 0.835270345211029, - "num_tokens": 11305564.0, - "step": 1264 - }, - { - "epoch": 0.9612462006079028, - "grad_norm": 2.139012575149536, - "learning_rate": 4.067597445113216e-06, - "loss": 0.5143396258354187, - "mean_token_accuracy": 0.8191739320755005, - "num_tokens": 11311870.0, - "step": 1265 - }, - { - "epoch": 0.9620060790273556, - "grad_norm": 1.3971210718154907, - "learning_rate": 4.06596538679945e-06, - "loss": 0.472080260515213, - "mean_token_accuracy": 0.8321092128753662, - "num_tokens": 11323970.0, - "step": 1266 - }, - { - "epoch": 0.9627659574468085, - "grad_norm": 1.4965174198150635, - "learning_rate": 4.064332229430895e-06, - "loss": 0.359701007604599, - "mean_token_accuracy": 0.8903120160102844, - "num_tokens": 11333412.0, - "step": 1267 - }, - { - "epoch": 0.9635258358662614, - "grad_norm": 1.1898726224899292, - "learning_rate": 4.062697974153764e-06, - "loss": 0.3423798084259033, - "mean_token_accuracy": 0.8661491870880127, - "num_tokens": 11347657.0, - "step": 1268 - }, - { - "epoch": 0.9642857142857143, - "grad_norm": 1.4952168464660645, - "learning_rate": 4.06106262211504e-06, - "loss": 0.4214417338371277, - "mean_token_accuracy": 0.8362159729003906, - "num_tokens": 11357786.0, - "step": 1269 - }, - { - "epoch": 0.9650455927051672, - "grad_norm": 1.7949583530426025, - "learning_rate": 4.059426174462476e-06, - "loss": 0.59087735414505, - "mean_token_accuracy": 0.7965556979179382, - "num_tokens": 11370561.0, - "step": 1270 - }, - { - "epoch": 0.96580547112462, - "grad_norm": 1.8973214626312256, - "learning_rate": 4.057788632344594e-06, - "loss": 0.47525322437286377, - "mean_token_accuracy": 0.8317365050315857, - "num_tokens": 11378507.0, - "step": 1271 - }, - { - "epoch": 0.9665653495440729, - "grad_norm": 1.8665250539779663, - "learning_rate": 4.056149996910683e-06, - "loss": 0.3537125587463379, - "mean_token_accuracy": 0.8921569585800171, - "num_tokens": 11385186.0, - "step": 1272 - }, - { - "epoch": 0.9673252279635258, - "grad_norm": 1.5072317123413086, - "learning_rate": 4.054510269310803e-06, - "loss": 0.5145624876022339, - "mean_token_accuracy": 0.8265488147735596, - "num_tokens": 11397125.0, - "step": 1273 - }, - { - "epoch": 0.9680851063829787, - "grad_norm": 1.520525574684143, - "learning_rate": 4.052869450695776e-06, - "loss": 0.44322293996810913, - "mean_token_accuracy": 0.8403642177581787, - "num_tokens": 11409919.0, - "step": 1274 - }, - { - "epoch": 0.9688449848024316, - "grad_norm": 1.3764475584030151, - "learning_rate": 4.051227542217192e-06, - "loss": 0.5774400234222412, - "mean_token_accuracy": 0.804118275642395, - "num_tokens": 11425900.0, - "step": 1275 - }, - { - "epoch": 0.9696048632218845, - "grad_norm": 1.3922648429870605, - "learning_rate": 4.049584545027406e-06, - "loss": 0.42727944254875183, - "mean_token_accuracy": 0.8654505014419556, - "num_tokens": 11438787.0, - "step": 1276 - }, - { - "epoch": 0.9703647416413373, - "grad_norm": 1.8505840301513672, - "learning_rate": 4.047940460279537e-06, - "loss": 0.490803062915802, - "mean_token_accuracy": 0.8340574502944946, - "num_tokens": 11447997.0, - "step": 1277 - }, - { - "epoch": 0.9711246200607903, - "grad_norm": 2.28271222114563, - "learning_rate": 4.046295289127466e-06, - "loss": 0.588828444480896, - "mean_token_accuracy": 0.833497166633606, - "num_tokens": 11454072.0, - "step": 1278 - }, - { - "epoch": 0.9718844984802432, - "grad_norm": 2.4242560863494873, - "learning_rate": 4.044649032725836e-06, - "loss": 0.5128831267356873, - "mean_token_accuracy": 0.8225122690200806, - "num_tokens": 11460211.0, - "step": 1279 - }, - { - "epoch": 0.9726443768996961, - "grad_norm": 2.1738455295562744, - "learning_rate": 4.0430016922300566e-06, - "loss": 0.441631942987442, - "mean_token_accuracy": 0.841723620891571, - "num_tokens": 11466814.0, - "step": 1280 - }, - { - "epoch": 0.973404255319149, - "grad_norm": 2.541599988937378, - "learning_rate": 4.0413532687962926e-06, - "loss": 0.5062629580497742, - "mean_token_accuracy": 0.8013502359390259, - "num_tokens": 11472371.0, - "step": 1281 - }, - { - "epoch": 0.9741641337386018, - "grad_norm": 2.8011014461517334, - "learning_rate": 4.039703763581472e-06, - "loss": 0.5061966776847839, - "mean_token_accuracy": 0.829810380935669, - "num_tokens": 11476672.0, - "step": 1282 - }, - { - "epoch": 0.9749240121580547, - "grad_norm": 2.4505462646484375, - "learning_rate": 4.038053177743279e-06, - "loss": 0.43407535552978516, - "mean_token_accuracy": 0.8428469896316528, - "num_tokens": 11481297.0, - "step": 1283 - }, - { - "epoch": 0.9756838905775076, - "grad_norm": 2.1618378162384033, - "learning_rate": 4.036401512440161e-06, - "loss": 0.6056663393974304, - "mean_token_accuracy": 0.7977457642555237, - "num_tokens": 11488657.0, - "step": 1284 - }, - { - "epoch": 0.9764437689969605, - "grad_norm": 1.9192147254943848, - "learning_rate": 4.034748768831319e-06, - "loss": 0.524390697479248, - "mean_token_accuracy": 0.8120636940002441, - "num_tokens": 11496485.0, - "step": 1285 - }, - { - "epoch": 0.9772036474164134, - "grad_norm": 2.766435384750366, - "learning_rate": 4.033094948076713e-06, - "loss": 0.5494908690452576, - "mean_token_accuracy": 0.8141890168190002, - "num_tokens": 11501341.0, - "step": 1286 - }, - { - "epoch": 0.9779635258358662, - "grad_norm": 1.3519539833068848, - "learning_rate": 4.031440051337056e-06, - "loss": 0.4339691400527954, - "mean_token_accuracy": 0.8400131464004517, - "num_tokens": 11512843.0, - "step": 1287 - }, - { - "epoch": 0.9787234042553191, - "grad_norm": 1.2492141723632812, - "learning_rate": 4.02978407977382e-06, - "loss": 0.4433518052101135, - "mean_token_accuracy": 0.8432940244674683, - "num_tokens": 11530227.0, - "step": 1288 - }, - { - "epoch": 0.979483282674772, - "grad_norm": 1.6597715616226196, - "learning_rate": 4.02812703454923e-06, - "loss": 0.602222204208374, - "mean_token_accuracy": 0.786965548992157, - "num_tokens": 11543955.0, - "step": 1289 - }, - { - "epoch": 0.9802431610942249, - "grad_norm": 1.6621816158294678, - "learning_rate": 4.026468916826262e-06, - "loss": 0.35662174224853516, - "mean_token_accuracy": 0.8716133832931519, - "num_tokens": 11552064.0, - "step": 1290 - }, - { - "epoch": 0.9810030395136778, - "grad_norm": 4.539844989776611, - "learning_rate": 4.024809727768648e-06, - "loss": 0.543423593044281, - "mean_token_accuracy": 0.8293194770812988, - "num_tokens": 11555595.0, - "step": 1291 - }, - { - "epoch": 0.9817629179331308, - "grad_norm": 1.4026556015014648, - "learning_rate": 4.023149468540871e-06, - "loss": 0.4301237165927887, - "mean_token_accuracy": 0.8358224630355835, - "num_tokens": 11572275.0, - "step": 1292 - }, - { - "epoch": 0.9825227963525835, - "grad_norm": 1.611262321472168, - "learning_rate": 4.021488140308165e-06, - "loss": 0.5378580689430237, - "mean_token_accuracy": 0.8173760771751404, - "num_tokens": 11584299.0, - "step": 1293 - }, - { - "epoch": 0.9832826747720365, - "grad_norm": 4.138631820678711, - "learning_rate": 4.019825744236514e-06, - "loss": 0.40272149443626404, - "mean_token_accuracy": 0.8648844957351685, - "num_tokens": 11586705.0, - "step": 1294 - }, - { - "epoch": 0.9840425531914894, - "grad_norm": 3.177703619003296, - "learning_rate": 4.018162281492651e-06, - "loss": 0.5320103168487549, - "mean_token_accuracy": 0.8250276446342468, - "num_tokens": 11590689.0, - "step": 1295 - }, - { - "epoch": 0.9848024316109423, - "grad_norm": 2.727597713470459, - "learning_rate": 4.016497753244058e-06, - "loss": 0.5662774443626404, - "mean_token_accuracy": 0.8074625730514526, - "num_tokens": 11596092.0, - "step": 1296 - }, - { - "epoch": 0.9855623100303952, - "grad_norm": 1.485139012336731, - "learning_rate": 4.014832160658966e-06, - "loss": 0.5414972305297852, - "mean_token_accuracy": 0.8082696199417114, - "num_tokens": 11613785.0, - "step": 1297 - }, - { - "epoch": 0.986322188449848, - "grad_norm": 2.4025990962982178, - "learning_rate": 4.013165504906352e-06, - "loss": 0.6556503772735596, - "mean_token_accuracy": 0.7785214781761169, - "num_tokens": 11620421.0, - "step": 1298 - }, - { - "epoch": 0.9870820668693009, - "grad_norm": 1.878273606300354, - "learning_rate": 4.011497787155938e-06, - "loss": 0.4221133887767792, - "mean_token_accuracy": 0.850035548210144, - "num_tokens": 11627998.0, - "step": 1299 - }, - { - "epoch": 0.9878419452887538, - "grad_norm": 2.0430715084075928, - "learning_rate": 4.009829008578192e-06, - "loss": 0.5205984711647034, - "mean_token_accuracy": 0.819183349609375, - "num_tokens": 11636279.0, - "step": 1300 - }, - { - "epoch": 0.9886018237082067, - "grad_norm": 3.4769439697265625, - "learning_rate": 4.00815917034433e-06, - "loss": 0.5449948310852051, - "mean_token_accuracy": 0.8240023851394653, - "num_tokens": 11639638.0, - "step": 1301 - }, - { - "epoch": 0.9893617021276596, - "grad_norm": 2.4783987998962402, - "learning_rate": 4.006488273626307e-06, - "loss": 0.4316832423210144, - "mean_token_accuracy": 0.8474695086479187, - "num_tokens": 11645463.0, - "step": 1302 - }, - { - "epoch": 0.9901215805471124, - "grad_norm": 1.881475567817688, - "learning_rate": 4.004816319596822e-06, - "loss": 0.5157331824302673, - "mean_token_accuracy": 0.826042652130127, - "num_tokens": 11653955.0, - "step": 1303 - }, - { - "epoch": 0.9908814589665653, - "grad_norm": 2.6569254398345947, - "learning_rate": 4.003143309429317e-06, - "loss": 0.46492767333984375, - "mean_token_accuracy": 0.8320850133895874, - "num_tokens": 11659357.0, - "step": 1304 - }, - { - "epoch": 0.9916413373860182, - "grad_norm": 2.4917593002319336, - "learning_rate": 4.0014692442979756e-06, - "loss": 0.459585040807724, - "mean_token_accuracy": 0.8457611799240112, - "num_tokens": 11664207.0, - "step": 1305 - }, - { - "epoch": 0.9924012158054711, - "grad_norm": 2.6885526180267334, - "learning_rate": 3.999794125377721e-06, - "loss": 0.4677402973175049, - "mean_token_accuracy": 0.8307361602783203, - "num_tokens": 11668879.0, - "step": 1306 - }, - { - "epoch": 0.993161094224924, - "grad_norm": 1.9737319946289062, - "learning_rate": 3.998117953844215e-06, - "loss": 0.44684839248657227, - "mean_token_accuracy": 0.8367687463760376, - "num_tokens": 11676081.0, - "step": 1307 - }, - { - "epoch": 0.993920972644377, - "grad_norm": 1.4333021640777588, - "learning_rate": 3.996440730873861e-06, - "loss": 0.526146650314331, - "mean_token_accuracy": 0.816251814365387, - "num_tokens": 11689333.0, - "step": 1308 - }, - { - "epoch": 0.9946808510638298, - "grad_norm": 1.3689230680465698, - "learning_rate": 3.9947624576437975e-06, - "loss": 0.40214329957962036, - "mean_token_accuracy": 0.8610327839851379, - "num_tokens": 11701540.0, - "step": 1309 - }, - { - "epoch": 0.9954407294832827, - "grad_norm": 1.2435375452041626, - "learning_rate": 3.9930831353319025e-06, - "loss": 0.4532913267612457, - "mean_token_accuracy": 0.8415389060974121, - "num_tokens": 11717920.0, - "step": 1310 - }, - { - "epoch": 0.9962006079027356, - "grad_norm": 1.9968011379241943, - "learning_rate": 3.9914027651167866e-06, - "loss": 0.46954160928726196, - "mean_token_accuracy": 0.8351103663444519, - "num_tokens": 11724999.0, - "step": 1311 - }, - { - "epoch": 0.9969604863221885, - "grad_norm": 1.9521311521530151, - "learning_rate": 3.989721348177801e-06, - "loss": 0.5068016052246094, - "mean_token_accuracy": 0.8220845460891724, - "num_tokens": 11732569.0, - "step": 1312 - }, - { - "epoch": 0.9977203647416414, - "grad_norm": 2.7332582473754883, - "learning_rate": 3.988038885695028e-06, - "loss": 0.4154692590236664, - "mean_token_accuracy": 0.8493857383728027, - "num_tokens": 11736759.0, - "step": 1313 - }, - { - "epoch": 0.9984802431610942, - "grad_norm": 1.8656952381134033, - "learning_rate": 3.986355378849284e-06, - "loss": 0.4151354134082794, - "mean_token_accuracy": 0.83440101146698, - "num_tokens": 11743827.0, - "step": 1314 - }, - { - "epoch": 0.9992401215805471, - "grad_norm": 1.304006576538086, - "learning_rate": 3.984670828822118e-06, - "loss": 0.4926128089427948, - "mean_token_accuracy": 0.8603005409240723, - "num_tokens": 11757707.0, - "step": 1315 - }, - { - "epoch": 1.0, - "grad_norm": 1.497079610824585, - "learning_rate": 3.982985236795815e-06, - "loss": 0.43342477083206177, - "mean_token_accuracy": 0.8550825119018555, - "num_tokens": 11769678.0, - "step": 1316 - }, - { - "epoch": 1.000759878419453, - "grad_norm": 2.870274543762207, - "learning_rate": 3.981298603953385e-06, - "loss": 0.3723528981208801, - "mean_token_accuracy": 0.8745899796485901, - "num_tokens": 11773290.0, - "step": 1317 - }, - { - "epoch": 1.0015197568389058, - "grad_norm": 1.3442503213882446, - "learning_rate": 3.979610931478574e-06, - "loss": 0.34688329696655273, - "mean_token_accuracy": 0.8749074935913086, - "num_tokens": 11786400.0, - "step": 1318 - }, - { - "epoch": 1.0022796352583587, - "grad_norm": 1.7272238731384277, - "learning_rate": 3.977922220555855e-06, - "loss": 0.28274932503700256, - "mean_token_accuracy": 0.896713137626648, - "num_tokens": 11793059.0, - "step": 1319 - }, - { - "epoch": 1.0030395136778116, - "grad_norm": 1.7362451553344727, - "learning_rate": 3.976232472370431e-06, - "loss": 0.5494794845581055, - "mean_token_accuracy": 0.8341718912124634, - "num_tokens": 11802593.0, - "step": 1320 - }, - { - "epoch": 1.0037993920972645, - "grad_norm": 1.3316494226455688, - "learning_rate": 3.97454168810823e-06, - "loss": 0.41505366563796997, - "mean_token_accuracy": 0.8581969738006592, - "num_tokens": 11813925.0, - "step": 1321 - }, - { - "epoch": 1.0045592705167172, - "grad_norm": 1.6152615547180176, - "learning_rate": 3.972849868955913e-06, - "loss": 0.44761013984680176, - "mean_token_accuracy": 0.8413045406341553, - "num_tokens": 11825709.0, - "step": 1322 - }, - { - "epoch": 1.0053191489361701, - "grad_norm": 2.1172471046447754, - "learning_rate": 3.97115701610086e-06, - "loss": 0.3903353810310364, - "mean_token_accuracy": 0.8662760257720947, - "num_tokens": 11832070.0, - "step": 1323 - }, - { - "epoch": 1.006079027355623, - "grad_norm": 1.5923868417739868, - "learning_rate": 3.969463130731183e-06, - "loss": 0.4491051137447357, - "mean_token_accuracy": 0.8677828311920166, - "num_tokens": 11843154.0, - "step": 1324 - }, - { - "epoch": 1.006838905775076, - "grad_norm": 1.6848995685577393, - "learning_rate": 3.967768214035716e-06, - "loss": 0.45765817165374756, - "mean_token_accuracy": 0.8401060104370117, - "num_tokens": 11854826.0, - "step": 1325 - }, - { - "epoch": 1.0075987841945289, - "grad_norm": 2.3739020824432373, - "learning_rate": 3.966072267204014e-06, - "loss": 0.4482722580432892, - "mean_token_accuracy": 0.8368916511535645, - "num_tokens": 11860559.0, - "step": 1326 - }, - { - "epoch": 1.0083586626139818, - "grad_norm": 1.5403034687042236, - "learning_rate": 3.964375291426361e-06, - "loss": 0.35589972138404846, - "mean_token_accuracy": 0.8728118538856506, - "num_tokens": 11871959.0, - "step": 1327 - }, - { - "epoch": 1.0091185410334347, - "grad_norm": 1.6750119924545288, - "learning_rate": 3.962677287893758e-06, - "loss": 0.35873427987098694, - "mean_token_accuracy": 0.9027186632156372, - "num_tokens": 11881818.0, - "step": 1328 - }, - { - "epoch": 1.0098784194528876, - "grad_norm": 1.5489170551300049, - "learning_rate": 3.9609782577979305e-06, - "loss": 0.3634672462940216, - "mean_token_accuracy": 0.8582607507705688, - "num_tokens": 11891084.0, - "step": 1329 - }, - { - "epoch": 1.0106382978723405, - "grad_norm": 2.43859601020813, - "learning_rate": 3.959278202331323e-06, - "loss": 0.3640799820423126, - "mean_token_accuracy": 0.88062584400177, - "num_tokens": 11896032.0, - "step": 1330 - }, - { - "epoch": 1.0113981762917934, - "grad_norm": 3.612184524536133, - "learning_rate": 3.9575771226870986e-06, - "loss": 0.3733130097389221, - "mean_token_accuracy": 0.8946067094802856, - "num_tokens": 11899479.0, - "step": 1331 - }, - { - "epoch": 1.012158054711246, - "grad_norm": 1.541355848312378, - "learning_rate": 3.955875020059141e-06, - "loss": 0.320593923330307, - "mean_token_accuracy": 0.9057406783103943, - "num_tokens": 11910179.0, - "step": 1332 - }, - { - "epoch": 1.012917933130699, - "grad_norm": 2.0565030574798584, - "learning_rate": 3.954171895642052e-06, - "loss": 0.3341682553291321, - "mean_token_accuracy": 0.8829344511032104, - "num_tokens": 11916489.0, - "step": 1333 - }, - { - "epoch": 1.013677811550152, - "grad_norm": 2.9732539653778076, - "learning_rate": 3.9524677506311505e-06, - "loss": 0.38488566875457764, - "mean_token_accuracy": 0.8752974271774292, - "num_tokens": 11920682.0, - "step": 1334 - }, - { - "epoch": 1.0144376899696048, - "grad_norm": 2.7697458267211914, - "learning_rate": 3.950762586222469e-06, - "loss": 0.39864760637283325, - "mean_token_accuracy": 0.8593167662620544, - "num_tokens": 11925233.0, - "step": 1335 - }, - { - "epoch": 1.0151975683890577, - "grad_norm": 2.2302119731903076, - "learning_rate": 3.949056403612758e-06, - "loss": 0.3985682725906372, - "mean_token_accuracy": 0.8677899837493896, - "num_tokens": 11932000.0, - "step": 1336 - }, - { - "epoch": 1.0159574468085106, - "grad_norm": 2.360572576522827, - "learning_rate": 3.947349203999485e-06, - "loss": 0.36940714716911316, - "mean_token_accuracy": 0.8760676383972168, - "num_tokens": 11937569.0, - "step": 1337 - }, - { - "epoch": 1.0167173252279635, - "grad_norm": 1.3383921384811401, - "learning_rate": 3.945640988580824e-06, - "loss": 0.40628793835639954, - "mean_token_accuracy": 0.866442084312439, - "num_tokens": 11955679.0, - "step": 1338 - }, - { - "epoch": 1.0174772036474165, - "grad_norm": 2.1502623558044434, - "learning_rate": 3.943931758555669e-06, - "loss": 0.4493565559387207, - "mean_token_accuracy": 0.8307522535324097, - "num_tokens": 11962734.0, - "step": 1339 - }, - { - "epoch": 1.0182370820668694, - "grad_norm": 2.4737331867218018, - "learning_rate": 3.942221515123624e-06, - "loss": 0.28508758544921875, - "mean_token_accuracy": 0.8967142105102539, - "num_tokens": 11967783.0, - "step": 1340 - }, - { - "epoch": 1.0189969604863223, - "grad_norm": 2.4525370597839355, - "learning_rate": 3.940510259485002e-06, - "loss": 0.40227818489074707, - "mean_token_accuracy": 0.8618967533111572, - "num_tokens": 11972918.0, - "step": 1341 - }, - { - "epoch": 1.0197568389057752, - "grad_norm": 1.7299731969833374, - "learning_rate": 3.938797992840828e-06, - "loss": 0.26339593529701233, - "mean_token_accuracy": 0.9004406929016113, - "num_tokens": 11981250.0, - "step": 1342 - }, - { - "epoch": 1.0205167173252279, - "grad_norm": 2.8756747245788574, - "learning_rate": 3.937084716392839e-06, - "loss": 0.47792482376098633, - "mean_token_accuracy": 0.8440839052200317, - "num_tokens": 11986356.0, - "step": 1343 - }, - { - "epoch": 1.0212765957446808, - "grad_norm": 2.104473114013672, - "learning_rate": 3.935370431343475e-06, - "loss": 0.36723971366882324, - "mean_token_accuracy": 0.8831232786178589, - "num_tokens": 11994495.0, - "step": 1344 - }, - { - "epoch": 1.0220364741641337, - "grad_norm": 1.9173074960708618, - "learning_rate": 3.933655138895889e-06, - "loss": 0.409319669008255, - "mean_token_accuracy": 0.8632645606994629, - "num_tokens": 12002060.0, - "step": 1345 - }, - { - "epoch": 1.0227963525835866, - "grad_norm": 2.958311080932617, - "learning_rate": 3.9319388402539395e-06, - "loss": 0.5390093922615051, - "mean_token_accuracy": 0.8204828500747681, - "num_tokens": 12007588.0, - "step": 1346 - }, - { - "epoch": 1.0235562310030395, - "grad_norm": 1.6470831632614136, - "learning_rate": 3.930221536622192e-06, - "loss": 0.4524633288383484, - "mean_token_accuracy": 0.8516575694084167, - "num_tokens": 12018831.0, - "step": 1347 - }, - { - "epoch": 1.0243161094224924, - "grad_norm": 1.3160780668258667, - "learning_rate": 3.928503229205913e-06, - "loss": 0.4180558919906616, - "mean_token_accuracy": 0.8495022058486938, - "num_tokens": 12033947.0, - "step": 1348 - }, - { - "epoch": 1.0250759878419453, - "grad_norm": 1.9686089754104614, - "learning_rate": 3.92678391921108e-06, - "loss": 0.41927334666252136, - "mean_token_accuracy": 0.8462997674942017, - "num_tokens": 12042005.0, - "step": 1349 - }, - { - "epoch": 1.0258358662613982, - "grad_norm": 2.351778507232666, - "learning_rate": 3.92506360784437e-06, - "loss": 0.2946245074272156, - "mean_token_accuracy": 0.9170923233032227, - "num_tokens": 12046579.0, - "step": 1350 - }, - { - "epoch": 1.0265957446808511, - "grad_norm": 2.0636913776397705, - "learning_rate": 3.923342296313162e-06, - "loss": 0.3422774076461792, - "mean_token_accuracy": 0.8809213638305664, - "num_tokens": 12053214.0, - "step": 1351 - }, - { - "epoch": 1.027355623100304, - "grad_norm": 1.7272592782974243, - "learning_rate": 3.92161998582554e-06, - "loss": 0.5864541530609131, - "mean_token_accuracy": 0.7986117601394653, - "num_tokens": 12068522.0, - "step": 1352 - }, - { - "epoch": 1.028115501519757, - "grad_norm": 0.8980231881141663, - "learning_rate": 3.919896677590289e-06, - "loss": 0.2964550256729126, - "mean_token_accuracy": 0.8911845088005066, - "num_tokens": 12093834.0, - "step": 1353 - }, - { - "epoch": 1.0288753799392096, - "grad_norm": 1.6031712293624878, - "learning_rate": 3.918172372816892e-06, - "loss": 0.37254488468170166, - "mean_token_accuracy": 0.8615843057632446, - "num_tokens": 12104393.0, - "step": 1354 - }, - { - "epoch": 1.0296352583586625, - "grad_norm": 1.282134771347046, - "learning_rate": 3.916447072715531e-06, - "loss": 0.3522927761077881, - "mean_token_accuracy": 0.8713657259941101, - "num_tokens": 12118671.0, - "step": 1355 - }, - { - "epoch": 1.0303951367781155, - "grad_norm": 2.1986680030822754, - "learning_rate": 3.914720778497091e-06, - "loss": 0.3716316223144531, - "mean_token_accuracy": 0.8661249279975891, - "num_tokens": 12125178.0, - "step": 1356 - }, - { - "epoch": 1.0311550151975684, - "grad_norm": 1.5937882661819458, - "learning_rate": 3.91299349137315e-06, - "loss": 0.48067355155944824, - "mean_token_accuracy": 0.8284252882003784, - "num_tokens": 12136785.0, - "step": 1357 - }, - { - "epoch": 1.0319148936170213, - "grad_norm": 1.6743099689483643, - "learning_rate": 3.9112652125559845e-06, - "loss": 0.4461551308631897, - "mean_token_accuracy": 0.8381845355033875, - "num_tokens": 12150066.0, - "step": 1358 - }, - { - "epoch": 1.0326747720364742, - "grad_norm": 2.2346715927124023, - "learning_rate": 3.909535943258567e-06, - "loss": 0.3148220181465149, - "mean_token_accuracy": 0.8797591924667358, - "num_tokens": 12155506.0, - "step": 1359 - }, - { - "epoch": 1.033434650455927, - "grad_norm": 1.9608992338180542, - "learning_rate": 3.907805684694567e-06, - "loss": 0.32598960399627686, - "mean_token_accuracy": 0.8819410800933838, - "num_tokens": 12163261.0, - "step": 1360 - }, - { - "epoch": 1.03419452887538, - "grad_norm": 2.413477897644043, - "learning_rate": 3.906074438078343e-06, - "loss": 0.38179588317871094, - "mean_token_accuracy": 0.8739585876464844, - "num_tokens": 12169254.0, - "step": 1361 - }, - { - "epoch": 1.034954407294833, - "grad_norm": 2.0258278846740723, - "learning_rate": 3.904342204624955e-06, - "loss": 0.33240315318107605, - "mean_token_accuracy": 0.8808181285858154, - "num_tokens": 12175379.0, - "step": 1362 - }, - { - "epoch": 1.0357142857142858, - "grad_norm": 2.4111437797546387, - "learning_rate": 3.9026089855501475e-06, - "loss": 0.412802517414093, - "mean_token_accuracy": 0.8504396677017212, - "num_tokens": 12182007.0, - "step": 1363 - }, - { - "epoch": 1.0364741641337385, - "grad_norm": 2.0424840450286865, - "learning_rate": 3.900874782070362e-06, - "loss": 0.2914797067642212, - "mean_token_accuracy": 0.8731886148452759, - "num_tokens": 12187743.0, - "step": 1364 - }, - { - "epoch": 1.0372340425531914, - "grad_norm": 2.9248716831207275, - "learning_rate": 3.899139595402729e-06, - "loss": 0.34071338176727295, - "mean_token_accuracy": 0.8736443519592285, - "num_tokens": 12191830.0, - "step": 1365 - }, - { - "epoch": 1.0379939209726443, - "grad_norm": 2.240220785140991, - "learning_rate": 3.8974034267650695e-06, - "loss": 0.23049014806747437, - "mean_token_accuracy": 0.9000070691108704, - "num_tokens": 12196460.0, - "step": 1366 - }, - { - "epoch": 1.0387537993920972, - "grad_norm": 1.5038460493087769, - "learning_rate": 3.895666277375892e-06, - "loss": 0.32255327701568604, - "mean_token_accuracy": 0.873004674911499, - "num_tokens": 12206230.0, - "step": 1367 - }, - { - "epoch": 1.0395136778115501, - "grad_norm": 1.2339142560958862, - "learning_rate": 3.893928148454398e-06, - "loss": 0.4069131314754486, - "mean_token_accuracy": 0.8461740016937256, - "num_tokens": 12226502.0, - "step": 1368 - }, - { - "epoch": 1.040273556231003, - "grad_norm": 2.531553268432617, - "learning_rate": 3.89218904122047e-06, - "loss": 0.43681037425994873, - "mean_token_accuracy": 0.8497104048728943, - "num_tokens": 12232241.0, - "step": 1369 - }, - { - "epoch": 1.041033434650456, - "grad_norm": 3.8404815196990967, - "learning_rate": 3.890448956894682e-06, - "loss": 0.3241814970970154, - "mean_token_accuracy": 0.884732723236084, - "num_tokens": 12235126.0, - "step": 1370 - }, - { - "epoch": 1.0417933130699089, - "grad_norm": 2.9608030319213867, - "learning_rate": 3.888707896698293e-06, - "loss": 0.4641021490097046, - "mean_token_accuracy": 0.8496800661087036, - "num_tokens": 12240630.0, - "step": 1371 - }, - { - "epoch": 1.0425531914893618, - "grad_norm": 2.1166417598724365, - "learning_rate": 3.886965861853243e-06, - "loss": 0.42038479447364807, - "mean_token_accuracy": 0.8512747287750244, - "num_tokens": 12247969.0, - "step": 1372 - }, - { - "epoch": 1.0433130699088147, - "grad_norm": 2.5918161869049072, - "learning_rate": 3.885222853582163e-06, - "loss": 0.2871917188167572, - "mean_token_accuracy": 0.9129709601402283, - "num_tokens": 12252161.0, - "step": 1373 - }, - { - "epoch": 1.0440729483282676, - "grad_norm": 2.4261348247528076, - "learning_rate": 3.88347887310836e-06, - "loss": 0.4003123342990875, - "mean_token_accuracy": 0.8570356369018555, - "num_tokens": 12258135.0, - "step": 1374 - }, - { - "epoch": 1.0448328267477203, - "grad_norm": 1.3439548015594482, - "learning_rate": 3.881733921655829e-06, - "loss": 0.3278140425682068, - "mean_token_accuracy": 0.8831373453140259, - "num_tokens": 12272849.0, - "step": 1375 - }, - { - "epoch": 1.0455927051671732, - "grad_norm": 1.527989387512207, - "learning_rate": 3.879988000449243e-06, - "loss": 0.33789363503456116, - "mean_token_accuracy": 0.8825669884681702, - "num_tokens": 12283281.0, - "step": 1376 - }, - { - "epoch": 1.046352583586626, - "grad_norm": 1.6755503416061401, - "learning_rate": 3.878241110713957e-06, - "loss": 0.4816160798072815, - "mean_token_accuracy": 0.8193758726119995, - "num_tokens": 12295422.0, - "step": 1377 - }, - { - "epoch": 1.047112462006079, - "grad_norm": 2.8110361099243164, - "learning_rate": 3.876493253676004e-06, - "loss": 0.38662949204444885, - "mean_token_accuracy": 0.8611986637115479, - "num_tokens": 12299806.0, - "step": 1378 - }, - { - "epoch": 1.047872340425532, - "grad_norm": 1.86097252368927, - "learning_rate": 3.8747444305621e-06, - "loss": 0.27612629532814026, - "mean_token_accuracy": 0.8984048366546631, - "num_tokens": 12306599.0, - "step": 1379 - }, - { - "epoch": 1.0486322188449848, - "grad_norm": 2.361828565597534, - "learning_rate": 3.872994642599635e-06, - "loss": 0.469953715801239, - "mean_token_accuracy": 0.8464452028274536, - "num_tokens": 12314249.0, - "step": 1380 - }, - { - "epoch": 1.0493920972644377, - "grad_norm": 1.9524794816970825, - "learning_rate": 3.871243891016676e-06, - "loss": 0.5419625043869019, - "mean_token_accuracy": 0.8468329906463623, - "num_tokens": 12324987.0, - "step": 1381 - }, - { - "epoch": 1.0501519756838906, - "grad_norm": 1.6931511163711548, - "learning_rate": 3.869492177041971e-06, - "loss": 0.3791416883468628, - "mean_token_accuracy": 0.8692882061004639, - "num_tokens": 12336864.0, - "step": 1382 - }, - { - "epoch": 1.0509118541033435, - "grad_norm": 1.909692406654358, - "learning_rate": 3.867739501904938e-06, - "loss": 0.27974557876586914, - "mean_token_accuracy": 0.9004636406898499, - "num_tokens": 12343093.0, - "step": 1383 - }, - { - "epoch": 1.0516717325227964, - "grad_norm": 1.415162205696106, - "learning_rate": 3.8659858668356735e-06, - "loss": 0.38928335905075073, - "mean_token_accuracy": 0.8491984009742737, - "num_tokens": 12356613.0, - "step": 1384 - }, - { - "epoch": 1.0524316109422491, - "grad_norm": 1.8195741176605225, - "learning_rate": 3.864231273064944e-06, - "loss": 0.3798758089542389, - "mean_token_accuracy": 0.8728072047233582, - "num_tokens": 12364860.0, - "step": 1385 - }, - { - "epoch": 1.053191489361702, - "grad_norm": 1.8481454849243164, - "learning_rate": 3.862475721824193e-06, - "loss": 0.269635945558548, - "mean_token_accuracy": 0.899247407913208, - "num_tokens": 12371841.0, - "step": 1386 - }, - { - "epoch": 1.053951367781155, - "grad_norm": 1.7838784456253052, - "learning_rate": 3.8607192143455325e-06, - "loss": 0.36971768736839294, - "mean_token_accuracy": 0.8833638429641724, - "num_tokens": 12380685.0, - "step": 1387 - }, - { - "epoch": 1.0547112462006079, - "grad_norm": 1.333358645439148, - "learning_rate": 3.858961751861748e-06, - "loss": 0.4039418399333954, - "mean_token_accuracy": 0.8541078567504883, - "num_tokens": 12394072.0, - "step": 1388 - }, - { - "epoch": 1.0554711246200608, - "grad_norm": 2.1600265502929688, - "learning_rate": 3.857203335606294e-06, - "loss": 0.38211894035339355, - "mean_token_accuracy": 0.8549972772598267, - "num_tokens": 12400449.0, - "step": 1389 - }, - { - "epoch": 1.0562310030395137, - "grad_norm": 2.914902687072754, - "learning_rate": 3.855443966813295e-06, - "loss": 0.2237374186515808, - "mean_token_accuracy": 0.9253600835800171, - "num_tokens": 12403758.0, - "step": 1390 - }, - { - "epoch": 1.0569908814589666, - "grad_norm": 2.2361080646514893, - "learning_rate": 3.853683646717543e-06, - "loss": 0.3359566926956177, - "mean_token_accuracy": 0.898173451423645, - "num_tokens": 12410374.0, - "step": 1391 - }, - { - "epoch": 1.0577507598784195, - "grad_norm": 2.3639304637908936, - "learning_rate": 3.8519223765544985e-06, - "loss": 0.3844943046569824, - "mean_token_accuracy": 0.863599419593811, - "num_tokens": 12416016.0, - "step": 1392 - }, - { - "epoch": 1.0585106382978724, - "grad_norm": 2.202971935272217, - "learning_rate": 3.85016015756029e-06, - "loss": 0.3546281158924103, - "mean_token_accuracy": 0.8907540440559387, - "num_tokens": 12422026.0, - "step": 1393 - }, - { - "epoch": 1.0592705167173253, - "grad_norm": 1.1279661655426025, - "learning_rate": 3.848396990971709e-06, - "loss": 0.31522464752197266, - "mean_token_accuracy": 0.8662257194519043, - "num_tokens": 12439964.0, - "step": 1394 - }, - { - "epoch": 1.0600303951367782, - "grad_norm": 2.4731740951538086, - "learning_rate": 3.846632878026214e-06, - "loss": 0.456442266702652, - "mean_token_accuracy": 0.8516958951950073, - "num_tokens": 12446231.0, - "step": 1395 - }, - { - "epoch": 1.060790273556231, - "grad_norm": 1.7631878852844238, - "learning_rate": 3.844867819961928e-06, - "loss": 0.487227201461792, - "mean_token_accuracy": 0.8466947078704834, - "num_tokens": 12459989.0, - "step": 1396 - }, - { - "epoch": 1.0615501519756838, - "grad_norm": 2.4468278884887695, - "learning_rate": 3.843101818017637e-06, - "loss": 0.3367291986942291, - "mean_token_accuracy": 0.8734689950942993, - "num_tokens": 12465741.0, - "step": 1397 - }, - { - "epoch": 1.0623100303951367, - "grad_norm": 1.9045145511627197, - "learning_rate": 3.841334873432789e-06, - "loss": 0.4652615487575531, - "mean_token_accuracy": 0.8333107233047485, - "num_tokens": 12474963.0, - "step": 1398 - }, - { - "epoch": 1.0630699088145896, - "grad_norm": 1.6816917657852173, - "learning_rate": 3.839566987447492e-06, - "loss": 0.4144279956817627, - "mean_token_accuracy": 0.8472539186477661, - "num_tokens": 12485521.0, - "step": 1399 - }, - { - "epoch": 1.0638297872340425, - "grad_norm": 1.8990092277526855, - "learning_rate": 3.837798161302518e-06, - "loss": 0.4040985405445099, - "mean_token_accuracy": 0.8514704704284668, - "num_tokens": 12493495.0, - "step": 1400 - }, - { - "epoch": 1.0645896656534954, - "grad_norm": 2.27785325050354, - "learning_rate": 3.836028396239297e-06, - "loss": 0.43425723910331726, - "mean_token_accuracy": 0.8795069456100464, - "num_tokens": 12499789.0, - "step": 1401 - }, - { - "epoch": 1.0653495440729484, - "grad_norm": 2.5130882263183594, - "learning_rate": 3.8342576934999184e-06, - "loss": 0.33892524242401123, - "mean_token_accuracy": 0.8717449903488159, - "num_tokens": 12504885.0, - "step": 1402 - }, - { - "epoch": 1.0661094224924013, - "grad_norm": 2.650040864944458, - "learning_rate": 3.832486054327131e-06, - "loss": 0.4200317859649658, - "mean_token_accuracy": 0.8616159558296204, - "num_tokens": 12509783.0, - "step": 1403 - }, - { - "epoch": 1.0668693009118542, - "grad_norm": 2.9176881313323975, - "learning_rate": 3.830713479964335e-06, - "loss": 0.37018489837646484, - "mean_token_accuracy": 0.8676021695137024, - "num_tokens": 12514441.0, - "step": 1404 - }, - { - "epoch": 1.067629179331307, - "grad_norm": 1.6430318355560303, - "learning_rate": 3.828939971655595e-06, - "loss": 0.27539193630218506, - "mean_token_accuracy": 0.9077831506729126, - "num_tokens": 12523677.0, - "step": 1405 - }, - { - "epoch": 1.06838905775076, - "grad_norm": 1.3683708906173706, - "learning_rate": 3.827165530645627e-06, - "loss": 0.4085099697113037, - "mean_token_accuracy": 0.8579255938529968, - "num_tokens": 12540104.0, - "step": 1406 - }, - { - "epoch": 1.0691489361702127, - "grad_norm": 2.528465747833252, - "learning_rate": 3.825390158179802e-06, - "loss": 0.42462456226348877, - "mean_token_accuracy": 0.852813720703125, - "num_tokens": 12548239.0, - "step": 1407 - }, - { - "epoch": 1.0699088145896656, - "grad_norm": 1.8288795948028564, - "learning_rate": 3.823613855504144e-06, - "loss": 0.412417471408844, - "mean_token_accuracy": 0.8622130751609802, - "num_tokens": 12557316.0, - "step": 1408 - }, - { - "epoch": 1.0706686930091185, - "grad_norm": 2.341794490814209, - "learning_rate": 3.82183662386533e-06, - "loss": 0.2996668815612793, - "mean_token_accuracy": 0.8964041471481323, - "num_tokens": 12562377.0, - "step": 1409 - }, - { - "epoch": 1.0714285714285714, - "grad_norm": 2.555877208709717, - "learning_rate": 3.82005846451069e-06, - "loss": 0.4184221625328064, - "mean_token_accuracy": 0.8678828477859497, - "num_tokens": 12568516.0, - "step": 1410 - }, - { - "epoch": 1.0721884498480243, - "grad_norm": 2.081308126449585, - "learning_rate": 3.8182793786882065e-06, - "loss": 0.4376835823059082, - "mean_token_accuracy": 0.8409077525138855, - "num_tokens": 12576598.0, - "step": 1411 - }, - { - "epoch": 1.0729483282674772, - "grad_norm": 2.0272316932678223, - "learning_rate": 3.816499367646508e-06, - "loss": 0.3630060851573944, - "mean_token_accuracy": 0.8762413263320923, - "num_tokens": 12584587.0, - "step": 1412 - }, - { - "epoch": 1.0737082066869301, - "grad_norm": 2.6382484436035156, - "learning_rate": 3.814718432634877e-06, - "loss": 0.4244990348815918, - "mean_token_accuracy": 0.8509312272071838, - "num_tokens": 12590028.0, - "step": 1413 - }, - { - "epoch": 1.074468085106383, - "grad_norm": 2.429800271987915, - "learning_rate": 3.8129365749032398e-06, - "loss": 0.36990004777908325, - "mean_token_accuracy": 0.8749774098396301, - "num_tokens": 12594984.0, - "step": 1414 - }, - { - "epoch": 1.075227963525836, - "grad_norm": 3.5939090251922607, - "learning_rate": 3.8111537957021736e-06, - "loss": 0.4245661199092865, - "mean_token_accuracy": 0.8481623530387878, - "num_tokens": 12598494.0, - "step": 1415 - }, - { - "epoch": 1.0759878419452888, - "grad_norm": 2.705955982208252, - "learning_rate": 3.809370096282903e-06, - "loss": 0.41851678490638733, - "mean_token_accuracy": 0.8548051714897156, - "num_tokens": 12603876.0, - "step": 1416 - }, - { - "epoch": 1.0767477203647418, - "grad_norm": 1.7812079191207886, - "learning_rate": 3.807585477897296e-06, - "loss": 0.47113919258117676, - "mean_token_accuracy": 0.8346904516220093, - "num_tokens": 12613402.0, - "step": 1417 - }, - { - "epoch": 1.0775075987841944, - "grad_norm": 1.4335212707519531, - "learning_rate": 3.8057999417978654e-06, - "loss": 0.3802063465118408, - "mean_token_accuracy": 0.8563423156738281, - "num_tokens": 12626865.0, - "step": 1418 - }, - { - "epoch": 1.0782674772036474, - "grad_norm": 1.9171305894851685, - "learning_rate": 3.8040134892377702e-06, - "loss": 0.20898357033729553, - "mean_token_accuracy": 0.9189738035202026, - "num_tokens": 12632593.0, - "step": 1419 - }, - { - "epoch": 1.0790273556231003, - "grad_norm": 1.4996821880340576, - "learning_rate": 3.802226121470811e-06, - "loss": 0.4203261137008667, - "mean_token_accuracy": 0.8479211330413818, - "num_tokens": 12646395.0, - "step": 1420 - }, - { - "epoch": 1.0797872340425532, - "grad_norm": 2.2007253170013428, - "learning_rate": 3.800437839751432e-06, - "loss": 0.40370577573776245, - "mean_token_accuracy": 0.8427679538726807, - "num_tokens": 12653508.0, - "step": 1421 - }, - { - "epoch": 1.080547112462006, - "grad_norm": 1.7266581058502197, - "learning_rate": 3.7986486453347183e-06, - "loss": 0.46750491857528687, - "mean_token_accuracy": 0.8429205417633057, - "num_tokens": 12666329.0, - "step": 1422 - }, - { - "epoch": 1.081306990881459, - "grad_norm": 1.4716318845748901, - "learning_rate": 3.796858539476394e-06, - "loss": 0.3330317735671997, - "mean_token_accuracy": 0.879012942314148, - "num_tokens": 12676741.0, - "step": 1423 - }, - { - "epoch": 1.082066869300912, - "grad_norm": 2.652127265930176, - "learning_rate": 3.795067523432826e-06, - "loss": 0.35365715622901917, - "mean_token_accuracy": 0.8796792030334473, - "num_tokens": 12681479.0, - "step": 1424 - }, - { - "epoch": 1.0828267477203648, - "grad_norm": 1.2937829494476318, - "learning_rate": 3.793275598461017e-06, - "loss": 0.25272446870803833, - "mean_token_accuracy": 0.9231734275817871, - "num_tokens": 12694238.0, - "step": 1425 - }, - { - "epoch": 1.0835866261398177, - "grad_norm": 1.3831220865249634, - "learning_rate": 3.7914827658186104e-06, - "loss": 0.4935331344604492, - "mean_token_accuracy": 0.8417420387268066, - "num_tokens": 12712857.0, - "step": 1426 - }, - { - "epoch": 1.0843465045592706, - "grad_norm": 3.059525728225708, - "learning_rate": 3.7896890267638832e-06, - "loss": 0.2592190206050873, - "mean_token_accuracy": 0.9040263295173645, - "num_tokens": 12716766.0, - "step": 1427 - }, - { - "epoch": 1.0851063829787233, - "grad_norm": 2.8399202823638916, - "learning_rate": 3.787894382555752e-06, - "loss": 0.32098138332366943, - "mean_token_accuracy": 0.8838302493095398, - "num_tokens": 12720774.0, - "step": 1428 - }, - { - "epoch": 1.0858662613981762, - "grad_norm": 2.618479013442993, - "learning_rate": 3.7860988344537664e-06, - "loss": 0.425255686044693, - "mean_token_accuracy": 0.8564130067825317, - "num_tokens": 12726506.0, - "step": 1429 - }, - { - "epoch": 1.0866261398176291, - "grad_norm": 1.3108669519424438, - "learning_rate": 3.7843023837181126e-06, - "loss": 0.40220165252685547, - "mean_token_accuracy": 0.8588873147964478, - "num_tokens": 12742814.0, - "step": 1430 - }, - { - "epoch": 1.087386018237082, - "grad_norm": 2.2083566188812256, - "learning_rate": 3.782505031609607e-06, - "loss": 0.318379282951355, - "mean_token_accuracy": 0.8887606859207153, - "num_tokens": 12748388.0, - "step": 1431 - }, - { - "epoch": 1.088145896656535, - "grad_norm": 1.922358751296997, - "learning_rate": 3.7807067793897006e-06, - "loss": 0.2519589364528656, - "mean_token_accuracy": 0.8936764001846313, - "num_tokens": 12754761.0, - "step": 1432 - }, - { - "epoch": 1.0889057750759878, - "grad_norm": 1.7367439270019531, - "learning_rate": 3.778907628320477e-06, - "loss": 0.3970367908477783, - "mean_token_accuracy": 0.858735203742981, - "num_tokens": 12764016.0, - "step": 1433 - }, - { - "epoch": 1.0896656534954408, - "grad_norm": 2.1931066513061523, - "learning_rate": 3.77710757966465e-06, - "loss": 0.5250554084777832, - "mean_token_accuracy": 0.8356746435165405, - "num_tokens": 12772272.0, - "step": 1434 - }, - { - "epoch": 1.0904255319148937, - "grad_norm": 1.718337893486023, - "learning_rate": 3.775306634685562e-06, - "loss": 0.283231645822525, - "mean_token_accuracy": 0.9009919166564941, - "num_tokens": 12780706.0, - "step": 1435 - }, - { - "epoch": 1.0911854103343466, - "grad_norm": 2.1985926628112793, - "learning_rate": 3.773504794647187e-06, - "loss": 0.3913170397281647, - "mean_token_accuracy": 0.8909255266189575, - "num_tokens": 12787052.0, - "step": 1436 - }, - { - "epoch": 1.0919452887537995, - "grad_norm": 2.8687937259674072, - "learning_rate": 3.771702060814123e-06, - "loss": 0.3135771155357361, - "mean_token_accuracy": 0.9016125202178955, - "num_tokens": 12791854.0, - "step": 1437 - }, - { - "epoch": 1.0927051671732522, - "grad_norm": 4.203946590423584, - "learning_rate": 3.7698984344516e-06, - "loss": 0.3642737865447998, - "mean_token_accuracy": 0.8842349052429199, - "num_tokens": 12794969.0, - "step": 1438 - }, - { - "epoch": 1.093465045592705, - "grad_norm": 1.5134642124176025, - "learning_rate": 3.7680939168254733e-06, - "loss": 0.3732057213783264, - "mean_token_accuracy": 0.8671083450317383, - "num_tokens": 12808480.0, - "step": 1439 - }, - { - "epoch": 1.094224924012158, - "grad_norm": 3.2103970050811768, - "learning_rate": 3.7662885092022206e-06, - "loss": 0.3556194603443146, - "mean_token_accuracy": 0.8786529302597046, - "num_tokens": 12812654.0, - "step": 1440 - }, - { - "epoch": 1.094984802431611, - "grad_norm": 2.2774064540863037, - "learning_rate": 3.7644822128489476e-06, - "loss": 0.38409674167633057, - "mean_token_accuracy": 0.866563081741333, - "num_tokens": 12819854.0, - "step": 1441 - }, - { - "epoch": 1.0957446808510638, - "grad_norm": 1.8250885009765625, - "learning_rate": 3.7626750290333824e-06, - "loss": 0.3812350034713745, - "mean_token_accuracy": 0.8676212430000305, - "num_tokens": 12830338.0, - "step": 1442 - }, - { - "epoch": 1.0965045592705167, - "grad_norm": 1.8337891101837158, - "learning_rate": 3.7608669590238765e-06, - "loss": 0.3892471194267273, - "mean_token_accuracy": 0.8616238832473755, - "num_tokens": 12840340.0, - "step": 1443 - }, - { - "epoch": 1.0972644376899696, - "grad_norm": 1.5300254821777344, - "learning_rate": 3.7590580040894025e-06, - "loss": 0.35288217663764954, - "mean_token_accuracy": 0.8625509738922119, - "num_tokens": 12853144.0, - "step": 1444 - }, - { - "epoch": 1.0980243161094225, - "grad_norm": 2.152683734893799, - "learning_rate": 3.7572481654995554e-06, - "loss": 0.4004772901535034, - "mean_token_accuracy": 0.858427107334137, - "num_tokens": 12859970.0, - "step": 1445 - }, - { - "epoch": 1.0987841945288754, - "grad_norm": 1.532832145690918, - "learning_rate": 3.755437444524548e-06, - "loss": 0.46820127964019775, - "mean_token_accuracy": 0.8585472106933594, - "num_tokens": 12875243.0, - "step": 1446 - }, - { - "epoch": 1.0995440729483283, - "grad_norm": 1.6485342979431152, - "learning_rate": 3.7536258424352164e-06, - "loss": 0.46329325437545776, - "mean_token_accuracy": 0.8376060724258423, - "num_tokens": 12886383.0, - "step": 1447 - }, - { - "epoch": 1.1003039513677813, - "grad_norm": 2.402256488800049, - "learning_rate": 3.75181336050301e-06, - "loss": 0.43916207551956177, - "mean_token_accuracy": 0.8448786735534668, - "num_tokens": 12892613.0, - "step": 1448 - }, - { - "epoch": 1.101063829787234, - "grad_norm": 1.3893651962280273, - "learning_rate": 3.7500000000000005e-06, - "loss": 0.3919021785259247, - "mean_token_accuracy": 0.8495820760726929, - "num_tokens": 12905523.0, - "step": 1449 - }, - { - "epoch": 1.1018237082066868, - "grad_norm": 1.5519827604293823, - "learning_rate": 3.7481857621988734e-06, - "loss": 0.4710700809955597, - "mean_token_accuracy": 0.8387632369995117, - "num_tokens": 12918236.0, - "step": 1450 - }, - { - "epoch": 1.1025835866261398, - "grad_norm": 2.0141353607177734, - "learning_rate": 3.74637064837293e-06, - "loss": 0.30866751074790955, - "mean_token_accuracy": 0.9059321880340576, - "num_tokens": 12924391.0, - "step": 1451 - }, - { - "epoch": 1.1033434650455927, - "grad_norm": 1.2201496362686157, - "learning_rate": 3.7445546597960882e-06, - "loss": 0.3938257396221161, - "mean_token_accuracy": 0.8726630210876465, - "num_tokens": 12943338.0, - "step": 1452 - }, - { - "epoch": 1.1041033434650456, - "grad_norm": 2.29434871673584, - "learning_rate": 3.742737797742878e-06, - "loss": 0.4347776174545288, - "mean_token_accuracy": 0.840569257736206, - "num_tokens": 12950636.0, - "step": 1453 - }, - { - "epoch": 1.1048632218844985, - "grad_norm": 2.3875105381011963, - "learning_rate": 3.7409200634884425e-06, - "loss": 0.48353564739227295, - "mean_token_accuracy": 0.8207056522369385, - "num_tokens": 12957635.0, - "step": 1454 - }, - { - "epoch": 1.1056231003039514, - "grad_norm": 2.3539648056030273, - "learning_rate": 3.7391014583085384e-06, - "loss": 0.3532431721687317, - "mean_token_accuracy": 0.8903788924217224, - "num_tokens": 12963032.0, - "step": 1455 - }, - { - "epoch": 1.1063829787234043, - "grad_norm": 1.5611135959625244, - "learning_rate": 3.737281983479534e-06, - "loss": 0.4734863042831421, - "mean_token_accuracy": 0.8413879871368408, - "num_tokens": 12977170.0, - "step": 1456 - }, - { - "epoch": 1.1071428571428572, - "grad_norm": 1.474320411682129, - "learning_rate": 3.735461640278404e-06, - "loss": 0.41854286193847656, - "mean_token_accuracy": 0.8499876856803894, - "num_tokens": 12993750.0, - "step": 1457 - }, - { - "epoch": 1.1079027355623101, - "grad_norm": 2.6873273849487305, - "learning_rate": 3.733640429982738e-06, - "loss": 0.47637903690338135, - "mean_token_accuracy": 0.83599853515625, - "num_tokens": 12999058.0, - "step": 1458 - }, - { - "epoch": 1.108662613981763, - "grad_norm": 1.4575026035308838, - "learning_rate": 3.731818353870729e-06, - "loss": 0.38441652059555054, - "mean_token_accuracy": 0.8582364320755005, - "num_tokens": 13013864.0, - "step": 1459 - }, - { - "epoch": 1.1094224924012157, - "grad_norm": 1.7722690105438232, - "learning_rate": 3.729995413221183e-06, - "loss": 0.4224998950958252, - "mean_token_accuracy": 0.8511888384819031, - "num_tokens": 13023714.0, - "step": 1460 - }, - { - "epoch": 1.1101823708206686, - "grad_norm": 2.625760555267334, - "learning_rate": 3.7281716093135068e-06, - "loss": 0.3487582802772522, - "mean_token_accuracy": 0.8834779262542725, - "num_tokens": 13028608.0, - "step": 1461 - }, - { - "epoch": 1.1109422492401215, - "grad_norm": 1.2554056644439697, - "learning_rate": 3.726346943427719e-06, - "loss": 0.33312469720840454, - "mean_token_accuracy": 0.8704153299331665, - "num_tokens": 13044901.0, - "step": 1462 - }, - { - "epoch": 1.1117021276595744, - "grad_norm": 2.1109910011291504, - "learning_rate": 3.7245214168444388e-06, - "loss": 0.387290894985199, - "mean_token_accuracy": 0.860816240310669, - "num_tokens": 13051452.0, - "step": 1463 - }, - { - "epoch": 1.1124620060790273, - "grad_norm": 3.159201145172119, - "learning_rate": 3.722695030844891e-06, - "loss": 0.37690871953964233, - "mean_token_accuracy": 0.8717561960220337, - "num_tokens": 13055131.0, - "step": 1464 - }, - { - "epoch": 1.1132218844984803, - "grad_norm": 1.3810011148452759, - "learning_rate": 3.7208677867109042e-06, - "loss": 0.36598485708236694, - "mean_token_accuracy": 0.8683375120162964, - "num_tokens": 13069798.0, - "step": 1465 - }, - { - "epoch": 1.1139817629179332, - "grad_norm": 2.500849485397339, - "learning_rate": 3.7190396857249087e-06, - "loss": 0.2781746983528137, - "mean_token_accuracy": 0.9026005268096924, - "num_tokens": 13075127.0, - "step": 1466 - }, - { - "epoch": 1.114741641337386, - "grad_norm": 1.7445712089538574, - "learning_rate": 3.7172107291699356e-06, - "loss": 0.5055314302444458, - "mean_token_accuracy": 0.8252174258232117, - "num_tokens": 13084843.0, - "step": 1467 - }, - { - "epoch": 1.115501519756839, - "grad_norm": 1.6386256217956543, - "learning_rate": 3.7153809183296174e-06, - "loss": 0.38478314876556396, - "mean_token_accuracy": 0.8600847721099854, - "num_tokens": 13096517.0, - "step": 1468 - }, - { - "epoch": 1.1162613981762919, - "grad_norm": 2.3818395137786865, - "learning_rate": 3.713550254488185e-06, - "loss": 0.40308547019958496, - "mean_token_accuracy": 0.8628184795379639, - "num_tokens": 13102324.0, - "step": 1469 - }, - { - "epoch": 1.1170212765957448, - "grad_norm": 1.73163640499115, - "learning_rate": 3.7117187389304703e-06, - "loss": 0.5035421848297119, - "mean_token_accuracy": 0.8229597210884094, - "num_tokens": 13113763.0, - "step": 1470 - }, - { - "epoch": 1.1177811550151975, - "grad_norm": 3.147177219390869, - "learning_rate": 3.7098863729418997e-06, - "loss": 0.557449221611023, - "mean_token_accuracy": 0.8266849517822266, - "num_tokens": 13118849.0, - "step": 1471 - }, - { - "epoch": 1.1185410334346504, - "grad_norm": 1.5061391592025757, - "learning_rate": 3.7080531578085e-06, - "loss": 0.3759554922580719, - "mean_token_accuracy": 0.8541903495788574, - "num_tokens": 13131337.0, - "step": 1472 - }, - { - "epoch": 1.1193009118541033, - "grad_norm": 2.172346353530884, - "learning_rate": 3.7062190948168906e-06, - "loss": 0.41491609811782837, - "mean_token_accuracy": 0.8531454801559448, - "num_tokens": 13139767.0, - "step": 1473 - }, - { - "epoch": 1.1200607902735562, - "grad_norm": 2.1527154445648193, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.4309239387512207, - "mean_token_accuracy": 0.8327745199203491, - "num_tokens": 13147210.0, - "step": 1474 - }, - { - "epoch": 1.1208206686930091, - "grad_norm": 1.8342832326889038, - "learning_rate": 3.7025484304085035e-06, - "loss": 0.34393298625946045, - "mean_token_accuracy": 0.8948153257369995, - "num_tokens": 13154831.0, - "step": 1475 - }, - { - "epoch": 1.121580547112462, - "grad_norm": 2.509291172027588, - "learning_rate": 3.7007118315679384e-06, - "loss": 0.4479471445083618, - "mean_token_accuracy": 0.8280234336853027, - "num_tokens": 13161040.0, - "step": 1476 - }, - { - "epoch": 1.122340425531915, - "grad_norm": 2.914710521697998, - "learning_rate": 3.6988743900215895e-06, - "loss": 0.3724832832813263, - "mean_token_accuracy": 0.863893985748291, - "num_tokens": 13164975.0, - "step": 1477 - }, - { - "epoch": 1.1231003039513678, - "grad_norm": 3.274808645248413, - "learning_rate": 3.6970361070590443e-06, - "loss": 0.4088161885738373, - "mean_token_accuracy": 0.8474822044372559, - "num_tokens": 13168826.0, - "step": 1478 - }, - { - "epoch": 1.1238601823708207, - "grad_norm": 2.861546277999878, - "learning_rate": 3.695196983970481e-06, - "loss": 0.45837992429733276, - "mean_token_accuracy": 0.8579759001731873, - "num_tokens": 13173794.0, - "step": 1479 - }, - { - "epoch": 1.1246200607902737, - "grad_norm": 1.9491597414016724, - "learning_rate": 3.6933570220466654e-06, - "loss": 0.4333910346031189, - "mean_token_accuracy": 0.8444236516952515, - "num_tokens": 13181598.0, - "step": 1480 - }, - { - "epoch": 1.1253799392097266, - "grad_norm": 1.329848051071167, - "learning_rate": 3.6915162225789546e-06, - "loss": 0.36404621601104736, - "mean_token_accuracy": 0.8694117069244385, - "num_tokens": 13196381.0, - "step": 1481 - }, - { - "epoch": 1.1261398176291793, - "grad_norm": 1.8854197263717651, - "learning_rate": 3.6896745868592924e-06, - "loss": 0.4085756838321686, - "mean_token_accuracy": 0.855188250541687, - "num_tokens": 13205236.0, - "step": 1482 - }, - { - "epoch": 1.1268996960486322, - "grad_norm": 3.01684832572937, - "learning_rate": 3.6878321161802106e-06, - "loss": 0.28105655312538147, - "mean_token_accuracy": 0.9009426236152649, - "num_tokens": 13209380.0, - "step": 1483 - }, - { - "epoch": 1.127659574468085, - "grad_norm": 1.8051308393478394, - "learning_rate": 3.685988811834823e-06, - "loss": 0.3314531147480011, - "mean_token_accuracy": 0.8805814385414124, - "num_tokens": 13217714.0, - "step": 1484 - }, - { - "epoch": 1.128419452887538, - "grad_norm": 1.61757493019104, - "learning_rate": 3.684144675116836e-06, - "loss": 0.4543863534927368, - "mean_token_accuracy": 0.8400536775588989, - "num_tokens": 13229330.0, - "step": 1485 - }, - { - "epoch": 1.1291793313069909, - "grad_norm": 1.602686882019043, - "learning_rate": 3.682299707320532e-06, - "loss": 0.3653204143047333, - "mean_token_accuracy": 0.8655825853347778, - "num_tokens": 13242872.0, - "step": 1486 - }, - { - "epoch": 1.1299392097264438, - "grad_norm": 2.3093113899230957, - "learning_rate": 3.680453909740782e-06, - "loss": 0.4383693039417267, - "mean_token_accuracy": 0.839782178401947, - "num_tokens": 13248976.0, - "step": 1487 - }, - { - "epoch": 1.1306990881458967, - "grad_norm": 1.180559754371643, - "learning_rate": 3.6786072836730376e-06, - "loss": 0.5354755520820618, - "mean_token_accuracy": 0.8151205778121948, - "num_tokens": 13272896.0, - "step": 1488 - }, - { - "epoch": 1.1314589665653496, - "grad_norm": 1.9554040431976318, - "learning_rate": 3.6767598304133325e-06, - "loss": 0.4485316872596741, - "mean_token_accuracy": 0.8399936556816101, - "num_tokens": 13280757.0, - "step": 1489 - }, - { - "epoch": 1.1322188449848025, - "grad_norm": 2.236471176147461, - "learning_rate": 3.674911551258279e-06, - "loss": 0.45594364404678345, - "mean_token_accuracy": 0.8552400469779968, - "num_tokens": 13287328.0, - "step": 1490 - }, - { - "epoch": 1.1329787234042552, - "grad_norm": 2.5228686332702637, - "learning_rate": 3.673062447505072e-06, - "loss": 0.4048641622066498, - "mean_token_accuracy": 0.8617376685142517, - "num_tokens": 13292716.0, - "step": 1491 - }, - { - "epoch": 1.1337386018237081, - "grad_norm": 1.1274473667144775, - "learning_rate": 3.6712125204514836e-06, - "loss": 0.3848876357078552, - "mean_token_accuracy": 0.8672975301742554, - "num_tokens": 13313403.0, - "step": 1492 - }, - { - "epoch": 1.134498480243161, - "grad_norm": 2.349541425704956, - "learning_rate": 3.6693617713958633e-06, - "loss": 0.3166058361530304, - "mean_token_accuracy": 0.8896721601486206, - "num_tokens": 13318720.0, - "step": 1493 - }, - { - "epoch": 1.135258358662614, - "grad_norm": 2.2438278198242188, - "learning_rate": 3.6675102016371387e-06, - "loss": 0.5418218970298767, - "mean_token_accuracy": 0.8256527185440063, - "num_tokens": 13325360.0, - "step": 1494 - }, - { - "epoch": 1.1360182370820668, - "grad_norm": 2.21268892288208, - "learning_rate": 3.665657812474812e-06, - "loss": 0.48603951930999756, - "mean_token_accuracy": 0.8273470401763916, - "num_tokens": 13333217.0, - "step": 1495 - }, - { - "epoch": 1.1367781155015197, - "grad_norm": 2.6105997562408447, - "learning_rate": 3.6638046052089614e-06, - "loss": 0.31221291422843933, - "mean_token_accuracy": 0.888375997543335, - "num_tokens": 13338413.0, - "step": 1496 - }, - { - "epoch": 1.1375379939209727, - "grad_norm": 3.655658483505249, - "learning_rate": 3.661950581140239e-06, - "loss": 0.3609023988246918, - "mean_token_accuracy": 0.8838576078414917, - "num_tokens": 13341499.0, - "step": 1497 - }, - { - "epoch": 1.1382978723404256, - "grad_norm": 2.242009162902832, - "learning_rate": 3.660095741569871e-06, - "loss": 0.40022802352905273, - "mean_token_accuracy": 0.8559960722923279, - "num_tokens": 13347917.0, - "step": 1498 - }, - { - "epoch": 1.1390577507598785, - "grad_norm": 1.7958979606628418, - "learning_rate": 3.658240087799655e-06, - "loss": 0.499157190322876, - "mean_token_accuracy": 0.8423802256584167, - "num_tokens": 13361570.0, - "step": 1499 - }, - { - "epoch": 1.1398176291793314, - "grad_norm": 2.5406908988952637, - "learning_rate": 3.6563836211319593e-06, - "loss": 0.4090137481689453, - "mean_token_accuracy": 0.8769663572311401, - "num_tokens": 13367183.0, - "step": 1500 - }, - { - "epoch": 1.1405775075987843, - "grad_norm": 1.9861716032028198, - "learning_rate": 3.654526342869724e-06, - "loss": 0.5125207304954529, - "mean_token_accuracy": 0.8315266370773315, - "num_tokens": 13376767.0, - "step": 1501 - }, - { - "epoch": 1.141337386018237, - "grad_norm": 1.731188178062439, - "learning_rate": 3.65266825431646e-06, - "loss": 0.39452576637268066, - "mean_token_accuracy": 0.8585706353187561, - "num_tokens": 13388437.0, - "step": 1502 - }, - { - "epoch": 1.1420972644376899, - "grad_norm": 1.5203773975372314, - "learning_rate": 3.6508093567762425e-06, - "loss": 0.39466819167137146, - "mean_token_accuracy": 0.8584027886390686, - "num_tokens": 13399727.0, - "step": 1503 - }, - { - "epoch": 1.1428571428571428, - "grad_norm": 2.606462001800537, - "learning_rate": 3.6489496515537204e-06, - "loss": 0.4521079361438751, - "mean_token_accuracy": 0.8413360118865967, - "num_tokens": 13408426.0, - "step": 1504 - }, - { - "epoch": 1.1436170212765957, - "grad_norm": 2.6207993030548096, - "learning_rate": 3.647089139954104e-06, - "loss": 0.4709353446960449, - "mean_token_accuracy": 0.8397113084793091, - "num_tokens": 13413506.0, - "step": 1505 - }, - { - "epoch": 1.1443768996960486, - "grad_norm": 1.7214165925979614, - "learning_rate": 3.6452278232831734e-06, - "loss": 0.45506367087364197, - "mean_token_accuracy": 0.8466023206710815, - "num_tokens": 13424592.0, - "step": 1506 - }, - { - "epoch": 1.1451367781155015, - "grad_norm": 1.7111759185791016, - "learning_rate": 3.643365702847272e-06, - "loss": 0.5016278624534607, - "mean_token_accuracy": 0.8196234703063965, - "num_tokens": 13434421.0, - "step": 1507 - }, - { - "epoch": 1.1458966565349544, - "grad_norm": 1.7528148889541626, - "learning_rate": 3.641502779953307e-06, - "loss": 0.5020896196365356, - "mean_token_accuracy": 0.826249361038208, - "num_tokens": 13445286.0, - "step": 1508 - }, - { - "epoch": 1.1466565349544073, - "grad_norm": 1.3470909595489502, - "learning_rate": 3.639639055908751e-06, - "loss": 0.45765724778175354, - "mean_token_accuracy": 0.8380560278892517, - "num_tokens": 13465030.0, - "step": 1509 - }, - { - "epoch": 1.1474164133738602, - "grad_norm": 2.4846835136413574, - "learning_rate": 3.6377745320216346e-06, - "loss": 0.46488267183303833, - "mean_token_accuracy": 0.8393925428390503, - "num_tokens": 13470883.0, - "step": 1510 - }, - { - "epoch": 1.1481762917933132, - "grad_norm": 1.770201563835144, - "learning_rate": 3.635909209600555e-06, - "loss": 0.5262179374694824, - "mean_token_accuracy": 0.8201162815093994, - "num_tokens": 13482558.0, - "step": 1511 - }, - { - "epoch": 1.148936170212766, - "grad_norm": 1.5955098867416382, - "learning_rate": 3.6340430899546656e-06, - "loss": 0.430621862411499, - "mean_token_accuracy": 0.8488553762435913, - "num_tokens": 13493003.0, - "step": 1512 - }, - { - "epoch": 1.1496960486322187, - "grad_norm": 2.846176862716675, - "learning_rate": 3.632176174393682e-06, - "loss": 0.23461638391017914, - "mean_token_accuracy": 0.9218817353248596, - "num_tokens": 13496566.0, - "step": 1513 - }, - { - "epoch": 1.1504559270516717, - "grad_norm": 1.9606610536575317, - "learning_rate": 3.630308464227877e-06, - "loss": 0.4940161108970642, - "mean_token_accuracy": 0.8474864959716797, - "num_tokens": 13504843.0, - "step": 1514 - }, - { - "epoch": 1.1512158054711246, - "grad_norm": 1.1588608026504517, - "learning_rate": 3.628439960768082e-06, - "loss": 0.32650992274284363, - "mean_token_accuracy": 0.8797246217727661, - "num_tokens": 13521513.0, - "step": 1515 - }, - { - "epoch": 1.1519756838905775, - "grad_norm": 1.3566495180130005, - "learning_rate": 3.6265706653256837e-06, - "loss": 0.4359064996242523, - "mean_token_accuracy": 0.8379859328269958, - "num_tokens": 13540608.0, - "step": 1516 - }, - { - "epoch": 1.1527355623100304, - "grad_norm": 1.4728609323501587, - "learning_rate": 3.624700579212626e-06, - "loss": 0.29939693212509155, - "mean_token_accuracy": 0.8831408023834229, - "num_tokens": 13550641.0, - "step": 1517 - }, - { - "epoch": 1.1534954407294833, - "grad_norm": 2.162325382232666, - "learning_rate": 3.6228297037414077e-06, - "loss": 0.4097636938095093, - "mean_token_accuracy": 0.8575425148010254, - "num_tokens": 13556931.0, - "step": 1518 - }, - { - "epoch": 1.1542553191489362, - "grad_norm": 1.754439353942871, - "learning_rate": 3.6209580402250816e-06, - "loss": 0.400202214717865, - "mean_token_accuracy": 0.8569821119308472, - "num_tokens": 13565491.0, - "step": 1519 - }, - { - "epoch": 1.155015197568389, - "grad_norm": 1.5250083208084106, - "learning_rate": 3.619085589977251e-06, - "loss": 0.43330419063568115, - "mean_token_accuracy": 0.8492985963821411, - "num_tokens": 13577147.0, - "step": 1520 - }, - { - "epoch": 1.155775075987842, - "grad_norm": 1.9108905792236328, - "learning_rate": 3.617212354312076e-06, - "loss": 0.30567464232444763, - "mean_token_accuracy": 0.8850164413452148, - "num_tokens": 13584366.0, - "step": 1521 - }, - { - "epoch": 1.156534954407295, - "grad_norm": 2.2574243545532227, - "learning_rate": 3.615338334544265e-06, - "loss": 0.4391738772392273, - "mean_token_accuracy": 0.839765727519989, - "num_tokens": 13591816.0, - "step": 1522 - }, - { - "epoch": 1.1572948328267478, - "grad_norm": 2.1235218048095703, - "learning_rate": 3.6134635319890763e-06, - "loss": 0.45043107867240906, - "mean_token_accuracy": 0.8385299444198608, - "num_tokens": 13599736.0, - "step": 1523 - }, - { - "epoch": 1.1580547112462005, - "grad_norm": 2.2274110317230225, - "learning_rate": 3.611587947962319e-06, - "loss": 0.3623226284980774, - "mean_token_accuracy": 0.8724044561386108, - "num_tokens": 13605354.0, - "step": 1524 - }, - { - "epoch": 1.1588145896656534, - "grad_norm": 3.414236545562744, - "learning_rate": 3.6097115837803504e-06, - "loss": 0.30060696601867676, - "mean_token_accuracy": 0.8971061706542969, - "num_tokens": 13608851.0, - "step": 1525 - }, - { - "epoch": 1.1595744680851063, - "grad_norm": 2.496264696121216, - "learning_rate": 3.6078344407600744e-06, - "loss": 0.3567180037498474, - "mean_token_accuracy": 0.8596180081367493, - "num_tokens": 13614339.0, - "step": 1526 - }, - { - "epoch": 1.1603343465045592, - "grad_norm": 2.0191843509674072, - "learning_rate": 3.6059565202189433e-06, - "loss": 0.43206095695495605, - "mean_token_accuracy": 0.8464000821113586, - "num_tokens": 13622395.0, - "step": 1527 - }, - { - "epoch": 1.1610942249240122, - "grad_norm": 1.5475906133651733, - "learning_rate": 3.604077823474954e-06, - "loss": 0.4535648226737976, - "mean_token_accuracy": 0.8391586542129517, - "num_tokens": 13635356.0, - "step": 1528 - }, - { - "epoch": 1.161854103343465, - "grad_norm": 2.1348211765289307, - "learning_rate": 3.6021983518466468e-06, - "loss": 0.2733963429927826, - "mean_token_accuracy": 0.9007417559623718, - "num_tokens": 13640641.0, - "step": 1529 - }, - { - "epoch": 1.162613981762918, - "grad_norm": 2.8452792167663574, - "learning_rate": 3.600318106653108e-06, - "loss": 0.29591235518455505, - "mean_token_accuracy": 0.8934413194656372, - "num_tokens": 13644995.0, - "step": 1530 - }, - { - "epoch": 1.1633738601823709, - "grad_norm": 2.342907190322876, - "learning_rate": 3.5984370892139663e-06, - "loss": 0.4675130248069763, - "mean_token_accuracy": 0.8352028131484985, - "num_tokens": 13652695.0, - "step": 1531 - }, - { - "epoch": 1.1641337386018238, - "grad_norm": 2.3480238914489746, - "learning_rate": 3.5965553008493924e-06, - "loss": 0.3114515542984009, - "mean_token_accuracy": 0.8845353126525879, - "num_tokens": 13658101.0, - "step": 1532 - }, - { - "epoch": 1.1648936170212765, - "grad_norm": 1.8608155250549316, - "learning_rate": 3.594672742880097e-06, - "loss": 0.3864145278930664, - "mean_token_accuracy": 0.867354154586792, - "num_tokens": 13666042.0, - "step": 1533 - }, - { - "epoch": 1.1656534954407296, - "grad_norm": 1.4756088256835938, - "learning_rate": 3.5927894166273324e-06, - "loss": 0.3671600818634033, - "mean_token_accuracy": 0.8695988655090332, - "num_tokens": 13678253.0, - "step": 1534 - }, - { - "epoch": 1.1664133738601823, - "grad_norm": 2.8831355571746826, - "learning_rate": 3.5909053234128893e-06, - "loss": 0.267184317111969, - "mean_token_accuracy": 0.9008115530014038, - "num_tokens": 13681790.0, - "step": 1535 - }, - { - "epoch": 1.1671732522796352, - "grad_norm": 2.1984763145446777, - "learning_rate": 3.5890204645590964e-06, - "loss": 0.4431505799293518, - "mean_token_accuracy": 0.8623673915863037, - "num_tokens": 13688444.0, - "step": 1536 - }, - { - "epoch": 1.167933130699088, - "grad_norm": 1.8271523714065552, - "learning_rate": 3.5871348413888207e-06, - "loss": 0.3861040771007538, - "mean_token_accuracy": 0.8624277114868164, - "num_tokens": 13696872.0, - "step": 1537 - }, - { - "epoch": 1.168693009118541, - "grad_norm": 1.6313756704330444, - "learning_rate": 3.585248455225466e-06, - "loss": 0.3775154948234558, - "mean_token_accuracy": 0.8624461889266968, - "num_tokens": 13706167.0, - "step": 1538 - }, - { - "epoch": 1.169452887537994, - "grad_norm": 2.4377901554107666, - "learning_rate": 3.5833613073929684e-06, - "loss": 0.2308957427740097, - "mean_token_accuracy": 0.920600175857544, - "num_tokens": 13710367.0, - "step": 1539 - }, - { - "epoch": 1.1702127659574468, - "grad_norm": 2.2621750831604004, - "learning_rate": 3.5814733992158025e-06, - "loss": 0.33167219161987305, - "mean_token_accuracy": 0.8963261842727661, - "num_tokens": 13716384.0, - "step": 1540 - }, - { - "epoch": 1.1709726443768997, - "grad_norm": 1.3178150653839111, - "learning_rate": 3.579584732018975e-06, - "loss": 0.3276631832122803, - "mean_token_accuracy": 0.8853521347045898, - "num_tokens": 13731031.0, - "step": 1541 - }, - { - "epoch": 1.1717325227963526, - "grad_norm": 2.177750587463379, - "learning_rate": 3.577695307128024e-06, - "loss": 0.48177266120910645, - "mean_token_accuracy": 0.830329418182373, - "num_tokens": 13737925.0, - "step": 1542 - }, - { - "epoch": 1.1724924012158056, - "grad_norm": 2.2268829345703125, - "learning_rate": 3.5758051258690223e-06, - "loss": 0.48843517899513245, - "mean_token_accuracy": 0.8310644030570984, - "num_tokens": 13746039.0, - "step": 1543 - }, - { - "epoch": 1.1732522796352582, - "grad_norm": 1.498701572418213, - "learning_rate": 3.5739141895685708e-06, - "loss": 0.4542962312698364, - "mean_token_accuracy": 0.8500330448150635, - "num_tokens": 13765002.0, - "step": 1544 - }, - { - "epoch": 1.1740121580547112, - "grad_norm": 1.786670446395874, - "learning_rate": 3.5720224995538023e-06, - "loss": 0.27367928624153137, - "mean_token_accuracy": 0.8916142582893372, - "num_tokens": 13774113.0, - "step": 1545 - }, - { - "epoch": 1.174772036474164, - "grad_norm": 2.0311272144317627, - "learning_rate": 3.5701300571523757e-06, - "loss": 0.559987485408783, - "mean_token_accuracy": 0.8266973495483398, - "num_tokens": 13783912.0, - "step": 1546 - }, - { - "epoch": 1.175531914893617, - "grad_norm": 1.8732186555862427, - "learning_rate": 3.5682368636924825e-06, - "loss": 0.5184751152992249, - "mean_token_accuracy": 0.8450918197631836, - "num_tokens": 13792728.0, - "step": 1547 - }, - { - "epoch": 1.1762917933130699, - "grad_norm": 1.4410661458969116, - "learning_rate": 3.566342920502837e-06, - "loss": 0.383536696434021, - "mean_token_accuracy": 0.8672217726707458, - "num_tokens": 13813590.0, - "step": 1548 - }, - { - "epoch": 1.1770516717325228, - "grad_norm": 3.06056547164917, - "learning_rate": 3.564448228912682e-06, - "loss": 0.3941686153411865, - "mean_token_accuracy": 0.8696402311325073, - "num_tokens": 13817704.0, - "step": 1549 - }, - { - "epoch": 1.1778115501519757, - "grad_norm": 1.6150329113006592, - "learning_rate": 3.562552790251785e-06, - "loss": 0.41606605052948, - "mean_token_accuracy": 0.8488572835922241, - "num_tokens": 13831303.0, - "step": 1550 - }, - { - "epoch": 1.1785714285714286, - "grad_norm": 2.1199934482574463, - "learning_rate": 3.5606566058504377e-06, - "loss": 0.3974752426147461, - "mean_token_accuracy": 0.8686345219612122, - "num_tokens": 13837613.0, - "step": 1551 - }, - { - "epoch": 1.1793313069908815, - "grad_norm": 1.5683876276016235, - "learning_rate": 3.558759677039455e-06, - "loss": 0.35225993394851685, - "mean_token_accuracy": 0.8710784316062927, - "num_tokens": 13846779.0, - "step": 1552 - }, - { - "epoch": 1.1800911854103344, - "grad_norm": 1.4644675254821777, - "learning_rate": 3.5568620051501755e-06, - "loss": 0.38400042057037354, - "mean_token_accuracy": 0.8548328876495361, - "num_tokens": 13860713.0, - "step": 1553 - }, - { - "epoch": 1.1808510638297873, - "grad_norm": 1.461491346359253, - "learning_rate": 3.5549635915144578e-06, - "loss": 0.4572640061378479, - "mean_token_accuracy": 0.8506045937538147, - "num_tokens": 13877289.0, - "step": 1554 - }, - { - "epoch": 1.18161094224924, - "grad_norm": 2.6364715099334717, - "learning_rate": 3.553064437464682e-06, - "loss": 0.3954341411590576, - "mean_token_accuracy": 0.8561649322509766, - "num_tokens": 13882064.0, - "step": 1555 - }, - { - "epoch": 1.182370820668693, - "grad_norm": 2.027273654937744, - "learning_rate": 3.551164544333745e-06, - "loss": 0.47625732421875, - "mean_token_accuracy": 0.8349384069442749, - "num_tokens": 13890306.0, - "step": 1556 - }, - { - "epoch": 1.1831306990881458, - "grad_norm": 2.8427743911743164, - "learning_rate": 3.549263913455069e-06, - "loss": 0.4273033142089844, - "mean_token_accuracy": 0.8541387319564819, - "num_tokens": 13894882.0, - "step": 1557 - }, - { - "epoch": 1.1838905775075987, - "grad_norm": 1.6298975944519043, - "learning_rate": 3.5473625461625884e-06, - "loss": 0.4378639757633209, - "mean_token_accuracy": 0.8634963631629944, - "num_tokens": 13906152.0, - "step": 1558 - }, - { - "epoch": 1.1846504559270516, - "grad_norm": 2.4098947048187256, - "learning_rate": 3.5454604437907535e-06, - "loss": 0.47236716747283936, - "mean_token_accuracy": 0.8646864891052246, - "num_tokens": 13911803.0, - "step": 1559 - }, - { - "epoch": 1.1854103343465046, - "grad_norm": 1.5972497463226318, - "learning_rate": 3.543557607674537e-06, - "loss": 0.3001407980918884, - "mean_token_accuracy": 0.8927055597305298, - "num_tokens": 13921304.0, - "step": 1560 - }, - { - "epoch": 1.1861702127659575, - "grad_norm": 2.1140005588531494, - "learning_rate": 3.54165403914942e-06, - "loss": 0.41898271441459656, - "mean_token_accuracy": 0.8542245626449585, - "num_tokens": 13929434.0, - "step": 1561 - }, - { - "epoch": 1.1869300911854104, - "grad_norm": 1.8733803033828735, - "learning_rate": 3.539749739551401e-06, - "loss": 0.35469961166381836, - "mean_token_accuracy": 0.8805290460586548, - "num_tokens": 13937781.0, - "step": 1562 - }, - { - "epoch": 1.1876899696048633, - "grad_norm": 2.2805802822113037, - "learning_rate": 3.53784471021699e-06, - "loss": 0.44496792554855347, - "mean_token_accuracy": 0.8454172611236572, - "num_tokens": 13944394.0, - "step": 1563 - }, - { - "epoch": 1.1884498480243162, - "grad_norm": 0.9728449583053589, - "learning_rate": 3.535938952483211e-06, - "loss": 0.3156968355178833, - "mean_token_accuracy": 0.8739837408065796, - "num_tokens": 13966712.0, - "step": 1564 - }, - { - "epoch": 1.189209726443769, - "grad_norm": 3.025338888168335, - "learning_rate": 3.534032467687597e-06, - "loss": 0.30036938190460205, - "mean_token_accuracy": 0.9058252573013306, - "num_tokens": 13970183.0, - "step": 1565 - }, - { - "epoch": 1.1899696048632218, - "grad_norm": 2.0659425258636475, - "learning_rate": 3.532125257168193e-06, - "loss": 0.30619731545448303, - "mean_token_accuracy": 0.9041587710380554, - "num_tokens": 13976657.0, - "step": 1566 - }, - { - "epoch": 1.1907294832826747, - "grad_norm": 3.2036776542663574, - "learning_rate": 3.5302173222635526e-06, - "loss": 0.4145944118499756, - "mean_token_accuracy": 0.8502328395843506, - "num_tokens": 13981198.0, - "step": 1567 - }, - { - "epoch": 1.1914893617021276, - "grad_norm": 1.7767539024353027, - "learning_rate": 3.5283086643127396e-06, - "loss": 0.437128484249115, - "mean_token_accuracy": 0.8965631723403931, - "num_tokens": 13990259.0, - "step": 1568 - }, - { - "epoch": 1.1922492401215805, - "grad_norm": 1.7777384519577026, - "learning_rate": 3.5263992846553203e-06, - "loss": 0.33831220865249634, - "mean_token_accuracy": 0.8734279870986938, - "num_tokens": 13999363.0, - "step": 1569 - }, - { - "epoch": 1.1930091185410334, - "grad_norm": 1.6710708141326904, - "learning_rate": 3.5244891846313733e-06, - "loss": 0.4005590081214905, - "mean_token_accuracy": 0.8820298314094543, - "num_tokens": 14008719.0, - "step": 1570 - }, - { - "epoch": 1.1937689969604863, - "grad_norm": 1.0378777980804443, - "learning_rate": 3.5225783655814798e-06, - "loss": 0.3174915313720703, - "mean_token_accuracy": 0.8894162774085999, - "num_tokens": 14025806.0, - "step": 1571 - }, - { - "epoch": 1.1945288753799392, - "grad_norm": 1.2647521495819092, - "learning_rate": 3.520666828846726e-06, - "loss": 0.4173050820827484, - "mean_token_accuracy": 0.8437265157699585, - "num_tokens": 14046445.0, - "step": 1572 - }, - { - "epoch": 1.1952887537993921, - "grad_norm": 2.8625528812408447, - "learning_rate": 3.518754575768702e-06, - "loss": 0.37182557582855225, - "mean_token_accuracy": 0.8660947680473328, - "num_tokens": 14051197.0, - "step": 1573 - }, - { - "epoch": 1.196048632218845, - "grad_norm": 1.1213171482086182, - "learning_rate": 3.516841607689501e-06, - "loss": 0.332731157541275, - "mean_token_accuracy": 0.8573278784751892, - "num_tokens": 14070817.0, - "step": 1574 - }, - { - "epoch": 1.196808510638298, - "grad_norm": 1.197508692741394, - "learning_rate": 3.5149279259517165e-06, - "loss": 0.34058472514152527, - "mean_token_accuracy": 0.8603571653366089, - "num_tokens": 14085301.0, - "step": 1575 - }, - { - "epoch": 1.1975683890577509, - "grad_norm": 4.019949913024902, - "learning_rate": 3.5130135318984454e-06, - "loss": 0.3094622492790222, - "mean_token_accuracy": 0.8905094861984253, - "num_tokens": 14088107.0, - "step": 1576 - }, - { - "epoch": 1.1983282674772036, - "grad_norm": 2.591181755065918, - "learning_rate": 3.5110984268732827e-06, - "loss": 0.3407078981399536, - "mean_token_accuracy": 0.880385160446167, - "num_tokens": 14092887.0, - "step": 1577 - }, - { - "epoch": 1.1990881458966565, - "grad_norm": 1.3069331645965576, - "learning_rate": 3.509182612220322e-06, - "loss": 0.3761988878250122, - "mean_token_accuracy": 0.862013041973114, - "num_tokens": 14109216.0, - "step": 1578 - }, - { - "epoch": 1.1998480243161094, - "grad_norm": 1.7802022695541382, - "learning_rate": 3.507266089284157e-06, - "loss": 0.3824652135372162, - "mean_token_accuracy": 0.8707721829414368, - "num_tokens": 14119645.0, - "step": 1579 - }, - { - "epoch": 1.2006079027355623, - "grad_norm": 2.7937185764312744, - "learning_rate": 3.5053488594098763e-06, - "loss": 0.33828890323638916, - "mean_token_accuracy": 0.8765541315078735, - "num_tokens": 14124628.0, - "step": 1580 - }, - { - "epoch": 1.2013677811550152, - "grad_norm": 1.892671823501587, - "learning_rate": 3.5034309239430664e-06, - "loss": 0.3476094603538513, - "mean_token_accuracy": 0.9053795337677002, - "num_tokens": 14131756.0, - "step": 1581 - }, - { - "epoch": 1.202127659574468, - "grad_norm": 1.6857695579528809, - "learning_rate": 3.501512284229807e-06, - "loss": 0.5397108793258667, - "mean_token_accuracy": 0.8173421025276184, - "num_tokens": 14143024.0, - "step": 1582 - }, - { - "epoch": 1.202887537993921, - "grad_norm": 2.501737117767334, - "learning_rate": 3.4995929416166756e-06, - "loss": 0.4192458391189575, - "mean_token_accuracy": 0.8558136224746704, - "num_tokens": 14149499.0, - "step": 1583 - }, - { - "epoch": 1.203647416413374, - "grad_norm": 2.0133907794952393, - "learning_rate": 3.4976728974507387e-06, - "loss": 0.4791576564311981, - "mean_token_accuracy": 0.8253597021102905, - "num_tokens": 14158381.0, - "step": 1584 - }, - { - "epoch": 1.2044072948328268, - "grad_norm": 2.984611988067627, - "learning_rate": 3.4957521530795576e-06, - "loss": 0.3040750026702881, - "mean_token_accuracy": 0.8902391791343689, - "num_tokens": 14162419.0, - "step": 1585 - }, - { - "epoch": 1.2051671732522795, - "grad_norm": 1.518591284751892, - "learning_rate": 3.493830709851185e-06, - "loss": 0.35539618134498596, - "mean_token_accuracy": 0.8737183809280396, - "num_tokens": 14173048.0, - "step": 1586 - }, - { - "epoch": 1.2059270516717326, - "grad_norm": 2.628758192062378, - "learning_rate": 3.4919085691141636e-06, - "loss": 0.33340200781822205, - "mean_token_accuracy": 0.8705098628997803, - "num_tokens": 14178255.0, - "step": 1587 - }, - { - "epoch": 1.2066869300911853, - "grad_norm": 2.5565974712371826, - "learning_rate": 3.4899857322175252e-06, - "loss": 0.44939476251602173, - "mean_token_accuracy": 0.8315504193305969, - "num_tokens": 14183808.0, - "step": 1588 - }, - { - "epoch": 1.2074468085106382, - "grad_norm": 1.7521045207977295, - "learning_rate": 3.4880622005107916e-06, - "loss": 0.3168621063232422, - "mean_token_accuracy": 0.8824669122695923, - "num_tokens": 14192186.0, - "step": 1589 - }, - { - "epoch": 1.2082066869300911, - "grad_norm": 1.9816104173660278, - "learning_rate": 3.486137975343971e-06, - "loss": 0.3892582058906555, - "mean_token_accuracy": 0.8524188995361328, - "num_tokens": 14200512.0, - "step": 1590 - }, - { - "epoch": 1.208966565349544, - "grad_norm": 1.459800124168396, - "learning_rate": 3.484213058067559e-06, - "loss": 0.45930033922195435, - "mean_token_accuracy": 0.8408471345901489, - "num_tokens": 14215232.0, - "step": 1591 - }, - { - "epoch": 1.209726443768997, - "grad_norm": 2.015493154525757, - "learning_rate": 3.482287450032536e-06, - "loss": 0.5514016151428223, - "mean_token_accuracy": 0.8456779718399048, - "num_tokens": 14225402.0, - "step": 1592 - }, - { - "epoch": 1.2104863221884499, - "grad_norm": 3.4511911869049072, - "learning_rate": 3.4803611525903687e-06, - "loss": 0.4772771894931793, - "mean_token_accuracy": 0.8558698892593384, - "num_tokens": 14229038.0, - "step": 1593 - }, - { - "epoch": 1.2112462006079028, - "grad_norm": 2.2247982025146484, - "learning_rate": 3.4784341670930067e-06, - "loss": 0.4042825996875763, - "mean_token_accuracy": 0.8635870218276978, - "num_tokens": 14237057.0, - "step": 1594 - }, - { - "epoch": 1.2120060790273557, - "grad_norm": 2.0534820556640625, - "learning_rate": 3.4765064948928813e-06, - "loss": 0.34057414531707764, - "mean_token_accuracy": 0.8800770044326782, - "num_tokens": 14243013.0, - "step": 1595 - }, - { - "epoch": 1.2127659574468086, - "grad_norm": 2.594703197479248, - "learning_rate": 3.474578137342909e-06, - "loss": 0.4997410774230957, - "mean_token_accuracy": 0.8302106261253357, - "num_tokens": 14251210.0, - "step": 1596 - }, - { - "epoch": 1.2135258358662613, - "grad_norm": 2.517833948135376, - "learning_rate": 3.4726490957964836e-06, - "loss": 0.3630390465259552, - "mean_token_accuracy": 0.8679884672164917, - "num_tokens": 14255893.0, - "step": 1597 - }, - { - "epoch": 1.2142857142857142, - "grad_norm": 1.5177065134048462, - "learning_rate": 3.4707193716074816e-06, - "loss": 0.36218544840812683, - "mean_token_accuracy": 0.879178524017334, - "num_tokens": 14268143.0, - "step": 1598 - }, - { - "epoch": 1.215045592705167, - "grad_norm": 2.215291738510132, - "learning_rate": 3.4687889661302577e-06, - "loss": 0.4166645407676697, - "mean_token_accuracy": 0.8495793342590332, - "num_tokens": 14276794.0, - "step": 1599 - }, - { - "epoch": 1.21580547112462, - "grad_norm": 1.534294843673706, - "learning_rate": 3.466857880719645e-06, - "loss": 0.2635883092880249, - "mean_token_accuracy": 0.8971712589263916, - "num_tokens": 14287000.0, - "step": 1600 - }, - { - "epoch": 1.216565349544073, - "grad_norm": 1.2338658571243286, - "learning_rate": 3.464926116730953e-06, - "loss": 0.339110404253006, - "mean_token_accuracy": 0.895592987537384, - "num_tokens": 14303217.0, - "step": 1601 - }, - { - "epoch": 1.2173252279635258, - "grad_norm": 1.8717178106307983, - "learning_rate": 3.462993675519968e-06, - "loss": 0.41204726696014404, - "mean_token_accuracy": 0.8560728430747986, - "num_tokens": 14311372.0, - "step": 1602 - }, - { - "epoch": 1.2180851063829787, - "grad_norm": 2.844160795211792, - "learning_rate": 3.4610605584429526e-06, - "loss": 0.4129520058631897, - "mean_token_accuracy": 0.8555002212524414, - "num_tokens": 14316244.0, - "step": 1603 - }, - { - "epoch": 1.2188449848024316, - "grad_norm": 1.099926471710205, - "learning_rate": 3.4591267668566412e-06, - "loss": 0.35783132910728455, - "mean_token_accuracy": 0.8693175315856934, - "num_tokens": 14338414.0, - "step": 1604 - }, - { - "epoch": 1.2196048632218845, - "grad_norm": 1.6448384523391724, - "learning_rate": 3.457192302118244e-06, - "loss": 0.42060258984565735, - "mean_token_accuracy": 0.8557323217391968, - "num_tokens": 14349143.0, - "step": 1605 - }, - { - "epoch": 1.2203647416413375, - "grad_norm": 2.097529888153076, - "learning_rate": 3.455257165585444e-06, - "loss": 0.5227499008178711, - "mean_token_accuracy": 0.828961968421936, - "num_tokens": 14360032.0, - "step": 1606 - }, - { - "epoch": 1.2211246200607904, - "grad_norm": 1.602988600730896, - "learning_rate": 3.453321358616393e-06, - "loss": 0.3537187874317169, - "mean_token_accuracy": 0.8776708841323853, - "num_tokens": 14370005.0, - "step": 1607 - }, - { - "epoch": 1.221884498480243, - "grad_norm": 2.358971357345581, - "learning_rate": 3.4513848825697145e-06, - "loss": 0.3448919653892517, - "mean_token_accuracy": 0.8887944221496582, - "num_tokens": 14375718.0, - "step": 1608 - }, - { - "epoch": 1.222644376899696, - "grad_norm": 1.72306227684021, - "learning_rate": 3.4494477388045035e-06, - "loss": 0.36985084414482117, - "mean_token_accuracy": 0.859595537185669, - "num_tokens": 14385016.0, - "step": 1609 - }, - { - "epoch": 1.2234042553191489, - "grad_norm": 1.5494085550308228, - "learning_rate": 3.4475099286803204e-06, - "loss": 0.49003708362579346, - "mean_token_accuracy": 0.8701964616775513, - "num_tokens": 14399277.0, - "step": 1610 - }, - { - "epoch": 1.2241641337386018, - "grad_norm": 2.6874046325683594, - "learning_rate": 3.445571453557196e-06, - "loss": 0.3424490690231323, - "mean_token_accuracy": 0.8835943937301636, - "num_tokens": 14404182.0, - "step": 1611 - }, - { - "epoch": 1.2249240121580547, - "grad_norm": 2.2163190841674805, - "learning_rate": 3.443632314795627e-06, - "loss": 0.40944457054138184, - "mean_token_accuracy": 0.8649888038635254, - "num_tokens": 14410158.0, - "step": 1612 - }, - { - "epoch": 1.2256838905775076, - "grad_norm": 2.7961158752441406, - "learning_rate": 3.4416925137565756e-06, - "loss": 0.17890746891498566, - "mean_token_accuracy": 0.9439430832862854, - "num_tokens": 14413285.0, - "step": 1613 - }, - { - "epoch": 1.2264437689969605, - "grad_norm": 1.421451210975647, - "learning_rate": 3.439752051801467e-06, - "loss": 0.33948683738708496, - "mean_token_accuracy": 0.8754585981369019, - "num_tokens": 14424674.0, - "step": 1614 - }, - { - "epoch": 1.2272036474164134, - "grad_norm": 2.105196237564087, - "learning_rate": 3.4378109302921946e-06, - "loss": 0.40009379386901855, - "mean_token_accuracy": 0.8600341081619263, - "num_tokens": 14432400.0, - "step": 1615 - }, - { - "epoch": 1.2279635258358663, - "grad_norm": 2.004122734069824, - "learning_rate": 3.4358691505911105e-06, - "loss": 0.46013444662094116, - "mean_token_accuracy": 0.8400925993919373, - "num_tokens": 14440741.0, - "step": 1616 - }, - { - "epoch": 1.2287234042553192, - "grad_norm": 1.8407535552978516, - "learning_rate": 3.4339267140610317e-06, - "loss": 0.38828906416893005, - "mean_token_accuracy": 0.8582802414894104, - "num_tokens": 14448698.0, - "step": 1617 - }, - { - "epoch": 1.2294832826747721, - "grad_norm": 2.4285924434661865, - "learning_rate": 3.4319836220652334e-06, - "loss": 0.3109283447265625, - "mean_token_accuracy": 0.8888344764709473, - "num_tokens": 14453674.0, - "step": 1618 - }, - { - "epoch": 1.2302431610942248, - "grad_norm": 1.6322550773620605, - "learning_rate": 3.430039875967454e-06, - "loss": 0.5222204327583313, - "mean_token_accuracy": 0.825019121170044, - "num_tokens": 14465736.0, - "step": 1619 - }, - { - "epoch": 1.2310030395136777, - "grad_norm": 2.307573080062866, - "learning_rate": 3.428095477131888e-06, - "loss": 0.29477375745773315, - "mean_token_accuracy": 0.8899064660072327, - "num_tokens": 14471266.0, - "step": 1620 - }, - { - "epoch": 1.2317629179331306, - "grad_norm": 1.8044531345367432, - "learning_rate": 3.4261504269231904e-06, - "loss": 0.4883342981338501, - "mean_token_accuracy": 0.8310165405273438, - "num_tokens": 14481679.0, - "step": 1621 - }, - { - "epoch": 1.2325227963525835, - "grad_norm": 2.7585411071777344, - "learning_rate": 3.4242047267064714e-06, - "loss": 0.45369645953178406, - "mean_token_accuracy": 0.8432134985923767, - "num_tokens": 14487299.0, - "step": 1622 - }, - { - "epoch": 1.2332826747720365, - "grad_norm": 2.687490701675415, - "learning_rate": 3.4222583778472997e-06, - "loss": 0.5627540349960327, - "mean_token_accuracy": 0.8186438083648682, - "num_tokens": 14494254.0, - "step": 1623 - }, - { - "epoch": 1.2340425531914894, - "grad_norm": 2.622443199157715, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.28697147965431213, - "mean_token_accuracy": 0.8861737847328186, - "num_tokens": 14498632.0, - "step": 1624 - }, - { - "epoch": 1.2348024316109423, - "grad_norm": 2.6943359375, - "learning_rate": 3.4183637396661372e-06, - "loss": 0.25273287296295166, - "mean_token_accuracy": 0.9104914665222168, - "num_tokens": 14502797.0, - "step": 1625 - }, - { - "epoch": 1.2355623100303952, - "grad_norm": 2.428189992904663, - "learning_rate": 3.4164154530775552e-06, - "loss": 0.4213451147079468, - "mean_token_accuracy": 0.851524293422699, - "num_tokens": 14508503.0, - "step": 1626 - }, - { - "epoch": 1.236322188449848, - "grad_norm": 2.1722824573516846, - "learning_rate": 3.4144665233133318e-06, - "loss": 0.35238856077194214, - "mean_token_accuracy": 0.8730837106704712, - "num_tokens": 14516126.0, - "step": 1627 - }, - { - "epoch": 1.237082066869301, - "grad_norm": 2.291365146636963, - "learning_rate": 3.4125169517413005e-06, - "loss": 0.43963465094566345, - "mean_token_accuracy": 0.8525444865226746, - "num_tokens": 14522507.0, - "step": 1628 - }, - { - "epoch": 1.237841945288754, - "grad_norm": 1.6181648969650269, - "learning_rate": 3.410566739729746e-06, - "loss": 0.2799680233001709, - "mean_token_accuracy": 0.8915654420852661, - "num_tokens": 14531025.0, - "step": 1629 - }, - { - "epoch": 1.2386018237082066, - "grad_norm": 1.4039218425750732, - "learning_rate": 3.408615888647402e-06, - "loss": 0.29756587743759155, - "mean_token_accuracy": 0.8951715230941772, - "num_tokens": 14543770.0, - "step": 1630 - }, - { - "epoch": 1.2393617021276595, - "grad_norm": 2.148325204849243, - "learning_rate": 3.4066643998634506e-06, - "loss": 0.3983418345451355, - "mean_token_accuracy": 0.8635951280593872, - "num_tokens": 14550896.0, - "step": 1631 - }, - { - "epoch": 1.2401215805471124, - "grad_norm": 1.5225859880447388, - "learning_rate": 3.4047122747475227e-06, - "loss": 0.3247569799423218, - "mean_token_accuracy": 0.8727027177810669, - "num_tokens": 14562181.0, - "step": 1632 - }, - { - "epoch": 1.2408814589665653, - "grad_norm": 3.99835467338562, - "learning_rate": 3.402759514669694e-06, - "loss": 0.4317352771759033, - "mean_token_accuracy": 0.8488142490386963, - "num_tokens": 14565521.0, - "step": 1633 - }, - { - "epoch": 1.2416413373860182, - "grad_norm": 1.7306902408599854, - "learning_rate": 3.4008061210004872e-06, - "loss": 0.389854371547699, - "mean_token_accuracy": 0.8553084135055542, - "num_tokens": 14574633.0, - "step": 1634 - }, - { - "epoch": 1.2424012158054711, - "grad_norm": 2.3614673614501953, - "learning_rate": 3.3988520951108683e-06, - "loss": 0.3150152564048767, - "mean_token_accuracy": 0.8865959644317627, - "num_tokens": 14580240.0, - "step": 1635 - }, - { - "epoch": 1.243161094224924, - "grad_norm": 1.5625747442245483, - "learning_rate": 3.3968974383722497e-06, - "loss": 0.43160033226013184, - "mean_token_accuracy": 0.840155839920044, - "num_tokens": 14594255.0, - "step": 1636 - }, - { - "epoch": 1.243920972644377, - "grad_norm": 1.871620535850525, - "learning_rate": 3.3949421521564825e-06, - "loss": 0.49550193548202515, - "mean_token_accuracy": 0.8315126299858093, - "num_tokens": 14605416.0, - "step": 1637 - }, - { - "epoch": 1.2446808510638299, - "grad_norm": 2.111304759979248, - "learning_rate": 3.392986237835863e-06, - "loss": 0.2794899046421051, - "mean_token_accuracy": 0.9049773216247559, - "num_tokens": 14611711.0, - "step": 1638 - }, - { - "epoch": 1.2454407294832828, - "grad_norm": 3.7479894161224365, - "learning_rate": 3.391029696783127e-06, - "loss": 0.469397634267807, - "mean_token_accuracy": 0.8352956771850586, - "num_tokens": 14615536.0, - "step": 1639 - }, - { - "epoch": 1.2462006079027357, - "grad_norm": 3.277726650238037, - "learning_rate": 3.389072530371451e-06, - "loss": 0.35431790351867676, - "mean_token_accuracy": 0.8822286128997803, - "num_tokens": 14619390.0, - "step": 1640 - }, - { - "epoch": 1.2469604863221884, - "grad_norm": 1.9583072662353516, - "learning_rate": 3.3871147399744482e-06, - "loss": 0.3708694577217102, - "mean_token_accuracy": 0.8720351457595825, - "num_tokens": 14626573.0, - "step": 1641 - }, - { - "epoch": 1.2477203647416413, - "grad_norm": 1.8734042644500732, - "learning_rate": 3.385156326966173e-06, - "loss": 0.48163774609565735, - "mean_token_accuracy": 0.8479621410369873, - "num_tokens": 14636382.0, - "step": 1642 - }, - { - "epoch": 1.2484802431610942, - "grad_norm": 2.0085532665252686, - "learning_rate": 3.383197292721114e-06, - "loss": 0.4893198311328888, - "mean_token_accuracy": 0.838238000869751, - "num_tokens": 14645083.0, - "step": 1643 - }, - { - "epoch": 1.249240121580547, - "grad_norm": 2.0874593257904053, - "learning_rate": 3.3812376386141966e-06, - "loss": 0.4610505700111389, - "mean_token_accuracy": 0.8441368341445923, - "num_tokens": 14654048.0, - "step": 1644 - }, - { - "epoch": 1.25, - "grad_norm": 1.6887420415878296, - "learning_rate": 3.379277366020782e-06, - "loss": 0.3628596067428589, - "mean_token_accuracy": 0.8838590383529663, - "num_tokens": 14662317.0, - "step": 1645 - }, - { - "epoch": 1.250759878419453, - "grad_norm": 2.389002561569214, - "learning_rate": 3.3773164763166653e-06, - "loss": 0.21903495490550995, - "mean_token_accuracy": 0.9249413013458252, - "num_tokens": 14666394.0, - "step": 1646 - }, - { - "epoch": 1.2515197568389058, - "grad_norm": 1.7091087102890015, - "learning_rate": 3.3753549708780736e-06, - "loss": 0.37802332639694214, - "mean_token_accuracy": 0.8644627332687378, - "num_tokens": 14676214.0, - "step": 1647 - }, - { - "epoch": 1.2522796352583587, - "grad_norm": 2.5717999935150146, - "learning_rate": 3.3733928510816677e-06, - "loss": 0.4236462116241455, - "mean_token_accuracy": 0.8519910573959351, - "num_tokens": 14681681.0, - "step": 1648 - }, - { - "epoch": 1.2530395136778116, - "grad_norm": 1.958856463432312, - "learning_rate": 3.3714301183045382e-06, - "loss": 0.3923419415950775, - "mean_token_accuracy": 0.8720202445983887, - "num_tokens": 14690419.0, - "step": 1649 - }, - { - "epoch": 1.2537993920972643, - "grad_norm": 1.5900038480758667, - "learning_rate": 3.369466773924207e-06, - "loss": 0.4182325601577759, - "mean_token_accuracy": 0.8515387177467346, - "num_tokens": 14699790.0, - "step": 1650 - }, - { - "epoch": 1.2545592705167175, - "grad_norm": 1.260547161102295, - "learning_rate": 3.3675028193186243e-06, - "loss": 0.3915718197822571, - "mean_token_accuracy": 0.8536830544471741, - "num_tokens": 14717502.0, - "step": 1651 - }, - { - "epoch": 1.2553191489361701, - "grad_norm": 1.8152283430099487, - "learning_rate": 3.365538255866169e-06, - "loss": 0.424524188041687, - "mean_token_accuracy": 0.8434420824050903, - "num_tokens": 14726591.0, - "step": 1652 - }, - { - "epoch": 1.256079027355623, - "grad_norm": 1.3357285261154175, - "learning_rate": 3.3635730849456484e-06, - "loss": 0.2949739396572113, - "mean_token_accuracy": 0.8868321180343628, - "num_tokens": 14739911.0, - "step": 1653 - }, - { - "epoch": 1.256838905775076, - "grad_norm": 1.1770358085632324, - "learning_rate": 3.3616073079362925e-06, - "loss": 0.29939576983451843, - "mean_token_accuracy": 0.8923654556274414, - "num_tokens": 14755521.0, - "step": 1654 - }, - { - "epoch": 1.2575987841945289, - "grad_norm": 2.059162139892578, - "learning_rate": 3.3596409262177633e-06, - "loss": 0.4562555253505707, - "mean_token_accuracy": 0.8585271239280701, - "num_tokens": 14764173.0, - "step": 1655 - }, - { - "epoch": 1.2583586626139818, - "grad_norm": 1.430752158164978, - "learning_rate": 3.357673941170139e-06, - "loss": 0.35301265120506287, - "mean_token_accuracy": 0.8920517563819885, - "num_tokens": 14775596.0, - "step": 1656 - }, - { - "epoch": 1.2591185410334347, - "grad_norm": 1.6066302061080933, - "learning_rate": 3.3557063541739283e-06, - "loss": 0.41129636764526367, - "mean_token_accuracy": 0.8512256145477295, - "num_tokens": 14786289.0, - "step": 1657 - }, - { - "epoch": 1.2598784194528876, - "grad_norm": 1.5471590757369995, - "learning_rate": 3.353738166610058e-06, - "loss": 0.3935067057609558, - "mean_token_accuracy": 0.8514131903648376, - "num_tokens": 14798672.0, - "step": 1658 - }, - { - "epoch": 1.2606382978723405, - "grad_norm": 1.3455181121826172, - "learning_rate": 3.35176937985988e-06, - "loss": 0.3486790657043457, - "mean_token_accuracy": 0.8644362688064575, - "num_tokens": 14811603.0, - "step": 1659 - }, - { - "epoch": 1.2613981762917934, - "grad_norm": 1.891432762145996, - "learning_rate": 3.349799995305162e-06, - "loss": 0.3325638175010681, - "mean_token_accuracy": 0.8844645023345947, - "num_tokens": 14819256.0, - "step": 1660 - }, - { - "epoch": 1.262158054711246, - "grad_norm": 2.600614309310913, - "learning_rate": 3.3478300143280946e-06, - "loss": 0.30310919880867004, - "mean_token_accuracy": 0.9103429317474365, - "num_tokens": 14823706.0, - "step": 1661 - }, - { - "epoch": 1.2629179331306992, - "grad_norm": 3.8636202812194824, - "learning_rate": 3.3458594383112868e-06, - "loss": 0.28377676010131836, - "mean_token_accuracy": 0.9047091007232666, - "num_tokens": 14826688.0, - "step": 1662 - }, - { - "epoch": 1.263677811550152, - "grad_norm": 2.3100268840789795, - "learning_rate": 3.343888268637765e-06, - "loss": 0.4723394513130188, - "mean_token_accuracy": 0.8306777477264404, - "num_tokens": 14835471.0, - "step": 1663 - }, - { - "epoch": 1.2644376899696048, - "grad_norm": 1.7582160234451294, - "learning_rate": 3.341916506690971e-06, - "loss": 0.48168784379959106, - "mean_token_accuracy": 0.8281306028366089, - "num_tokens": 14846513.0, - "step": 1664 - }, - { - "epoch": 1.2651975683890577, - "grad_norm": 2.166055917739868, - "learning_rate": 3.3399441538547638e-06, - "loss": 0.4626024067401886, - "mean_token_accuracy": 0.8377980589866638, - "num_tokens": 14853408.0, - "step": 1665 - }, - { - "epoch": 1.2659574468085106, - "grad_norm": 2.23038911819458, - "learning_rate": 3.337971211513417e-06, - "loss": 0.38434159755706787, - "mean_token_accuracy": 0.8708412647247314, - "num_tokens": 14859919.0, - "step": 1666 - }, - { - "epoch": 1.2667173252279635, - "grad_norm": 2.092505693435669, - "learning_rate": 3.3359976810516164e-06, - "loss": 0.35072219371795654, - "mean_token_accuracy": 0.8761640191078186, - "num_tokens": 14865624.0, - "step": 1667 - }, - { - "epoch": 1.2674772036474165, - "grad_norm": 1.8255130052566528, - "learning_rate": 3.3340235638544633e-06, - "loss": 0.4404270648956299, - "mean_token_accuracy": 0.836356520652771, - "num_tokens": 14874181.0, - "step": 1668 - }, - { - "epoch": 1.2682370820668694, - "grad_norm": 1.9889036417007446, - "learning_rate": 3.332048861307467e-06, - "loss": 0.4199368357658386, - "mean_token_accuracy": 0.8508217334747314, - "num_tokens": 14882275.0, - "step": 1669 - }, - { - "epoch": 1.2689969604863223, - "grad_norm": 4.050281047821045, - "learning_rate": 3.330073574796551e-06, - "loss": 0.4271625280380249, - "mean_token_accuracy": 0.8471108675003052, - "num_tokens": 14893633.0, - "step": 1670 - }, - { - "epoch": 1.2697568389057752, - "grad_norm": 1.998838186264038, - "learning_rate": 3.328097705708047e-06, - "loss": 0.34743767976760864, - "mean_token_accuracy": 0.8771528005599976, - "num_tokens": 14899859.0, - "step": 1671 - }, - { - "epoch": 1.2705167173252279, - "grad_norm": 1.7989062070846558, - "learning_rate": 3.3261212554286977e-06, - "loss": 0.5267184376716614, - "mean_token_accuracy": 0.8323302268981934, - "num_tokens": 14911131.0, - "step": 1672 - }, - { - "epoch": 1.2712765957446808, - "grad_norm": 1.312070369720459, - "learning_rate": 3.324144225345649e-06, - "loss": 0.4675425887107849, - "mean_token_accuracy": 0.8157106637954712, - "num_tokens": 14928955.0, - "step": 1673 - }, - { - "epoch": 1.2720364741641337, - "grad_norm": 2.0547919273376465, - "learning_rate": 3.3221666168464584e-06, - "loss": 0.33704331517219543, - "mean_token_accuracy": 0.8621441125869751, - "num_tokens": 14935536.0, - "step": 1674 - }, - { - "epoch": 1.2727963525835866, - "grad_norm": 2.810413122177124, - "learning_rate": 3.320188431319088e-06, - "loss": 0.4007563292980194, - "mean_token_accuracy": 0.8649672269821167, - "num_tokens": 14940219.0, - "step": 1675 - }, - { - "epoch": 1.2735562310030395, - "grad_norm": 1.3516674041748047, - "learning_rate": 3.318209670151904e-06, - "loss": 0.3457040786743164, - "mean_token_accuracy": 0.8698287010192871, - "num_tokens": 14952904.0, - "step": 1676 - }, - { - "epoch": 1.2743161094224924, - "grad_norm": 2.440643310546875, - "learning_rate": 3.3162303347336765e-06, - "loss": 0.5195086002349854, - "mean_token_accuracy": 0.8348199129104614, - "num_tokens": 14958623.0, - "step": 1677 - }, - { - "epoch": 1.2750759878419453, - "grad_norm": 1.3264343738555908, - "learning_rate": 3.3142504264535808e-06, - "loss": 0.2990425229072571, - "mean_token_accuracy": 0.8961933851242065, - "num_tokens": 14971494.0, - "step": 1678 - }, - { - "epoch": 1.2758358662613982, - "grad_norm": 1.3106894493103027, - "learning_rate": 3.3122699467011913e-06, - "loss": 0.291853666305542, - "mean_token_accuracy": 0.893449068069458, - "num_tokens": 14985239.0, - "step": 1679 - }, - { - "epoch": 1.2765957446808511, - "grad_norm": 2.5387396812438965, - "learning_rate": 3.3102888968664857e-06, - "loss": 0.4336916208267212, - "mean_token_accuracy": 0.8447890877723694, - "num_tokens": 14991453.0, - "step": 1680 - }, - { - "epoch": 1.2773556231003038, - "grad_norm": 2.7052135467529297, - "learning_rate": 3.308307278339842e-06, - "loss": 0.3279378116130829, - "mean_token_accuracy": 0.8935879468917847, - "num_tokens": 14995428.0, - "step": 1681 - }, - { - "epoch": 1.278115501519757, - "grad_norm": 1.6251261234283447, - "learning_rate": 3.306325092512034e-06, - "loss": 0.32066458463668823, - "mean_token_accuracy": 0.8909799456596375, - "num_tokens": 15004841.0, - "step": 1682 - }, - { - "epoch": 1.2788753799392096, - "grad_norm": 2.3014605045318604, - "learning_rate": 3.3043423407742374e-06, - "loss": 0.3523373603820801, - "mean_token_accuracy": 0.8810735940933228, - "num_tokens": 15010742.0, - "step": 1683 - }, - { - "epoch": 1.2796352583586625, - "grad_norm": 2.9563019275665283, - "learning_rate": 3.3023590245180237e-06, - "loss": 0.39715707302093506, - "mean_token_accuracy": 0.8779881000518799, - "num_tokens": 15015357.0, - "step": 1684 - }, - { - "epoch": 1.2803951367781155, - "grad_norm": 1.5787957906723022, - "learning_rate": 3.300375145135361e-06, - "loss": 0.44630166888237, - "mean_token_accuracy": 0.8400174975395203, - "num_tokens": 15031360.0, - "step": 1685 - }, - { - "epoch": 1.2811550151975684, - "grad_norm": 1.6753438711166382, - "learning_rate": 3.2983907040186112e-06, - "loss": 0.3235800862312317, - "mean_token_accuracy": 0.8938044309616089, - "num_tokens": 15040276.0, - "step": 1686 - }, - { - "epoch": 1.2819148936170213, - "grad_norm": 1.7331148386001587, - "learning_rate": 3.296405702560532e-06, - "loss": 0.39061424136161804, - "mean_token_accuracy": 0.8599754571914673, - "num_tokens": 15049725.0, - "step": 1687 - }, - { - "epoch": 1.2826747720364742, - "grad_norm": 2.2029430866241455, - "learning_rate": 3.294420142154274e-06, - "loss": 0.43598297238349915, - "mean_token_accuracy": 0.8663698434829712, - "num_tokens": 15058182.0, - "step": 1688 - }, - { - "epoch": 1.283434650455927, - "grad_norm": 2.943964958190918, - "learning_rate": 3.29243402419338e-06, - "loss": 0.405210942029953, - "mean_token_accuracy": 0.854996919631958, - "num_tokens": 15062920.0, - "step": 1689 - }, - { - "epoch": 1.28419452887538, - "grad_norm": 1.9343379735946655, - "learning_rate": 3.2904473500717826e-06, - "loss": 0.35011449456214905, - "mean_token_accuracy": 0.8745867013931274, - "num_tokens": 15070298.0, - "step": 1690 - }, - { - "epoch": 1.284954407294833, - "grad_norm": 2.559859037399292, - "learning_rate": 3.2884601211838087e-06, - "loss": 0.38816407322883606, - "mean_token_accuracy": 0.854763388633728, - "num_tokens": 15075667.0, - "step": 1691 - }, - { - "epoch": 1.2857142857142856, - "grad_norm": 1.4357839822769165, - "learning_rate": 3.2864723389241697e-06, - "loss": 0.4512745141983032, - "mean_token_accuracy": 0.8398592472076416, - "num_tokens": 15090291.0, - "step": 1692 - }, - { - "epoch": 1.2864741641337387, - "grad_norm": 1.7643728256225586, - "learning_rate": 3.284484004687969e-06, - "loss": 0.3536742627620697, - "mean_token_accuracy": 0.8726381063461304, - "num_tokens": 15099325.0, - "step": 1693 - }, - { - "epoch": 1.2872340425531914, - "grad_norm": 1.853173017501831, - "learning_rate": 3.2824951198706958e-06, - "loss": 0.36579740047454834, - "mean_token_accuracy": 0.8988048434257507, - "num_tokens": 15107090.0, - "step": 1694 - }, - { - "epoch": 1.2879939209726443, - "grad_norm": 1.6526862382888794, - "learning_rate": 3.280505685868226e-06, - "loss": 0.3853636682033539, - "mean_token_accuracy": 0.8743607997894287, - "num_tokens": 15117818.0, - "step": 1695 - }, - { - "epoch": 1.2887537993920972, - "grad_norm": 2.790398597717285, - "learning_rate": 3.278515704076821e-06, - "loss": 0.2707311511039734, - "mean_token_accuracy": 0.9034668803215027, - "num_tokens": 15121641.0, - "step": 1696 - }, - { - "epoch": 1.2895136778115501, - "grad_norm": 1.69557523727417, - "learning_rate": 3.276525175893126e-06, - "loss": 0.3707970082759857, - "mean_token_accuracy": 0.8617855906486511, - "num_tokens": 15130414.0, - "step": 1697 - }, - { - "epoch": 1.290273556231003, - "grad_norm": 1.1360478401184082, - "learning_rate": 3.274534102714172e-06, - "loss": 0.3368082344532013, - "mean_token_accuracy": 0.8781654834747314, - "num_tokens": 15148307.0, - "step": 1698 - }, - { - "epoch": 1.291033434650456, - "grad_norm": 1.5894653797149658, - "learning_rate": 3.272542485937369e-06, - "loss": 0.3870658278465271, - "mean_token_accuracy": 0.8830926418304443, - "num_tokens": 15161841.0, - "step": 1699 - }, - { - "epoch": 1.2917933130699089, - "grad_norm": 2.3735709190368652, - "learning_rate": 3.270550326960511e-06, - "loss": 0.3873991370201111, - "mean_token_accuracy": 0.8729057908058167, - "num_tokens": 15167733.0, - "step": 1700 - }, - { - "epoch": 1.2925531914893618, - "grad_norm": 1.3739598989486694, - "learning_rate": 3.268557627181772e-06, - "loss": 0.30831626057624817, - "mean_token_accuracy": 0.8695719242095947, - "num_tokens": 15180861.0, - "step": 1701 - }, - { - "epoch": 1.2933130699088147, - "grad_norm": 1.7526969909667969, - "learning_rate": 3.2665643879997054e-06, - "loss": 0.4716024398803711, - "mean_token_accuracy": 0.8303275108337402, - "num_tokens": 15191642.0, - "step": 1702 - }, - { - "epoch": 1.2940729483282674, - "grad_norm": 2.7866084575653076, - "learning_rate": 3.2645706108132426e-06, - "loss": 0.33337634801864624, - "mean_token_accuracy": 0.8790726065635681, - "num_tokens": 15196038.0, - "step": 1703 - }, - { - "epoch": 1.2948328267477205, - "grad_norm": 2.319765090942383, - "learning_rate": 3.2625762970216944e-06, - "loss": 0.3999716639518738, - "mean_token_accuracy": 0.8693568706512451, - "num_tokens": 15202075.0, - "step": 1704 - }, - { - "epoch": 1.2955927051671732, - "grad_norm": 3.18292498588562, - "learning_rate": 3.2605814480247454e-06, - "loss": 0.4579541087150574, - "mean_token_accuracy": 0.8516187071800232, - "num_tokens": 15206886.0, - "step": 1705 - }, - { - "epoch": 1.296352583586626, - "grad_norm": 2.1816933155059814, - "learning_rate": 3.258586065222459e-06, - "loss": 0.5198885202407837, - "mean_token_accuracy": 0.8170592784881592, - "num_tokens": 15214088.0, - "step": 1706 - }, - { - "epoch": 1.297112462006079, - "grad_norm": 1.9076340198516846, - "learning_rate": 3.2565901500152702e-06, - "loss": 0.49752360582351685, - "mean_token_accuracy": 0.8681992292404175, - "num_tokens": 15226046.0, - "step": 1707 - }, - { - "epoch": 1.297872340425532, - "grad_norm": 2.0223331451416016, - "learning_rate": 3.2545937038039904e-06, - "loss": 0.4515793025493622, - "mean_token_accuracy": 0.8429619073867798, - "num_tokens": 15234993.0, - "step": 1708 - }, - { - "epoch": 1.2986322188449848, - "grad_norm": 2.5089669227600098, - "learning_rate": 3.2525967279898017e-06, - "loss": 0.43628376722335815, - "mean_token_accuracy": 0.8493682146072388, - "num_tokens": 15240575.0, - "step": 1709 - }, - { - "epoch": 1.2993920972644377, - "grad_norm": 2.8347091674804688, - "learning_rate": 3.2505992239742582e-06, - "loss": 0.25112441182136536, - "mean_token_accuracy": 0.908825159072876, - "num_tokens": 15244085.0, - "step": 1710 - }, - { - "epoch": 1.3001519756838906, - "grad_norm": 2.3157572746276855, - "learning_rate": 3.2486011931592863e-06, - "loss": 0.482818067073822, - "mean_token_accuracy": 0.8305923938751221, - "num_tokens": 15250377.0, - "step": 1711 - }, - { - "epoch": 1.3009118541033435, - "grad_norm": 3.169052839279175, - "learning_rate": 3.2466026369471804e-06, - "loss": 0.3493242561817169, - "mean_token_accuracy": 0.86913001537323, - "num_tokens": 15255041.0, - "step": 1712 - }, - { - "epoch": 1.3016717325227964, - "grad_norm": 1.4475083351135254, - "learning_rate": 3.2446035567406033e-06, - "loss": 0.4177290201187134, - "mean_token_accuracy": 0.8497589826583862, - "num_tokens": 15266946.0, - "step": 1713 - }, - { - "epoch": 1.3024316109422491, - "grad_norm": 1.6473008394241333, - "learning_rate": 3.2426039539425875e-06, - "loss": 0.5272886753082275, - "mean_token_accuracy": 0.8440133333206177, - "num_tokens": 15279263.0, - "step": 1714 - }, - { - "epoch": 1.3031914893617023, - "grad_norm": 2.3996543884277344, - "learning_rate": 3.240603829956531e-06, - "loss": 0.4272066652774811, - "mean_token_accuracy": 0.8495640754699707, - "num_tokens": 15285213.0, - "step": 1715 - }, - { - "epoch": 1.303951367781155, - "grad_norm": 1.63034987449646, - "learning_rate": 3.238603186186198e-06, - "loss": 0.4034635126590729, - "mean_token_accuracy": 0.8638584613800049, - "num_tokens": 15295974.0, - "step": 1716 - }, - { - "epoch": 1.3047112462006079, - "grad_norm": 2.153608798980713, - "learning_rate": 3.2366020240357166e-06, - "loss": 0.30712565779685974, - "mean_token_accuracy": 0.8863866329193115, - "num_tokens": 15302220.0, - "step": 1717 - }, - { - "epoch": 1.3054711246200608, - "grad_norm": 2.9814558029174805, - "learning_rate": 3.2346003449095803e-06, - "loss": 0.3922840356826782, - "mean_token_accuracy": 0.868030309677124, - "num_tokens": 15306747.0, - "step": 1718 - }, - { - "epoch": 1.3062310030395137, - "grad_norm": 3.3417985439300537, - "learning_rate": 3.2325981502126434e-06, - "loss": 0.30750396847724915, - "mean_token_accuracy": 0.9065356850624084, - "num_tokens": 15310309.0, - "step": 1719 - }, - { - "epoch": 1.3069908814589666, - "grad_norm": 2.237682819366455, - "learning_rate": 3.2305954413501252e-06, - "loss": 0.35068294405937195, - "mean_token_accuracy": 0.8887614011764526, - "num_tokens": 15316463.0, - "step": 1720 - }, - { - "epoch": 1.3077507598784195, - "grad_norm": 1.9526605606079102, - "learning_rate": 3.228592219727602e-06, - "loss": 0.42061835527420044, - "mean_token_accuracy": 0.8456839323043823, - "num_tokens": 15323984.0, - "step": 1721 - }, - { - "epoch": 1.3085106382978724, - "grad_norm": 1.6454212665557861, - "learning_rate": 3.226588486751012e-06, - "loss": 0.5189976692199707, - "mean_token_accuracy": 0.8187375068664551, - "num_tokens": 15338807.0, - "step": 1722 - }, - { - "epoch": 1.3092705167173253, - "grad_norm": 1.4521609544754028, - "learning_rate": 3.2245842438266526e-06, - "loss": 0.329673171043396, - "mean_token_accuracy": 0.853867769241333, - "num_tokens": 15350400.0, - "step": 1723 - }, - { - "epoch": 1.3100303951367782, - "grad_norm": 1.8750989437103271, - "learning_rate": 3.222579492361179e-06, - "loss": 0.4635341167449951, - "mean_token_accuracy": 0.8393422365188599, - "num_tokens": 15360557.0, - "step": 1724 - }, - { - "epoch": 1.310790273556231, - "grad_norm": 1.2728849649429321, - "learning_rate": 3.220574233761603e-06, - "loss": 0.3255572021007538, - "mean_token_accuracy": 0.8989741802215576, - "num_tokens": 15376548.0, - "step": 1725 - }, - { - "epoch": 1.3115501519756838, - "grad_norm": 3.5155694484710693, - "learning_rate": 3.2185684694352913e-06, - "loss": 0.34204089641571045, - "mean_token_accuracy": 0.8781906366348267, - "num_tokens": 15380304.0, - "step": 1726 - }, - { - "epoch": 1.3123100303951367, - "grad_norm": 2.059800148010254, - "learning_rate": 3.216562200789968e-06, - "loss": 0.36288338899612427, - "mean_token_accuracy": 0.8595278263092041, - "num_tokens": 15387653.0, - "step": 1727 - }, - { - "epoch": 1.3130699088145896, - "grad_norm": 3.5388240814208984, - "learning_rate": 3.214555429233707e-06, - "loss": 0.5434849858283997, - "mean_token_accuracy": 0.8074631690979004, - "num_tokens": 15391662.0, - "step": 1728 - }, - { - "epoch": 1.3138297872340425, - "grad_norm": 2.8595592975616455, - "learning_rate": 3.2125481561749406e-06, - "loss": 0.5113687515258789, - "mean_token_accuracy": 0.8448649644851685, - "num_tokens": 15397536.0, - "step": 1729 - }, - { - "epoch": 1.3145896656534954, - "grad_norm": 2.50386905670166, - "learning_rate": 3.210540383022449e-06, - "loss": 0.5293697118759155, - "mean_token_accuracy": 0.8096445798873901, - "num_tokens": 15403478.0, - "step": 1730 - }, - { - "epoch": 1.3153495440729484, - "grad_norm": 1.880035400390625, - "learning_rate": 3.208532111185365e-06, - "loss": 0.5344835519790649, - "mean_token_accuracy": 0.8172965049743652, - "num_tokens": 15413812.0, - "step": 1731 - }, - { - "epoch": 1.3161094224924013, - "grad_norm": 1.3688768148422241, - "learning_rate": 3.2065233420731717e-06, - "loss": 0.2577427327632904, - "mean_token_accuracy": 0.9142681360244751, - "num_tokens": 15423583.0, - "step": 1732 - }, - { - "epoch": 1.3168693009118542, - "grad_norm": 1.7945705652236938, - "learning_rate": 3.2045140770956987e-06, - "loss": 0.3983926773071289, - "mean_token_accuracy": 0.8652000427246094, - "num_tokens": 15432473.0, - "step": 1733 - }, - { - "epoch": 1.3176291793313069, - "grad_norm": 1.8243350982666016, - "learning_rate": 3.2025043176631283e-06, - "loss": 0.48644185066223145, - "mean_token_accuracy": 0.8319193124771118, - "num_tokens": 15445463.0, - "step": 1734 - }, - { - "epoch": 1.31838905775076, - "grad_norm": 2.000094175338745, - "learning_rate": 3.2004940651859844e-06, - "loss": 0.43567317724227905, - "mean_token_accuracy": 0.8857482671737671, - "num_tokens": 15452382.0, - "step": 1735 - }, - { - "epoch": 1.3191489361702127, - "grad_norm": 2.379974365234375, - "learning_rate": 3.198483321075141e-06, - "loss": 0.5153506398200989, - "mean_token_accuracy": 0.8295865654945374, - "num_tokens": 15458740.0, - "step": 1736 - }, - { - "epoch": 1.3199088145896656, - "grad_norm": 1.6564184427261353, - "learning_rate": 3.196472086741815e-06, - "loss": 0.508430540561676, - "mean_token_accuracy": 0.8181540369987488, - "num_tokens": 15471844.0, - "step": 1737 - }, - { - "epoch": 1.3206686930091185, - "grad_norm": 2.006925344467163, - "learning_rate": 3.194460363597569e-06, - "loss": 0.34542378783226013, - "mean_token_accuracy": 0.8827437162399292, - "num_tokens": 15478414.0, - "step": 1738 - }, - { - "epoch": 1.3214285714285714, - "grad_norm": 3.589045763015747, - "learning_rate": 3.192448153054306e-06, - "loss": 0.4385780096054077, - "mean_token_accuracy": 0.8480287790298462, - "num_tokens": 15482063.0, - "step": 1739 - }, - { - "epoch": 1.3221884498480243, - "grad_norm": 1.9797427654266357, - "learning_rate": 3.190435456524275e-06, - "loss": 0.4330386519432068, - "mean_token_accuracy": 0.8458058834075928, - "num_tokens": 15489803.0, - "step": 1740 - }, - { - "epoch": 1.3229483282674772, - "grad_norm": 1.4777411222457886, - "learning_rate": 3.188422275420063e-06, - "loss": 0.3997895419597626, - "mean_token_accuracy": 0.8639512062072754, - "num_tokens": 15501103.0, - "step": 1741 - }, - { - "epoch": 1.3237082066869301, - "grad_norm": 2.882338523864746, - "learning_rate": 3.186408611154597e-06, - "loss": 0.2336438149213791, - "mean_token_accuracy": 0.9176726937294006, - "num_tokens": 15504854.0, - "step": 1742 - }, - { - "epoch": 1.324468085106383, - "grad_norm": 2.353503704071045, - "learning_rate": 3.184394465141146e-06, - "loss": 0.4107069671154022, - "mean_token_accuracy": 0.8677014112472534, - "num_tokens": 15510662.0, - "step": 1743 - }, - { - "epoch": 1.325227963525836, - "grad_norm": 2.6551976203918457, - "learning_rate": 3.1823798387933134e-06, - "loss": 0.3862302899360657, - "mean_token_accuracy": 0.8819445371627808, - "num_tokens": 15515681.0, - "step": 1744 - }, - { - "epoch": 1.3259878419452886, - "grad_norm": 1.478572964668274, - "learning_rate": 3.180364733525043e-06, - "loss": 0.43972986936569214, - "mean_token_accuracy": 0.832388162612915, - "num_tokens": 15529542.0, - "step": 1745 - }, - { - "epoch": 1.3267477203647418, - "grad_norm": 1.6003550291061401, - "learning_rate": 3.178349150750612e-06, - "loss": 0.3404902219772339, - "mean_token_accuracy": 0.8764007091522217, - "num_tokens": 15538865.0, - "step": 1746 - }, - { - "epoch": 1.3275075987841944, - "grad_norm": 2.130689859390259, - "learning_rate": 3.1763330918846347e-06, - "loss": 0.383136510848999, - "mean_token_accuracy": 0.8652247190475464, - "num_tokens": 15545567.0, - "step": 1747 - }, - { - "epoch": 1.3282674772036474, - "grad_norm": 2.395937442779541, - "learning_rate": 3.1743165583420586e-06, - "loss": 0.3870319128036499, - "mean_token_accuracy": 0.8618065118789673, - "num_tokens": 15551090.0, - "step": 1748 - }, - { - "epoch": 1.3290273556231003, - "grad_norm": 2.0841057300567627, - "learning_rate": 3.1722995515381644e-06, - "loss": 0.4838739335536957, - "mean_token_accuracy": 0.8548711538314819, - "num_tokens": 15558913.0, - "step": 1749 - }, - { - "epoch": 1.3297872340425532, - "grad_norm": 1.4237847328186035, - "learning_rate": 3.1702820728885657e-06, - "loss": 0.40350261330604553, - "mean_token_accuracy": 0.858984649181366, - "num_tokens": 15572045.0, - "step": 1750 - }, - { - "epoch": 1.330547112462006, - "grad_norm": 2.2641282081604004, - "learning_rate": 3.1682641238092064e-06, - "loss": 0.5117636919021606, - "mean_token_accuracy": 0.8078924417495728, - "num_tokens": 15579753.0, - "step": 1751 - }, - { - "epoch": 1.331306990881459, - "grad_norm": 1.0010309219360352, - "learning_rate": 3.1662457057163603e-06, - "loss": 0.3220978379249573, - "mean_token_accuracy": 0.8786559104919434, - "num_tokens": 15602823.0, - "step": 1752 - }, - { - "epoch": 1.332066869300912, - "grad_norm": 2.441230535507202, - "learning_rate": 3.164226820026632e-06, - "loss": 0.37529727816581726, - "mean_token_accuracy": 0.8886898756027222, - "num_tokens": 15608473.0, - "step": 1753 - }, - { - "epoch": 1.3328267477203648, - "grad_norm": 1.2960991859436035, - "learning_rate": 3.162207468156952e-06, - "loss": 0.3393767476081848, - "mean_token_accuracy": 0.8766993284225464, - "num_tokens": 15620893.0, - "step": 1754 - }, - { - "epoch": 1.3335866261398177, - "grad_norm": 2.0806996822357178, - "learning_rate": 3.16018765152458e-06, - "loss": 0.38034507632255554, - "mean_token_accuracy": 0.8854838609695435, - "num_tokens": 15627068.0, - "step": 1755 - }, - { - "epoch": 1.3343465045592704, - "grad_norm": 1.4316699504852295, - "learning_rate": 3.1581673715471007e-06, - "loss": 0.3665890693664551, - "mean_token_accuracy": 0.870919406414032, - "num_tokens": 15641070.0, - "step": 1756 - }, - { - "epoch": 1.3351063829787235, - "grad_norm": 1.3466622829437256, - "learning_rate": 3.1561466296424247e-06, - "loss": 0.37387198209762573, - "mean_token_accuracy": 0.8633951544761658, - "num_tokens": 15653777.0, - "step": 1757 - }, - { - "epoch": 1.3358662613981762, - "grad_norm": 1.8108628988265991, - "learning_rate": 3.154125427228786e-06, - "loss": 0.38428938388824463, - "mean_token_accuracy": 0.85402512550354, - "num_tokens": 15662494.0, - "step": 1758 - }, - { - "epoch": 1.3366261398176291, - "grad_norm": 1.3221700191497803, - "learning_rate": 3.152103765724743e-06, - "loss": 0.42825520038604736, - "mean_token_accuracy": 0.8435465097427368, - "num_tokens": 15677552.0, - "step": 1759 - }, - { - "epoch": 1.337386018237082, - "grad_norm": 2.6247692108154297, - "learning_rate": 3.150081646549174e-06, - "loss": 0.36186715960502625, - "mean_token_accuracy": 0.8767328262329102, - "num_tokens": 15682103.0, - "step": 1760 - }, - { - "epoch": 1.338145896656535, - "grad_norm": 2.1469814777374268, - "learning_rate": 3.1480590711212823e-06, - "loss": 0.3734385669231415, - "mean_token_accuracy": 0.8711104393005371, - "num_tokens": 15689182.0, - "step": 1761 - }, - { - "epoch": 1.3389057750759878, - "grad_norm": 2.1702585220336914, - "learning_rate": 3.1460360408605866e-06, - "loss": 0.2795315086841583, - "mean_token_accuracy": 0.8892190456390381, - "num_tokens": 15694272.0, - "step": 1762 - }, - { - "epoch": 1.3396656534954408, - "grad_norm": 1.918797254562378, - "learning_rate": 3.144012557186931e-06, - "loss": 0.4363473057746887, - "mean_token_accuracy": 0.8573931455612183, - "num_tokens": 15703532.0, - "step": 1763 - }, - { - "epoch": 1.3404255319148937, - "grad_norm": 2.5579960346221924, - "learning_rate": 3.14198862152047e-06, - "loss": 0.406247079372406, - "mean_token_accuracy": 0.8617593050003052, - "num_tokens": 15708652.0, - "step": 1764 - }, - { - "epoch": 1.3411854103343466, - "grad_norm": 2.3617870807647705, - "learning_rate": 3.1399642352816825e-06, - "loss": 0.2839522659778595, - "mean_token_accuracy": 0.8996064066886902, - "num_tokens": 15713598.0, - "step": 1765 - }, - { - "epoch": 1.3419452887537995, - "grad_norm": 1.248302936553955, - "learning_rate": 3.1379393998913594e-06, - "loss": 0.2922290861606598, - "mean_token_accuracy": 0.8948773145675659, - "num_tokens": 15726693.0, - "step": 1766 - }, - { - "epoch": 1.3427051671732522, - "grad_norm": 2.143599510192871, - "learning_rate": 3.135914116770609e-06, - "loss": 0.32176223397254944, - "mean_token_accuracy": 0.8808754682540894, - "num_tokens": 15731901.0, - "step": 1767 - }, - { - "epoch": 1.3434650455927053, - "grad_norm": 4.226369857788086, - "learning_rate": 3.1338883873408517e-06, - "loss": 0.4682556390762329, - "mean_token_accuracy": 0.8566025495529175, - "num_tokens": 15735029.0, - "step": 1768 - }, - { - "epoch": 1.344224924012158, - "grad_norm": 1.8695988655090332, - "learning_rate": 3.1318622130238237e-06, - "loss": 0.4297192394733429, - "mean_token_accuracy": 0.8419148921966553, - "num_tokens": 15744310.0, - "step": 1769 - }, - { - "epoch": 1.344984802431611, - "grad_norm": 2.4321305751800537, - "learning_rate": 3.1298355952415714e-06, - "loss": 0.36076444387435913, - "mean_token_accuracy": 0.8826035261154175, - "num_tokens": 15749337.0, - "step": 1770 - }, - { - "epoch": 1.3457446808510638, - "grad_norm": 1.5500011444091797, - "learning_rate": 3.127808535416454e-06, - "loss": 0.48664039373397827, - "mean_token_accuracy": 0.844344437122345, - "num_tokens": 15761096.0, - "step": 1771 - }, - { - "epoch": 1.3465045592705167, - "grad_norm": 2.1498289108276367, - "learning_rate": 3.1257810349711388e-06, - "loss": 0.4841752052307129, - "mean_token_accuracy": 0.8324567079544067, - "num_tokens": 15768646.0, - "step": 1772 - }, - { - "epoch": 1.3472644376899696, - "grad_norm": 1.2995187044143677, - "learning_rate": 3.1237530953286046e-06, - "loss": 0.492019385099411, - "mean_token_accuracy": 0.8285316228866577, - "num_tokens": 15788401.0, - "step": 1773 - }, - { - "epoch": 1.3480243161094225, - "grad_norm": 2.324819803237915, - "learning_rate": 3.121724717912138e-06, - "loss": 0.33166298270225525, - "mean_token_accuracy": 0.8856451511383057, - "num_tokens": 15794097.0, - "step": 1774 - }, - { - "epoch": 1.3487841945288754, - "grad_norm": 1.9611430168151855, - "learning_rate": 3.11969590414533e-06, - "loss": 0.3974284827709198, - "mean_token_accuracy": 0.8751305937767029, - "num_tokens": 15801065.0, - "step": 1775 - }, - { - "epoch": 1.3495440729483283, - "grad_norm": 1.7084417343139648, - "learning_rate": 3.1176666554520827e-06, - "loss": 0.38729435205459595, - "mean_token_accuracy": 0.8680770397186279, - "num_tokens": 15810353.0, - "step": 1776 - }, - { - "epoch": 1.3503039513677813, - "grad_norm": 1.7616240978240967, - "learning_rate": 3.1156369732566006e-06, - "loss": 0.4271578788757324, - "mean_token_accuracy": 0.843730092048645, - "num_tokens": 15821889.0, - "step": 1777 - }, - { - "epoch": 1.351063829787234, - "grad_norm": 2.030747413635254, - "learning_rate": 3.113606858983391e-06, - "loss": 0.361891508102417, - "mean_token_accuracy": 0.8522407412528992, - "num_tokens": 15830800.0, - "step": 1778 - }, - { - "epoch": 1.3518237082066868, - "grad_norm": 1.4842649698257446, - "learning_rate": 3.1115763140572686e-06, - "loss": 0.466334730386734, - "mean_token_accuracy": 0.8433995246887207, - "num_tokens": 15849422.0, - "step": 1779 - }, - { - "epoch": 1.3525835866261398, - "grad_norm": 1.6595379114151, - "learning_rate": 3.109545339903347e-06, - "loss": 0.4622533321380615, - "mean_token_accuracy": 0.8526314496994019, - "num_tokens": 15860431.0, - "step": 1780 - }, - { - "epoch": 1.3533434650455927, - "grad_norm": 2.1235809326171875, - "learning_rate": 3.107513937947041e-06, - "loss": 0.42694270610809326, - "mean_token_accuracy": 0.854864239692688, - "num_tokens": 15869044.0, - "step": 1781 - }, - { - "epoch": 1.3541033434650456, - "grad_norm": 1.5889263153076172, - "learning_rate": 3.1054821096140675e-06, - "loss": 0.41838499903678894, - "mean_token_accuracy": 0.8671513795852661, - "num_tokens": 15878598.0, - "step": 1782 - }, - { - "epoch": 1.3548632218844985, - "grad_norm": 2.2261741161346436, - "learning_rate": 3.1034498563304435e-06, - "loss": 0.4045066237449646, - "mean_token_accuracy": 0.843826949596405, - "num_tokens": 15885167.0, - "step": 1783 - }, - { - "epoch": 1.3556231003039514, - "grad_norm": 2.2569329738616943, - "learning_rate": 3.1014171795224794e-06, - "loss": 0.36677104234695435, - "mean_token_accuracy": 0.8747833967208862, - "num_tokens": 15891308.0, - "step": 1784 - }, - { - "epoch": 1.3563829787234043, - "grad_norm": 2.1027088165283203, - "learning_rate": 3.0993840806167884e-06, - "loss": 0.437946081161499, - "mean_token_accuracy": 0.8370785117149353, - "num_tokens": 15898952.0, - "step": 1785 - }, - { - "epoch": 1.3571428571428572, - "grad_norm": 1.8768929243087769, - "learning_rate": 3.0973505610402767e-06, - "loss": 0.4201734662055969, - "mean_token_accuracy": 0.8474810123443604, - "num_tokens": 15907340.0, - "step": 1786 - }, - { - "epoch": 1.35790273556231, - "grad_norm": 1.7216229438781738, - "learning_rate": 3.0953166222201474e-06, - "loss": 0.4225231409072876, - "mean_token_accuracy": 0.8437749147415161, - "num_tokens": 15917852.0, - "step": 1787 - }, - { - "epoch": 1.358662613981763, - "grad_norm": 2.6256966590881348, - "learning_rate": 3.093282265583895e-06, - "loss": 0.435439795255661, - "mean_token_accuracy": 0.8452040553092957, - "num_tokens": 15923739.0, - "step": 1788 - }, - { - "epoch": 1.3594224924012157, - "grad_norm": 2.90028977394104, - "learning_rate": 3.0912474925593124e-06, - "loss": 0.3730456829071045, - "mean_token_accuracy": 0.8766646385192871, - "num_tokens": 15927943.0, - "step": 1789 - }, - { - "epoch": 1.3601823708206686, - "grad_norm": 1.5966626405715942, - "learning_rate": 3.0892123045744787e-06, - "loss": 0.42150455713272095, - "mean_token_accuracy": 0.854656457901001, - "num_tokens": 15939922.0, - "step": 1790 - }, - { - "epoch": 1.3609422492401215, - "grad_norm": 1.8069748878479004, - "learning_rate": 3.0871767030577686e-06, - "loss": 0.4954872131347656, - "mean_token_accuracy": 0.8289790153503418, - "num_tokens": 15950095.0, - "step": 1791 - }, - { - "epoch": 1.3617021276595744, - "grad_norm": 2.0855250358581543, - "learning_rate": 3.085140689437846e-06, - "loss": 0.41999945044517517, - "mean_token_accuracy": 0.8517382144927979, - "num_tokens": 15957972.0, - "step": 1792 - }, - { - "epoch": 1.3624620060790273, - "grad_norm": 2.108659267425537, - "learning_rate": 3.0831042651436634e-06, - "loss": 0.3668023645877838, - "mean_token_accuracy": 0.8710855841636658, - "num_tokens": 15965614.0, - "step": 1793 - }, - { - "epoch": 1.3632218844984803, - "grad_norm": 1.3799632787704468, - "learning_rate": 3.0810674316044602e-06, - "loss": 0.351409375667572, - "mean_token_accuracy": 0.870837390422821, - "num_tokens": 15978854.0, - "step": 1794 - }, - { - "epoch": 1.3639817629179332, - "grad_norm": 1.540397047996521, - "learning_rate": 3.0790301902497664e-06, - "loss": 0.403600811958313, - "mean_token_accuracy": 0.8485002517700195, - "num_tokens": 15993324.0, - "step": 1795 - }, - { - "epoch": 1.364741641337386, - "grad_norm": 1.946882963180542, - "learning_rate": 3.076992542509396e-06, - "loss": 0.40118327736854553, - "mean_token_accuracy": 0.8607497811317444, - "num_tokens": 16001937.0, - "step": 1796 - }, - { - "epoch": 1.365501519756839, - "grad_norm": 2.0464305877685547, - "learning_rate": 3.0749544898134487e-06, - "loss": 0.31742292642593384, - "mean_token_accuracy": 0.8878391981124878, - "num_tokens": 16009277.0, - "step": 1797 - }, - { - "epoch": 1.3662613981762917, - "grad_norm": 2.091754913330078, - "learning_rate": 3.072916033592307e-06, - "loss": 0.31580421328544617, - "mean_token_accuracy": 0.8875244855880737, - "num_tokens": 16015756.0, - "step": 1798 - }, - { - "epoch": 1.3670212765957448, - "grad_norm": 3.4449212551116943, - "learning_rate": 3.0708771752766397e-06, - "loss": 0.4692591726779938, - "mean_token_accuracy": 0.8456202149391174, - "num_tokens": 16019912.0, - "step": 1799 - }, - { - "epoch": 1.3677811550151975, - "grad_norm": 1.600419521331787, - "learning_rate": 3.068837916297396e-06, - "loss": 0.40389442443847656, - "mean_token_accuracy": 0.8378961086273193, - "num_tokens": 16032637.0, - "step": 1800 - }, - { - "epoch": 1.3685410334346504, - "grad_norm": 1.5282686948776245, - "learning_rate": 3.0667982580858047e-06, - "loss": 0.379841685295105, - "mean_token_accuracy": 0.8752143383026123, - "num_tokens": 16045205.0, - "step": 1801 - }, - { - "epoch": 1.3693009118541033, - "grad_norm": 2.486079454421997, - "learning_rate": 3.0647582020733773e-06, - "loss": 0.41060030460357666, - "mean_token_accuracy": 0.8575131893157959, - "num_tokens": 16051189.0, - "step": 1802 - }, - { - "epoch": 1.3700607902735562, - "grad_norm": 1.9458621740341187, - "learning_rate": 3.062717749691904e-06, - "loss": 0.4442213773727417, - "mean_token_accuracy": 0.8451495170593262, - "num_tokens": 16059700.0, - "step": 1803 - }, - { - "epoch": 1.3708206686930091, - "grad_norm": 1.4333001375198364, - "learning_rate": 3.0606769023734535e-06, - "loss": 0.39132001996040344, - "mean_token_accuracy": 0.8609901666641235, - "num_tokens": 16072458.0, - "step": 1804 - }, - { - "epoch": 1.371580547112462, - "grad_norm": 1.490355372428894, - "learning_rate": 3.0586356615503693e-06, - "loss": 0.4108564257621765, - "mean_token_accuracy": 0.8871046304702759, - "num_tokens": 16083142.0, - "step": 1805 - }, - { - "epoch": 1.372340425531915, - "grad_norm": 1.7765129804611206, - "learning_rate": 3.056594028655274e-06, - "loss": 0.3850266635417938, - "mean_token_accuracy": 0.8923365473747253, - "num_tokens": 16092519.0, - "step": 1806 - }, - { - "epoch": 1.3731003039513678, - "grad_norm": 1.955661416053772, - "learning_rate": 3.0545520051210637e-06, - "loss": 0.4665378928184509, - "mean_token_accuracy": 0.837419867515564, - "num_tokens": 16100618.0, - "step": 1807 - }, - { - "epoch": 1.3738601823708207, - "grad_norm": 3.259265422821045, - "learning_rate": 3.052509592380909e-06, - "loss": 0.24722981452941895, - "mean_token_accuracy": 0.9106054306030273, - "num_tokens": 16103836.0, - "step": 1808 - }, - { - "epoch": 1.3746200607902734, - "grad_norm": 1.7995736598968506, - "learning_rate": 3.050466791868254e-06, - "loss": 0.4982220530509949, - "mean_token_accuracy": 0.8298169374465942, - "num_tokens": 16114727.0, - "step": 1809 - }, - { - "epoch": 1.3753799392097266, - "grad_norm": 1.9643093347549438, - "learning_rate": 3.048423605016815e-06, - "loss": 0.5076829195022583, - "mean_token_accuracy": 0.8303098678588867, - "num_tokens": 16129491.0, - "step": 1810 - }, - { - "epoch": 1.3761398176291793, - "grad_norm": 3.505594491958618, - "learning_rate": 3.0463800332605787e-06, - "loss": 0.27466052770614624, - "mean_token_accuracy": 0.9018045663833618, - "num_tokens": 16132640.0, - "step": 1811 - }, - { - "epoch": 1.3768996960486322, - "grad_norm": 1.798437237739563, - "learning_rate": 3.0443360780338034e-06, - "loss": 0.4004853069782257, - "mean_token_accuracy": 0.8569544553756714, - "num_tokens": 16143317.0, - "step": 1812 - }, - { - "epoch": 1.377659574468085, - "grad_norm": 2.276740789413452, - "learning_rate": 3.042291740771014e-06, - "loss": 0.3823797106742859, - "mean_token_accuracy": 0.8764113783836365, - "num_tokens": 16148898.0, - "step": 1813 - }, - { - "epoch": 1.378419452887538, - "grad_norm": 2.5051357746124268, - "learning_rate": 3.0402470229070057e-06, - "loss": 0.40365856885910034, - "mean_token_accuracy": 0.8809891939163208, - "num_tokens": 16153815.0, - "step": 1814 - }, - { - "epoch": 1.3791793313069909, - "grad_norm": 1.2379236221313477, - "learning_rate": 3.03820192587684e-06, - "loss": 0.3955119848251343, - "mean_token_accuracy": 0.8536627292633057, - "num_tokens": 16167783.0, - "step": 1815 - }, - { - "epoch": 1.3799392097264438, - "grad_norm": 2.2286343574523926, - "learning_rate": 3.036156451115846e-06, - "loss": 0.39647501707077026, - "mean_token_accuracy": 0.8621993064880371, - "num_tokens": 16174707.0, - "step": 1816 - }, - { - "epoch": 1.3806990881458967, - "grad_norm": 1.884639024734497, - "learning_rate": 3.034110600059616e-06, - "loss": 0.31612110137939453, - "mean_token_accuracy": 0.8942475318908691, - "num_tokens": 16181919.0, - "step": 1817 - }, - { - "epoch": 1.3814589665653496, - "grad_norm": 1.891312599182129, - "learning_rate": 3.0320643741440052e-06, - "loss": 0.46209126710891724, - "mean_token_accuracy": 0.8374713659286499, - "num_tokens": 16189276.0, - "step": 1818 - }, - { - "epoch": 1.3822188449848025, - "grad_norm": 2.507478713989258, - "learning_rate": 3.0300177748051375e-06, - "loss": 0.37601593136787415, - "mean_token_accuracy": 0.8633589148521423, - "num_tokens": 16194346.0, - "step": 1819 - }, - { - "epoch": 1.3829787234042552, - "grad_norm": 1.5046696662902832, - "learning_rate": 3.0279708034793907e-06, - "loss": 0.3284982144832611, - "mean_token_accuracy": 0.8792630434036255, - "num_tokens": 16205457.0, - "step": 1820 - }, - { - "epoch": 1.3837386018237083, - "grad_norm": 2.4244449138641357, - "learning_rate": 3.025923461603412e-06, - "loss": 0.40939009189605713, - "mean_token_accuracy": 0.8596426248550415, - "num_tokens": 16211866.0, - "step": 1821 - }, - { - "epoch": 1.384498480243161, - "grad_norm": 2.8656933307647705, - "learning_rate": 3.0238757506141013e-06, - "loss": 0.4397110044956207, - "mean_token_accuracy": 0.8597331047058105, - "num_tokens": 16216607.0, - "step": 1822 - }, - { - "epoch": 1.385258358662614, - "grad_norm": 2.0718610286712646, - "learning_rate": 3.0218276719486245e-06, - "loss": 0.49057573080062866, - "mean_token_accuracy": 0.8325331211090088, - "num_tokens": 16224014.0, - "step": 1823 - }, - { - "epoch": 1.3860182370820668, - "grad_norm": 1.054450273513794, - "learning_rate": 3.019779227044398e-06, - "loss": 0.3758106827735901, - "mean_token_accuracy": 0.8689473867416382, - "num_tokens": 16248627.0, - "step": 1824 - }, - { - "epoch": 1.3867781155015197, - "grad_norm": 2.1115148067474365, - "learning_rate": 3.0177304173391038e-06, - "loss": 0.502967119216919, - "mean_token_accuracy": 0.823198676109314, - "num_tokens": 16256255.0, - "step": 1825 - }, - { - "epoch": 1.3875379939209727, - "grad_norm": 2.207277297973633, - "learning_rate": 3.015681244270672e-06, - "loss": 0.3458971083164215, - "mean_token_accuracy": 0.8930196762084961, - "num_tokens": 16261823.0, - "step": 1826 - }, - { - "epoch": 1.3882978723404256, - "grad_norm": 1.289669156074524, - "learning_rate": 3.0136317092772923e-06, - "loss": 0.4422765374183655, - "mean_token_accuracy": 0.8358346819877625, - "num_tokens": 16280659.0, - "step": 1827 - }, - { - "epoch": 1.3890577507598785, - "grad_norm": 2.233865737915039, - "learning_rate": 3.0115818137974066e-06, - "loss": 0.3643006384372711, - "mean_token_accuracy": 0.8682862520217896, - "num_tokens": 16286356.0, - "step": 1828 - }, - { - "epoch": 1.3898176291793314, - "grad_norm": 1.0950042009353638, - "learning_rate": 3.0095315592697126e-06, - "loss": 0.34712421894073486, - "mean_token_accuracy": 0.8578766584396362, - "num_tokens": 16307298.0, - "step": 1829 - }, - { - "epoch": 1.3905775075987843, - "grad_norm": 1.1708037853240967, - "learning_rate": 3.007480947133155e-06, - "loss": 0.33152541518211365, - "mean_token_accuracy": 0.894973874092102, - "num_tokens": 16323232.0, - "step": 1830 - }, - { - "epoch": 1.391337386018237, - "grad_norm": 1.2226970195770264, - "learning_rate": 3.0054299788269343e-06, - "loss": 0.3915635943412781, - "mean_token_accuracy": 0.8575779795646667, - "num_tokens": 16339273.0, - "step": 1831 - }, - { - "epoch": 1.39209726443769, - "grad_norm": 1.2226042747497559, - "learning_rate": 3.0033786557904982e-06, - "loss": 0.45846253633499146, - "mean_token_accuracy": 0.8290432691574097, - "num_tokens": 16360145.0, - "step": 1832 - }, - { - "epoch": 1.3928571428571428, - "grad_norm": 2.0117406845092773, - "learning_rate": 3.001326979463545e-06, - "loss": 0.3837882876396179, - "mean_token_accuracy": 0.8941739797592163, - "num_tokens": 16366602.0, - "step": 1833 - }, - { - "epoch": 1.3936170212765957, - "grad_norm": 1.8419997692108154, - "learning_rate": 2.9992749512860177e-06, - "loss": 0.40777021646499634, - "mean_token_accuracy": 0.854655385017395, - "num_tokens": 16375611.0, - "step": 1834 - }, - { - "epoch": 1.3943768996960486, - "grad_norm": 1.9405122995376587, - "learning_rate": 2.9972225726981114e-06, - "loss": 0.46685922145843506, - "mean_token_accuracy": 0.8493201732635498, - "num_tokens": 16384878.0, - "step": 1835 - }, - { - "epoch": 1.3951367781155015, - "grad_norm": 1.2425674200057983, - "learning_rate": 2.995169845140264e-06, - "loss": 0.394692063331604, - "mean_token_accuracy": 0.851348876953125, - "num_tokens": 16404452.0, - "step": 1836 - }, - { - "epoch": 1.3958966565349544, - "grad_norm": 1.2215365171432495, - "learning_rate": 2.9931167700531575e-06, - "loss": 0.31412452459335327, - "mean_token_accuracy": 0.882760763168335, - "num_tokens": 16419358.0, - "step": 1837 - }, - { - "epoch": 1.3966565349544073, - "grad_norm": 1.912168025970459, - "learning_rate": 2.9910633488777198e-06, - "loss": 0.5065487623214722, - "mean_token_accuracy": 0.8524355292320251, - "num_tokens": 16430418.0, - "step": 1838 - }, - { - "epoch": 1.3974164133738602, - "grad_norm": 2.2173948287963867, - "learning_rate": 2.989009583055121e-06, - "loss": 0.4290938377380371, - "mean_token_accuracy": 0.8381836414337158, - "num_tokens": 16438267.0, - "step": 1839 - }, - { - "epoch": 1.3981762917933132, - "grad_norm": 1.8293484449386597, - "learning_rate": 2.9869554740267726e-06, - "loss": 0.41683733463287354, - "mean_token_accuracy": 0.8548779487609863, - "num_tokens": 16447382.0, - "step": 1840 - }, - { - "epoch": 1.398936170212766, - "grad_norm": 1.835015892982483, - "learning_rate": 2.9849010232343274e-06, - "loss": 0.5080599784851074, - "mean_token_accuracy": 0.8193596601486206, - "num_tokens": 16458541.0, - "step": 1841 - }, - { - "epoch": 1.3996960486322187, - "grad_norm": 2.031339645385742, - "learning_rate": 2.982846232119679e-06, - "loss": 0.5168882012367249, - "mean_token_accuracy": 0.8525956869125366, - "num_tokens": 16467747.0, - "step": 1842 - }, - { - "epoch": 1.4004559270516717, - "grad_norm": 1.5554167032241821, - "learning_rate": 2.9807911021249573e-06, - "loss": 0.35098958015441895, - "mean_token_accuracy": 0.888373851776123, - "num_tokens": 16479319.0, - "step": 1843 - }, - { - "epoch": 1.4012158054711246, - "grad_norm": 1.7183740139007568, - "learning_rate": 2.9787356346925327e-06, - "loss": 0.41263148188591003, - "mean_token_accuracy": 0.8478364944458008, - "num_tokens": 16489952.0, - "step": 1844 - }, - { - "epoch": 1.4019756838905775, - "grad_norm": 1.7743209600448608, - "learning_rate": 2.9766798312650112e-06, - "loss": 0.4211183190345764, - "mean_token_accuracy": 0.8641136884689331, - "num_tokens": 16498655.0, - "step": 1845 - }, - { - "epoch": 1.4027355623100304, - "grad_norm": 2.141300916671753, - "learning_rate": 2.9746236932852355e-06, - "loss": 0.49548980593681335, - "mean_token_accuracy": 0.8304252028465271, - "num_tokens": 16506348.0, - "step": 1846 - }, - { - "epoch": 1.4034954407294833, - "grad_norm": 2.341571807861328, - "learning_rate": 2.9725672221962804e-06, - "loss": 0.40804803371429443, - "mean_token_accuracy": 0.8545800447463989, - "num_tokens": 16513091.0, - "step": 1847 - }, - { - "epoch": 1.4042553191489362, - "grad_norm": 1.934428095817566, - "learning_rate": 2.9705104194414587e-06, - "loss": 0.30029812455177307, - "mean_token_accuracy": 0.9032052755355835, - "num_tokens": 16519455.0, - "step": 1848 - }, - { - "epoch": 1.405015197568389, - "grad_norm": 1.420804500579834, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.4384060502052307, - "mean_token_accuracy": 0.8465110063552856, - "num_tokens": 16533222.0, - "step": 1849 - }, - { - "epoch": 1.405775075987842, - "grad_norm": 2.1180737018585205, - "learning_rate": 2.9663958247086165e-06, - "loss": 0.3915565609931946, - "mean_token_accuracy": 0.8633890748023987, - "num_tokens": 16539489.0, - "step": 1850 - }, - { - "epoch": 1.4065349544072947, - "grad_norm": 1.408048152923584, - "learning_rate": 2.964338035618378e-06, - "loss": 0.46166157722473145, - "mean_token_accuracy": 0.8305013179779053, - "num_tokens": 16555785.0, - "step": 1851 - }, - { - "epoch": 1.4072948328267478, - "grad_norm": 1.3418530225753784, - "learning_rate": 2.9622799206378306e-06, - "loss": 0.5314373970031738, - "mean_token_accuracy": 0.81779944896698, - "num_tokens": 16578111.0, - "step": 1852 - }, - { - "epoch": 1.4080547112462005, - "grad_norm": 1.4634262323379517, - "learning_rate": 2.9602214812114414e-06, - "loss": 0.4859408140182495, - "mean_token_accuracy": 0.8261818885803223, - "num_tokens": 16591976.0, - "step": 1853 - }, - { - "epoch": 1.4088145896656534, - "grad_norm": 1.4840295314788818, - "learning_rate": 2.9581627187838997e-06, - "loss": 0.4079628586769104, - "mean_token_accuracy": 0.8549603223800659, - "num_tokens": 16603631.0, - "step": 1854 - }, - { - "epoch": 1.4095744680851063, - "grad_norm": 2.1474642753601074, - "learning_rate": 2.956103634800126e-06, - "loss": 0.32997995615005493, - "mean_token_accuracy": 0.8836915493011475, - "num_tokens": 16609875.0, - "step": 1855 - }, - { - "epoch": 1.4103343465045592, - "grad_norm": 2.627460241317749, - "learning_rate": 2.9540442307052643e-06, - "loss": 0.3229186236858368, - "mean_token_accuracy": 0.8852157592773438, - "num_tokens": 16614113.0, - "step": 1856 - }, - { - "epoch": 1.4110942249240122, - "grad_norm": 1.9569811820983887, - "learning_rate": 2.9519845079446824e-06, - "loss": 0.5057883858680725, - "mean_token_accuracy": 0.8585711717605591, - "num_tokens": 16624611.0, - "step": 1857 - }, - { - "epoch": 1.411854103343465, - "grad_norm": 2.0604090690612793, - "learning_rate": 2.949924467963975e-06, - "loss": 0.4681510329246521, - "mean_token_accuracy": 0.8390560150146484, - "num_tokens": 16632938.0, - "step": 1858 - }, - { - "epoch": 1.412613981762918, - "grad_norm": 2.5430450439453125, - "learning_rate": 2.9478641122089563e-06, - "loss": 0.3090999126434326, - "mean_token_accuracy": 0.8943990468978882, - "num_tokens": 16637135.0, - "step": 1859 - }, - { - "epoch": 1.4133738601823709, - "grad_norm": 1.3275387287139893, - "learning_rate": 2.945803442125663e-06, - "loss": 0.3592180013656616, - "mean_token_accuracy": 0.8678265810012817, - "num_tokens": 16650322.0, - "step": 1860 - }, - { - "epoch": 1.4141337386018238, - "grad_norm": 1.9070929288864136, - "learning_rate": 2.943742459160354e-06, - "loss": 0.5332518815994263, - "mean_token_accuracy": 0.8475706577301025, - "num_tokens": 16660240.0, - "step": 1861 - }, - { - "epoch": 1.4148936170212765, - "grad_norm": 2.8724546432495117, - "learning_rate": 2.9416811647595052e-06, - "loss": 0.5052884817123413, - "mean_token_accuracy": 0.8363175392150879, - "num_tokens": 16665481.0, - "step": 1862 - }, - { - "epoch": 1.4156534954407296, - "grad_norm": 4.203817844390869, - "learning_rate": 2.939619560369813e-06, - "loss": 0.546925961971283, - "mean_token_accuracy": 0.834044337272644, - "num_tokens": 16669615.0, - "step": 1863 - }, - { - "epoch": 1.4164133738601823, - "grad_norm": 1.6466281414031982, - "learning_rate": 2.9375576474381907e-06, - "loss": 0.3474533259868622, - "mean_token_accuracy": 0.8571163415908813, - "num_tokens": 16678893.0, - "step": 1864 - }, - { - "epoch": 1.4171732522796352, - "grad_norm": 1.8885842561721802, - "learning_rate": 2.9354954274117683e-06, - "loss": 0.3726021349430084, - "mean_token_accuracy": 0.8629094958305359, - "num_tokens": 16685939.0, - "step": 1865 - }, - { - "epoch": 1.417933130699088, - "grad_norm": 2.830599784851074, - "learning_rate": 2.9334329017378898e-06, - "loss": 0.4138668477535248, - "mean_token_accuracy": 0.8670746088027954, - "num_tokens": 16690012.0, - "step": 1866 - }, - { - "epoch": 1.418693009118541, - "grad_norm": 1.6838961839675903, - "learning_rate": 2.9313700718641167e-06, - "loss": 0.33954259753227234, - "mean_token_accuracy": 0.8660278916358948, - "num_tokens": 16700061.0, - "step": 1867 - }, - { - "epoch": 1.419452887537994, - "grad_norm": 2.8767011165618896, - "learning_rate": 2.9293069392382224e-06, - "loss": 0.4650302827358246, - "mean_token_accuracy": 0.8448452949523926, - "num_tokens": 16705072.0, - "step": 1868 - }, - { - "epoch": 1.4202127659574468, - "grad_norm": 1.5901305675506592, - "learning_rate": 2.927243505308192e-06, - "loss": 0.40838998556137085, - "mean_token_accuracy": 0.8560664653778076, - "num_tokens": 16714763.0, - "step": 1869 - }, - { - "epoch": 1.4209726443768997, - "grad_norm": 1.3293657302856445, - "learning_rate": 2.925179771522223e-06, - "loss": 0.34712862968444824, - "mean_token_accuracy": 0.8633697032928467, - "num_tokens": 16729575.0, - "step": 1870 - }, - { - "epoch": 1.4217325227963526, - "grad_norm": 1.7465964555740356, - "learning_rate": 2.9231157393287234e-06, - "loss": 0.48190903663635254, - "mean_token_accuracy": 0.8255834579467773, - "num_tokens": 16742529.0, - "step": 1871 - }, - { - "epoch": 1.4224924012158056, - "grad_norm": 1.865749716758728, - "learning_rate": 2.9210514101763116e-06, - "loss": 0.4912028908729553, - "mean_token_accuracy": 0.8309572339057922, - "num_tokens": 16753989.0, - "step": 1872 - }, - { - "epoch": 1.4232522796352582, - "grad_norm": 2.55780291557312, - "learning_rate": 2.9189867855138103e-06, - "loss": 0.4550635814666748, - "mean_token_accuracy": 0.8584091067314148, - "num_tokens": 16758906.0, - "step": 1873 - }, - { - "epoch": 1.4240121580547114, - "grad_norm": 1.867530107498169, - "learning_rate": 2.9169218667902562e-06, - "loss": 0.3524911105632782, - "mean_token_accuracy": 0.8715004920959473, - "num_tokens": 16765969.0, - "step": 1874 - }, - { - "epoch": 1.424772036474164, - "grad_norm": 1.8886862993240356, - "learning_rate": 2.9148566554548857e-06, - "loss": 0.37144535779953003, - "mean_token_accuracy": 0.8640961050987244, - "num_tokens": 16773935.0, - "step": 1875 - }, - { - "epoch": 1.425531914893617, - "grad_norm": 1.266065239906311, - "learning_rate": 2.912791152957145e-06, - "loss": 0.3341747522354126, - "mean_token_accuracy": 0.8929134607315063, - "num_tokens": 16787780.0, - "step": 1876 - }, - { - "epoch": 1.4262917933130699, - "grad_norm": 2.524888753890991, - "learning_rate": 2.9107253607466833e-06, - "loss": 0.33709171414375305, - "mean_token_accuracy": 0.8857531547546387, - "num_tokens": 16792753.0, - "step": 1877 - }, - { - "epoch": 1.4270516717325228, - "grad_norm": 1.9269018173217773, - "learning_rate": 2.908659280273354e-06, - "loss": 0.32599249482154846, - "mean_token_accuracy": 0.8777773380279541, - "num_tokens": 16799904.0, - "step": 1878 - }, - { - "epoch": 1.4278115501519757, - "grad_norm": 1.9844375848770142, - "learning_rate": 2.9065929129872097e-06, - "loss": 0.4086732268333435, - "mean_token_accuracy": 0.8505409955978394, - "num_tokens": 16807774.0, - "step": 1879 - }, - { - "epoch": 1.4285714285714286, - "grad_norm": 4.0958662033081055, - "learning_rate": 2.9045262603385073e-06, - "loss": 0.3838827610015869, - "mean_token_accuracy": 0.877601146697998, - "num_tokens": 16810908.0, - "step": 1880 - }, - { - "epoch": 1.4293313069908815, - "grad_norm": 1.7323768138885498, - "learning_rate": 2.902459323777704e-06, - "loss": 0.37459003925323486, - "mean_token_accuracy": 0.8655836582183838, - "num_tokens": 16819494.0, - "step": 1881 - }, - { - "epoch": 1.4300911854103344, - "grad_norm": 2.608043670654297, - "learning_rate": 2.900392104755455e-06, - "loss": 0.5798726677894592, - "mean_token_accuracy": 0.8382592797279358, - "num_tokens": 16827745.0, - "step": 1882 - }, - { - "epoch": 1.4308510638297873, - "grad_norm": 1.3262078762054443, - "learning_rate": 2.8983246047226137e-06, - "loss": 0.3724595904350281, - "mean_token_accuracy": 0.8651963472366333, - "num_tokens": 16844171.0, - "step": 1883 - }, - { - "epoch": 1.43161094224924, - "grad_norm": 1.7250545024871826, - "learning_rate": 2.8962568251302327e-06, - "loss": 0.3478979468345642, - "mean_token_accuracy": 0.8807886242866516, - "num_tokens": 16852838.0, - "step": 1884 - }, - { - "epoch": 1.4323708206686931, - "grad_norm": 2.114525318145752, - "learning_rate": 2.8941887674295573e-06, - "loss": 0.5156140327453613, - "mean_token_accuracy": 0.825178861618042, - "num_tokens": 16861087.0, - "step": 1885 - }, - { - "epoch": 1.4331306990881458, - "grad_norm": 2.400829792022705, - "learning_rate": 2.892120433072031e-06, - "loss": 0.2807392477989197, - "mean_token_accuracy": 0.8907361030578613, - "num_tokens": 16866557.0, - "step": 1886 - }, - { - "epoch": 1.4338905775075987, - "grad_norm": 2.490880012512207, - "learning_rate": 2.8900518235092908e-06, - "loss": 0.2615952491760254, - "mean_token_accuracy": 0.9152894020080566, - "num_tokens": 16871357.0, - "step": 1887 - }, - { - "epoch": 1.4346504559270516, - "grad_norm": 1.9058431386947632, - "learning_rate": 2.887982940193165e-06, - "loss": 0.43623363971710205, - "mean_token_accuracy": 0.84696364402771, - "num_tokens": 16879016.0, - "step": 1888 - }, - { - "epoch": 1.4354103343465046, - "grad_norm": 1.4520210027694702, - "learning_rate": 2.8859137845756785e-06, - "loss": 0.3961856961250305, - "mean_token_accuracy": 0.8518897294998169, - "num_tokens": 16892254.0, - "step": 1889 - }, - { - "epoch": 1.4361702127659575, - "grad_norm": 2.500274896621704, - "learning_rate": 2.8838443581090415e-06, - "loss": 0.41457289457321167, - "mean_token_accuracy": 0.8751448392868042, - "num_tokens": 16897156.0, - "step": 1890 - }, - { - "epoch": 1.4369300911854104, - "grad_norm": 2.9312057495117188, - "learning_rate": 2.8817746622456585e-06, - "loss": 0.45875269174575806, - "mean_token_accuracy": 0.8411039113998413, - "num_tokens": 16902291.0, - "step": 1891 - }, - { - "epoch": 1.4376899696048633, - "grad_norm": 2.367419481277466, - "learning_rate": 2.879704698438121e-06, - "loss": 0.3643629848957062, - "mean_token_accuracy": 0.8771071434020996, - "num_tokens": 16908128.0, - "step": 1892 - }, - { - "epoch": 1.4384498480243162, - "grad_norm": 1.9907705783843994, - "learning_rate": 2.8776344681392106e-06, - "loss": 0.3206835389137268, - "mean_token_accuracy": 0.879996657371521, - "num_tokens": 16914918.0, - "step": 1893 - }, - { - "epoch": 1.439209726443769, - "grad_norm": 3.536956310272217, - "learning_rate": 2.875563972801893e-06, - "loss": 0.3640141785144806, - "mean_token_accuracy": 0.8814959526062012, - "num_tokens": 16918187.0, - "step": 1894 - }, - { - "epoch": 1.4399696048632218, - "grad_norm": 1.3451156616210938, - "learning_rate": 2.8734932138793226e-06, - "loss": 0.3427346348762512, - "mean_token_accuracy": 0.8835382461547852, - "num_tokens": 16931135.0, - "step": 1895 - }, - { - "epoch": 1.4407294832826747, - "grad_norm": 2.0735955238342285, - "learning_rate": 2.871422192824837e-06, - "loss": 0.4265315532684326, - "mean_token_accuracy": 0.8452677726745605, - "num_tokens": 16937995.0, - "step": 1896 - }, - { - "epoch": 1.4414893617021276, - "grad_norm": 1.5124932527542114, - "learning_rate": 2.8693509110919597e-06, - "loss": 0.497121661901474, - "mean_token_accuracy": 0.815092921257019, - "num_tokens": 16952743.0, - "step": 1897 - }, - { - "epoch": 1.4422492401215805, - "grad_norm": 3.716669797897339, - "learning_rate": 2.867279370134395e-06, - "loss": 0.5452651381492615, - "mean_token_accuracy": 0.8150380849838257, - "num_tokens": 16956797.0, - "step": 1898 - }, - { - "epoch": 1.4430091185410334, - "grad_norm": 1.3571398258209229, - "learning_rate": 2.8652075714060296e-06, - "loss": 0.4249724745750427, - "mean_token_accuracy": 0.8675867915153503, - "num_tokens": 16974494.0, - "step": 1899 - }, - { - "epoch": 1.4437689969604863, - "grad_norm": 2.310673475265503, - "learning_rate": 2.863135516360932e-06, - "loss": 0.39368677139282227, - "mean_token_accuracy": 0.878392219543457, - "num_tokens": 16980612.0, - "step": 1900 - }, - { - "epoch": 1.4445288753799392, - "grad_norm": 1.9025533199310303, - "learning_rate": 2.8610632064533517e-06, - "loss": 0.4786127805709839, - "mean_token_accuracy": 0.8720556497573853, - "num_tokens": 16992262.0, - "step": 1901 - }, - { - "epoch": 1.4452887537993921, - "grad_norm": 2.528564453125, - "learning_rate": 2.8589906431377133e-06, - "loss": 0.4223094582557678, - "mean_token_accuracy": 0.8513246178627014, - "num_tokens": 16997717.0, - "step": 1902 - }, - { - "epoch": 1.446048632218845, - "grad_norm": 1.010425329208374, - "learning_rate": 2.8569178278686222e-06, - "loss": 0.3908255696296692, - "mean_token_accuracy": 0.8620463609695435, - "num_tokens": 17020903.0, - "step": 1903 - }, - { - "epoch": 1.4468085106382977, - "grad_norm": 1.5760232210159302, - "learning_rate": 2.8548447621008614e-06, - "loss": 0.4134044051170349, - "mean_token_accuracy": 0.8472093343734741, - "num_tokens": 17035250.0, - "step": 1904 - }, - { - "epoch": 1.4475683890577509, - "grad_norm": 2.0668535232543945, - "learning_rate": 2.8527714472893866e-06, - "loss": 0.44095730781555176, - "mean_token_accuracy": 0.881983757019043, - "num_tokens": 17042170.0, - "step": 1905 - }, - { - "epoch": 1.4483282674772036, - "grad_norm": 1.1620599031448364, - "learning_rate": 2.85069788488933e-06, - "loss": 0.3607163429260254, - "mean_token_accuracy": 0.8684282898902893, - "num_tokens": 17061937.0, - "step": 1906 - }, - { - "epoch": 1.4490881458966565, - "grad_norm": 2.1316568851470947, - "learning_rate": 2.8486240763559984e-06, - "loss": 0.3478124141693115, - "mean_token_accuracy": 0.8772403001785278, - "num_tokens": 17068628.0, - "step": 1907 - }, - { - "epoch": 1.4498480243161094, - "grad_norm": 2.4756391048431396, - "learning_rate": 2.8465500231448707e-06, - "loss": 0.46441152691841125, - "mean_token_accuracy": 0.8436450958251953, - "num_tokens": 17075495.0, - "step": 1908 - }, - { - "epoch": 1.4506079027355623, - "grad_norm": 2.249720573425293, - "learning_rate": 2.844475726711595e-06, - "loss": 0.41565513610839844, - "mean_token_accuracy": 0.8525094985961914, - "num_tokens": 17080940.0, - "step": 1909 - }, - { - "epoch": 1.4513677811550152, - "grad_norm": 2.3081841468811035, - "learning_rate": 2.8424011885119956e-06, - "loss": 0.49903199076652527, - "mean_token_accuracy": 0.8212426900863647, - "num_tokens": 17092024.0, - "step": 1910 - }, - { - "epoch": 1.452127659574468, - "grad_norm": 1.2929959297180176, - "learning_rate": 2.8403264100020613e-06, - "loss": 0.47038257122039795, - "mean_token_accuracy": 0.8319816589355469, - "num_tokens": 17108840.0, - "step": 1911 - }, - { - "epoch": 1.452887537993921, - "grad_norm": 1.6476463079452515, - "learning_rate": 2.8382513926379508e-06, - "loss": 0.42287829518318176, - "mean_token_accuracy": 0.8555682897567749, - "num_tokens": 17119704.0, - "step": 1912 - }, - { - "epoch": 1.453647416413374, - "grad_norm": 1.759998083114624, - "learning_rate": 2.836176137875993e-06, - "loss": 0.40904951095581055, - "mean_token_accuracy": 0.8698266744613647, - "num_tokens": 17130676.0, - "step": 1913 - }, - { - "epoch": 1.4544072948328268, - "grad_norm": 1.510909914970398, - "learning_rate": 2.8341006471726817e-06, - "loss": 0.47834792733192444, - "mean_token_accuracy": 0.8335825204849243, - "num_tokens": 17146304.0, - "step": 1914 - }, - { - "epoch": 1.4551671732522795, - "grad_norm": 3.538071632385254, - "learning_rate": 2.832024921984674e-06, - "loss": 0.34059035778045654, - "mean_token_accuracy": 0.8769031763076782, - "num_tokens": 17150458.0, - "step": 1915 - }, - { - "epoch": 1.4559270516717326, - "grad_norm": 2.3368659019470215, - "learning_rate": 2.8299489637687955e-06, - "loss": 0.43068382143974304, - "mean_token_accuracy": 0.845360517501831, - "num_tokens": 17157368.0, - "step": 1916 - }, - { - "epoch": 1.4566869300911853, - "grad_norm": 1.8720396757125854, - "learning_rate": 2.8278727739820334e-06, - "loss": 0.37013399600982666, - "mean_token_accuracy": 0.854241132736206, - "num_tokens": 17166325.0, - "step": 1917 - }, - { - "epoch": 1.4574468085106382, - "grad_norm": 1.6706892251968384, - "learning_rate": 2.825796354081537e-06, - "loss": 0.5397020578384399, - "mean_token_accuracy": 0.8309713006019592, - "num_tokens": 17178920.0, - "step": 1918 - }, - { - "epoch": 1.4582066869300911, - "grad_norm": 2.729210376739502, - "learning_rate": 2.8237197055246175e-06, - "loss": 0.25137859582901, - "mean_token_accuracy": 0.9148792028427124, - "num_tokens": 17183107.0, - "step": 1919 - }, - { - "epoch": 1.458966565349544, - "grad_norm": 3.023500680923462, - "learning_rate": 2.821642829768748e-06, - "loss": 0.43312495946884155, - "mean_token_accuracy": 0.8481811285018921, - "num_tokens": 17187853.0, - "step": 1920 - }, - { - "epoch": 1.459726443768997, - "grad_norm": 1.8108519315719604, - "learning_rate": 2.8195657282715595e-06, - "loss": 0.5101792216300964, - "mean_token_accuracy": 0.8315553069114685, - "num_tokens": 17199247.0, - "step": 1921 - }, - { - "epoch": 1.4604863221884499, - "grad_norm": 2.0262672901153564, - "learning_rate": 2.817488402490841e-06, - "loss": 0.4449934959411621, - "mean_token_accuracy": 0.8634527325630188, - "num_tokens": 17206348.0, - "step": 1922 - }, - { - "epoch": 1.4612462006079028, - "grad_norm": 2.6163926124572754, - "learning_rate": 2.8154108538845405e-06, - "loss": 0.43052345514297485, - "mean_token_accuracy": 0.8375401496887207, - "num_tokens": 17211702.0, - "step": 1923 - }, - { - "epoch": 1.4620060790273557, - "grad_norm": 2.0854408740997314, - "learning_rate": 2.813333083910761e-06, - "loss": 0.5011380910873413, - "mean_token_accuracy": 0.8359915018081665, - "num_tokens": 17219096.0, - "step": 1924 - }, - { - "epoch": 1.4627659574468086, - "grad_norm": 2.2081687450408936, - "learning_rate": 2.8112550940277615e-06, - "loss": 0.5239193439483643, - "mean_token_accuracy": 0.8499593734741211, - "num_tokens": 17229266.0, - "step": 1925 - }, - { - "epoch": 1.4635258358662613, - "grad_norm": 1.798343539237976, - "learning_rate": 2.809176885693956e-06, - "loss": 0.4515029191970825, - "mean_token_accuracy": 0.8400485515594482, - "num_tokens": 17239280.0, - "step": 1926 - }, - { - "epoch": 1.4642857142857144, - "grad_norm": 1.897887945175171, - "learning_rate": 2.807098460367911e-06, - "loss": 0.35935714840888977, - "mean_token_accuracy": 0.8776072263717651, - "num_tokens": 17247132.0, - "step": 1927 - }, - { - "epoch": 1.465045592705167, - "grad_norm": 2.705836296081543, - "learning_rate": 2.8050198195083445e-06, - "loss": 0.3728443682193756, - "mean_token_accuracy": 0.8649885654449463, - "num_tokens": 17251865.0, - "step": 1928 - }, - { - "epoch": 1.46580547112462, - "grad_norm": 1.841178059577942, - "learning_rate": 2.802940964574127e-06, - "loss": 0.40604841709136963, - "mean_token_accuracy": 0.8537783622741699, - "num_tokens": 17260163.0, - "step": 1929 - }, - { - "epoch": 1.466565349544073, - "grad_norm": 2.7393605709075928, - "learning_rate": 2.800861897024279e-06, - "loss": 0.39346879720687866, - "mean_token_accuracy": 0.8628787994384766, - "num_tokens": 17264876.0, - "step": 1930 - }, - { - "epoch": 1.4673252279635258, - "grad_norm": 1.84367835521698, - "learning_rate": 2.798782618317971e-06, - "loss": 0.37411895394325256, - "mean_token_accuracy": 0.8605265617370605, - "num_tokens": 17273049.0, - "step": 1931 - }, - { - "epoch": 1.4680851063829787, - "grad_norm": 1.6546733379364014, - "learning_rate": 2.796703129914519e-06, - "loss": 0.4997844099998474, - "mean_token_accuracy": 0.8267433643341064, - "num_tokens": 17285074.0, - "step": 1932 - }, - { - "epoch": 1.4688449848024316, - "grad_norm": 2.2749221324920654, - "learning_rate": 2.79462343327339e-06, - "loss": 0.35453367233276367, - "mean_token_accuracy": 0.8746850490570068, - "num_tokens": 17290273.0, - "step": 1933 - }, - { - "epoch": 1.4696048632218845, - "grad_norm": 1.7142518758773804, - "learning_rate": 2.7925435298541944e-06, - "loss": 0.345878541469574, - "mean_token_accuracy": 0.8600981831550598, - "num_tokens": 17301045.0, - "step": 1934 - }, - { - "epoch": 1.4703647416413375, - "grad_norm": 3.163342237472534, - "learning_rate": 2.7904634211166877e-06, - "loss": 0.4356975853443146, - "mean_token_accuracy": 0.8460350036621094, - "num_tokens": 17305108.0, - "step": 1935 - }, - { - "epoch": 1.4711246200607904, - "grad_norm": 1.6377612352371216, - "learning_rate": 2.7883831085207707e-06, - "loss": 0.4459729790687561, - "mean_token_accuracy": 0.8463394641876221, - "num_tokens": 17315479.0, - "step": 1936 - }, - { - "epoch": 1.471884498480243, - "grad_norm": 1.865268588066101, - "learning_rate": 2.7863025935264876e-06, - "loss": 0.394723117351532, - "mean_token_accuracy": 0.864177942276001, - "num_tokens": 17324795.0, - "step": 1937 - }, - { - "epoch": 1.4726443768996962, - "grad_norm": 1.241937518119812, - "learning_rate": 2.784221877594024e-06, - "loss": 0.2752220630645752, - "mean_token_accuracy": 0.8998259902000427, - "num_tokens": 17338000.0, - "step": 1938 - }, - { - "epoch": 1.4734042553191489, - "grad_norm": 1.8013651371002197, - "learning_rate": 2.7821409621837042e-06, - "loss": 0.4251005947589874, - "mean_token_accuracy": 0.8518919348716736, - "num_tokens": 17347351.0, - "step": 1939 - }, - { - "epoch": 1.4741641337386018, - "grad_norm": 1.2902207374572754, - "learning_rate": 2.7800598487559976e-06, - "loss": 0.3640727400779724, - "mean_token_accuracy": 0.8592870235443115, - "num_tokens": 17362335.0, - "step": 1940 - }, - { - "epoch": 1.4749240121580547, - "grad_norm": 2.5427513122558594, - "learning_rate": 2.777978538771508e-06, - "loss": 0.38166797161102295, - "mean_token_accuracy": 0.8653234839439392, - "num_tokens": 17367733.0, - "step": 1941 - }, - { - "epoch": 1.4756838905775076, - "grad_norm": 1.7793641090393066, - "learning_rate": 2.7758970336909795e-06, - "loss": 0.3113783895969391, - "mean_token_accuracy": 0.8812868595123291, - "num_tokens": 17375267.0, - "step": 1942 - }, - { - "epoch": 1.4764437689969605, - "grad_norm": 3.4031741619110107, - "learning_rate": 2.7738153349752923e-06, - "loss": 0.4800986647605896, - "mean_token_accuracy": 0.8336698412895203, - "num_tokens": 17379549.0, - "step": 1943 - }, - { - "epoch": 1.4772036474164134, - "grad_norm": 1.3451651334762573, - "learning_rate": 2.7717334440854634e-06, - "loss": 0.3115345239639282, - "mean_token_accuracy": 0.908623218536377, - "num_tokens": 17394455.0, - "step": 1944 - }, - { - "epoch": 1.4779635258358663, - "grad_norm": 1.980919599533081, - "learning_rate": 2.7696513624826422e-06, - "loss": 0.391154944896698, - "mean_token_accuracy": 0.8650267720222473, - "num_tokens": 17401931.0, - "step": 1945 - }, - { - "epoch": 1.4787234042553192, - "grad_norm": 1.0118765830993652, - "learning_rate": 2.7675690916281158e-06, - "loss": 0.3157956600189209, - "mean_token_accuracy": 0.8827471733093262, - "num_tokens": 17424144.0, - "step": 1946 - }, - { - "epoch": 1.4794832826747721, - "grad_norm": 1.579654335975647, - "learning_rate": 2.7654866329833e-06, - "loss": 0.4578486382961273, - "mean_token_accuracy": 0.8361750245094299, - "num_tokens": 17435769.0, - "step": 1947 - }, - { - "epoch": 1.4802431610942248, - "grad_norm": 1.7706717252731323, - "learning_rate": 2.763403988009746e-06, - "loss": 0.3564416170120239, - "mean_token_accuracy": 0.8689201474189758, - "num_tokens": 17444088.0, - "step": 1948 - }, - { - "epoch": 1.4810030395136777, - "grad_norm": 1.2264244556427002, - "learning_rate": 2.761321158169134e-06, - "loss": 0.30763837695121765, - "mean_token_accuracy": 0.8960219621658325, - "num_tokens": 17458096.0, - "step": 1949 - }, - { - "epoch": 1.4817629179331306, - "grad_norm": 1.214431881904602, - "learning_rate": 2.759238144923274e-06, - "loss": 0.49099457263946533, - "mean_token_accuracy": 0.8279136419296265, - "num_tokens": 17481062.0, - "step": 1950 - }, - { - "epoch": 1.4825227963525835, - "grad_norm": 1.593892216682434, - "learning_rate": 2.7571549497341044e-06, - "loss": 0.3745320737361908, - "mean_token_accuracy": 0.8690779209136963, - "num_tokens": 17490874.0, - "step": 1951 - }, - { - "epoch": 1.4832826747720365, - "grad_norm": 2.409924268722534, - "learning_rate": 2.755071574063692e-06, - "loss": 0.4310247600078583, - "mean_token_accuracy": 0.8521159291267395, - "num_tokens": 17496942.0, - "step": 1952 - }, - { - "epoch": 1.4840425531914894, - "grad_norm": 1.2557463645935059, - "learning_rate": 2.7529880193742297e-06, - "loss": 0.34304720163345337, - "mean_token_accuracy": 0.8748183250427246, - "num_tokens": 17514391.0, - "step": 1953 - }, - { - "epoch": 1.4848024316109423, - "grad_norm": 1.17310631275177, - "learning_rate": 2.7509042871280373e-06, - "loss": 0.3835817277431488, - "mean_token_accuracy": 0.8853274583816528, - "num_tokens": 17533289.0, - "step": 1954 - }, - { - "epoch": 1.4855623100303952, - "grad_norm": 1.5261479616165161, - "learning_rate": 2.748820378787558e-06, - "loss": 0.4799988865852356, - "mean_token_accuracy": 0.8252149820327759, - "num_tokens": 17544118.0, - "step": 1955 - }, - { - "epoch": 1.486322188449848, - "grad_norm": 2.030930757522583, - "learning_rate": 2.7467362958153585e-06, - "loss": 0.35690805315971375, - "mean_token_accuracy": 0.8959587216377258, - "num_tokens": 17550431.0, - "step": 1956 - }, - { - "epoch": 1.4870820668693008, - "grad_norm": 2.376520872116089, - "learning_rate": 2.7446520396741293e-06, - "loss": 0.262234091758728, - "mean_token_accuracy": 0.9054547548294067, - "num_tokens": 17554853.0, - "step": 1957 - }, - { - "epoch": 1.487841945288754, - "grad_norm": 1.6944479942321777, - "learning_rate": 2.742567611826681e-06, - "loss": 0.529259979724884, - "mean_token_accuracy": 0.8195339441299438, - "num_tokens": 17568016.0, - "step": 1958 - }, - { - "epoch": 1.4886018237082066, - "grad_norm": 2.833029270172119, - "learning_rate": 2.7404830137359445e-06, - "loss": 0.30229634046554565, - "mean_token_accuracy": 0.8933001756668091, - "num_tokens": 17572587.0, - "step": 1959 - }, - { - "epoch": 1.4893617021276595, - "grad_norm": 1.7040144205093384, - "learning_rate": 2.7383982468649715e-06, - "loss": 0.3166356682777405, - "mean_token_accuracy": 0.8871906399726868, - "num_tokens": 17580966.0, - "step": 1960 - }, - { - "epoch": 1.4901215805471124, - "grad_norm": 1.7539052963256836, - "learning_rate": 2.7363133126769326e-06, - "loss": 0.4231064021587372, - "mean_token_accuracy": 0.8708304166793823, - "num_tokens": 17590907.0, - "step": 1961 - }, - { - "epoch": 1.4908814589665653, - "grad_norm": 1.6198650598526, - "learning_rate": 2.7342282126351145e-06, - "loss": 0.4198967218399048, - "mean_token_accuracy": 0.8723280429840088, - "num_tokens": 17604291.0, - "step": 1962 - }, - { - "epoch": 1.4916413373860182, - "grad_norm": 1.8437711000442505, - "learning_rate": 2.73214294820292e-06, - "loss": 0.38923323154449463, - "mean_token_accuracy": 0.8697006106376648, - "num_tokens": 17612291.0, - "step": 1963 - }, - { - "epoch": 1.4924012158054711, - "grad_norm": 1.1129369735717773, - "learning_rate": 2.7300575208438684e-06, - "loss": 0.3107512593269348, - "mean_token_accuracy": 0.878618597984314, - "num_tokens": 17630073.0, - "step": 1964 - }, - { - "epoch": 1.493161094224924, - "grad_norm": 3.0210442543029785, - "learning_rate": 2.7279719320215924e-06, - "loss": 0.4630751609802246, - "mean_token_accuracy": 0.8567075729370117, - "num_tokens": 17634758.0, - "step": 1965 - }, - { - "epoch": 1.493920972644377, - "grad_norm": 2.8825972080230713, - "learning_rate": 2.725886183199839e-06, - "loss": 0.35351765155792236, - "mean_token_accuracy": 0.8711981773376465, - "num_tokens": 17639613.0, - "step": 1966 - }, - { - "epoch": 1.4946808510638299, - "grad_norm": 2.111238718032837, - "learning_rate": 2.723800275842468e-06, - "loss": 0.3529569208621979, - "mean_token_accuracy": 0.8679244518280029, - "num_tokens": 17645308.0, - "step": 1967 - }, - { - "epoch": 1.4954407294832825, - "grad_norm": 2.080509901046753, - "learning_rate": 2.7217142114134466e-06, - "loss": 0.43321219086647034, - "mean_token_accuracy": 0.8848220109939575, - "num_tokens": 17652292.0, - "step": 1968 - }, - { - "epoch": 1.4962006079027357, - "grad_norm": 2.8686363697052, - "learning_rate": 2.7196279913768587e-06, - "loss": 0.417035311460495, - "mean_token_accuracy": 0.8724601864814758, - "num_tokens": 17656908.0, - "step": 1969 - }, - { - "epoch": 1.4969604863221884, - "grad_norm": 3.294193744659424, - "learning_rate": 2.717541617196891e-06, - "loss": 0.3551934063434601, - "mean_token_accuracy": 0.8838565349578857, - "num_tokens": 17660590.0, - "step": 1970 - }, - { - "epoch": 1.4977203647416413, - "grad_norm": 1.766292929649353, - "learning_rate": 2.7154550903378425e-06, - "loss": 0.36521971225738525, - "mean_token_accuracy": 0.8810199499130249, - "num_tokens": 17668214.0, - "step": 1971 - }, - { - "epoch": 1.4984802431610942, - "grad_norm": 1.2127676010131836, - "learning_rate": 2.713368412264118e-06, - "loss": 0.35184425115585327, - "mean_token_accuracy": 0.8672580718994141, - "num_tokens": 17684736.0, - "step": 1972 - }, - { - "epoch": 1.499240121580547, - "grad_norm": 2.268256664276123, - "learning_rate": 2.711281584440228e-06, - "loss": 0.40115267038345337, - "mean_token_accuracy": 0.8517841100692749, - "num_tokens": 17691510.0, - "step": 1973 - }, - { - "epoch": 1.5, - "grad_norm": 2.7196054458618164, - "learning_rate": 2.70919460833079e-06, - "loss": 0.3819037675857544, - "mean_token_accuracy": 0.8765411376953125, - "num_tokens": 17696179.0, - "step": 1974 - }, - { - "epoch": 1.500759878419453, - "grad_norm": 2.969406843185425, - "learning_rate": 2.7071074854005206e-06, - "loss": 0.3922455608844757, - "mean_token_accuracy": 0.8796037435531616, - "num_tokens": 17700597.0, - "step": 1975 - }, - { - "epoch": 1.5015197568389058, - "grad_norm": 2.2965853214263916, - "learning_rate": 2.705020217114248e-06, - "loss": 0.5433666110038757, - "mean_token_accuracy": 0.809639036655426, - "num_tokens": 17708895.0, - "step": 1976 - }, - { - "epoch": 1.5022796352583585, - "grad_norm": 1.5584394931793213, - "learning_rate": 2.7029328049368942e-06, - "loss": 0.4736343324184418, - "mean_token_accuracy": 0.8197190761566162, - "num_tokens": 17725202.0, - "step": 1977 - }, - { - "epoch": 1.5030395136778116, - "grad_norm": 1.3903142213821411, - "learning_rate": 2.700845250333486e-06, - "loss": 0.4471571445465088, - "mean_token_accuracy": 0.839043140411377, - "num_tokens": 17742835.0, - "step": 1978 - }, - { - "epoch": 1.5037993920972643, - "grad_norm": 3.080716609954834, - "learning_rate": 2.69875755476915e-06, - "loss": 0.45760005712509155, - "mean_token_accuracy": 0.8366328477859497, - "num_tokens": 17747324.0, - "step": 1979 - }, - { - "epoch": 1.5045592705167175, - "grad_norm": 1.0150405168533325, - "learning_rate": 2.696669719709111e-06, - "loss": 0.33638954162597656, - "mean_token_accuracy": 0.8591676354408264, - "num_tokens": 17765565.0, - "step": 1980 - }, - { - "epoch": 1.5053191489361701, - "grad_norm": 2.402927875518799, - "learning_rate": 2.694581746618691e-06, - "loss": 0.4086601436138153, - "mean_token_accuracy": 0.8769911527633667, - "num_tokens": 17771275.0, - "step": 1981 - }, - { - "epoch": 1.506079027355623, - "grad_norm": 2.030583381652832, - "learning_rate": 2.6924936369633126e-06, - "loss": 0.5115457773208618, - "mean_token_accuracy": 0.8054746389389038, - "num_tokens": 17779999.0, - "step": 1982 - }, - { - "epoch": 1.506838905775076, - "grad_norm": 2.575199604034424, - "learning_rate": 2.6904053922084893e-06, - "loss": 0.363183856010437, - "mean_token_accuracy": 0.8716042637825012, - "num_tokens": 17785473.0, - "step": 1983 - }, - { - "epoch": 1.5075987841945289, - "grad_norm": 1.8497480154037476, - "learning_rate": 2.688317013819832e-06, - "loss": 0.4254384934902191, - "mean_token_accuracy": 0.8549597263336182, - "num_tokens": 17793812.0, - "step": 1984 - }, - { - "epoch": 1.5083586626139818, - "grad_norm": 1.7786511182785034, - "learning_rate": 2.686228503263045e-06, - "loss": 0.33400774002075195, - "mean_token_accuracy": 0.9027615189552307, - "num_tokens": 17801783.0, - "step": 1985 - }, - { - "epoch": 1.5091185410334347, - "grad_norm": 1.8365367650985718, - "learning_rate": 2.684139862003927e-06, - "loss": 0.35765063762664795, - "mean_token_accuracy": 0.8663736581802368, - "num_tokens": 17809562.0, - "step": 1986 - }, - { - "epoch": 1.5098784194528876, - "grad_norm": 1.8817477226257324, - "learning_rate": 2.682051091508365e-06, - "loss": 0.4627506732940674, - "mean_token_accuracy": 0.8358862400054932, - "num_tokens": 17819094.0, - "step": 1987 - }, - { - "epoch": 1.5106382978723403, - "grad_norm": 2.221547842025757, - "learning_rate": 2.679962193242338e-06, - "loss": 0.577020525932312, - "mean_token_accuracy": 0.80013108253479, - "num_tokens": 17826666.0, - "step": 1988 - }, - { - "epoch": 1.5113981762917934, - "grad_norm": 2.6618270874023438, - "learning_rate": 2.6778731686719177e-06, - "loss": 0.44632256031036377, - "mean_token_accuracy": 0.8611289262771606, - "num_tokens": 17833172.0, - "step": 1989 - }, - { - "epoch": 1.512158054711246, - "grad_norm": 2.9495689868927, - "learning_rate": 2.67578401926326e-06, - "loss": 0.3482511043548584, - "mean_token_accuracy": 0.8703314661979675, - "num_tokens": 17837220.0, - "step": 1990 - }, - { - "epoch": 1.5129179331306992, - "grad_norm": 2.0943644046783447, - "learning_rate": 2.6736947464826107e-06, - "loss": 0.2354314625263214, - "mean_token_accuracy": 0.9137634038925171, - "num_tokens": 17842712.0, - "step": 1991 - }, - { - "epoch": 1.513677811550152, - "grad_norm": 1.1303033828735352, - "learning_rate": 2.671605351796302e-06, - "loss": 0.3624761700630188, - "mean_token_accuracy": 0.8769594430923462, - "num_tokens": 17860902.0, - "step": 1992 - }, - { - "epoch": 1.5144376899696048, - "grad_norm": 2.8921146392822266, - "learning_rate": 2.6695158366707526e-06, - "loss": 0.2517220973968506, - "mean_token_accuracy": 0.8974182605743408, - "num_tokens": 17865160.0, - "step": 1993 - }, - { - "epoch": 1.5151975683890577, - "grad_norm": 2.320587158203125, - "learning_rate": 2.667426202572463e-06, - "loss": 0.4589889943599701, - "mean_token_accuracy": 0.8379613161087036, - "num_tokens": 17871994.0, - "step": 1994 - }, - { - "epoch": 1.5159574468085106, - "grad_norm": 1.1407674551010132, - "learning_rate": 2.665336450968019e-06, - "loss": 0.34412115812301636, - "mean_token_accuracy": 0.8776306509971619, - "num_tokens": 17889941.0, - "step": 1995 - }, - { - "epoch": 1.5167173252279635, - "grad_norm": 2.069814920425415, - "learning_rate": 2.6632465833240895e-06, - "loss": 0.47524404525756836, - "mean_token_accuracy": 0.830310046672821, - "num_tokens": 17898447.0, - "step": 1996 - }, - { - "epoch": 1.5174772036474165, - "grad_norm": 1.822415828704834, - "learning_rate": 2.661156601107424e-06, - "loss": 0.4541318416595459, - "mean_token_accuracy": 0.8856616020202637, - "num_tokens": 17908729.0, - "step": 1997 - }, - { - "epoch": 1.5182370820668694, - "grad_norm": 2.851428985595703, - "learning_rate": 2.659066505784852e-06, - "loss": 0.41761666536331177, - "mean_token_accuracy": 0.8710572719573975, - "num_tokens": 17913860.0, - "step": 1998 - }, - { - "epoch": 1.518996960486322, - "grad_norm": 1.8483710289001465, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.45517268776893616, - "mean_token_accuracy": 0.8411115407943726, - "num_tokens": 17923497.0, - "step": 1999 - }, - { - "epoch": 1.5197568389057752, - "grad_norm": 1.9044219255447388, - "learning_rate": 2.654885981689706e-06, - "loss": 0.42533189058303833, - "mean_token_accuracy": 0.8597894906997681, - "num_tokens": 17932670.0, - "step": 2000 - } - ], - "logging_steps": 1.0, - "max_steps": 3948, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 1000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.9547571235271475e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin deleted file mode 100644 index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000 --- a/checkpoint-2000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 -size 6225 diff --git a/checkpoint-3000/chat_template.jinja b/checkpoint-3000/chat_template.jinja deleted file mode 100644 index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000 --- a/checkpoint-3000/chat_template.jinja +++ /dev/null @@ -1,61 +0,0 @@ -{%- if tools %} - {{- '<|im_start|>system\n' }} - {%- if messages[0].role == 'system' %} - {{- messages[0].content + '\n\n' }} - {%- endif %} - {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} - {%- for tool in tools %} - {{- "\n" }} - {{- tool | tojson }} - {%- endfor %} - {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} -{%- else %} - {%- if messages[0].role == 'system' %} - {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} - {%- endif %} -{%- endif %} -{%- for message in messages %} - {%- if message.content is string %} - {%- set content = message.content %} - {%- else %} - {%- set content = '' %} - {%- endif %} - {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} - {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} - {%- elif message.role == "assistant" %} - {{- '<|im_start|>' + message.role + '\n' + content }} - {%- if message.tool_calls %} - {%- for tool_call in message.tool_calls %} - {%- if (loop.first and content) or (not loop.first) %} - {{- '\n' }} - {%- endif %} - {%- if tool_call.function %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '\n{"name": "' }} - {{- tool_call.name }} - {{- '", "arguments": ' }} - {%- if tool_call.arguments is string %} - {{- tool_call.arguments }} - {%- else %} - {{- tool_call.arguments | tojson }} - {%- endif %} - {{- '}\n' }} - {%- endfor %} - {%- endif %} - {{- '<|im_end|>\n' }} - {%- elif message.role == "tool" %} - {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} - {{- '<|im_start|>user' }} - {%- endif %} - {{- '\n\n' }} - {{- content }} - {{- '\n' }} - {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} - {{- '<|im_end|>\n' }} - {%- endif %} - {%- endif %} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|im_start|>assistant\n' }} -{%- endif %} \ No newline at end of file diff --git a/checkpoint-3000/config.json b/checkpoint-3000/config.json deleted file mode 100644 index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000 --- a/checkpoint-3000/config.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "architectures": [ - "Qwen3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": null, - "dtype": "float32", - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 2560, - "initializer_range": 0.02, - "intermediate_size": 9728, - "layer_types": [ - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention" - ], - "max_position_embeddings": 262144, - "max_window_layers": 36, - "model_type": "qwen3", - "num_attention_heads": 32, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "pad_token_id": 151662, - "rms_norm_eps": 1e-06, - "rope_parameters": { - "rope_theta": 5000000, - "rope_type": "default" - }, - "sliding_window": null, - "tie_word_embeddings": true, - "transformers_version": "5.5.3", - "use_cache": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/checkpoint-3000/generation_config.json b/checkpoint-3000/generation_config.json deleted file mode 100644 index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000 --- a/checkpoint-3000/generation_config.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "do_sample": true, - "eos_token_id": [ - 151645, - 151643 - ], - "pad_token_id": 151662, - "temperature": 0.7, - "top_k": 20, - "top_p": 0.8, - "transformers_version": "5.5.3" -} diff --git a/checkpoint-3000/model.safetensors b/checkpoint-3000/model.safetensors deleted file mode 100644 index 07e0e931ed749c8c0c6c086ebb969bd3c5167e3f..0000000000000000000000000000000000000000 --- a/checkpoint-3000/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a87a133eb5ec5af0878395bc45e179834b11224819f981211f70acdd015060b -size 17645743048 diff --git a/checkpoint-3000/optimizer.bin b/checkpoint-3000/optimizer.bin deleted file mode 100644 index 18574ad6580a2815e85a104eea5910c353aaf5dc..0000000000000000000000000000000000000000 --- a/checkpoint-3000/optimizer.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ff8e5977667fc938b297528391c931889487050b2acf34a78a42a820912cd38 -size 32180124005 diff --git a/checkpoint-3000/pytorch_model_fsdp.bin b/checkpoint-3000/pytorch_model_fsdp.bin deleted file mode 100644 index 798e41cb07595e0af0eea0bc21a9c2bdffb4914c..0000000000000000000000000000000000000000 --- a/checkpoint-3000/pytorch_model_fsdp.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3023a52ce183c0d2cddf839ebf937f5047e153db9c651eb9f295b9a386e6b589 -size 17645897996 diff --git a/checkpoint-3000/rng_state_0.pth b/checkpoint-3000/rng_state_0.pth deleted file mode 100644 index 5379ca97bc0c62d226d0fc37920d4937a7bb8b43..0000000000000000000000000000000000000000 --- a/checkpoint-3000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61e957b4cd785256be4cb26eb03060ef689e1d58f1766d7f26ca36a62bec4994 -size 14917 diff --git a/checkpoint-3000/rng_state_1.pth b/checkpoint-3000/rng_state_1.pth deleted file mode 100644 index 662ad0d5b30369c825f66c080779973608c5058e..0000000000000000000000000000000000000000 --- a/checkpoint-3000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:550c54d430b44b77b0abe44c6e3ceba90a155305315c081b7616b35e2c18d1ce -size 14917 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt deleted file mode 100644 index 58a045115b7b529e69edb60002fbf90b0935a577..0000000000000000000000000000000000000000 --- a/checkpoint-3000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b07c9eca675fb8c47d0c01728c4ef879c66a752ffdace85e7e9feac32b48ac4b -size 1465 diff --git a/checkpoint-3000/tokenizer.json b/checkpoint-3000/tokenizer.json deleted file mode 100644 index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000 --- a/checkpoint-3000/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 -size 11422650 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json deleted file mode 100644 index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000 --- a/checkpoint-3000/tokenizer_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "add_prefix_space": false, - "backend": "tokenizers", - "bos_token": null, - "clean_up_tokenization_spaces": false, - "eos_token": "<|im_end|>", - "errors": "replace", - "extra_special_tokens": [ - "<|im_start|>", - "<|im_end|>", - "<|object_ref_start|>", - "<|object_ref_end|>", - "<|box_start|>", - "<|box_end|>", - "<|quad_start|>", - "<|quad_end|>", - "<|vision_start|>", - "<|vision_end|>", - "<|vision_pad|>", - "<|image_pad|>", - "<|video_pad|>" - ], - "is_local": false, - "model_max_length": 1010000, - "pad_token": "<|fim_pad|>", - "split_special_tokens": false, - "tokenizer_class": "Qwen2Tokenizer", - "unk_token": null -} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json deleted file mode 100644 index 666130f045326cc7a4d60f3405606f5f0040b4a4..0000000000000000000000000000000000000000 --- a/checkpoint-3000/trainer_state.json +++ /dev/null @@ -1,27034 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.2796352583586628, - "eval_steps": 500, - "global_step": 3000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0007598784194528875, - "grad_norm": 11.767926216125488, - "learning_rate": 0.0, - "loss": 0.7937269806861877, - "mean_token_accuracy": 0.7822731137275696, - "num_tokens": 10507.0, - "step": 1 - }, - { - "epoch": 0.001519756838905775, - "grad_norm": 14.9199800491333, - "learning_rate": 2.5252525252525256e-08, - "loss": 0.7665389776229858, - "mean_token_accuracy": 0.8342233300209045, - "num_tokens": 14806.0, - "step": 2 - }, - { - "epoch": 0.0022796352583586625, - "grad_norm": 11.991217613220215, - "learning_rate": 5.050505050505051e-08, - "loss": 0.9597002267837524, - "mean_token_accuracy": 0.7054992318153381, - "num_tokens": 27170.0, - "step": 3 - }, - { - "epoch": 0.00303951367781155, - "grad_norm": 12.958333015441895, - "learning_rate": 7.575757575757576e-08, - "loss": 0.9971482753753662, - "mean_token_accuracy": 0.7261134386062622, - "num_tokens": 33729.0, - "step": 4 - }, - { - "epoch": 0.003799392097264438, - "grad_norm": 13.5665283203125, - "learning_rate": 1.0101010101010103e-07, - "loss": 0.9504883885383606, - "mean_token_accuracy": 0.745307445526123, - "num_tokens": 41174.0, - "step": 5 - }, - { - "epoch": 0.004559270516717325, - "grad_norm": 10.09444808959961, - "learning_rate": 1.2626262626262626e-07, - "loss": 0.759548008441925, - "mean_token_accuracy": 0.7842121124267578, - "num_tokens": 47943.0, - "step": 6 - }, - { - "epoch": 0.005319148936170213, - "grad_norm": 10.741650581359863, - "learning_rate": 1.5151515151515152e-07, - "loss": 0.8231598138809204, - "mean_token_accuracy": 0.7550969123840332, - "num_tokens": 56665.0, - "step": 7 - }, - { - "epoch": 0.0060790273556231, - "grad_norm": 12.250170707702637, - "learning_rate": 1.767676767676768e-07, - "loss": 0.8576581478118896, - "mean_token_accuracy": 0.7568671703338623, - "num_tokens": 67606.0, - "step": 8 - }, - { - "epoch": 0.006838905775075988, - "grad_norm": 12.828629493713379, - "learning_rate": 2.0202020202020205e-07, - "loss": 0.9886435866355896, - "mean_token_accuracy": 0.733400285243988, - "num_tokens": 74272.0, - "step": 9 - }, - { - "epoch": 0.007598784194528876, - "grad_norm": 15.966923713684082, - "learning_rate": 2.2727272727272729e-07, - "loss": 1.064985990524292, - "mean_token_accuracy": 0.7101132869720459, - "num_tokens": 80524.0, - "step": 10 - }, - { - "epoch": 0.008358662613981762, - "grad_norm": 10.864850044250488, - "learning_rate": 2.525252525252525e-07, - "loss": 0.8311550617218018, - "mean_token_accuracy": 0.7431639432907104, - "num_tokens": 96292.0, - "step": 11 - }, - { - "epoch": 0.00911854103343465, - "grad_norm": 16.438785552978516, - "learning_rate": 2.7777777777777776e-07, - "loss": 1.0579866170883179, - "mean_token_accuracy": 0.7222976684570312, - "num_tokens": 102992.0, - "step": 12 - }, - { - "epoch": 0.009878419452887538, - "grad_norm": 11.179214477539062, - "learning_rate": 3.0303030303030305e-07, - "loss": 0.9816144704818726, - "mean_token_accuracy": 0.7206371426582336, - "num_tokens": 113571.0, - "step": 13 - }, - { - "epoch": 0.010638297872340425, - "grad_norm": 12.780299186706543, - "learning_rate": 3.2828282828282834e-07, - "loss": 0.847449004650116, - "mean_token_accuracy": 0.7826199531555176, - "num_tokens": 119568.0, - "step": 14 - }, - { - "epoch": 0.011398176291793313, - "grad_norm": 14.800421714782715, - "learning_rate": 3.535353535353536e-07, - "loss": 0.9275516271591187, - "mean_token_accuracy": 0.7655045986175537, - "num_tokens": 126258.0, - "step": 15 - }, - { - "epoch": 0.0121580547112462, - "grad_norm": 11.267602920532227, - "learning_rate": 3.787878787878788e-07, - "loss": 0.8464037179946899, - "mean_token_accuracy": 0.7606508731842041, - "num_tokens": 136831.0, - "step": 16 - }, - { - "epoch": 0.012917933130699088, - "grad_norm": 12.891013145446777, - "learning_rate": 4.040404040404041e-07, - "loss": 0.9903074502944946, - "mean_token_accuracy": 0.7247487306594849, - "num_tokens": 150434.0, - "step": 17 - }, - { - "epoch": 0.013677811550151976, - "grad_norm": 11.13957691192627, - "learning_rate": 4.2929292929292934e-07, - "loss": 0.8287211656570435, - "mean_token_accuracy": 0.7621913552284241, - "num_tokens": 158516.0, - "step": 18 - }, - { - "epoch": 0.014437689969604863, - "grad_norm": 18.39569664001465, - "learning_rate": 4.5454545454545457e-07, - "loss": 1.150015115737915, - "mean_token_accuracy": 0.7349498271942139, - "num_tokens": 162214.0, - "step": 19 - }, - { - "epoch": 0.015197568389057751, - "grad_norm": 9.353750228881836, - "learning_rate": 4.797979797979798e-07, - "loss": 0.7228299379348755, - "mean_token_accuracy": 0.7969573736190796, - "num_tokens": 173035.0, - "step": 20 - }, - { - "epoch": 0.015957446808510637, - "grad_norm": 8.267163276672363, - "learning_rate": 5.05050505050505e-07, - "loss": 0.7358136177062988, - "mean_token_accuracy": 0.7903937101364136, - "num_tokens": 183568.0, - "step": 21 - }, - { - "epoch": 0.016717325227963525, - "grad_norm": 11.137128829956055, - "learning_rate": 5.303030303030304e-07, - "loss": 1.0075397491455078, - "mean_token_accuracy": 0.702807605266571, - "num_tokens": 192759.0, - "step": 22 - }, - { - "epoch": 0.017477203647416412, - "grad_norm": 10.734103202819824, - "learning_rate": 5.555555555555555e-07, - "loss": 0.8925919532775879, - "mean_token_accuracy": 0.7475671768188477, - "num_tokens": 201280.0, - "step": 23 - }, - { - "epoch": 0.0182370820668693, - "grad_norm": 11.945566177368164, - "learning_rate": 5.808080808080809e-07, - "loss": 0.7260514497756958, - "mean_token_accuracy": 0.7859152555465698, - "num_tokens": 218053.0, - "step": 24 - }, - { - "epoch": 0.018996960486322188, - "grad_norm": 18.610652923583984, - "learning_rate": 6.060606060606061e-07, - "loss": 0.8995465636253357, - "mean_token_accuracy": 0.7931990623474121, - "num_tokens": 220953.0, - "step": 25 - }, - { - "epoch": 0.019756838905775075, - "grad_norm": 10.51898193359375, - "learning_rate": 6.313131313131314e-07, - "loss": 0.9532671570777893, - "mean_token_accuracy": 0.7257645726203918, - "num_tokens": 231200.0, - "step": 26 - }, - { - "epoch": 0.020516717325227963, - "grad_norm": 9.581812858581543, - "learning_rate": 6.565656565656567e-07, - "loss": 0.9038010239601135, - "mean_token_accuracy": 0.7390379905700684, - "num_tokens": 237711.0, - "step": 27 - }, - { - "epoch": 0.02127659574468085, - "grad_norm": 12.297484397888184, - "learning_rate": 6.818181818181818e-07, - "loss": 1.048936367034912, - "mean_token_accuracy": 0.7175670862197876, - "num_tokens": 242503.0, - "step": 28 - }, - { - "epoch": 0.022036474164133738, - "grad_norm": 7.437953472137451, - "learning_rate": 7.070707070707071e-07, - "loss": 0.8308826684951782, - "mean_token_accuracy": 0.7415335774421692, - "num_tokens": 250842.0, - "step": 29 - }, - { - "epoch": 0.022796352583586626, - "grad_norm": 6.134475231170654, - "learning_rate": 7.323232323232324e-07, - "loss": 0.647913932800293, - "mean_token_accuracy": 0.8124054670333862, - "num_tokens": 267453.0, - "step": 30 - }, - { - "epoch": 0.023556231003039513, - "grad_norm": 6.678966045379639, - "learning_rate": 7.575757575757576e-07, - "loss": 0.7052810192108154, - "mean_token_accuracy": 0.7908754348754883, - "num_tokens": 284416.0, - "step": 31 - }, - { - "epoch": 0.0243161094224924, - "grad_norm": 7.42232084274292, - "learning_rate": 7.82828282828283e-07, - "loss": 1.022383213043213, - "mean_token_accuracy": 0.7053230404853821, - "num_tokens": 292073.0, - "step": 32 - }, - { - "epoch": 0.02507598784194529, - "grad_norm": 6.463219165802002, - "learning_rate": 8.080808080808082e-07, - "loss": 0.7603012323379517, - "mean_token_accuracy": 0.7728140354156494, - "num_tokens": 298550.0, - "step": 33 - }, - { - "epoch": 0.025835866261398176, - "grad_norm": 5.668411731719971, - "learning_rate": 8.333333333333333e-07, - "loss": 0.7707852721214294, - "mean_token_accuracy": 0.7827773094177246, - "num_tokens": 306683.0, - "step": 34 - }, - { - "epoch": 0.026595744680851064, - "grad_norm": 4.984964847564697, - "learning_rate": 8.585858585858587e-07, - "loss": 0.6317349672317505, - "mean_token_accuracy": 0.8106861114501953, - "num_tokens": 318842.0, - "step": 35 - }, - { - "epoch": 0.02735562310030395, - "grad_norm": 4.421732425689697, - "learning_rate": 8.838383838383839e-07, - "loss": 0.6228617429733276, - "mean_token_accuracy": 0.8023355603218079, - "num_tokens": 329850.0, - "step": 36 - }, - { - "epoch": 0.02811550151975684, - "grad_norm": 5.970808029174805, - "learning_rate": 9.090909090909091e-07, - "loss": 0.8443238139152527, - "mean_token_accuracy": 0.7462409734725952, - "num_tokens": 335844.0, - "step": 37 - }, - { - "epoch": 0.028875379939209727, - "grad_norm": 4.5389084815979, - "learning_rate": 9.343434343434345e-07, - "loss": 0.6976436376571655, - "mean_token_accuracy": 0.790410041809082, - "num_tokens": 348768.0, - "step": 38 - }, - { - "epoch": 0.029635258358662615, - "grad_norm": 4.116631507873535, - "learning_rate": 9.595959595959596e-07, - "loss": 0.6698519587516785, - "mean_token_accuracy": 0.7818127870559692, - "num_tokens": 355460.0, - "step": 39 - }, - { - "epoch": 0.030395136778115502, - "grad_norm": 3.3714773654937744, - "learning_rate": 9.84848484848485e-07, - "loss": 0.5723201036453247, - "mean_token_accuracy": 0.8100086450576782, - "num_tokens": 368507.0, - "step": 40 - }, - { - "epoch": 0.03115501519756839, - "grad_norm": 4.4438347816467285, - "learning_rate": 1.01010101010101e-06, - "loss": 0.7508786916732788, - "mean_token_accuracy": 0.7711942791938782, - "num_tokens": 376467.0, - "step": 41 - }, - { - "epoch": 0.031914893617021274, - "grad_norm": 5.609974384307861, - "learning_rate": 1.0353535353535354e-06, - "loss": 0.566256046295166, - "mean_token_accuracy": 0.8319284319877625, - "num_tokens": 381399.0, - "step": 42 - }, - { - "epoch": 0.03267477203647416, - "grad_norm": 5.124386787414551, - "learning_rate": 1.0606060606060608e-06, - "loss": 0.8151067495346069, - "mean_token_accuracy": 0.7537785768508911, - "num_tokens": 387389.0, - "step": 43 - }, - { - "epoch": 0.03343465045592705, - "grad_norm": 3.6318116188049316, - "learning_rate": 1.085858585858586e-06, - "loss": 0.5989949107170105, - "mean_token_accuracy": 0.8129256963729858, - "num_tokens": 395302.0, - "step": 44 - }, - { - "epoch": 0.03419452887537994, - "grad_norm": 2.694424629211426, - "learning_rate": 1.111111111111111e-06, - "loss": 0.5831396579742432, - "mean_token_accuracy": 0.8056820631027222, - "num_tokens": 409920.0, - "step": 45 - }, - { - "epoch": 0.034954407294832825, - "grad_norm": 2.2949178218841553, - "learning_rate": 1.1363636363636364e-06, - "loss": 0.472550630569458, - "mean_token_accuracy": 0.8343006372451782, - "num_tokens": 428323.0, - "step": 46 - }, - { - "epoch": 0.03571428571428571, - "grad_norm": 3.3930575847625732, - "learning_rate": 1.1616161616161617e-06, - "loss": 0.6246505379676819, - "mean_token_accuracy": 0.783149003982544, - "num_tokens": 435889.0, - "step": 47 - }, - { - "epoch": 0.0364741641337386, - "grad_norm": 3.692598819732666, - "learning_rate": 1.186868686868687e-06, - "loss": 0.46132946014404297, - "mean_token_accuracy": 0.8583089113235474, - "num_tokens": 441192.0, - "step": 48 - }, - { - "epoch": 0.03723404255319149, - "grad_norm": 6.571533203125, - "learning_rate": 1.2121212121212122e-06, - "loss": 0.9351121783256531, - "mean_token_accuracy": 0.7580878734588623, - "num_tokens": 444277.0, - "step": 49 - }, - { - "epoch": 0.037993920972644375, - "grad_norm": 5.029570579528809, - "learning_rate": 1.2373737373737375e-06, - "loss": 0.6921554803848267, - "mean_token_accuracy": 0.8131166100502014, - "num_tokens": 447646.0, - "step": 50 - }, - { - "epoch": 0.03875379939209726, - "grad_norm": 2.9174208641052246, - "learning_rate": 1.2626262626262629e-06, - "loss": 0.591706395149231, - "mean_token_accuracy": 0.8108617067337036, - "num_tokens": 461397.0, - "step": 51 - }, - { - "epoch": 0.03951367781155015, - "grad_norm": 4.315536022186279, - "learning_rate": 1.287878787878788e-06, - "loss": 0.6986310482025146, - "mean_token_accuracy": 0.7710754871368408, - "num_tokens": 472047.0, - "step": 52 - }, - { - "epoch": 0.04027355623100304, - "grad_norm": 2.6216275691986084, - "learning_rate": 1.3131313131313134e-06, - "loss": 0.5553690791130066, - "mean_token_accuracy": 0.8167896866798401, - "num_tokens": 482795.0, - "step": 53 - }, - { - "epoch": 0.041033434650455926, - "grad_norm": 3.0562477111816406, - "learning_rate": 1.3383838383838385e-06, - "loss": 0.6909202337265015, - "mean_token_accuracy": 0.7859863638877869, - "num_tokens": 494818.0, - "step": 54 - }, - { - "epoch": 0.04179331306990881, - "grad_norm": 2.1420412063598633, - "learning_rate": 1.3636363636363636e-06, - "loss": 0.5415265560150146, - "mean_token_accuracy": 0.818886399269104, - "num_tokens": 513695.0, - "step": 55 - }, - { - "epoch": 0.0425531914893617, - "grad_norm": 2.9610488414764404, - "learning_rate": 1.3888888888888892e-06, - "loss": 0.6602212190628052, - "mean_token_accuracy": 0.7830734252929688, - "num_tokens": 523784.0, - "step": 56 - }, - { - "epoch": 0.04331306990881459, - "grad_norm": 2.511972665786743, - "learning_rate": 1.4141414141414143e-06, - "loss": 0.5717809796333313, - "mean_token_accuracy": 0.8053616285324097, - "num_tokens": 546308.0, - "step": 57 - }, - { - "epoch": 0.044072948328267476, - "grad_norm": 3.52642822265625, - "learning_rate": 1.4393939393939396e-06, - "loss": 0.6242594718933105, - "mean_token_accuracy": 0.8162082433700562, - "num_tokens": 552019.0, - "step": 58 - }, - { - "epoch": 0.044832826747720364, - "grad_norm": 3.02362322807312, - "learning_rate": 1.4646464646464648e-06, - "loss": 0.6634255647659302, - "mean_token_accuracy": 0.7682032585144043, - "num_tokens": 560009.0, - "step": 59 - }, - { - "epoch": 0.04559270516717325, - "grad_norm": 2.3910107612609863, - "learning_rate": 1.48989898989899e-06, - "loss": 0.5519146919250488, - "mean_token_accuracy": 0.8270269632339478, - "num_tokens": 571005.0, - "step": 60 - }, - { - "epoch": 0.04635258358662614, - "grad_norm": 4.28154993057251, - "learning_rate": 1.5151515151515152e-06, - "loss": 0.7437789440155029, - "mean_token_accuracy": 0.7782418131828308, - "num_tokens": 574950.0, - "step": 61 - }, - { - "epoch": 0.04711246200607903, - "grad_norm": 3.4078686237335205, - "learning_rate": 1.5404040404040404e-06, - "loss": 0.6345915198326111, - "mean_token_accuracy": 0.7903392314910889, - "num_tokens": 581657.0, - "step": 62 - }, - { - "epoch": 0.047872340425531915, - "grad_norm": 2.6834158897399902, - "learning_rate": 1.565656565656566e-06, - "loss": 0.5981127023696899, - "mean_token_accuracy": 0.7911489605903625, - "num_tokens": 591267.0, - "step": 63 - }, - { - "epoch": 0.0486322188449848, - "grad_norm": 2.1054461002349854, - "learning_rate": 1.590909090909091e-06, - "loss": 0.5523523688316345, - "mean_token_accuracy": 0.8194501399993896, - "num_tokens": 606787.0, - "step": 64 - }, - { - "epoch": 0.04939209726443769, - "grad_norm": 3.322596788406372, - "learning_rate": 1.6161616161616164e-06, - "loss": 0.48417025804519653, - "mean_token_accuracy": 0.8293706178665161, - "num_tokens": 611068.0, - "step": 65 - }, - { - "epoch": 0.05015197568389058, - "grad_norm": 2.302450180053711, - "learning_rate": 1.6414141414141415e-06, - "loss": 0.6498389840126038, - "mean_token_accuracy": 0.7728497385978699, - "num_tokens": 624452.0, - "step": 66 - }, - { - "epoch": 0.050911854103343465, - "grad_norm": 2.680191993713379, - "learning_rate": 1.6666666666666667e-06, - "loss": 0.6347037553787231, - "mean_token_accuracy": 0.8108306527137756, - "num_tokens": 638049.0, - "step": 67 - }, - { - "epoch": 0.05167173252279635, - "grad_norm": 3.0297021865844727, - "learning_rate": 1.6919191919191922e-06, - "loss": 0.5344363451004028, - "mean_token_accuracy": 0.8113535046577454, - "num_tokens": 643892.0, - "step": 68 - }, - { - "epoch": 0.05243161094224924, - "grad_norm": 2.9283676147460938, - "learning_rate": 1.7171717171717173e-06, - "loss": 0.6999260187149048, - "mean_token_accuracy": 0.7782022356987, - "num_tokens": 654418.0, - "step": 69 - }, - { - "epoch": 0.05319148936170213, - "grad_norm": 3.4098572731018066, - "learning_rate": 1.7424242424242427e-06, - "loss": 0.6508946418762207, - "mean_token_accuracy": 0.7942900657653809, - "num_tokens": 659837.0, - "step": 70 - }, - { - "epoch": 0.053951367781155016, - "grad_norm": 2.6756019592285156, - "learning_rate": 1.7676767676767678e-06, - "loss": 0.603486180305481, - "mean_token_accuracy": 0.8015457391738892, - "num_tokens": 668361.0, - "step": 71 - }, - { - "epoch": 0.0547112462006079, - "grad_norm": 2.2630293369293213, - "learning_rate": 1.792929292929293e-06, - "loss": 0.6608274579048157, - "mean_token_accuracy": 0.7753809690475464, - "num_tokens": 679025.0, - "step": 72 - }, - { - "epoch": 0.05547112462006079, - "grad_norm": 2.123962879180908, - "learning_rate": 1.8181818181818183e-06, - "loss": 0.4525482654571533, - "mean_token_accuracy": 0.8425612449645996, - "num_tokens": 688574.0, - "step": 73 - }, - { - "epoch": 0.05623100303951368, - "grad_norm": 7.90519905090332, - "learning_rate": 1.8434343434343434e-06, - "loss": 0.6507195830345154, - "mean_token_accuracy": 0.7714964151382446, - "num_tokens": 694534.0, - "step": 74 - }, - { - "epoch": 0.056990881458966566, - "grad_norm": 2.372203826904297, - "learning_rate": 1.868686868686869e-06, - "loss": 0.4458143413066864, - "mean_token_accuracy": 0.7991449236869812, - "num_tokens": 703114.0, - "step": 75 - }, - { - "epoch": 0.057750759878419454, - "grad_norm": 2.918677568435669, - "learning_rate": 1.8939393939393941e-06, - "loss": 0.5614339113235474, - "mean_token_accuracy": 0.8211464881896973, - "num_tokens": 709038.0, - "step": 76 - }, - { - "epoch": 0.05851063829787234, - "grad_norm": 1.6106709241867065, - "learning_rate": 1.9191919191919192e-06, - "loss": 0.5802098512649536, - "mean_token_accuracy": 0.8055065870285034, - "num_tokens": 730482.0, - "step": 77 - }, - { - "epoch": 0.05927051671732523, - "grad_norm": 2.8069989681243896, - "learning_rate": 1.944444444444445e-06, - "loss": 0.5709059238433838, - "mean_token_accuracy": 0.8024872541427612, - "num_tokens": 751817.0, - "step": 78 - }, - { - "epoch": 0.06003039513677812, - "grad_norm": 2.641667127609253, - "learning_rate": 1.96969696969697e-06, - "loss": 0.6480152606964111, - "mean_token_accuracy": 0.7912271618843079, - "num_tokens": 759236.0, - "step": 79 - }, - { - "epoch": 0.060790273556231005, - "grad_norm": 2.6034350395202637, - "learning_rate": 1.994949494949495e-06, - "loss": 0.5535176396369934, - "mean_token_accuracy": 0.7980542778968811, - "num_tokens": 766496.0, - "step": 80 - }, - { - "epoch": 0.06155015197568389, - "grad_norm": 1.7095069885253906, - "learning_rate": 2.02020202020202e-06, - "loss": 0.4545496106147766, - "mean_token_accuracy": 0.8229660391807556, - "num_tokens": 780124.0, - "step": 81 - }, - { - "epoch": 0.06231003039513678, - "grad_norm": 3.788830518722534, - "learning_rate": 2.0454545454545457e-06, - "loss": 0.6679391264915466, - "mean_token_accuracy": 0.7942397594451904, - "num_tokens": 784555.0, - "step": 82 - }, - { - "epoch": 0.06306990881458967, - "grad_norm": 2.009831666946411, - "learning_rate": 2.070707070707071e-06, - "loss": 0.5067101120948792, - "mean_token_accuracy": 0.8276634216308594, - "num_tokens": 797459.0, - "step": 83 - }, - { - "epoch": 0.06382978723404255, - "grad_norm": 2.201627731323242, - "learning_rate": 2.095959595959596e-06, - "loss": 0.5012127161026001, - "mean_token_accuracy": 0.8432504534721375, - "num_tokens": 810817.0, - "step": 84 - }, - { - "epoch": 0.06458966565349544, - "grad_norm": 2.492568016052246, - "learning_rate": 2.1212121212121216e-06, - "loss": 0.6142797470092773, - "mean_token_accuracy": 0.8338661193847656, - "num_tokens": 818191.0, - "step": 85 - }, - { - "epoch": 0.06534954407294832, - "grad_norm": 2.8360862731933594, - "learning_rate": 2.1464646464646467e-06, - "loss": 0.5569300651550293, - "mean_token_accuracy": 0.8121030330657959, - "num_tokens": 825325.0, - "step": 86 - }, - { - "epoch": 0.06610942249240122, - "grad_norm": 2.407548427581787, - "learning_rate": 2.171717171717172e-06, - "loss": 0.6442930102348328, - "mean_token_accuracy": 0.792514443397522, - "num_tokens": 834439.0, - "step": 87 - }, - { - "epoch": 0.0668693009118541, - "grad_norm": 2.340728759765625, - "learning_rate": 2.196969696969697e-06, - "loss": 0.6494365930557251, - "mean_token_accuracy": 0.7746615409851074, - "num_tokens": 843078.0, - "step": 88 - }, - { - "epoch": 0.067629179331307, - "grad_norm": 1.7703697681427002, - "learning_rate": 2.222222222222222e-06, - "loss": 0.598991870880127, - "mean_token_accuracy": 0.7992157340049744, - "num_tokens": 860171.0, - "step": 89 - }, - { - "epoch": 0.06838905775075987, - "grad_norm": 2.5779271125793457, - "learning_rate": 2.2474747474747476e-06, - "loss": 0.5693082809448242, - "mean_token_accuracy": 0.8093700408935547, - "num_tokens": 866669.0, - "step": 90 - }, - { - "epoch": 0.06914893617021277, - "grad_norm": 2.014092206954956, - "learning_rate": 2.2727272727272728e-06, - "loss": 0.5346695780754089, - "mean_token_accuracy": 0.8165590763092041, - "num_tokens": 876698.0, - "step": 91 - }, - { - "epoch": 0.06990881458966565, - "grad_norm": 1.7555919885635376, - "learning_rate": 2.2979797979797983e-06, - "loss": 0.5321458578109741, - "mean_token_accuracy": 0.8166656494140625, - "num_tokens": 889488.0, - "step": 92 - }, - { - "epoch": 0.07066869300911854, - "grad_norm": 1.8631824254989624, - "learning_rate": 2.3232323232323234e-06, - "loss": 0.5246532559394836, - "mean_token_accuracy": 0.8088107705116272, - "num_tokens": 901322.0, - "step": 93 - }, - { - "epoch": 0.07142857142857142, - "grad_norm": 3.2332139015197754, - "learning_rate": 2.348484848484849e-06, - "loss": 0.5141711235046387, - "mean_token_accuracy": 0.8382217884063721, - "num_tokens": 905792.0, - "step": 94 - }, - { - "epoch": 0.07218844984802432, - "grad_norm": 1.7806555032730103, - "learning_rate": 2.373737373737374e-06, - "loss": 0.5233149528503418, - "mean_token_accuracy": 0.8101529479026794, - "num_tokens": 917320.0, - "step": 95 - }, - { - "epoch": 0.0729483282674772, - "grad_norm": 1.8169859647750854, - "learning_rate": 2.3989898989898993e-06, - "loss": 0.578881561756134, - "mean_token_accuracy": 0.8044873476028442, - "num_tokens": 931062.0, - "step": 96 - }, - { - "epoch": 0.0737082066869301, - "grad_norm": 4.677402496337891, - "learning_rate": 2.4242424242424244e-06, - "loss": 0.7842556238174438, - "mean_token_accuracy": 0.7579764127731323, - "num_tokens": 934712.0, - "step": 97 - }, - { - "epoch": 0.07446808510638298, - "grad_norm": 2.6987264156341553, - "learning_rate": 2.4494949494949495e-06, - "loss": 0.5669287443161011, - "mean_token_accuracy": 0.8186933994293213, - "num_tokens": 941058.0, - "step": 98 - }, - { - "epoch": 0.07522796352583587, - "grad_norm": 1.6906023025512695, - "learning_rate": 2.474747474747475e-06, - "loss": 0.4976363778114319, - "mean_token_accuracy": 0.8198553323745728, - "num_tokens": 956509.0, - "step": 99 - }, - { - "epoch": 0.07598784194528875, - "grad_norm": 2.7256152629852295, - "learning_rate": 2.5e-06, - "loss": 0.7138420343399048, - "mean_token_accuracy": 0.7752805948257446, - "num_tokens": 963920.0, - "step": 100 - }, - { - "epoch": 0.07674772036474165, - "grad_norm": 2.174870491027832, - "learning_rate": 2.5252525252525258e-06, - "loss": 0.6733541488647461, - "mean_token_accuracy": 0.7745175361633301, - "num_tokens": 975268.0, - "step": 101 - }, - { - "epoch": 0.07750759878419453, - "grad_norm": 1.5587213039398193, - "learning_rate": 2.5505050505050505e-06, - "loss": 0.44223445653915405, - "mean_token_accuracy": 0.8278359174728394, - "num_tokens": 991837.0, - "step": 102 - }, - { - "epoch": 0.07826747720364742, - "grad_norm": 2.181840658187866, - "learning_rate": 2.575757575757576e-06, - "loss": 0.625128448009491, - "mean_token_accuracy": 0.7941786050796509, - "num_tokens": 1004325.0, - "step": 103 - }, - { - "epoch": 0.0790273556231003, - "grad_norm": 1.4986687898635864, - "learning_rate": 2.601010101010101e-06, - "loss": 0.39262527227401733, - "mean_token_accuracy": 0.8412648439407349, - "num_tokens": 1018331.0, - "step": 104 - }, - { - "epoch": 0.0797872340425532, - "grad_norm": 2.3416061401367188, - "learning_rate": 2.6262626262626267e-06, - "loss": 0.5495132803916931, - "mean_token_accuracy": 0.8193322420120239, - "num_tokens": 1026090.0, - "step": 105 - }, - { - "epoch": 0.08054711246200608, - "grad_norm": 3.8168859481811523, - "learning_rate": 2.6515151515151514e-06, - "loss": 0.4898706376552582, - "mean_token_accuracy": 0.8467956185340881, - "num_tokens": 1029955.0, - "step": 106 - }, - { - "epoch": 0.08130699088145897, - "grad_norm": 4.113908767700195, - "learning_rate": 2.676767676767677e-06, - "loss": 0.6189584732055664, - "mean_token_accuracy": 0.8019394278526306, - "num_tokens": 1033598.0, - "step": 107 - }, - { - "epoch": 0.08206686930091185, - "grad_norm": 2.50003981590271, - "learning_rate": 2.7020202020202025e-06, - "loss": 0.6479471921920776, - "mean_token_accuracy": 0.7790026664733887, - "num_tokens": 1042533.0, - "step": 108 - }, - { - "epoch": 0.08282674772036475, - "grad_norm": 1.408934473991394, - "learning_rate": 2.7272727272727272e-06, - "loss": 0.3909248113632202, - "mean_token_accuracy": 0.8477586507797241, - "num_tokens": 1061755.0, - "step": 109 - }, - { - "epoch": 0.08358662613981763, - "grad_norm": 3.360633611679077, - "learning_rate": 2.7525252525252528e-06, - "loss": 0.6952459812164307, - "mean_token_accuracy": 0.777535080909729, - "num_tokens": 1067316.0, - "step": 110 - }, - { - "epoch": 0.08434650455927052, - "grad_norm": 1.8631696701049805, - "learning_rate": 2.7777777777777783e-06, - "loss": 0.5420593023300171, - "mean_token_accuracy": 0.8157662749290466, - "num_tokens": 1079930.0, - "step": 111 - }, - { - "epoch": 0.0851063829787234, - "grad_norm": 2.4308314323425293, - "learning_rate": 2.803030303030303e-06, - "loss": 0.5863882303237915, - "mean_token_accuracy": 0.8206346035003662, - "num_tokens": 1088069.0, - "step": 112 - }, - { - "epoch": 0.0858662613981763, - "grad_norm": 2.922808885574341, - "learning_rate": 2.8282828282828286e-06, - "loss": 0.5217319130897522, - "mean_token_accuracy": 0.8253234028816223, - "num_tokens": 1093607.0, - "step": 113 - }, - { - "epoch": 0.08662613981762918, - "grad_norm": 2.3596107959747314, - "learning_rate": 2.8535353535353537e-06, - "loss": 0.5070714950561523, - "mean_token_accuracy": 0.8258323669433594, - "num_tokens": 1100405.0, - "step": 114 - }, - { - "epoch": 0.08738601823708207, - "grad_norm": 3.0853066444396973, - "learning_rate": 2.8787878787878793e-06, - "loss": 0.591964840888977, - "mean_token_accuracy": 0.8047322630882263, - "num_tokens": 1107535.0, - "step": 115 - }, - { - "epoch": 0.08814589665653495, - "grad_norm": 1.9251092672348022, - "learning_rate": 2.904040404040404e-06, - "loss": 0.5226191878318787, - "mean_token_accuracy": 0.8022720217704773, - "num_tokens": 1118716.0, - "step": 116 - }, - { - "epoch": 0.08890577507598785, - "grad_norm": 1.9692988395690918, - "learning_rate": 2.9292929292929295e-06, - "loss": 0.5462069511413574, - "mean_token_accuracy": 0.8157015442848206, - "num_tokens": 1131917.0, - "step": 117 - }, - { - "epoch": 0.08966565349544073, - "grad_norm": 1.4738909006118774, - "learning_rate": 2.954545454545455e-06, - "loss": 0.4564219117164612, - "mean_token_accuracy": 0.849632978439331, - "num_tokens": 1148534.0, - "step": 118 - }, - { - "epoch": 0.09042553191489362, - "grad_norm": 2.72646164894104, - "learning_rate": 2.97979797979798e-06, - "loss": 0.6654808521270752, - "mean_token_accuracy": 0.7752684354782104, - "num_tokens": 1155438.0, - "step": 119 - }, - { - "epoch": 0.0911854103343465, - "grad_norm": 2.7843852043151855, - "learning_rate": 3.0050505050505054e-06, - "loss": 0.5354680418968201, - "mean_token_accuracy": 0.8196378946304321, - "num_tokens": 1161815.0, - "step": 120 - }, - { - "epoch": 0.0919452887537994, - "grad_norm": 2.8052573204040527, - "learning_rate": 3.0303030303030305e-06, - "loss": 0.6366757154464722, - "mean_token_accuracy": 0.7967483997344971, - "num_tokens": 1168295.0, - "step": 121 - }, - { - "epoch": 0.09270516717325228, - "grad_norm": 2.7462735176086426, - "learning_rate": 3.055555555555556e-06, - "loss": 0.59470534324646, - "mean_token_accuracy": 0.8023771047592163, - "num_tokens": 1174502.0, - "step": 122 - }, - { - "epoch": 0.09346504559270517, - "grad_norm": 2.2743821144104004, - "learning_rate": 3.0808080808080807e-06, - "loss": 0.5720560550689697, - "mean_token_accuracy": 0.8162771463394165, - "num_tokens": 1183615.0, - "step": 123 - }, - { - "epoch": 0.09422492401215805, - "grad_norm": 1.8669533729553223, - "learning_rate": 3.1060606060606063e-06, - "loss": 0.4655378758907318, - "mean_token_accuracy": 0.8360732793807983, - "num_tokens": 1193761.0, - "step": 124 - }, - { - "epoch": 0.09498480243161095, - "grad_norm": 1.7666901350021362, - "learning_rate": 3.131313131313132e-06, - "loss": 0.5524153709411621, - "mean_token_accuracy": 0.8252713680267334, - "num_tokens": 1207870.0, - "step": 125 - }, - { - "epoch": 0.09574468085106383, - "grad_norm": 2.4720070362091064, - "learning_rate": 3.1565656565656566e-06, - "loss": 0.5003011226654053, - "mean_token_accuracy": 0.8491042852401733, - "num_tokens": 1214603.0, - "step": 126 - }, - { - "epoch": 0.09650455927051672, - "grad_norm": 1.6500422954559326, - "learning_rate": 3.181818181818182e-06, - "loss": 0.5137069225311279, - "mean_token_accuracy": 0.8273531198501587, - "num_tokens": 1228717.0, - "step": 127 - }, - { - "epoch": 0.0972644376899696, - "grad_norm": 3.402543067932129, - "learning_rate": 3.2070707070707072e-06, - "loss": 0.708167552947998, - "mean_token_accuracy": 0.7705385684967041, - "num_tokens": 1234361.0, - "step": 128 - }, - { - "epoch": 0.0980243161094225, - "grad_norm": 2.547285795211792, - "learning_rate": 3.232323232323233e-06, - "loss": 0.6020137071609497, - "mean_token_accuracy": 0.7981340289115906, - "num_tokens": 1244169.0, - "step": 129 - }, - { - "epoch": 0.09878419452887538, - "grad_norm": 2.0578792095184326, - "learning_rate": 3.257575757575758e-06, - "loss": 0.4425000250339508, - "mean_token_accuracy": 0.8567807674407959, - "num_tokens": 1252709.0, - "step": 130 - }, - { - "epoch": 0.09954407294832827, - "grad_norm": 1.672614336013794, - "learning_rate": 3.282828282828283e-06, - "loss": 0.4860966205596924, - "mean_token_accuracy": 0.8393139243125916, - "num_tokens": 1265766.0, - "step": 131 - }, - { - "epoch": 0.10030395136778116, - "grad_norm": 3.2560198307037354, - "learning_rate": 3.3080808080808086e-06, - "loss": 0.624736487865448, - "mean_token_accuracy": 0.7875322699546814, - "num_tokens": 1270779.0, - "step": 132 - }, - { - "epoch": 0.10106382978723404, - "grad_norm": 2.4468185901641846, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.5062227249145508, - "mean_token_accuracy": 0.8217229843139648, - "num_tokens": 1277113.0, - "step": 133 - }, - { - "epoch": 0.10182370820668693, - "grad_norm": 2.6371328830718994, - "learning_rate": 3.358585858585859e-06, - "loss": 0.477113276720047, - "mean_token_accuracy": 0.8605583906173706, - "num_tokens": 1282514.0, - "step": 134 - }, - { - "epoch": 0.10258358662613981, - "grad_norm": 2.48421311378479, - "learning_rate": 3.3838383838383844e-06, - "loss": 0.40855684876441956, - "mean_token_accuracy": 0.864548921585083, - "num_tokens": 1287859.0, - "step": 135 - }, - { - "epoch": 0.1033434650455927, - "grad_norm": 1.993099331855774, - "learning_rate": 3.409090909090909e-06, - "loss": 0.5913145542144775, - "mean_token_accuracy": 0.8248485922813416, - "num_tokens": 1301074.0, - "step": 136 - }, - { - "epoch": 0.10410334346504559, - "grad_norm": 3.5947680473327637, - "learning_rate": 3.4343434343434347e-06, - "loss": 0.5028599500656128, - "mean_token_accuracy": 0.8367215394973755, - "num_tokens": 1305219.0, - "step": 137 - }, - { - "epoch": 0.10486322188449848, - "grad_norm": 2.5778582096099854, - "learning_rate": 3.45959595959596e-06, - "loss": 0.5297672748565674, - "mean_token_accuracy": 0.8232187032699585, - "num_tokens": 1312482.0, - "step": 138 - }, - { - "epoch": 0.10562310030395136, - "grad_norm": 1.8961588144302368, - "learning_rate": 3.4848484848484854e-06, - "loss": 0.39954107999801636, - "mean_token_accuracy": 0.8605833053588867, - "num_tokens": 1323404.0, - "step": 139 - }, - { - "epoch": 0.10638297872340426, - "grad_norm": 1.9687960147857666, - "learning_rate": 3.51010101010101e-06, - "loss": 0.48791587352752686, - "mean_token_accuracy": 0.8200347423553467, - "num_tokens": 1333027.0, - "step": 140 - }, - { - "epoch": 0.10714285714285714, - "grad_norm": 2.520242691040039, - "learning_rate": 3.5353535353535356e-06, - "loss": 0.6106002330780029, - "mean_token_accuracy": 0.790692150592804, - "num_tokens": 1340999.0, - "step": 141 - }, - { - "epoch": 0.10790273556231003, - "grad_norm": 3.751617431640625, - "learning_rate": 3.560606060606061e-06, - "loss": 0.48141729831695557, - "mean_token_accuracy": 0.8421382904052734, - "num_tokens": 1344687.0, - "step": 142 - }, - { - "epoch": 0.10866261398176291, - "grad_norm": 2.7101709842681885, - "learning_rate": 3.585858585858586e-06, - "loss": 0.5375241637229919, - "mean_token_accuracy": 0.8061438202857971, - "num_tokens": 1350192.0, - "step": 143 - }, - { - "epoch": 0.1094224924012158, - "grad_norm": 2.583484411239624, - "learning_rate": 3.6111111111111115e-06, - "loss": 0.6492470502853394, - "mean_token_accuracy": 0.7863001823425293, - "num_tokens": 1358148.0, - "step": 144 - }, - { - "epoch": 0.11018237082066869, - "grad_norm": 1.792561650276184, - "learning_rate": 3.6363636363636366e-06, - "loss": 0.48480600118637085, - "mean_token_accuracy": 0.8358709812164307, - "num_tokens": 1369519.0, - "step": 145 - }, - { - "epoch": 0.11094224924012158, - "grad_norm": 2.6480472087860107, - "learning_rate": 3.661616161616162e-06, - "loss": 0.5268933176994324, - "mean_token_accuracy": 0.8214013576507568, - "num_tokens": 1375862.0, - "step": 146 - }, - { - "epoch": 0.11170212765957446, - "grad_norm": 2.3174469470977783, - "learning_rate": 3.686868686868687e-06, - "loss": 0.42517897486686707, - "mean_token_accuracy": 0.8523461222648621, - "num_tokens": 1381546.0, - "step": 147 - }, - { - "epoch": 0.11246200607902736, - "grad_norm": 3.0090949535369873, - "learning_rate": 3.7121212121212124e-06, - "loss": 0.4042336940765381, - "mean_token_accuracy": 0.8670448064804077, - "num_tokens": 1385896.0, - "step": 148 - }, - { - "epoch": 0.11322188449848024, - "grad_norm": 2.4928104877471924, - "learning_rate": 3.737373737373738e-06, - "loss": 0.6498878598213196, - "mean_token_accuracy": 0.7967068552970886, - "num_tokens": 1394169.0, - "step": 149 - }, - { - "epoch": 0.11398176291793313, - "grad_norm": 1.5984913110733032, - "learning_rate": 3.7626262626262627e-06, - "loss": 0.546096920967102, - "mean_token_accuracy": 0.8035850524902344, - "num_tokens": 1408785.0, - "step": 150 - }, - { - "epoch": 0.11474164133738601, - "grad_norm": 2.3663532733917236, - "learning_rate": 3.7878787878787882e-06, - "loss": 0.6111721992492676, - "mean_token_accuracy": 0.8015355467796326, - "num_tokens": 1417510.0, - "step": 151 - }, - { - "epoch": 0.11550151975683891, - "grad_norm": 2.518932819366455, - "learning_rate": 3.8131313131313138e-06, - "loss": 0.5274964570999146, - "mean_token_accuracy": 0.8155480623245239, - "num_tokens": 1424186.0, - "step": 152 - }, - { - "epoch": 0.11626139817629179, - "grad_norm": 2.14353609085083, - "learning_rate": 3.8383838383838385e-06, - "loss": 0.5283297896385193, - "mean_token_accuracy": 0.8275758028030396, - "num_tokens": 1432630.0, - "step": 153 - }, - { - "epoch": 0.11702127659574468, - "grad_norm": 1.8243604898452759, - "learning_rate": 3.863636363636364e-06, - "loss": 0.41854870319366455, - "mean_token_accuracy": 0.8222295045852661, - "num_tokens": 1442691.0, - "step": 154 - }, - { - "epoch": 0.11778115501519756, - "grad_norm": 2.088212251663208, - "learning_rate": 3.88888888888889e-06, - "loss": 0.6062943339347839, - "mean_token_accuracy": 0.8009427785873413, - "num_tokens": 1456890.0, - "step": 155 - }, - { - "epoch": 0.11854103343465046, - "grad_norm": 1.3469511270523071, - "learning_rate": 3.914141414141415e-06, - "loss": 0.4390433728694916, - "mean_token_accuracy": 0.8436295986175537, - "num_tokens": 1475349.0, - "step": 156 - }, - { - "epoch": 0.11930091185410334, - "grad_norm": 3.247023105621338, - "learning_rate": 3.93939393939394e-06, - "loss": 0.6490433216094971, - "mean_token_accuracy": 0.8037861585617065, - "num_tokens": 1479952.0, - "step": 157 - }, - { - "epoch": 0.12006079027355623, - "grad_norm": 2.6610445976257324, - "learning_rate": 3.964646464646465e-06, - "loss": 0.6221826076507568, - "mean_token_accuracy": 0.7848749160766602, - "num_tokens": 1487306.0, - "step": 158 - }, - { - "epoch": 0.12082066869300911, - "grad_norm": 2.3060810565948486, - "learning_rate": 3.98989898989899e-06, - "loss": 0.5052388310432434, - "mean_token_accuracy": 0.8281195759773254, - "num_tokens": 1495367.0, - "step": 159 - }, - { - "epoch": 0.12158054711246201, - "grad_norm": 2.504448652267456, - "learning_rate": 4.015151515151515e-06, - "loss": 0.5005477666854858, - "mean_token_accuracy": 0.8408058881759644, - "num_tokens": 1502069.0, - "step": 160 - }, - { - "epoch": 0.12234042553191489, - "grad_norm": 3.993938446044922, - "learning_rate": 4.04040404040404e-06, - "loss": 0.5569638013839722, - "mean_token_accuracy": 0.8095242977142334, - "num_tokens": 1510224.0, - "step": 161 - }, - { - "epoch": 0.12310030395136778, - "grad_norm": 2.2287683486938477, - "learning_rate": 4.065656565656566e-06, - "loss": 0.524042546749115, - "mean_token_accuracy": 0.8102203607559204, - "num_tokens": 1518364.0, - "step": 162 - }, - { - "epoch": 0.12386018237082067, - "grad_norm": 1.9531738758087158, - "learning_rate": 4.0909090909090915e-06, - "loss": 0.45794573426246643, - "mean_token_accuracy": 0.8560376167297363, - "num_tokens": 1528097.0, - "step": 163 - }, - { - "epoch": 0.12462006079027356, - "grad_norm": 1.5841206312179565, - "learning_rate": 4.116161616161617e-06, - "loss": 0.5420972108840942, - "mean_token_accuracy": 0.8092726469039917, - "num_tokens": 1544119.0, - "step": 164 - }, - { - "epoch": 0.12537993920972645, - "grad_norm": 1.7536218166351318, - "learning_rate": 4.141414141414142e-06, - "loss": 0.554668664932251, - "mean_token_accuracy": 0.8193825483322144, - "num_tokens": 1559140.0, - "step": 165 - }, - { - "epoch": 0.12613981762917933, - "grad_norm": 3.545454740524292, - "learning_rate": 4.166666666666667e-06, - "loss": 0.580947995185852, - "mean_token_accuracy": 0.8286383152008057, - "num_tokens": 1563625.0, - "step": 166 - }, - { - "epoch": 0.12689969604863222, - "grad_norm": 1.6608915328979492, - "learning_rate": 4.191919191919192e-06, - "loss": 0.5523324012756348, - "mean_token_accuracy": 0.8155215978622437, - "num_tokens": 1574945.0, - "step": 167 - }, - { - "epoch": 0.1276595744680851, - "grad_norm": 1.4832708835601807, - "learning_rate": 4.217171717171717e-06, - "loss": 0.5133191347122192, - "mean_token_accuracy": 0.8367571830749512, - "num_tokens": 1595865.0, - "step": 168 - }, - { - "epoch": 0.128419452887538, - "grad_norm": 1.7807520627975464, - "learning_rate": 4.242424242424243e-06, - "loss": 0.5131410360336304, - "mean_token_accuracy": 0.8129367232322693, - "num_tokens": 1608723.0, - "step": 169 - }, - { - "epoch": 0.12917933130699089, - "grad_norm": 2.707569122314453, - "learning_rate": 4.267676767676767e-06, - "loss": 0.6129013299942017, - "mean_token_accuracy": 0.7926048040390015, - "num_tokens": 1616136.0, - "step": 170 - }, - { - "epoch": 0.12993920972644377, - "grad_norm": 2.5831644535064697, - "learning_rate": 4.292929292929293e-06, - "loss": 0.6264227628707886, - "mean_token_accuracy": 0.8074911236763, - "num_tokens": 1624228.0, - "step": 171 - }, - { - "epoch": 0.13069908814589665, - "grad_norm": 3.1124250888824463, - "learning_rate": 4.3181818181818185e-06, - "loss": 0.41763827204704285, - "mean_token_accuracy": 0.8565453290939331, - "num_tokens": 1628098.0, - "step": 172 - }, - { - "epoch": 0.13145896656534956, - "grad_norm": 2.3214211463928223, - "learning_rate": 4.343434343434344e-06, - "loss": 0.421974778175354, - "mean_token_accuracy": 0.8391546010971069, - "num_tokens": 1634950.0, - "step": 173 - }, - { - "epoch": 0.13221884498480244, - "grad_norm": 2.1010327339172363, - "learning_rate": 4.368686868686869e-06, - "loss": 0.5307331681251526, - "mean_token_accuracy": 0.8139588236808777, - "num_tokens": 1644132.0, - "step": 174 - }, - { - "epoch": 0.13297872340425532, - "grad_norm": 2.533612012863159, - "learning_rate": 4.393939393939394e-06, - "loss": 0.5626664161682129, - "mean_token_accuracy": 0.8029808402061462, - "num_tokens": 1651637.0, - "step": 175 - }, - { - "epoch": 0.1337386018237082, - "grad_norm": 1.669508457183838, - "learning_rate": 4.41919191919192e-06, - "loss": 0.5351508259773254, - "mean_token_accuracy": 0.8281655311584473, - "num_tokens": 1666776.0, - "step": 176 - }, - { - "epoch": 0.1344984802431611, - "grad_norm": 1.7579659223556519, - "learning_rate": 4.444444444444444e-06, - "loss": 0.5235031247138977, - "mean_token_accuracy": 0.8143284320831299, - "num_tokens": 1679241.0, - "step": 177 - }, - { - "epoch": 0.135258358662614, - "grad_norm": 3.123563528060913, - "learning_rate": 4.46969696969697e-06, - "loss": 0.43051332235336304, - "mean_token_accuracy": 0.8518186211585999, - "num_tokens": 1683317.0, - "step": 178 - }, - { - "epoch": 0.13601823708206687, - "grad_norm": 2.2411575317382812, - "learning_rate": 4.494949494949495e-06, - "loss": 0.5471380949020386, - "mean_token_accuracy": 0.8267596960067749, - "num_tokens": 1691366.0, - "step": 179 - }, - { - "epoch": 0.13677811550151975, - "grad_norm": 2.621973991394043, - "learning_rate": 4.520202020202021e-06, - "loss": 0.5685839653015137, - "mean_token_accuracy": 0.8260642290115356, - "num_tokens": 1698148.0, - "step": 180 - }, - { - "epoch": 0.13753799392097266, - "grad_norm": 2.1553852558135986, - "learning_rate": 4.5454545454545455e-06, - "loss": 0.5703883171081543, - "mean_token_accuracy": 0.8219090700149536, - "num_tokens": 1707225.0, - "step": 181 - }, - { - "epoch": 0.13829787234042554, - "grad_norm": 5.1767897605896, - "learning_rate": 4.5707070707070715e-06, - "loss": 0.32704639434814453, - "mean_token_accuracy": 0.8754568099975586, - "num_tokens": 1712748.0, - "step": 182 - }, - { - "epoch": 0.13905775075987842, - "grad_norm": 2.609168291091919, - "learning_rate": 4.595959595959597e-06, - "loss": 0.5939987301826477, - "mean_token_accuracy": 0.8034975528717041, - "num_tokens": 1719932.0, - "step": 183 - }, - { - "epoch": 0.1398176291793313, - "grad_norm": 2.2059099674224854, - "learning_rate": 4.621212121212122e-06, - "loss": 0.5310720205307007, - "mean_token_accuracy": 0.8177368640899658, - "num_tokens": 1727640.0, - "step": 184 - }, - { - "epoch": 0.1405775075987842, - "grad_norm": 2.6367759704589844, - "learning_rate": 4.646464646464647e-06, - "loss": 0.522086501121521, - "mean_token_accuracy": 0.826233983039856, - "num_tokens": 1733609.0, - "step": 185 - }, - { - "epoch": 0.1413373860182371, - "grad_norm": 3.326732873916626, - "learning_rate": 4.671717171717172e-06, - "loss": 0.4127829074859619, - "mean_token_accuracy": 0.8551101684570312, - "num_tokens": 1737256.0, - "step": 186 - }, - { - "epoch": 0.14209726443768997, - "grad_norm": 1.828412413597107, - "learning_rate": 4.696969696969698e-06, - "loss": 0.5444269180297852, - "mean_token_accuracy": 0.8350818157196045, - "num_tokens": 1750196.0, - "step": 187 - }, - { - "epoch": 0.14285714285714285, - "grad_norm": 3.209203004837036, - "learning_rate": 4.722222222222222e-06, - "loss": 0.5087994933128357, - "mean_token_accuracy": 0.8349015712738037, - "num_tokens": 1754836.0, - "step": 188 - }, - { - "epoch": 0.14361702127659576, - "grad_norm": 1.7339166402816772, - "learning_rate": 4.747474747474748e-06, - "loss": 0.5151352286338806, - "mean_token_accuracy": 0.8321266174316406, - "num_tokens": 1766015.0, - "step": 189 - }, - { - "epoch": 0.14437689969604864, - "grad_norm": 2.699068069458008, - "learning_rate": 4.772727272727273e-06, - "loss": 0.4406203031539917, - "mean_token_accuracy": 0.8425000905990601, - "num_tokens": 1771684.0, - "step": 190 - }, - { - "epoch": 0.14513677811550152, - "grad_norm": 2.8117282390594482, - "learning_rate": 4.7979797979797985e-06, - "loss": 0.40428489446640015, - "mean_token_accuracy": 0.8654326796531677, - "num_tokens": 1776301.0, - "step": 191 - }, - { - "epoch": 0.1458966565349544, - "grad_norm": 2.9204647541046143, - "learning_rate": 4.823232323232324e-06, - "loss": 0.4191770553588867, - "mean_token_accuracy": 0.8574687242507935, - "num_tokens": 1781678.0, - "step": 192 - }, - { - "epoch": 0.1466565349544073, - "grad_norm": 2.1648988723754883, - "learning_rate": 4.848484848484849e-06, - "loss": 0.5839012861251831, - "mean_token_accuracy": 0.8053664565086365, - "num_tokens": 1792516.0, - "step": 193 - }, - { - "epoch": 0.1474164133738602, - "grad_norm": 2.3221631050109863, - "learning_rate": 4.873737373737374e-06, - "loss": 0.5037894248962402, - "mean_token_accuracy": 0.8427227139472961, - "num_tokens": 1800192.0, - "step": 194 - }, - { - "epoch": 0.14817629179331307, - "grad_norm": 2.4536430835723877, - "learning_rate": 4.898989898989899e-06, - "loss": 0.42326074838638306, - "mean_token_accuracy": 0.8510633111000061, - "num_tokens": 1806159.0, - "step": 195 - }, - { - "epoch": 0.14893617021276595, - "grad_norm": 2.4875805377960205, - "learning_rate": 4.924242424242425e-06, - "loss": 0.539531409740448, - "mean_token_accuracy": 0.8060250282287598, - "num_tokens": 1813392.0, - "step": 196 - }, - { - "epoch": 0.14969604863221886, - "grad_norm": 2.1664798259735107, - "learning_rate": 4.94949494949495e-06, - "loss": 0.42502015829086304, - "mean_token_accuracy": 0.8503251075744629, - "num_tokens": 1821424.0, - "step": 197 - }, - { - "epoch": 0.15045592705167174, - "grad_norm": 2.568808078765869, - "learning_rate": 4.974747474747475e-06, - "loss": 0.5025098323822021, - "mean_token_accuracy": 0.8182311058044434, - "num_tokens": 1827225.0, - "step": 198 - }, - { - "epoch": 0.15121580547112462, - "grad_norm": 1.9116802215576172, - "learning_rate": 5e-06, - "loss": 0.4907258450984955, - "mean_token_accuracy": 0.8310189843177795, - "num_tokens": 1836297.0, - "step": 199 - }, - { - "epoch": 0.1519756838905775, - "grad_norm": 3.150765895843506, - "learning_rate": 4.999999122701883e-06, - "loss": 0.390616774559021, - "mean_token_accuracy": 0.8626647591590881, - "num_tokens": 1839984.0, - "step": 200 - }, - { - "epoch": 0.15273556231003038, - "grad_norm": 3.2229044437408447, - "learning_rate": 4.999996490808146e-06, - "loss": 0.48009657859802246, - "mean_token_accuracy": 0.825214147567749, - "num_tokens": 1844610.0, - "step": 201 - }, - { - "epoch": 0.1534954407294833, - "grad_norm": 1.4473289251327515, - "learning_rate": 4.9999921043206356e-06, - "loss": 0.40135183930397034, - "mean_token_accuracy": 0.8537827730178833, - "num_tokens": 1859573.0, - "step": 202 - }, - { - "epoch": 0.15425531914893617, - "grad_norm": 4.072319507598877, - "learning_rate": 4.999985963242432e-06, - "loss": 0.6158689260482788, - "mean_token_accuracy": 0.8075432777404785, - "num_tokens": 1863147.0, - "step": 203 - }, - { - "epoch": 0.15501519756838905, - "grad_norm": 3.15741229057312, - "learning_rate": 4.999978067577844e-06, - "loss": 0.4603108763694763, - "mean_token_accuracy": 0.8418779373168945, - "num_tokens": 1867201.0, - "step": 204 - }, - { - "epoch": 0.15577507598784193, - "grad_norm": 2.1925418376922607, - "learning_rate": 4.999968417332415e-06, - "loss": 0.5552488565444946, - "mean_token_accuracy": 0.8216016292572021, - "num_tokens": 1874837.0, - "step": 205 - }, - { - "epoch": 0.15653495440729484, - "grad_norm": 2.2518117427825928, - "learning_rate": 4.999957012512916e-06, - "loss": 0.4912569522857666, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 1881842.0, - "step": 206 - }, - { - "epoch": 0.15729483282674772, - "grad_norm": 1.8223762512207031, - "learning_rate": 4.999943853127351e-06, - "loss": 0.47709137201309204, - "mean_token_accuracy": 0.8311659097671509, - "num_tokens": 1890805.0, - "step": 207 - }, - { - "epoch": 0.1580547112462006, - "grad_norm": 2.066499948501587, - "learning_rate": 4.999928939184958e-06, - "loss": 0.44794657826423645, - "mean_token_accuracy": 0.8513424396514893, - "num_tokens": 1898264.0, - "step": 208 - }, - { - "epoch": 0.15881458966565348, - "grad_norm": 3.53865909576416, - "learning_rate": 4.999912270696202e-06, - "loss": 0.5978270769119263, - "mean_token_accuracy": 0.8080137968063354, - "num_tokens": 1902435.0, - "step": 209 - }, - { - "epoch": 0.1595744680851064, - "grad_norm": 2.0760679244995117, - "learning_rate": 4.999893847672783e-06, - "loss": 0.5930601358413696, - "mean_token_accuracy": 0.8028650283813477, - "num_tokens": 1912252.0, - "step": 210 - }, - { - "epoch": 0.16033434650455927, - "grad_norm": 2.21551513671875, - "learning_rate": 4.99987367012763e-06, - "loss": 0.6336753964424133, - "mean_token_accuracy": 0.7902286648750305, - "num_tokens": 1922095.0, - "step": 211 - }, - { - "epoch": 0.16109422492401215, - "grad_norm": 1.7654480934143066, - "learning_rate": 4.999851738074904e-06, - "loss": 0.6373403668403625, - "mean_token_accuracy": 0.7802424430847168, - "num_tokens": 1938962.0, - "step": 212 - }, - { - "epoch": 0.16185410334346503, - "grad_norm": 2.852834701538086, - "learning_rate": 4.9998280515300006e-06, - "loss": 0.6418683528900146, - "mean_token_accuracy": 0.7895716428756714, - "num_tokens": 1944668.0, - "step": 213 - }, - { - "epoch": 0.16261398176291794, - "grad_norm": 3.4737212657928467, - "learning_rate": 4.999802610509541e-06, - "loss": 0.6323273181915283, - "mean_token_accuracy": 0.7982614636421204, - "num_tokens": 1949142.0, - "step": 214 - }, - { - "epoch": 0.16337386018237082, - "grad_norm": 3.0802664756774902, - "learning_rate": 4.999775415031381e-06, - "loss": 0.5929068326950073, - "mean_token_accuracy": 0.8112219572067261, - "num_tokens": 1954141.0, - "step": 215 - }, - { - "epoch": 0.1641337386018237, - "grad_norm": 2.9808855056762695, - "learning_rate": 4.999746465114609e-06, - "loss": 0.5556406378746033, - "mean_token_accuracy": 0.8117628693580627, - "num_tokens": 1959406.0, - "step": 216 - }, - { - "epoch": 0.16489361702127658, - "grad_norm": 1.7346166372299194, - "learning_rate": 4.999715760779541e-06, - "loss": 0.5122925043106079, - "mean_token_accuracy": 0.8040724992752075, - "num_tokens": 1971921.0, - "step": 217 - }, - { - "epoch": 0.1656534954407295, - "grad_norm": 1.4183907508850098, - "learning_rate": 4.999683302047729e-06, - "loss": 0.46471893787384033, - "mean_token_accuracy": 0.8381330966949463, - "num_tokens": 1988863.0, - "step": 218 - }, - { - "epoch": 0.16641337386018237, - "grad_norm": 1.6797802448272705, - "learning_rate": 4.999649088941951e-06, - "loss": 0.38348832726478577, - "mean_token_accuracy": 0.8344278931617737, - "num_tokens": 2000003.0, - "step": 219 - }, - { - "epoch": 0.16717325227963525, - "grad_norm": 3.036963939666748, - "learning_rate": 4.999613121486222e-06, - "loss": 0.6062780618667603, - "mean_token_accuracy": 0.8217900991439819, - "num_tokens": 2004813.0, - "step": 220 - }, - { - "epoch": 0.16793313069908813, - "grad_norm": 2.0343217849731445, - "learning_rate": 4.999575399705782e-06, - "loss": 0.5052450895309448, - "mean_token_accuracy": 0.8368623852729797, - "num_tokens": 2013565.0, - "step": 221 - }, - { - "epoch": 0.16869300911854104, - "grad_norm": 2.1162009239196777, - "learning_rate": 4.9995359236271094e-06, - "loss": 0.5169756412506104, - "mean_token_accuracy": 0.8339958190917969, - "num_tokens": 2025763.0, - "step": 222 - }, - { - "epoch": 0.16945288753799392, - "grad_norm": 2.055333375930786, - "learning_rate": 4.9994946932779076e-06, - "loss": 0.6327048540115356, - "mean_token_accuracy": 0.8078711032867432, - "num_tokens": 2037005.0, - "step": 223 - }, - { - "epoch": 0.1702127659574468, - "grad_norm": 3.334620475769043, - "learning_rate": 4.999451708687114e-06, - "loss": 0.5688358545303345, - "mean_token_accuracy": 0.8015589714050293, - "num_tokens": 2041473.0, - "step": 224 - }, - { - "epoch": 0.17097264437689969, - "grad_norm": 2.3734676837921143, - "learning_rate": 4.999406969884897e-06, - "loss": 0.5673821568489075, - "mean_token_accuracy": 0.8054057359695435, - "num_tokens": 2049397.0, - "step": 225 - }, - { - "epoch": 0.1717325227963526, - "grad_norm": 1.807358980178833, - "learning_rate": 4.999360476902656e-06, - "loss": 0.4376158118247986, - "mean_token_accuracy": 0.8456039428710938, - "num_tokens": 2058721.0, - "step": 226 - }, - { - "epoch": 0.17249240121580547, - "grad_norm": 3.231638193130493, - "learning_rate": 4.999312229773022e-06, - "loss": 0.5592809915542603, - "mean_token_accuracy": 0.8170154094696045, - "num_tokens": 2063455.0, - "step": 227 - }, - { - "epoch": 0.17325227963525835, - "grad_norm": 2.2717151641845703, - "learning_rate": 4.999262228529855e-06, - "loss": 0.6144396066665649, - "mean_token_accuracy": 0.7948470115661621, - "num_tokens": 2071686.0, - "step": 228 - }, - { - "epoch": 0.17401215805471124, - "grad_norm": 1.4171342849731445, - "learning_rate": 4.99921047320825e-06, - "loss": 0.43680912256240845, - "mean_token_accuracy": 0.84850013256073, - "num_tokens": 2086999.0, - "step": 229 - }, - { - "epoch": 0.17477203647416414, - "grad_norm": 3.162736654281616, - "learning_rate": 4.99915696384453e-06, - "loss": 0.6025407910346985, - "mean_token_accuracy": 0.8042335510253906, - "num_tokens": 2092001.0, - "step": 230 - }, - { - "epoch": 0.17553191489361702, - "grad_norm": 1.8672804832458496, - "learning_rate": 4.99910170047625e-06, - "loss": 0.5843087434768677, - "mean_token_accuracy": 0.8016980886459351, - "num_tokens": 2103372.0, - "step": 231 - }, - { - "epoch": 0.1762917933130699, - "grad_norm": 2.967587471008301, - "learning_rate": 4.999044683142196e-06, - "loss": 0.5123642086982727, - "mean_token_accuracy": 0.8216149806976318, - "num_tokens": 2108008.0, - "step": 232 - }, - { - "epoch": 0.1770516717325228, - "grad_norm": 1.9651981592178345, - "learning_rate": 4.998985911882383e-06, - "loss": 0.5868178606033325, - "mean_token_accuracy": 0.7904198169708252, - "num_tokens": 2119009.0, - "step": 233 - }, - { - "epoch": 0.1778115501519757, - "grad_norm": 2.7785449028015137, - "learning_rate": 4.998925386738063e-06, - "loss": 0.5075510144233704, - "mean_token_accuracy": 0.8280210494995117, - "num_tokens": 2124915.0, - "step": 234 - }, - { - "epoch": 0.17857142857142858, - "grad_norm": 2.957470417022705, - "learning_rate": 4.998863107751711e-06, - "loss": 0.5351958274841309, - "mean_token_accuracy": 0.846825122833252, - "num_tokens": 2129905.0, - "step": 235 - }, - { - "epoch": 0.17933130699088146, - "grad_norm": 3.207671880722046, - "learning_rate": 4.99879907496704e-06, - "loss": 0.6209091544151306, - "mean_token_accuracy": 0.789960503578186, - "num_tokens": 2135027.0, - "step": 236 - }, - { - "epoch": 0.18009118541033434, - "grad_norm": 2.018953800201416, - "learning_rate": 4.998733288428987e-06, - "loss": 0.601510763168335, - "mean_token_accuracy": 0.8136930465698242, - "num_tokens": 2147016.0, - "step": 237 - }, - { - "epoch": 0.18085106382978725, - "grad_norm": 2.437281847000122, - "learning_rate": 4.998665748183727e-06, - "loss": 0.5813639163970947, - "mean_token_accuracy": 0.8116716146469116, - "num_tokens": 2155386.0, - "step": 238 - }, - { - "epoch": 0.18161094224924013, - "grad_norm": 1.5708180665969849, - "learning_rate": 4.998596454278661e-06, - "loss": 0.5252395272254944, - "mean_token_accuracy": 0.8193864822387695, - "num_tokens": 2170295.0, - "step": 239 - }, - { - "epoch": 0.182370820668693, - "grad_norm": 1.9921495914459229, - "learning_rate": 4.998525406762422e-06, - "loss": 0.5335029363632202, - "mean_token_accuracy": 0.8120872974395752, - "num_tokens": 2180012.0, - "step": 240 - }, - { - "epoch": 0.1831306990881459, - "grad_norm": 2.6562681198120117, - "learning_rate": 4.998452605684874e-06, - "loss": 0.48021435737609863, - "mean_token_accuracy": 0.8388714790344238, - "num_tokens": 2185607.0, - "step": 241 - }, - { - "epoch": 0.1838905775075988, - "grad_norm": 2.2535853385925293, - "learning_rate": 4.998378051097111e-06, - "loss": 0.5747300386428833, - "mean_token_accuracy": 0.8004639148712158, - "num_tokens": 2194105.0, - "step": 242 - }, - { - "epoch": 0.18465045592705168, - "grad_norm": 1.6151788234710693, - "learning_rate": 4.998301743051459e-06, - "loss": 0.6190565824508667, - "mean_token_accuracy": 0.7816627621650696, - "num_tokens": 2210629.0, - "step": 243 - }, - { - "epoch": 0.18541033434650456, - "grad_norm": 2.1088173389434814, - "learning_rate": 4.9982236816014735e-06, - "loss": 0.4715560972690582, - "mean_token_accuracy": 0.8485721349716187, - "num_tokens": 2218958.0, - "step": 244 - }, - { - "epoch": 0.18617021276595744, - "grad_norm": 2.6168735027313232, - "learning_rate": 4.998143866801941e-06, - "loss": 0.6077103018760681, - "mean_token_accuracy": 0.8057924509048462, - "num_tokens": 2226368.0, - "step": 245 - }, - { - "epoch": 0.18693009118541035, - "grad_norm": 2.5988616943359375, - "learning_rate": 4.99806229870888e-06, - "loss": 0.5021637678146362, - "mean_token_accuracy": 0.8361666202545166, - "num_tokens": 2232485.0, - "step": 246 - }, - { - "epoch": 0.18768996960486323, - "grad_norm": 2.015887498855591, - "learning_rate": 4.9979789773795365e-06, - "loss": 0.4309737980365753, - "mean_token_accuracy": 0.8508044481277466, - "num_tokens": 2240819.0, - "step": 247 - }, - { - "epoch": 0.1884498480243161, - "grad_norm": 2.3115265369415283, - "learning_rate": 4.997893902872389e-06, - "loss": 0.5776500701904297, - "mean_token_accuracy": 0.8079549074172974, - "num_tokens": 2249460.0, - "step": 248 - }, - { - "epoch": 0.189209726443769, - "grad_norm": 1.7387021780014038, - "learning_rate": 4.997807075247147e-06, - "loss": 0.430944561958313, - "mean_token_accuracy": 0.8483544588088989, - "num_tokens": 2259124.0, - "step": 249 - }, - { - "epoch": 0.1899696048632219, - "grad_norm": 1.6378381252288818, - "learning_rate": 4.997718494564747e-06, - "loss": 0.4123363792896271, - "mean_token_accuracy": 0.8557409644126892, - "num_tokens": 2269899.0, - "step": 250 - }, - { - "epoch": 0.19072948328267478, - "grad_norm": 1.336282730102539, - "learning_rate": 4.997628160887361e-06, - "loss": 0.502329409122467, - "mean_token_accuracy": 0.8186938166618347, - "num_tokens": 2292821.0, - "step": 251 - }, - { - "epoch": 0.19148936170212766, - "grad_norm": 3.3335583209991455, - "learning_rate": 4.997536074278388e-06, - "loss": 0.584446907043457, - "mean_token_accuracy": 0.8062717318534851, - "num_tokens": 2297175.0, - "step": 252 - }, - { - "epoch": 0.19224924012158054, - "grad_norm": 2.246727228164673, - "learning_rate": 4.9974422348024565e-06, - "loss": 0.5683060884475708, - "mean_token_accuracy": 0.8193703293800354, - "num_tokens": 2305456.0, - "step": 253 - }, - { - "epoch": 0.19300911854103345, - "grad_norm": 2.3520865440368652, - "learning_rate": 4.997346642525429e-06, - "loss": 0.4724946618080139, - "mean_token_accuracy": 0.8426719307899475, - "num_tokens": 2312241.0, - "step": 254 - }, - { - "epoch": 0.19376899696048633, - "grad_norm": 2.7115702629089355, - "learning_rate": 4.9972492975143936e-06, - "loss": 0.5019032955169678, - "mean_token_accuracy": 0.8253573179244995, - "num_tokens": 2318094.0, - "step": 255 - }, - { - "epoch": 0.1945288753799392, - "grad_norm": 1.705528974533081, - "learning_rate": 4.997150199837671e-06, - "loss": 0.45588475465774536, - "mean_token_accuracy": 0.836666464805603, - "num_tokens": 2329025.0, - "step": 256 - }, - { - "epoch": 0.1952887537993921, - "grad_norm": 2.161400318145752, - "learning_rate": 4.997049349564814e-06, - "loss": 0.5170183777809143, - "mean_token_accuracy": 0.8287534117698669, - "num_tokens": 2337448.0, - "step": 257 - }, - { - "epoch": 0.196048632218845, - "grad_norm": 2.629669189453125, - "learning_rate": 4.996946746766602e-06, - "loss": 0.44650501012802124, - "mean_token_accuracy": 0.850114107131958, - "num_tokens": 2343207.0, - "step": 258 - }, - { - "epoch": 0.19680851063829788, - "grad_norm": 1.6735503673553467, - "learning_rate": 4.996842391515045e-06, - "loss": 0.5247820019721985, - "mean_token_accuracy": 0.8285071849822998, - "num_tokens": 2356801.0, - "step": 259 - }, - { - "epoch": 0.19756838905775076, - "grad_norm": 1.2753115892410278, - "learning_rate": 4.996736283883382e-06, - "loss": 0.41870927810668945, - "mean_token_accuracy": 0.8448047637939453, - "num_tokens": 2377306.0, - "step": 260 - }, - { - "epoch": 0.19832826747720364, - "grad_norm": 2.6947314739227295, - "learning_rate": 4.9966284239460875e-06, - "loss": 0.5059205889701843, - "mean_token_accuracy": 0.8430814743041992, - "num_tokens": 2383352.0, - "step": 261 - }, - { - "epoch": 0.19908814589665655, - "grad_norm": 2.0509963035583496, - "learning_rate": 4.996518811778858e-06, - "loss": 0.4565388560295105, - "mean_token_accuracy": 0.8453130722045898, - "num_tokens": 2391149.0, - "step": 262 - }, - { - "epoch": 0.19984802431610943, - "grad_norm": 2.1856348514556885, - "learning_rate": 4.996407447458626e-06, - "loss": 0.531380832195282, - "mean_token_accuracy": 0.8387004137039185, - "num_tokens": 2399875.0, - "step": 263 - }, - { - "epoch": 0.2006079027355623, - "grad_norm": 2.7348573207855225, - "learning_rate": 4.99629433106355e-06, - "loss": 0.5242817401885986, - "mean_token_accuracy": 0.8177423477172852, - "num_tokens": 2406586.0, - "step": 264 - }, - { - "epoch": 0.2013677811550152, - "grad_norm": 1.76587975025177, - "learning_rate": 4.99617946267302e-06, - "loss": 0.49298471212387085, - "mean_token_accuracy": 0.8271149396896362, - "num_tokens": 2418683.0, - "step": 265 - }, - { - "epoch": 0.20212765957446807, - "grad_norm": 2.8129730224609375, - "learning_rate": 4.996062842367655e-06, - "loss": 0.46420302987098694, - "mean_token_accuracy": 0.8453244566917419, - "num_tokens": 2422929.0, - "step": 266 - }, - { - "epoch": 0.20288753799392098, - "grad_norm": 2.575744152069092, - "learning_rate": 4.9959444702293025e-06, - "loss": 0.43208545446395874, - "mean_token_accuracy": 0.8494843244552612, - "num_tokens": 2429567.0, - "step": 267 - }, - { - "epoch": 0.20364741641337386, - "grad_norm": 2.7586750984191895, - "learning_rate": 4.995824346341041e-06, - "loss": 0.4390473961830139, - "mean_token_accuracy": 0.8348895311355591, - "num_tokens": 2434700.0, - "step": 268 - }, - { - "epoch": 0.20440729483282674, - "grad_norm": 1.972145438194275, - "learning_rate": 4.99570247078718e-06, - "loss": 0.6219544410705566, - "mean_token_accuracy": 0.7939999103546143, - "num_tokens": 2447007.0, - "step": 269 - }, - { - "epoch": 0.20516717325227962, - "grad_norm": 2.2963485717773438, - "learning_rate": 4.995578843653255e-06, - "loss": 0.5008970499038696, - "mean_token_accuracy": 0.8255308866500854, - "num_tokens": 2453936.0, - "step": 270 - }, - { - "epoch": 0.20592705167173253, - "grad_norm": 1.8897721767425537, - "learning_rate": 4.995453465026033e-06, - "loss": 0.5436089038848877, - "mean_token_accuracy": 0.819086492061615, - "num_tokens": 2464494.0, - "step": 271 - }, - { - "epoch": 0.2066869300911854, - "grad_norm": 2.319728374481201, - "learning_rate": 4.995326334993508e-06, - "loss": 0.5136368870735168, - "mean_token_accuracy": 0.820817232131958, - "num_tokens": 2470938.0, - "step": 272 - }, - { - "epoch": 0.2074468085106383, - "grad_norm": 2.230414390563965, - "learning_rate": 4.9951974536449055e-06, - "loss": 0.5272846817970276, - "mean_token_accuracy": 0.8203279972076416, - "num_tokens": 2478629.0, - "step": 273 - }, - { - "epoch": 0.20820668693009117, - "grad_norm": 3.401937484741211, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.4389592111110687, - "mean_token_accuracy": 0.8647899031639099, - "num_tokens": 2482193.0, - "step": 274 - }, - { - "epoch": 0.20896656534954408, - "grad_norm": 2.1278507709503174, - "learning_rate": 4.994934437362513e-06, - "loss": 0.598863422870636, - "mean_token_accuracy": 0.7945119738578796, - "num_tokens": 2492465.0, - "step": 275 - }, - { - "epoch": 0.20972644376899696, - "grad_norm": 1.9259960651397705, - "learning_rate": 4.994800302613318e-06, - "loss": 0.49520939588546753, - "mean_token_accuracy": 0.8371536135673523, - "num_tokens": 2500825.0, - "step": 276 - }, - { - "epoch": 0.21048632218844984, - "grad_norm": 2.346418857574463, - "learning_rate": 4.994664416917236e-06, - "loss": 0.5412614345550537, - "mean_token_accuracy": 0.810661792755127, - "num_tokens": 2509513.0, - "step": 277 - }, - { - "epoch": 0.21124620060790272, - "grad_norm": 1.3092039823532104, - "learning_rate": 4.994526780369636e-06, - "loss": 0.46305379271507263, - "mean_token_accuracy": 0.8358527421951294, - "num_tokens": 2531405.0, - "step": 278 - }, - { - "epoch": 0.21200607902735563, - "grad_norm": 2.924611806869507, - "learning_rate": 4.9943873930671175e-06, - "loss": 0.6134544610977173, - "mean_token_accuracy": 0.7947378754615784, - "num_tokens": 2536744.0, - "step": 279 - }, - { - "epoch": 0.2127659574468085, - "grad_norm": 2.8290598392486572, - "learning_rate": 4.994246255107506e-06, - "loss": 0.465520441532135, - "mean_token_accuracy": 0.8440108299255371, - "num_tokens": 2541184.0, - "step": 280 - }, - { - "epoch": 0.2135258358662614, - "grad_norm": 3.8081259727478027, - "learning_rate": 4.994103366589859e-06, - "loss": 0.43394139409065247, - "mean_token_accuracy": 0.8579148054122925, - "num_tokens": 2545395.0, - "step": 281 - }, - { - "epoch": 0.21428571428571427, - "grad_norm": 1.7994529008865356, - "learning_rate": 4.993958727614462e-06, - "loss": 0.5076484680175781, - "mean_token_accuracy": 0.8270803093910217, - "num_tokens": 2556541.0, - "step": 282 - }, - { - "epoch": 0.21504559270516718, - "grad_norm": 2.5582659244537354, - "learning_rate": 4.993812338282826e-06, - "loss": 0.4453684389591217, - "mean_token_accuracy": 0.8488293886184692, - "num_tokens": 2562949.0, - "step": 283 - }, - { - "epoch": 0.21580547112462006, - "grad_norm": 1.6448938846588135, - "learning_rate": 4.993664198697694e-06, - "loss": 0.461971640586853, - "mean_token_accuracy": 0.824763298034668, - "num_tokens": 2576407.0, - "step": 284 - }, - { - "epoch": 0.21656534954407294, - "grad_norm": 2.1264469623565674, - "learning_rate": 4.993514308963037e-06, - "loss": 0.6241602897644043, - "mean_token_accuracy": 0.7916014790534973, - "num_tokens": 2585695.0, - "step": 285 - }, - { - "epoch": 0.21732522796352582, - "grad_norm": 3.629991292953491, - "learning_rate": 4.993362669184051e-06, - "loss": 0.610355019569397, - "mean_token_accuracy": 0.7847568988800049, - "num_tokens": 2589778.0, - "step": 286 - }, - { - "epoch": 0.21808510638297873, - "grad_norm": 1.9070756435394287, - "learning_rate": 4.993209279467164e-06, - "loss": 0.5513623952865601, - "mean_token_accuracy": 0.7911607027053833, - "num_tokens": 2600920.0, - "step": 287 - }, - { - "epoch": 0.2188449848024316, - "grad_norm": 1.761062741279602, - "learning_rate": 4.993054139920031e-06, - "loss": 0.4579957127571106, - "mean_token_accuracy": 0.8189530372619629, - "num_tokens": 2611856.0, - "step": 288 - }, - { - "epoch": 0.2196048632218845, - "grad_norm": 1.7264713048934937, - "learning_rate": 4.992897250651535e-06, - "loss": 0.5871305465698242, - "mean_token_accuracy": 0.7918527126312256, - "num_tokens": 2624730.0, - "step": 289 - }, - { - "epoch": 0.22036474164133737, - "grad_norm": 1.7455977201461792, - "learning_rate": 4.992738611771787e-06, - "loss": 0.5475119948387146, - "mean_token_accuracy": 0.8226917386054993, - "num_tokens": 2635705.0, - "step": 290 - }, - { - "epoch": 0.22112462006079028, - "grad_norm": 2.095095157623291, - "learning_rate": 4.992578223392124e-06, - "loss": 0.5952225923538208, - "mean_token_accuracy": 0.8078469038009644, - "num_tokens": 2643954.0, - "step": 291 - }, - { - "epoch": 0.22188449848024316, - "grad_norm": 2.994664192199707, - "learning_rate": 4.992416085625115e-06, - "loss": 0.5432442426681519, - "mean_token_accuracy": 0.8329008221626282, - "num_tokens": 2648800.0, - "step": 292 - }, - { - "epoch": 0.22264437689969604, - "grad_norm": 2.796790361404419, - "learning_rate": 4.992252198584554e-06, - "loss": 0.5168961882591248, - "mean_token_accuracy": 0.8393474817276001, - "num_tokens": 2653546.0, - "step": 293 - }, - { - "epoch": 0.22340425531914893, - "grad_norm": 1.8610522747039795, - "learning_rate": 4.992086562385462e-06, - "loss": 0.5728024244308472, - "mean_token_accuracy": 0.797406792640686, - "num_tokens": 2667483.0, - "step": 294 - }, - { - "epoch": 0.22416413373860183, - "grad_norm": 1.695472002029419, - "learning_rate": 4.9919191771440905e-06, - "loss": 0.5460028648376465, - "mean_token_accuracy": 0.8123016357421875, - "num_tokens": 2683574.0, - "step": 295 - }, - { - "epoch": 0.22492401215805471, - "grad_norm": 2.8627376556396484, - "learning_rate": 4.9917500429779165e-06, - "loss": 0.5566985011100769, - "mean_token_accuracy": 0.815531313419342, - "num_tokens": 2688985.0, - "step": 296 - }, - { - "epoch": 0.2256838905775076, - "grad_norm": 2.73323655128479, - "learning_rate": 4.991579160005644e-06, - "loss": 0.48197102546691895, - "mean_token_accuracy": 0.8471829295158386, - "num_tokens": 2694799.0, - "step": 297 - }, - { - "epoch": 0.22644376899696048, - "grad_norm": 1.8436161279678345, - "learning_rate": 4.991406528347206e-06, - "loss": 0.4528339207172394, - "mean_token_accuracy": 0.8603188395500183, - "num_tokens": 2707321.0, - "step": 298 - }, - { - "epoch": 0.22720364741641338, - "grad_norm": 2.6231515407562256, - "learning_rate": 4.9912321481237616e-06, - "loss": 0.5916541814804077, - "mean_token_accuracy": 0.8050242066383362, - "num_tokens": 2714233.0, - "step": 299 - }, - { - "epoch": 0.22796352583586627, - "grad_norm": 3.08776593208313, - "learning_rate": 4.991056019457697e-06, - "loss": 0.4860580563545227, - "mean_token_accuracy": 0.8464088439941406, - "num_tokens": 2718443.0, - "step": 300 - }, - { - "epoch": 0.22872340425531915, - "grad_norm": 2.2537803649902344, - "learning_rate": 4.990878142472628e-06, - "loss": 0.5158311128616333, - "mean_token_accuracy": 0.824694812297821, - "num_tokens": 2726158.0, - "step": 301 - }, - { - "epoch": 0.22948328267477203, - "grad_norm": 2.1122705936431885, - "learning_rate": 4.990698517293394e-06, - "loss": 0.495265394449234, - "mean_token_accuracy": 0.8343238830566406, - "num_tokens": 2735022.0, - "step": 302 - }, - { - "epoch": 0.23024316109422494, - "grad_norm": 3.5503528118133545, - "learning_rate": 4.9905171440460645e-06, - "loss": 0.46063232421875, - "mean_token_accuracy": 0.8420047760009766, - "num_tokens": 2738550.0, - "step": 303 - }, - { - "epoch": 0.23100303951367782, - "grad_norm": 3.9858486652374268, - "learning_rate": 4.990334022857932e-06, - "loss": 0.5832710266113281, - "mean_token_accuracy": 0.8144199848175049, - "num_tokens": 2741720.0, - "step": 304 - }, - { - "epoch": 0.2317629179331307, - "grad_norm": 2.407231330871582, - "learning_rate": 4.990149153857519e-06, - "loss": 0.4692630171775818, - "mean_token_accuracy": 0.8429223299026489, - "num_tokens": 2748693.0, - "step": 305 - }, - { - "epoch": 0.23252279635258358, - "grad_norm": 1.6996397972106934, - "learning_rate": 4.989962537174573e-06, - "loss": 0.49143946170806885, - "mean_token_accuracy": 0.8340128064155579, - "num_tokens": 2761254.0, - "step": 306 - }, - { - "epoch": 0.23328267477203649, - "grad_norm": 3.746432065963745, - "learning_rate": 4.989774172940071e-06, - "loss": 0.6282026767730713, - "mean_token_accuracy": 0.775698184967041, - "num_tokens": 2765115.0, - "step": 307 - }, - { - "epoch": 0.23404255319148937, - "grad_norm": 2.212872266769409, - "learning_rate": 4.989584061286211e-06, - "loss": 0.5193763971328735, - "mean_token_accuracy": 0.8168246746063232, - "num_tokens": 2772345.0, - "step": 308 - }, - { - "epoch": 0.23480243161094225, - "grad_norm": 1.752297282218933, - "learning_rate": 4.989392202346423e-06, - "loss": 0.4437984824180603, - "mean_token_accuracy": 0.8451256155967712, - "num_tokens": 2783072.0, - "step": 309 - }, - { - "epoch": 0.23556231003039513, - "grad_norm": 2.386019706726074, - "learning_rate": 4.989198596255361e-06, - "loss": 0.4090752899646759, - "mean_token_accuracy": 0.8480085134506226, - "num_tokens": 2788757.0, - "step": 310 - }, - { - "epoch": 0.23632218844984804, - "grad_norm": 3.9981489181518555, - "learning_rate": 4.989003243148904e-06, - "loss": 0.5149132013320923, - "mean_token_accuracy": 0.8179056644439697, - "num_tokens": 2792096.0, - "step": 311 - }, - { - "epoch": 0.23708206686930092, - "grad_norm": 1.8723100423812866, - "learning_rate": 4.988806143164159e-06, - "loss": 0.4531487822532654, - "mean_token_accuracy": 0.8400167226791382, - "num_tokens": 2802210.0, - "step": 312 - }, - { - "epoch": 0.2378419452887538, - "grad_norm": 2.3415136337280273, - "learning_rate": 4.988607296439459e-06, - "loss": 0.5974439978599548, - "mean_token_accuracy": 0.8035976886749268, - "num_tokens": 2810088.0, - "step": 313 - }, - { - "epoch": 0.23860182370820668, - "grad_norm": 1.5317577123641968, - "learning_rate": 4.98840670311436e-06, - "loss": 0.49247145652770996, - "mean_token_accuracy": 0.8292540311813354, - "num_tokens": 2824005.0, - "step": 314 - }, - { - "epoch": 0.2393617021276596, - "grad_norm": 2.170772075653076, - "learning_rate": 4.988204363329648e-06, - "loss": 0.6359974145889282, - "mean_token_accuracy": 0.7785564661026001, - "num_tokens": 2834680.0, - "step": 315 - }, - { - "epoch": 0.24012158054711247, - "grad_norm": 3.2655932903289795, - "learning_rate": 4.988000277227334e-06, - "loss": 0.5080196857452393, - "mean_token_accuracy": 0.8295877575874329, - "num_tokens": 2838735.0, - "step": 316 - }, - { - "epoch": 0.24088145896656535, - "grad_norm": 3.406589984893799, - "learning_rate": 4.987794444950651e-06, - "loss": 0.3939085006713867, - "mean_token_accuracy": 0.8700719475746155, - "num_tokens": 2842127.0, - "step": 317 - }, - { - "epoch": 0.24164133738601823, - "grad_norm": 1.8211106061935425, - "learning_rate": 4.987586866644061e-06, - "loss": 0.5270540118217468, - "mean_token_accuracy": 0.826683521270752, - "num_tokens": 2853656.0, - "step": 318 - }, - { - "epoch": 0.24240121580547114, - "grad_norm": 1.8429969549179077, - "learning_rate": 4.9873775424532515e-06, - "loss": 0.4705049991607666, - "mean_token_accuracy": 0.8355701565742493, - "num_tokens": 2863513.0, - "step": 319 - }, - { - "epoch": 0.24316109422492402, - "grad_norm": 2.2425320148468018, - "learning_rate": 4.9871664725251314e-06, - "loss": 0.485736608505249, - "mean_token_accuracy": 0.835182785987854, - "num_tokens": 2871556.0, - "step": 320 - }, - { - "epoch": 0.2439209726443769, - "grad_norm": 1.6202056407928467, - "learning_rate": 4.986953657007841e-06, - "loss": 0.4437887370586395, - "mean_token_accuracy": 0.8282591700553894, - "num_tokens": 2884335.0, - "step": 321 - }, - { - "epoch": 0.24468085106382978, - "grad_norm": 1.1027268171310425, - "learning_rate": 4.98673909605074e-06, - "loss": 0.3770800828933716, - "mean_token_accuracy": 0.8325437307357788, - "num_tokens": 2904286.0, - "step": 322 - }, - { - "epoch": 0.2454407294832827, - "grad_norm": 2.3239076137542725, - "learning_rate": 4.986522789804417e-06, - "loss": 0.5387254953384399, - "mean_token_accuracy": 0.806242823600769, - "num_tokens": 2910975.0, - "step": 323 - }, - { - "epoch": 0.24620060790273557, - "grad_norm": 2.243482828140259, - "learning_rate": 4.986304738420684e-06, - "loss": 0.4396553039550781, - "mean_token_accuracy": 0.8561904430389404, - "num_tokens": 2917087.0, - "step": 324 - }, - { - "epoch": 0.24696048632218845, - "grad_norm": 2.537264347076416, - "learning_rate": 4.986084942052577e-06, - "loss": 0.395110160112381, - "mean_token_accuracy": 0.8636915683746338, - "num_tokens": 2921887.0, - "step": 325 - }, - { - "epoch": 0.24772036474164133, - "grad_norm": 2.319399118423462, - "learning_rate": 4.9858634008543574e-06, - "loss": 0.581517219543457, - "mean_token_accuracy": 0.8157487511634827, - "num_tokens": 2928996.0, - "step": 326 - }, - { - "epoch": 0.24848024316109424, - "grad_norm": 1.9787474870681763, - "learning_rate": 4.985640114981513e-06, - "loss": 0.5084106922149658, - "mean_token_accuracy": 0.835221529006958, - "num_tokens": 2940302.0, - "step": 327 - }, - { - "epoch": 0.24924012158054712, - "grad_norm": 2.4783265590667725, - "learning_rate": 4.985415084590752e-06, - "loss": 0.6062222719192505, - "mean_token_accuracy": 0.7885516285896301, - "num_tokens": 2946386.0, - "step": 328 - }, - { - "epoch": 0.25, - "grad_norm": 2.4081411361694336, - "learning_rate": 4.985188309840012e-06, - "loss": 0.5079880356788635, - "mean_token_accuracy": 0.8313904404640198, - "num_tokens": 2952323.0, - "step": 329 - }, - { - "epoch": 0.2507598784194529, - "grad_norm": 2.64993953704834, - "learning_rate": 4.984959790888451e-06, - "loss": 0.5461447834968567, - "mean_token_accuracy": 0.8125468492507935, - "num_tokens": 2958119.0, - "step": 330 - }, - { - "epoch": 0.25151975683890576, - "grad_norm": 2.549734115600586, - "learning_rate": 4.984729527896451e-06, - "loss": 0.5998573303222656, - "mean_token_accuracy": 0.8076666593551636, - "num_tokens": 2964947.0, - "step": 331 - }, - { - "epoch": 0.25227963525835867, - "grad_norm": 3.2185161113739014, - "learning_rate": 4.984497521025622e-06, - "loss": 0.4232945442199707, - "mean_token_accuracy": 0.8543803095817566, - "num_tokens": 2968598.0, - "step": 332 - }, - { - "epoch": 0.2530395136778115, - "grad_norm": 2.588994264602661, - "learning_rate": 4.984263770438793e-06, - "loss": 0.460967481136322, - "mean_token_accuracy": 0.8416207432746887, - "num_tokens": 2974510.0, - "step": 333 - }, - { - "epoch": 0.25379939209726443, - "grad_norm": 2.1373162269592285, - "learning_rate": 4.984028276300021e-06, - "loss": 0.49382102489471436, - "mean_token_accuracy": 0.8388048410415649, - "num_tokens": 2981632.0, - "step": 334 - }, - { - "epoch": 0.25455927051671734, - "grad_norm": 2.2524826526641846, - "learning_rate": 4.983791038774585e-06, - "loss": 0.4947671890258789, - "mean_token_accuracy": 0.8066365122795105, - "num_tokens": 2988736.0, - "step": 335 - }, - { - "epoch": 0.2553191489361702, - "grad_norm": 1.7244199514389038, - "learning_rate": 4.983552058028985e-06, - "loss": 0.48096776008605957, - "mean_token_accuracy": 0.830735445022583, - "num_tokens": 3003576.0, - "step": 336 - }, - { - "epoch": 0.2560790273556231, - "grad_norm": 3.0628933906555176, - "learning_rate": 4.9833113342309495e-06, - "loss": 0.6027032136917114, - "mean_token_accuracy": 0.8008694648742676, - "num_tokens": 3009549.0, - "step": 337 - }, - { - "epoch": 0.256838905775076, - "grad_norm": 2.438674211502075, - "learning_rate": 4.983068867549427e-06, - "loss": 0.517090916633606, - "mean_token_accuracy": 0.827893853187561, - "num_tokens": 3015236.0, - "step": 338 - }, - { - "epoch": 0.25759878419452886, - "grad_norm": 2.131535053253174, - "learning_rate": 4.982824658154589e-06, - "loss": 0.6656812429428101, - "mean_token_accuracy": 0.7772425413131714, - "num_tokens": 3028142.0, - "step": 339 - }, - { - "epoch": 0.25835866261398177, - "grad_norm": 2.3206584453582764, - "learning_rate": 4.9825787062178315e-06, - "loss": 0.5757625699043274, - "mean_token_accuracy": 0.8073873519897461, - "num_tokens": 3040996.0, - "step": 340 - }, - { - "epoch": 0.2591185410334346, - "grad_norm": 1.3905521631240845, - "learning_rate": 4.982331011911774e-06, - "loss": 0.4193805456161499, - "mean_token_accuracy": 0.8399466872215271, - "num_tokens": 3061931.0, - "step": 341 - }, - { - "epoch": 0.25987841945288753, - "grad_norm": 2.184173345565796, - "learning_rate": 4.982081575410256e-06, - "loss": 0.4751223921775818, - "mean_token_accuracy": 0.8409271240234375, - "num_tokens": 3069081.0, - "step": 342 - }, - { - "epoch": 0.26063829787234044, - "grad_norm": 3.538764238357544, - "learning_rate": 4.9818303968883445e-06, - "loss": 0.8119601011276245, - "mean_token_accuracy": 0.7442739009857178, - "num_tokens": 3073628.0, - "step": 343 - }, - { - "epoch": 0.2613981762917933, - "grad_norm": 1.8063762187957764, - "learning_rate": 4.981577476522323e-06, - "loss": 0.5615730881690979, - "mean_token_accuracy": 0.8207751512527466, - "num_tokens": 3086596.0, - "step": 344 - }, - { - "epoch": 0.2621580547112462, - "grad_norm": 2.4346961975097656, - "learning_rate": 4.981322814489703e-06, - "loss": 0.5266709327697754, - "mean_token_accuracy": 0.8211277723312378, - "num_tokens": 3092631.0, - "step": 345 - }, - { - "epoch": 0.2629179331306991, - "grad_norm": 1.91289484500885, - "learning_rate": 4.981066410969215e-06, - "loss": 0.5047177672386169, - "mean_token_accuracy": 0.8356877565383911, - "num_tokens": 3101102.0, - "step": 346 - }, - { - "epoch": 0.26367781155015196, - "grad_norm": 2.1495707035064697, - "learning_rate": 4.980808266140813e-06, - "loss": 0.47876280546188354, - "mean_token_accuracy": 0.8364313244819641, - "num_tokens": 3107998.0, - "step": 347 - }, - { - "epoch": 0.26443768996960487, - "grad_norm": 2.5961992740631104, - "learning_rate": 4.9805483801856744e-06, - "loss": 0.5512958765029907, - "mean_token_accuracy": 0.8181467652320862, - "num_tokens": 3113848.0, - "step": 348 - }, - { - "epoch": 0.2651975683890577, - "grad_norm": 3.2828900814056396, - "learning_rate": 4.980286753286196e-06, - "loss": 0.4217945635318756, - "mean_token_accuracy": 0.8617103099822998, - "num_tokens": 3117652.0, - "step": 349 - }, - { - "epoch": 0.26595744680851063, - "grad_norm": 1.425554871559143, - "learning_rate": 4.980023385625996e-06, - "loss": 0.4042487144470215, - "mean_token_accuracy": 0.8492785692214966, - "num_tokens": 3132336.0, - "step": 350 - }, - { - "epoch": 0.26671732522796354, - "grad_norm": 2.933504104614258, - "learning_rate": 4.979758277389919e-06, - "loss": 0.5406704545021057, - "mean_token_accuracy": 0.8035423755645752, - "num_tokens": 3137544.0, - "step": 351 - }, - { - "epoch": 0.2674772036474164, - "grad_norm": 1.9958966970443726, - "learning_rate": 4.9794914287640264e-06, - "loss": 0.5857555270195007, - "mean_token_accuracy": 0.7965140342712402, - "num_tokens": 3149705.0, - "step": 352 - }, - { - "epoch": 0.2682370820668693, - "grad_norm": 2.467694044113159, - "learning_rate": 4.979222839935602e-06, - "loss": 0.6404043436050415, - "mean_token_accuracy": 0.7823755741119385, - "num_tokens": 3158353.0, - "step": 353 - }, - { - "epoch": 0.2689969604863222, - "grad_norm": 2.0102720260620117, - "learning_rate": 4.9789525110931545e-06, - "loss": 0.5681496858596802, - "mean_token_accuracy": 0.8108169436454773, - "num_tokens": 3167121.0, - "step": 354 - }, - { - "epoch": 0.26975683890577506, - "grad_norm": 2.6017866134643555, - "learning_rate": 4.978680442426409e-06, - "loss": 0.6309828162193298, - "mean_token_accuracy": 0.7742617130279541, - "num_tokens": 3175012.0, - "step": 355 - }, - { - "epoch": 0.270516717325228, - "grad_norm": 1.8799268007278442, - "learning_rate": 4.978406634126315e-06, - "loss": 0.524029016494751, - "mean_token_accuracy": 0.8317689895629883, - "num_tokens": 3185331.0, - "step": 356 - }, - { - "epoch": 0.2712765957446808, - "grad_norm": 1.508332371711731, - "learning_rate": 4.978131086385041e-06, - "loss": 0.46656402945518494, - "mean_token_accuracy": 0.8339117765426636, - "num_tokens": 3198813.0, - "step": 357 - }, - { - "epoch": 0.27203647416413373, - "grad_norm": 3.595707654953003, - "learning_rate": 4.977853799395976e-06, - "loss": 0.5101234912872314, - "mean_token_accuracy": 0.8251723051071167, - "num_tokens": 3206557.0, - "step": 358 - }, - { - "epoch": 0.27279635258358664, - "grad_norm": 3.5317916870117188, - "learning_rate": 4.977574773353732e-06, - "loss": 0.5684665441513062, - "mean_token_accuracy": 0.8124493360519409, - "num_tokens": 3210912.0, - "step": 359 - }, - { - "epoch": 0.2735562310030395, - "grad_norm": 2.8606204986572266, - "learning_rate": 4.97729400845414e-06, - "loss": 0.4746384620666504, - "mean_token_accuracy": 0.8195606470108032, - "num_tokens": 3215365.0, - "step": 360 - }, - { - "epoch": 0.2743161094224924, - "grad_norm": 1.8214033842086792, - "learning_rate": 4.977011504894253e-06, - "loss": 0.4842769503593445, - "mean_token_accuracy": 0.82928866147995, - "num_tokens": 3224037.0, - "step": 361 - }, - { - "epoch": 0.2750759878419453, - "grad_norm": 1.628746509552002, - "learning_rate": 4.97672726287234e-06, - "loss": 0.4397493302822113, - "mean_token_accuracy": 0.8606528043746948, - "num_tokens": 3235589.0, - "step": 362 - }, - { - "epoch": 0.27583586626139817, - "grad_norm": 3.557973861694336, - "learning_rate": 4.976441282587894e-06, - "loss": 0.5732032060623169, - "mean_token_accuracy": 0.8041545748710632, - "num_tokens": 3239958.0, - "step": 363 - }, - { - "epoch": 0.2765957446808511, - "grad_norm": 1.3467901945114136, - "learning_rate": 4.9761535642416284e-06, - "loss": 0.4525323510169983, - "mean_token_accuracy": 0.8281061053276062, - "num_tokens": 3257703.0, - "step": 364 - }, - { - "epoch": 0.2773556231003039, - "grad_norm": 2.2649986743927, - "learning_rate": 4.9758641080354745e-06, - "loss": 0.5074734687805176, - "mean_token_accuracy": 0.8447474241256714, - "num_tokens": 3264334.0, - "step": 365 - }, - { - "epoch": 0.27811550151975684, - "grad_norm": 2.8667566776275635, - "learning_rate": 4.975572914172581e-06, - "loss": 0.5759559869766235, - "mean_token_accuracy": 0.7976793050765991, - "num_tokens": 3269314.0, - "step": 366 - }, - { - "epoch": 0.27887537993920974, - "grad_norm": 2.2514986991882324, - "learning_rate": 4.975279982857324e-06, - "loss": 0.5786465406417847, - "mean_token_accuracy": 0.8058781623840332, - "num_tokens": 3277324.0, - "step": 367 - }, - { - "epoch": 0.2796352583586626, - "grad_norm": 1.3826723098754883, - "learning_rate": 4.97498531429529e-06, - "loss": 0.40801727771759033, - "mean_token_accuracy": 0.8601310849189758, - "num_tokens": 3290530.0, - "step": 368 - }, - { - "epoch": 0.2803951367781155, - "grad_norm": 2.084092617034912, - "learning_rate": 4.97468890869329e-06, - "loss": 0.47076648473739624, - "mean_token_accuracy": 0.8310186862945557, - "num_tokens": 3298325.0, - "step": 369 - }, - { - "epoch": 0.2811550151975684, - "grad_norm": 1.3467998504638672, - "learning_rate": 4.974390766259353e-06, - "loss": 0.44668465852737427, - "mean_token_accuracy": 0.8275353908538818, - "num_tokens": 3314302.0, - "step": 370 - }, - { - "epoch": 0.28191489361702127, - "grad_norm": 2.5921075344085693, - "learning_rate": 4.974090887202726e-06, - "loss": 0.5343953967094421, - "mean_token_accuracy": 0.8110706806182861, - "num_tokens": 3320963.0, - "step": 371 - }, - { - "epoch": 0.2826747720364742, - "grad_norm": 2.042781352996826, - "learning_rate": 4.973789271733877e-06, - "loss": 0.6293343305587769, - "mean_token_accuracy": 0.7800243496894836, - "num_tokens": 3332742.0, - "step": 372 - }, - { - "epoch": 0.28343465045592703, - "grad_norm": 4.822193145751953, - "learning_rate": 4.973485920064491e-06, - "loss": 0.6256728768348694, - "mean_token_accuracy": 0.7962433099746704, - "num_tokens": 3335872.0, - "step": 373 - }, - { - "epoch": 0.28419452887537994, - "grad_norm": 1.260988473892212, - "learning_rate": 4.973180832407471e-06, - "loss": 0.38731223344802856, - "mean_token_accuracy": 0.8385066986083984, - "num_tokens": 3351884.0, - "step": 374 - }, - { - "epoch": 0.28495440729483285, - "grad_norm": 2.669966697692871, - "learning_rate": 4.97287400897694e-06, - "loss": 0.5594710111618042, - "mean_token_accuracy": 0.8097212314605713, - "num_tokens": 3358197.0, - "step": 375 - }, - { - "epoch": 0.2857142857142857, - "grad_norm": 3.0344486236572266, - "learning_rate": 4.972565449988238e-06, - "loss": 0.34449583292007446, - "mean_token_accuracy": 0.8813316822052002, - "num_tokens": 3362133.0, - "step": 376 - }, - { - "epoch": 0.2864741641337386, - "grad_norm": 2.562251091003418, - "learning_rate": 4.972255155657925e-06, - "loss": 0.5331522822380066, - "mean_token_accuracy": 0.8212941288948059, - "num_tokens": 3370346.0, - "step": 377 - }, - { - "epoch": 0.2872340425531915, - "grad_norm": 2.7083740234375, - "learning_rate": 4.9719431262037755e-06, - "loss": 0.5403046011924744, - "mean_token_accuracy": 0.8108335733413696, - "num_tokens": 3375588.0, - "step": 378 - }, - { - "epoch": 0.28799392097264437, - "grad_norm": 1.396430492401123, - "learning_rate": 4.971629361844785e-06, - "loss": 0.4041529893875122, - "mean_token_accuracy": 0.8588063716888428, - "num_tokens": 3390749.0, - "step": 379 - }, - { - "epoch": 0.2887537993920973, - "grad_norm": 1.9872784614562988, - "learning_rate": 4.971313862801166e-06, - "loss": 0.4336993098258972, - "mean_token_accuracy": 0.8511303663253784, - "num_tokens": 3399064.0, - "step": 380 - }, - { - "epoch": 0.28951367781155013, - "grad_norm": 1.9652575254440308, - "learning_rate": 4.9709966292943455e-06, - "loss": 0.4578358232975006, - "mean_token_accuracy": 0.8229440450668335, - "num_tokens": 3407229.0, - "step": 381 - }, - { - "epoch": 0.29027355623100304, - "grad_norm": 1.6626898050308228, - "learning_rate": 4.970677661546972e-06, - "loss": 0.5427594184875488, - "mean_token_accuracy": 0.815427303314209, - "num_tokens": 3422321.0, - "step": 382 - }, - { - "epoch": 0.29103343465045595, - "grad_norm": 3.5265562534332275, - "learning_rate": 4.970356959782909e-06, - "loss": 0.6661460995674133, - "mean_token_accuracy": 0.7856965065002441, - "num_tokens": 3427442.0, - "step": 383 - }, - { - "epoch": 0.2917933130699088, - "grad_norm": 1.667205572128296, - "learning_rate": 4.970034524227239e-06, - "loss": 0.36256325244903564, - "mean_token_accuracy": 0.8711205720901489, - "num_tokens": 3436662.0, - "step": 384 - }, - { - "epoch": 0.2925531914893617, - "grad_norm": 1.3389486074447632, - "learning_rate": 4.969710355106256e-06, - "loss": 0.4282698631286621, - "mean_token_accuracy": 0.838951587677002, - "num_tokens": 3450060.0, - "step": 385 - }, - { - "epoch": 0.2933130699088146, - "grad_norm": 2.5163397789001465, - "learning_rate": 4.969384452647477e-06, - "loss": 0.5176984071731567, - "mean_token_accuracy": 0.8235267996788025, - "num_tokens": 3456990.0, - "step": 386 - }, - { - "epoch": 0.29407294832826747, - "grad_norm": 1.7588495016098022, - "learning_rate": 4.969056817079633e-06, - "loss": 0.49710947275161743, - "mean_token_accuracy": 0.818520724773407, - "num_tokens": 3468098.0, - "step": 387 - }, - { - "epoch": 0.2948328267477204, - "grad_norm": 2.6381046772003174, - "learning_rate": 4.968727448632669e-06, - "loss": 0.4425308108329773, - "mean_token_accuracy": 0.8451643586158752, - "num_tokens": 3472899.0, - "step": 388 - }, - { - "epoch": 0.29559270516717323, - "grad_norm": 1.6345038414001465, - "learning_rate": 4.968396347537751e-06, - "loss": 0.4177059829235077, - "mean_token_accuracy": 0.8498886227607727, - "num_tokens": 3484826.0, - "step": 389 - }, - { - "epoch": 0.29635258358662614, - "grad_norm": 3.0466468334198, - "learning_rate": 4.968063514027258e-06, - "loss": 0.4274463951587677, - "mean_token_accuracy": 0.8387278318405151, - "num_tokens": 3488610.0, - "step": 390 - }, - { - "epoch": 0.29711246200607905, - "grad_norm": 2.6509406566619873, - "learning_rate": 4.967728948334784e-06, - "loss": 0.5401753783226013, - "mean_token_accuracy": 0.8252490162849426, - "num_tokens": 3493657.0, - "step": 391 - }, - { - "epoch": 0.2978723404255319, - "grad_norm": 1.6372219324111938, - "learning_rate": 4.967392650695141e-06, - "loss": 0.3862472176551819, - "mean_token_accuracy": 0.8555525541305542, - "num_tokens": 3505588.0, - "step": 392 - }, - { - "epoch": 0.2986322188449848, - "grad_norm": 2.1615452766418457, - "learning_rate": 4.967054621344356e-06, - "loss": 0.57850581407547, - "mean_token_accuracy": 0.8222678899765015, - "num_tokens": 3514396.0, - "step": 393 - }, - { - "epoch": 0.2993920972644377, - "grad_norm": 1.8610916137695312, - "learning_rate": 4.96671486051967e-06, - "loss": 0.5440595149993896, - "mean_token_accuracy": 0.8196715116500854, - "num_tokens": 3523604.0, - "step": 394 - }, - { - "epoch": 0.30015197568389057, - "grad_norm": 2.9585862159729004, - "learning_rate": 4.966373368459542e-06, - "loss": 0.6921588182449341, - "mean_token_accuracy": 0.7816659808158875, - "num_tokens": 3529849.0, - "step": 395 - }, - { - "epoch": 0.3009118541033435, - "grad_norm": 1.9374035596847534, - "learning_rate": 4.966030145403642e-06, - "loss": 0.5494055151939392, - "mean_token_accuracy": 0.8126792907714844, - "num_tokens": 3539529.0, - "step": 396 - }, - { - "epoch": 0.30167173252279633, - "grad_norm": 1.730530023574829, - "learning_rate": 4.965685191592859e-06, - "loss": 0.4271572232246399, - "mean_token_accuracy": 0.8383668661117554, - "num_tokens": 3550972.0, - "step": 397 - }, - { - "epoch": 0.30243161094224924, - "grad_norm": 3.9635560512542725, - "learning_rate": 4.9653385072692935e-06, - "loss": 0.5576210021972656, - "mean_token_accuracy": 0.799404501914978, - "num_tokens": 3554147.0, - "step": 398 - }, - { - "epoch": 0.30319148936170215, - "grad_norm": 2.5731968879699707, - "learning_rate": 4.964990092676263e-06, - "loss": 0.5478942394256592, - "mean_token_accuracy": 0.8220961093902588, - "num_tokens": 3559972.0, - "step": 399 - }, - { - "epoch": 0.303951367781155, - "grad_norm": 2.2096588611602783, - "learning_rate": 4.964639948058297e-06, - "loss": 0.35461270809173584, - "mean_token_accuracy": 0.8640927076339722, - "num_tokens": 3565770.0, - "step": 400 - }, - { - "epoch": 0.3047112462006079, - "grad_norm": 1.7874189615249634, - "learning_rate": 4.964288073661142e-06, - "loss": 0.38849619030952454, - "mean_token_accuracy": 0.8443037271499634, - "num_tokens": 3574514.0, - "step": 401 - }, - { - "epoch": 0.30547112462006076, - "grad_norm": 1.5583146810531616, - "learning_rate": 4.963934469731756e-06, - "loss": 0.48909449577331543, - "mean_token_accuracy": 0.8429768681526184, - "num_tokens": 3585877.0, - "step": 402 - }, - { - "epoch": 0.30623100303951367, - "grad_norm": 3.026599645614624, - "learning_rate": 4.963579136518312e-06, - "loss": 0.5138992071151733, - "mean_token_accuracy": 0.8283728361129761, - "num_tokens": 3590412.0, - "step": 403 - }, - { - "epoch": 0.3069908814589666, - "grad_norm": 2.777505874633789, - "learning_rate": 4.963222074270197e-06, - "loss": 0.6241534948348999, - "mean_token_accuracy": 0.8130464553833008, - "num_tokens": 3596246.0, - "step": 404 - }, - { - "epoch": 0.30775075987841943, - "grad_norm": 2.4772839546203613, - "learning_rate": 4.962863283238011e-06, - "loss": 0.5930814146995544, - "mean_token_accuracy": 0.8036394715309143, - "num_tokens": 3602878.0, - "step": 405 - }, - { - "epoch": 0.30851063829787234, - "grad_norm": 1.5049982070922852, - "learning_rate": 4.962502763673566e-06, - "loss": 0.4903082549571991, - "mean_token_accuracy": 0.8184912204742432, - "num_tokens": 3617018.0, - "step": 406 - }, - { - "epoch": 0.30927051671732525, - "grad_norm": 2.453155040740967, - "learning_rate": 4.96214051582989e-06, - "loss": 0.5138067603111267, - "mean_token_accuracy": 0.8336835503578186, - "num_tokens": 3624188.0, - "step": 407 - }, - { - "epoch": 0.3100303951367781, - "grad_norm": 2.4038336277008057, - "learning_rate": 4.961776539961222e-06, - "loss": 0.5752760171890259, - "mean_token_accuracy": 0.8054730892181396, - "num_tokens": 3634152.0, - "step": 408 - }, - { - "epoch": 0.310790273556231, - "grad_norm": 2.629068374633789, - "learning_rate": 4.961410836323014e-06, - "loss": 0.5580606460571289, - "mean_token_accuracy": 0.8121089935302734, - "num_tokens": 3639528.0, - "step": 409 - }, - { - "epoch": 0.31155015197568386, - "grad_norm": 1.4245928525924683, - "learning_rate": 4.961043405171931e-06, - "loss": 0.5399882793426514, - "mean_token_accuracy": 0.812280535697937, - "num_tokens": 3655744.0, - "step": 410 - }, - { - "epoch": 0.3123100303951368, - "grad_norm": 1.5236459970474243, - "learning_rate": 4.9606742467658505e-06, - "loss": 0.5234690308570862, - "mean_token_accuracy": 0.8188928365707397, - "num_tokens": 3675010.0, - "step": 411 - }, - { - "epoch": 0.3130699088145897, - "grad_norm": 2.27961802482605, - "learning_rate": 4.960303361363863e-06, - "loss": 0.5502505898475647, - "mean_token_accuracy": 0.8161963224411011, - "num_tokens": 3682328.0, - "step": 412 - }, - { - "epoch": 0.31382978723404253, - "grad_norm": 1.554518222808838, - "learning_rate": 4.959930749226269e-06, - "loss": 0.420867919921875, - "mean_token_accuracy": 0.8499157428741455, - "num_tokens": 3694980.0, - "step": 413 - }, - { - "epoch": 0.31458966565349544, - "grad_norm": 2.609218120574951, - "learning_rate": 4.9595564106145825e-06, - "loss": 0.4706704318523407, - "mean_token_accuracy": 0.8412490487098694, - "num_tokens": 3700033.0, - "step": 414 - }, - { - "epoch": 0.31534954407294835, - "grad_norm": 1.5303231477737427, - "learning_rate": 4.959180345791528e-06, - "loss": 0.4668654799461365, - "mean_token_accuracy": 0.8125015497207642, - "num_tokens": 3715012.0, - "step": 415 - }, - { - "epoch": 0.3161094224924012, - "grad_norm": 1.2774665355682373, - "learning_rate": 4.958802555021042e-06, - "loss": 0.4339369237422943, - "mean_token_accuracy": 0.8442851901054382, - "num_tokens": 3733928.0, - "step": 416 - }, - { - "epoch": 0.3168693009118541, - "grad_norm": 2.1240181922912598, - "learning_rate": 4.958423038568274e-06, - "loss": 0.4029104709625244, - "mean_token_accuracy": 0.8627674579620361, - "num_tokens": 3740202.0, - "step": 417 - }, - { - "epoch": 0.31762917933130697, - "grad_norm": 2.00538969039917, - "learning_rate": 4.958041796699583e-06, - "loss": 0.5229607820510864, - "mean_token_accuracy": 0.8282366394996643, - "num_tokens": 3749308.0, - "step": 418 - }, - { - "epoch": 0.3183890577507599, - "grad_norm": 2.6555092334747314, - "learning_rate": 4.957658829682539e-06, - "loss": 0.5344101190567017, - "mean_token_accuracy": 0.8183202743530273, - "num_tokens": 3754595.0, - "step": 419 - }, - { - "epoch": 0.3191489361702128, - "grad_norm": 1.7468839883804321, - "learning_rate": 4.9572741377859225e-06, - "loss": 0.5667245984077454, - "mean_token_accuracy": 0.8080123662948608, - "num_tokens": 3765761.0, - "step": 420 - }, - { - "epoch": 0.31990881458966564, - "grad_norm": 2.9612457752227783, - "learning_rate": 4.956887721279726e-06, - "loss": 0.5389559864997864, - "mean_token_accuracy": 0.8019476532936096, - "num_tokens": 3770844.0, - "step": 421 - }, - { - "epoch": 0.32066869300911854, - "grad_norm": 1.842403769493103, - "learning_rate": 4.95649958043515e-06, - "loss": 0.38279837369918823, - "mean_token_accuracy": 0.858866810798645, - "num_tokens": 3778094.0, - "step": 422 - }, - { - "epoch": 0.32142857142857145, - "grad_norm": 2.3108131885528564, - "learning_rate": 4.956109715524609e-06, - "loss": 0.5453893542289734, - "mean_token_accuracy": 0.8085013031959534, - "num_tokens": 3785015.0, - "step": 423 - }, - { - "epoch": 0.3221884498480243, - "grad_norm": 3.0326945781707764, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.5550523400306702, - "mean_token_accuracy": 0.8125876188278198, - "num_tokens": 3789830.0, - "step": 424 - }, - { - "epoch": 0.3229483282674772, - "grad_norm": 1.8851977586746216, - "learning_rate": 4.955324814601324e-06, - "loss": 0.4902324974536896, - "mean_token_accuracy": 0.8205406665802002, - "num_tokens": 3799862.0, - "step": 425 - }, - { - "epoch": 0.32370820668693007, - "grad_norm": 2.6018171310424805, - "learning_rate": 4.954929779139455e-06, - "loss": 0.5920133590698242, - "mean_token_accuracy": 0.8340690732002258, - "num_tokens": 3806617.0, - "step": 426 - }, - { - "epoch": 0.324468085106383, - "grad_norm": 2.4283878803253174, - "learning_rate": 4.954533020713367e-06, - "loss": 0.5305854082107544, - "mean_token_accuracy": 0.8137468099594116, - "num_tokens": 3813843.0, - "step": 427 - }, - { - "epoch": 0.3252279635258359, - "grad_norm": 2.667978525161743, - "learning_rate": 4.954134539601519e-06, - "loss": 0.5333638787269592, - "mean_token_accuracy": 0.8402629494667053, - "num_tokens": 3819450.0, - "step": 428 - }, - { - "epoch": 0.32598784194528874, - "grad_norm": 1.7302523851394653, - "learning_rate": 4.953734336083582e-06, - "loss": 0.422895610332489, - "mean_token_accuracy": 0.8709704875946045, - "num_tokens": 3831027.0, - "step": 429 - }, - { - "epoch": 0.32674772036474165, - "grad_norm": 2.427192211151123, - "learning_rate": 4.953332410440434e-06, - "loss": 0.6334598064422607, - "mean_token_accuracy": 0.7817479968070984, - "num_tokens": 3841776.0, - "step": 430 - }, - { - "epoch": 0.32750759878419455, - "grad_norm": 1.460949182510376, - "learning_rate": 4.952928762954161e-06, - "loss": 0.3654777705669403, - "mean_token_accuracy": 0.8780122995376587, - "num_tokens": 3852213.0, - "step": 431 - }, - { - "epoch": 0.3282674772036474, - "grad_norm": 1.9855005741119385, - "learning_rate": 4.952523393908059e-06, - "loss": 0.5117089748382568, - "mean_token_accuracy": 0.811911404132843, - "num_tokens": 3861176.0, - "step": 432 - }, - { - "epoch": 0.3290273556231003, - "grad_norm": 2.2653207778930664, - "learning_rate": 4.952116303586631e-06, - "loss": 0.42514950037002563, - "mean_token_accuracy": 0.8448518514633179, - "num_tokens": 3867164.0, - "step": 433 - }, - { - "epoch": 0.32978723404255317, - "grad_norm": 1.9780964851379395, - "learning_rate": 4.951707492275589e-06, - "loss": 0.5095293521881104, - "mean_token_accuracy": 0.8262748718261719, - "num_tokens": 3876406.0, - "step": 434 - }, - { - "epoch": 0.3305471124620061, - "grad_norm": 2.9480233192443848, - "learning_rate": 4.951296960261853e-06, - "loss": 0.3494448959827423, - "mean_token_accuracy": 0.8781307935714722, - "num_tokens": 3880298.0, - "step": 435 - }, - { - "epoch": 0.331306990881459, - "grad_norm": 2.335571527481079, - "learning_rate": 4.95088470783355e-06, - "loss": 0.5456914901733398, - "mean_token_accuracy": 0.816297173500061, - "num_tokens": 3886487.0, - "step": 436 - }, - { - "epoch": 0.33206686930091184, - "grad_norm": 2.3046419620513916, - "learning_rate": 4.950470735280013e-06, - "loss": 0.4835948944091797, - "mean_token_accuracy": 0.8539175391197205, - "num_tokens": 3892706.0, - "step": 437 - }, - { - "epoch": 0.33282674772036475, - "grad_norm": 2.44047474861145, - "learning_rate": 4.950055042891786e-06, - "loss": 0.5154092907905579, - "mean_token_accuracy": 0.8579919338226318, - "num_tokens": 3899532.0, - "step": 438 - }, - { - "epoch": 0.33358662613981765, - "grad_norm": 4.826764106750488, - "learning_rate": 4.949637630960618e-06, - "loss": 0.5270259976387024, - "mean_token_accuracy": 0.8172192573547363, - "num_tokens": 3902260.0, - "step": 439 - }, - { - "epoch": 0.3343465045592705, - "grad_norm": 2.001574754714966, - "learning_rate": 4.949218499779462e-06, - "loss": 0.5413002967834473, - "mean_token_accuracy": 0.8162837028503418, - "num_tokens": 3911706.0, - "step": 440 - }, - { - "epoch": 0.3351063829787234, - "grad_norm": 1.7998944520950317, - "learning_rate": 4.948797649642484e-06, - "loss": 0.5131614208221436, - "mean_token_accuracy": 0.8367440700531006, - "num_tokens": 3923490.0, - "step": 441 - }, - { - "epoch": 0.33586626139817627, - "grad_norm": 3.4566173553466797, - "learning_rate": 4.94837508084505e-06, - "loss": 0.7258909940719604, - "mean_token_accuracy": 0.771377444267273, - "num_tokens": 3928099.0, - "step": 442 - }, - { - "epoch": 0.3366261398176292, - "grad_norm": 2.0040442943573, - "learning_rate": 4.9479507936837364e-06, - "loss": 0.482135534286499, - "mean_token_accuracy": 0.8339327573776245, - "num_tokens": 3937328.0, - "step": 443 - }, - { - "epoch": 0.3373860182370821, - "grad_norm": 2.949502944946289, - "learning_rate": 4.947524788456325e-06, - "loss": 0.6474795341491699, - "mean_token_accuracy": 0.7951677441596985, - "num_tokens": 3942529.0, - "step": 444 - }, - { - "epoch": 0.33814589665653494, - "grad_norm": 1.5528364181518555, - "learning_rate": 4.947097065461801e-06, - "loss": 0.48791584372520447, - "mean_token_accuracy": 0.8425545692443848, - "num_tokens": 3955200.0, - "step": 445 - }, - { - "epoch": 0.33890577507598785, - "grad_norm": 1.8813284635543823, - "learning_rate": 4.946667625000358e-06, - "loss": 0.45922309160232544, - "mean_token_accuracy": 0.8206527233123779, - "num_tokens": 3962975.0, - "step": 446 - }, - { - "epoch": 0.33966565349544076, - "grad_norm": 1.7157847881317139, - "learning_rate": 4.946236467373392e-06, - "loss": 0.5454182028770447, - "mean_token_accuracy": 0.8049604892730713, - "num_tokens": 3973956.0, - "step": 447 - }, - { - "epoch": 0.3404255319148936, - "grad_norm": 2.008857250213623, - "learning_rate": 4.945803592883509e-06, - "loss": 0.5151860117912292, - "mean_token_accuracy": 0.8262045383453369, - "num_tokens": 3982853.0, - "step": 448 - }, - { - "epoch": 0.3411854103343465, - "grad_norm": 1.6632496118545532, - "learning_rate": 4.9453690018345144e-06, - "loss": 0.42710691690444946, - "mean_token_accuracy": 0.8521314859390259, - "num_tokens": 3993838.0, - "step": 449 - }, - { - "epoch": 0.34194528875379937, - "grad_norm": 1.365234375, - "learning_rate": 4.944932694531423e-06, - "loss": 0.5172526836395264, - "mean_token_accuracy": 0.8277045488357544, - "num_tokens": 4014179.0, - "step": 450 - }, - { - "epoch": 0.3427051671732523, - "grad_norm": 1.7610243558883667, - "learning_rate": 4.94449467128045e-06, - "loss": 0.42104798555374146, - "mean_token_accuracy": 0.8552065491676331, - "num_tokens": 4023663.0, - "step": 451 - }, - { - "epoch": 0.3434650455927052, - "grad_norm": 2.3732354640960693, - "learning_rate": 4.944054932389018e-06, - "loss": 0.5471175909042358, - "mean_token_accuracy": 0.8487317562103271, - "num_tokens": 4030100.0, - "step": 452 - }, - { - "epoch": 0.34422492401215804, - "grad_norm": 1.5973623991012573, - "learning_rate": 4.943613478165753e-06, - "loss": 0.419813871383667, - "mean_token_accuracy": 0.8484025001525879, - "num_tokens": 4041124.0, - "step": 453 - }, - { - "epoch": 0.34498480243161095, - "grad_norm": 2.966381549835205, - "learning_rate": 4.943170308920484e-06, - "loss": 0.5370652675628662, - "mean_token_accuracy": 0.8439491987228394, - "num_tokens": 4045675.0, - "step": 454 - }, - { - "epoch": 0.34574468085106386, - "grad_norm": 2.5097248554229736, - "learning_rate": 4.9427254249642445e-06, - "loss": 0.5776349306106567, - "mean_token_accuracy": 0.8060523867607117, - "num_tokens": 4053250.0, - "step": 455 - }, - { - "epoch": 0.3465045592705167, - "grad_norm": 1.6779125928878784, - "learning_rate": 4.942278826609272e-06, - "loss": 0.5245476961135864, - "mean_token_accuracy": 0.8168526887893677, - "num_tokens": 4064106.0, - "step": 456 - }, - { - "epoch": 0.3472644376899696, - "grad_norm": 1.5945546627044678, - "learning_rate": 4.9418305141690045e-06, - "loss": 0.4972047209739685, - "mean_token_accuracy": 0.8257735967636108, - "num_tokens": 4077687.0, - "step": 457 - }, - { - "epoch": 0.34802431610942247, - "grad_norm": 2.864778757095337, - "learning_rate": 4.9413804879580865e-06, - "loss": 0.5372499823570251, - "mean_token_accuracy": 0.8423776626586914, - "num_tokens": 4082632.0, - "step": 458 - }, - { - "epoch": 0.3487841945288754, - "grad_norm": 1.4797078371047974, - "learning_rate": 4.940928748292363e-06, - "loss": 0.5903409719467163, - "mean_token_accuracy": 0.8061295747756958, - "num_tokens": 4104218.0, - "step": 459 - }, - { - "epoch": 0.3495440729483283, - "grad_norm": 2.4376983642578125, - "learning_rate": 4.940475295488882e-06, - "loss": 0.4534894824028015, - "mean_token_accuracy": 0.8395825028419495, - "num_tokens": 4110530.0, - "step": 460 - }, - { - "epoch": 0.35030395136778114, - "grad_norm": 1.2955626249313354, - "learning_rate": 4.940020129865895e-06, - "loss": 0.47155818343162537, - "mean_token_accuracy": 0.8253582715988159, - "num_tokens": 4128398.0, - "step": 461 - }, - { - "epoch": 0.35106382978723405, - "grad_norm": 2.066575527191162, - "learning_rate": 4.9395632517428546e-06, - "loss": 0.5555641651153564, - "mean_token_accuracy": 0.814624547958374, - "num_tokens": 4137623.0, - "step": 462 - }, - { - "epoch": 0.3518237082066869, - "grad_norm": 1.6407525539398193, - "learning_rate": 4.939104661440415e-06, - "loss": 0.4361790418624878, - "mean_token_accuracy": 0.8544459342956543, - "num_tokens": 4152803.0, - "step": 463 - }, - { - "epoch": 0.3525835866261398, - "grad_norm": 2.1685116291046143, - "learning_rate": 4.938644359280433e-06, - "loss": 0.5347012877464294, - "mean_token_accuracy": 0.853853702545166, - "num_tokens": 4160778.0, - "step": 464 - }, - { - "epoch": 0.3533434650455927, - "grad_norm": 1.8824869394302368, - "learning_rate": 4.938182345585967e-06, - "loss": 0.5512481927871704, - "mean_token_accuracy": 0.7985891699790955, - "num_tokens": 4170380.0, - "step": 465 - }, - { - "epoch": 0.3541033434650456, - "grad_norm": 2.2229504585266113, - "learning_rate": 4.937718620681273e-06, - "loss": 0.516828179359436, - "mean_token_accuracy": 0.8265621066093445, - "num_tokens": 4178179.0, - "step": 466 - }, - { - "epoch": 0.3548632218844985, - "grad_norm": 1.955990195274353, - "learning_rate": 4.9372531848918145e-06, - "loss": 0.5586158037185669, - "mean_token_accuracy": 0.8367916345596313, - "num_tokens": 4188626.0, - "step": 467 - }, - { - "epoch": 0.3556231003039514, - "grad_norm": 1.9687023162841797, - "learning_rate": 4.936786038544251e-06, - "loss": 0.5517531633377075, - "mean_token_accuracy": 0.8134098052978516, - "num_tokens": 4198144.0, - "step": 468 - }, - { - "epoch": 0.35638297872340424, - "grad_norm": 1.405516505241394, - "learning_rate": 4.9363171819664434e-06, - "loss": 0.5305492877960205, - "mean_token_accuracy": 0.8014427423477173, - "num_tokens": 4222818.0, - "step": 469 - }, - { - "epoch": 0.35714285714285715, - "grad_norm": 2.6355695724487305, - "learning_rate": 4.9358466154874535e-06, - "loss": 0.5303391218185425, - "mean_token_accuracy": 0.8028861284255981, - "num_tokens": 4228318.0, - "step": 470 - }, - { - "epoch": 0.35790273556231, - "grad_norm": 1.5133824348449707, - "learning_rate": 4.935374339437543e-06, - "loss": 0.5329189300537109, - "mean_token_accuracy": 0.8479441404342651, - "num_tokens": 4244527.0, - "step": 471 - }, - { - "epoch": 0.3586626139817629, - "grad_norm": 3.4356725215911865, - "learning_rate": 4.934900354148173e-06, - "loss": 0.5431582927703857, - "mean_token_accuracy": 0.8328983783721924, - "num_tokens": 4248034.0, - "step": 472 - }, - { - "epoch": 0.3594224924012158, - "grad_norm": 2.5789499282836914, - "learning_rate": 4.934424659952006e-06, - "loss": 0.4141455292701721, - "mean_token_accuracy": 0.8658635020256042, - "num_tokens": 4252953.0, - "step": 473 - }, - { - "epoch": 0.3601823708206687, - "grad_norm": 1.145262598991394, - "learning_rate": 4.933947257182901e-06, - "loss": 0.40294092893600464, - "mean_token_accuracy": 0.8565847277641296, - "num_tokens": 4277813.0, - "step": 474 - }, - { - "epoch": 0.3609422492401216, - "grad_norm": 1.7242133617401123, - "learning_rate": 4.933468146175918e-06, - "loss": 0.6036738753318787, - "mean_token_accuracy": 0.8072597980499268, - "num_tokens": 4291088.0, - "step": 475 - }, - { - "epoch": 0.3617021276595745, - "grad_norm": 2.3490941524505615, - "learning_rate": 4.932987327267317e-06, - "loss": 0.49456146359443665, - "mean_token_accuracy": 0.8372673988342285, - "num_tokens": 4297376.0, - "step": 476 - }, - { - "epoch": 0.36246200607902734, - "grad_norm": 1.3605526685714722, - "learning_rate": 4.932504800794553e-06, - "loss": 0.43595948815345764, - "mean_token_accuracy": 0.8415953516960144, - "num_tokens": 4312054.0, - "step": 477 - }, - { - "epoch": 0.36322188449848025, - "grad_norm": 1.4525885581970215, - "learning_rate": 4.9320205670962815e-06, - "loss": 0.5390371680259705, - "mean_token_accuracy": 0.8101649284362793, - "num_tokens": 4328701.0, - "step": 478 - }, - { - "epoch": 0.3639817629179331, - "grad_norm": 1.9862419366836548, - "learning_rate": 4.931534626512359e-06, - "loss": 0.45436930656433105, - "mean_token_accuracy": 0.8352861404418945, - "num_tokens": 4338372.0, - "step": 479 - }, - { - "epoch": 0.364741641337386, - "grad_norm": 1.7804961204528809, - "learning_rate": 4.931046979383836e-06, - "loss": 0.4677754044532776, - "mean_token_accuracy": 0.840467095375061, - "num_tokens": 4347897.0, - "step": 480 - }, - { - "epoch": 0.3655015197568389, - "grad_norm": 2.066632032394409, - "learning_rate": 4.930557626052961e-06, - "loss": 0.42418140172958374, - "mean_token_accuracy": 0.8528275489807129, - "num_tokens": 4354061.0, - "step": 481 - }, - { - "epoch": 0.3662613981762918, - "grad_norm": 1.6155282258987427, - "learning_rate": 4.930066566863182e-06, - "loss": 0.5424284934997559, - "mean_token_accuracy": 0.825040876865387, - "num_tokens": 4370400.0, - "step": 482 - }, - { - "epoch": 0.3670212765957447, - "grad_norm": 2.1452953815460205, - "learning_rate": 4.929573802159143e-06, - "loss": 0.5105804204940796, - "mean_token_accuracy": 0.8284053802490234, - "num_tokens": 4377579.0, - "step": 483 - }, - { - "epoch": 0.3677811550151976, - "grad_norm": 1.8940945863723755, - "learning_rate": 4.929079332286685e-06, - "loss": 0.43478304147720337, - "mean_token_accuracy": 0.8505665063858032, - "num_tokens": 4385686.0, - "step": 484 - }, - { - "epoch": 0.36854103343465044, - "grad_norm": 1.6785860061645508, - "learning_rate": 4.928583157592846e-06, - "loss": 0.40227848291397095, - "mean_token_accuracy": 0.8623573780059814, - "num_tokens": 4396128.0, - "step": 485 - }, - { - "epoch": 0.36930091185410335, - "grad_norm": 1.6416733264923096, - "learning_rate": 4.928085278425862e-06, - "loss": 0.526267409324646, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 4407963.0, - "step": 486 - }, - { - "epoch": 0.3700607902735562, - "grad_norm": 1.8882389068603516, - "learning_rate": 4.927585695135162e-06, - "loss": 0.5555213093757629, - "mean_token_accuracy": 0.8115293979644775, - "num_tokens": 4418057.0, - "step": 487 - }, - { - "epoch": 0.3708206686930091, - "grad_norm": 2.300248384475708, - "learning_rate": 4.9270844080713735e-06, - "loss": 0.5812339186668396, - "mean_token_accuracy": 0.800270676612854, - "num_tokens": 4425358.0, - "step": 488 - }, - { - "epoch": 0.371580547112462, - "grad_norm": 1.6802922487258911, - "learning_rate": 4.926581417586319e-06, - "loss": 0.5134941935539246, - "mean_token_accuracy": 0.8247408866882324, - "num_tokens": 4437702.0, - "step": 489 - }, - { - "epoch": 0.3723404255319149, - "grad_norm": 1.7620291709899902, - "learning_rate": 4.926076724033016e-06, - "loss": 0.5233973264694214, - "mean_token_accuracy": 0.8102161884307861, - "num_tokens": 4448584.0, - "step": 490 - }, - { - "epoch": 0.3731003039513678, - "grad_norm": 1.6911998987197876, - "learning_rate": 4.925570327765678e-06, - "loss": 0.5337274074554443, - "mean_token_accuracy": 0.845306396484375, - "num_tokens": 4462651.0, - "step": 491 - }, - { - "epoch": 0.3738601823708207, - "grad_norm": 1.7991242408752441, - "learning_rate": 4.9250622291397144e-06, - "loss": 0.31018948554992676, - "mean_token_accuracy": 0.8857606053352356, - "num_tokens": 4469971.0, - "step": 492 - }, - { - "epoch": 0.37462006079027355, - "grad_norm": 4.9776835441589355, - "learning_rate": 4.924552428511727e-06, - "loss": 0.44114983081817627, - "mean_token_accuracy": 0.8429906368255615, - "num_tokens": 4478275.0, - "step": 493 - }, - { - "epoch": 0.37537993920972645, - "grad_norm": 1.8007272481918335, - "learning_rate": 4.924040926239515e-06, - "loss": 0.574328601360321, - "mean_token_accuracy": 0.7669196128845215, - "num_tokens": 4491551.0, - "step": 494 - }, - { - "epoch": 0.3761398176291793, - "grad_norm": 2.021300792694092, - "learning_rate": 4.92352772268207e-06, - "loss": 0.45636120438575745, - "mean_token_accuracy": 0.840438723564148, - "num_tokens": 4498658.0, - "step": 495 - }, - { - "epoch": 0.3768996960486322, - "grad_norm": 2.369748592376709, - "learning_rate": 4.923012818199576e-06, - "loss": 0.5206376910209656, - "mean_token_accuracy": 0.8521823287010193, - "num_tokens": 4504648.0, - "step": 496 - }, - { - "epoch": 0.3776595744680851, - "grad_norm": 2.733485221862793, - "learning_rate": 4.922496213153416e-06, - "loss": 0.5067723989486694, - "mean_token_accuracy": 0.8168281316757202, - "num_tokens": 4509990.0, - "step": 497 - }, - { - "epoch": 0.378419452887538, - "grad_norm": 2.3751676082611084, - "learning_rate": 4.921977907906161e-06, - "loss": 0.49757206439971924, - "mean_token_accuracy": 0.8325017690658569, - "num_tokens": 4518373.0, - "step": 498 - }, - { - "epoch": 0.3791793313069909, - "grad_norm": 2.1672775745391846, - "learning_rate": 4.921457902821578e-06, - "loss": 0.4237566590309143, - "mean_token_accuracy": 0.8404698371887207, - "num_tokens": 4524338.0, - "step": 499 - }, - { - "epoch": 0.3799392097264438, - "grad_norm": 1.8374360799789429, - "learning_rate": 4.9209361982646275e-06, - "loss": 0.4995468854904175, - "mean_token_accuracy": 0.8299649953842163, - "num_tokens": 4533396.0, - "step": 500 - }, - { - "epoch": 0.38069908814589665, - "grad_norm": 2.083967924118042, - "learning_rate": 4.920412794601461e-06, - "loss": 0.489935040473938, - "mean_token_accuracy": 0.8315291404724121, - "num_tokens": 4540941.0, - "step": 501 - }, - { - "epoch": 0.38145896656534956, - "grad_norm": 2.2075610160827637, - "learning_rate": 4.919887692199423e-06, - "loss": 0.5233147740364075, - "mean_token_accuracy": 0.804171085357666, - "num_tokens": 4548215.0, - "step": 502 - }, - { - "epoch": 0.3822188449848024, - "grad_norm": 2.076775312423706, - "learning_rate": 4.9193608914270515e-06, - "loss": 0.5785550475120544, - "mean_token_accuracy": 0.7993186116218567, - "num_tokens": 4558204.0, - "step": 503 - }, - { - "epoch": 0.3829787234042553, - "grad_norm": 2.238546133041382, - "learning_rate": 4.918832392654075e-06, - "loss": 0.5287384390830994, - "mean_token_accuracy": 0.8214945793151855, - "num_tokens": 4565407.0, - "step": 504 - }, - { - "epoch": 0.3837386018237082, - "grad_norm": 1.6783074140548706, - "learning_rate": 4.9183021962514145e-06, - "loss": 0.6063359379768372, - "mean_token_accuracy": 0.7914625406265259, - "num_tokens": 4580991.0, - "step": 505 - }, - { - "epoch": 0.3844984802431611, - "grad_norm": 1.6287449598312378, - "learning_rate": 4.917770302591183e-06, - "loss": 0.3598247766494751, - "mean_token_accuracy": 0.8706809878349304, - "num_tokens": 4590579.0, - "step": 506 - }, - { - "epoch": 0.385258358662614, - "grad_norm": 1.5432041883468628, - "learning_rate": 4.917236712046682e-06, - "loss": 0.5267890095710754, - "mean_token_accuracy": 0.8032117486000061, - "num_tokens": 4608380.0, - "step": 507 - }, - { - "epoch": 0.3860182370820669, - "grad_norm": 1.7664037942886353, - "learning_rate": 4.9167014249924075e-06, - "loss": 0.3552354574203491, - "mean_token_accuracy": 0.8569793701171875, - "num_tokens": 4616426.0, - "step": 508 - }, - { - "epoch": 0.38677811550151975, - "grad_norm": 2.1147472858428955, - "learning_rate": 4.916164441804044e-06, - "loss": 0.5212404727935791, - "mean_token_accuracy": 0.8196578025817871, - "num_tokens": 4623908.0, - "step": 509 - }, - { - "epoch": 0.38753799392097266, - "grad_norm": 2.1092333793640137, - "learning_rate": 4.915625762858467e-06, - "loss": 0.5197038650512695, - "mean_token_accuracy": 0.8245604634284973, - "num_tokens": 4630956.0, - "step": 510 - }, - { - "epoch": 0.3882978723404255, - "grad_norm": 1.23331880569458, - "learning_rate": 4.915085388533743e-06, - "loss": 0.4759839177131653, - "mean_token_accuracy": 0.8192248344421387, - "num_tokens": 4651269.0, - "step": 511 - }, - { - "epoch": 0.3890577507598784, - "grad_norm": 2.424199104309082, - "learning_rate": 4.914543319209126e-06, - "loss": 0.5576270818710327, - "mean_token_accuracy": 0.8203302621841431, - "num_tokens": 4657296.0, - "step": 512 - }, - { - "epoch": 0.3898176291793313, - "grad_norm": 2.725156307220459, - "learning_rate": 4.913999555265062e-06, - "loss": 0.4337949752807617, - "mean_token_accuracy": 0.8382406234741211, - "num_tokens": 4661850.0, - "step": 513 - }, - { - "epoch": 0.3905775075987842, - "grad_norm": 2.3120534420013428, - "learning_rate": 4.913454097083185e-06, - "loss": 0.4941597580909729, - "mean_token_accuracy": 0.8302834033966064, - "num_tokens": 4667769.0, - "step": 514 - }, - { - "epoch": 0.3913373860182371, - "grad_norm": 2.3111207485198975, - "learning_rate": 4.912906945046319e-06, - "loss": 0.5253715515136719, - "mean_token_accuracy": 0.84515380859375, - "num_tokens": 4674537.0, - "step": 515 - }, - { - "epoch": 0.39209726443769, - "grad_norm": 1.4117841720581055, - "learning_rate": 4.912358099538476e-06, - "loss": 0.4521017074584961, - "mean_token_accuracy": 0.8208256959915161, - "num_tokens": 4690605.0, - "step": 516 - }, - { - "epoch": 0.39285714285714285, - "grad_norm": 2.3742799758911133, - "learning_rate": 4.911807560944858e-06, - "loss": 0.41572901606559753, - "mean_token_accuracy": 0.8550551533699036, - "num_tokens": 4706437.0, - "step": 517 - }, - { - "epoch": 0.39361702127659576, - "grad_norm": 2.4052202701568604, - "learning_rate": 4.911255329651852e-06, - "loss": 0.6003736257553101, - "mean_token_accuracy": 0.8247885704040527, - "num_tokens": 4712746.0, - "step": 518 - }, - { - "epoch": 0.3943768996960486, - "grad_norm": 1.9335490465164185, - "learning_rate": 4.910701406047037e-06, - "loss": 0.5457713603973389, - "mean_token_accuracy": 0.787429690361023, - "num_tokens": 4731937.0, - "step": 519 - }, - { - "epoch": 0.3951367781155015, - "grad_norm": 2.257706880569458, - "learning_rate": 4.910145790519177e-06, - "loss": 0.5300652980804443, - "mean_token_accuracy": 0.8192912936210632, - "num_tokens": 4739422.0, - "step": 520 - }, - { - "epoch": 0.3958966565349544, - "grad_norm": 1.2099462747573853, - "learning_rate": 4.9095884834582256e-06, - "loss": 0.45872747898101807, - "mean_token_accuracy": 0.8362667560577393, - "num_tokens": 4757113.0, - "step": 521 - }, - { - "epoch": 0.3966565349544073, - "grad_norm": 2.7991135120391846, - "learning_rate": 4.909029485255321e-06, - "loss": 0.49039560556411743, - "mean_token_accuracy": 0.8260016441345215, - "num_tokens": 4761709.0, - "step": 522 - }, - { - "epoch": 0.3974164133738602, - "grad_norm": 2.2360129356384277, - "learning_rate": 4.90846879630279e-06, - "loss": 0.49556830525398254, - "mean_token_accuracy": 0.827864408493042, - "num_tokens": 4769048.0, - "step": 523 - }, - { - "epoch": 0.3981762917933131, - "grad_norm": 2.5953688621520996, - "learning_rate": 4.907906416994146e-06, - "loss": 0.387208491563797, - "mean_token_accuracy": 0.8467001914978027, - "num_tokens": 4774637.0, - "step": 524 - }, - { - "epoch": 0.39893617021276595, - "grad_norm": 2.1046814918518066, - "learning_rate": 4.907342347724088e-06, - "loss": 0.5477259755134583, - "mean_token_accuracy": 0.8060322999954224, - "num_tokens": 4782774.0, - "step": 525 - }, - { - "epoch": 0.39969604863221886, - "grad_norm": 2.5622646808624268, - "learning_rate": 4.906776588888502e-06, - "loss": 0.5684159398078918, - "mean_token_accuracy": 0.8095303177833557, - "num_tokens": 4788766.0, - "step": 526 - }, - { - "epoch": 0.4004559270516717, - "grad_norm": 1.9027913808822632, - "learning_rate": 4.906209140884459e-06, - "loss": 0.535524845123291, - "mean_token_accuracy": 0.815237820148468, - "num_tokens": 4798492.0, - "step": 527 - }, - { - "epoch": 0.4012158054711246, - "grad_norm": 2.1447622776031494, - "learning_rate": 4.905640004110216e-06, - "loss": 0.5628632307052612, - "mean_token_accuracy": 0.8085395097732544, - "num_tokens": 4805737.0, - "step": 528 - }, - { - "epoch": 0.40197568389057753, - "grad_norm": 1.6754741668701172, - "learning_rate": 4.905069178965215e-06, - "loss": 0.5046736598014832, - "mean_token_accuracy": 0.8247535228729248, - "num_tokens": 4816912.0, - "step": 529 - }, - { - "epoch": 0.4027355623100304, - "grad_norm": 2.271230459213257, - "learning_rate": 4.904496665850083e-06, - "loss": 0.6086187958717346, - "mean_token_accuracy": 0.7935276627540588, - "num_tokens": 4824577.0, - "step": 530 - }, - { - "epoch": 0.4034954407294833, - "grad_norm": 2.107595205307007, - "learning_rate": 4.903922465166633e-06, - "loss": 0.5431341528892517, - "mean_token_accuracy": 0.8129537105560303, - "num_tokens": 4831772.0, - "step": 531 - }, - { - "epoch": 0.40425531914893614, - "grad_norm": 1.3860732316970825, - "learning_rate": 4.903346577317859e-06, - "loss": 0.45816320180892944, - "mean_token_accuracy": 0.8328287601470947, - "num_tokens": 4850302.0, - "step": 532 - }, - { - "epoch": 0.40501519756838905, - "grad_norm": 1.9186837673187256, - "learning_rate": 4.902769002707942e-06, - "loss": 0.3294633626937866, - "mean_token_accuracy": 0.8853933811187744, - "num_tokens": 4856624.0, - "step": 533 - }, - { - "epoch": 0.40577507598784196, - "grad_norm": 1.516194462776184, - "learning_rate": 4.902189741742247e-06, - "loss": 0.45482105016708374, - "mean_token_accuracy": 0.8370342254638672, - "num_tokens": 4870395.0, - "step": 534 - }, - { - "epoch": 0.4065349544072948, - "grad_norm": 2.3235628604888916, - "learning_rate": 4.901608794827321e-06, - "loss": 0.40688639879226685, - "mean_token_accuracy": 0.8643521666526794, - "num_tokens": 4875645.0, - "step": 535 - }, - { - "epoch": 0.4072948328267477, - "grad_norm": 2.29286527633667, - "learning_rate": 4.9010261623708945e-06, - "loss": 0.45482826232910156, - "mean_token_accuracy": 0.8429383039474487, - "num_tokens": 4881772.0, - "step": 536 - }, - { - "epoch": 0.40805471124620063, - "grad_norm": 1.5907070636749268, - "learning_rate": 4.900441844781882e-06, - "loss": 0.5266948342323303, - "mean_token_accuracy": 0.8348641395568848, - "num_tokens": 4894289.0, - "step": 537 - }, - { - "epoch": 0.4088145896656535, - "grad_norm": 2.1816294193267822, - "learning_rate": 4.89985584247038e-06, - "loss": 0.4797617793083191, - "mean_token_accuracy": 0.8549500703811646, - "num_tokens": 4901106.0, - "step": 538 - }, - { - "epoch": 0.4095744680851064, - "grad_norm": 1.7347146272659302, - "learning_rate": 4.899268155847667e-06, - "loss": 0.4754739999771118, - "mean_token_accuracy": 0.8278418183326721, - "num_tokens": 4912131.0, - "step": 539 - }, - { - "epoch": 0.41033434650455924, - "grad_norm": 2.0694527626037598, - "learning_rate": 4.898678785326205e-06, - "loss": 0.5071008801460266, - "mean_token_accuracy": 0.8157946467399597, - "num_tokens": 4921141.0, - "step": 540 - }, - { - "epoch": 0.41109422492401215, - "grad_norm": 2.570047616958618, - "learning_rate": 4.898087731319637e-06, - "loss": 0.43639278411865234, - "mean_token_accuracy": 0.8682913780212402, - "num_tokens": 4926182.0, - "step": 541 - }, - { - "epoch": 0.41185410334346506, - "grad_norm": 4.064006805419922, - "learning_rate": 4.8974949942427854e-06, - "loss": 0.539260745048523, - "mean_token_accuracy": 0.8225528001785278, - "num_tokens": 4929449.0, - "step": 542 - }, - { - "epoch": 0.4126139817629179, - "grad_norm": 1.7644332647323608, - "learning_rate": 4.896900574511657e-06, - "loss": 0.472618043422699, - "mean_token_accuracy": 0.8332902193069458, - "num_tokens": 4939443.0, - "step": 543 - }, - { - "epoch": 0.4133738601823708, - "grad_norm": 2.879918336868286, - "learning_rate": 4.89630447254344e-06, - "loss": 0.6360667943954468, - "mean_token_accuracy": 0.8215296268463135, - "num_tokens": 4950838.0, - "step": 544 - }, - { - "epoch": 0.41413373860182373, - "grad_norm": 1.4575570821762085, - "learning_rate": 4.8957066887565005e-06, - "loss": 0.45617997646331787, - "mean_token_accuracy": 0.8373187184333801, - "num_tokens": 4965222.0, - "step": 545 - }, - { - "epoch": 0.4148936170212766, - "grad_norm": 2.4829535484313965, - "learning_rate": 4.895107223570386e-06, - "loss": 0.42285341024398804, - "mean_token_accuracy": 0.8686380386352539, - "num_tokens": 4970724.0, - "step": 546 - }, - { - "epoch": 0.4156534954407295, - "grad_norm": 2.639474630355835, - "learning_rate": 4.894506077405824e-06, - "loss": 0.5906289219856262, - "mean_token_accuracy": 0.8174435496330261, - "num_tokens": 4976766.0, - "step": 547 - }, - { - "epoch": 0.41641337386018235, - "grad_norm": 2.7960562705993652, - "learning_rate": 4.893903250684723e-06, - "loss": 0.4518949091434479, - "mean_token_accuracy": 0.8387585282325745, - "num_tokens": 4980991.0, - "step": 548 - }, - { - "epoch": 0.41717325227963525, - "grad_norm": 2.184176206588745, - "learning_rate": 4.893298743830168e-06, - "loss": 0.5223842859268188, - "mean_token_accuracy": 0.8170937299728394, - "num_tokens": 4987781.0, - "step": 549 - }, - { - "epoch": 0.41793313069908816, - "grad_norm": 2.2393438816070557, - "learning_rate": 4.892692557266429e-06, - "loss": 0.5238431692123413, - "mean_token_accuracy": 0.8217905759811401, - "num_tokens": 4994321.0, - "step": 550 - }, - { - "epoch": 0.418693009118541, - "grad_norm": 3.579047441482544, - "learning_rate": 4.8920846914189465e-06, - "loss": 0.5367584228515625, - "mean_token_accuracy": 0.8312011361122131, - "num_tokens": 4997951.0, - "step": 551 - }, - { - "epoch": 0.4194528875379939, - "grad_norm": 1.6330240964889526, - "learning_rate": 4.891475146714348e-06, - "loss": 0.6054705381393433, - "mean_token_accuracy": 0.7938206791877747, - "num_tokens": 5012726.0, - "step": 552 - }, - { - "epoch": 0.42021276595744683, - "grad_norm": 1.5775716304779053, - "learning_rate": 4.8908639235804324e-06, - "loss": 0.4774656891822815, - "mean_token_accuracy": 0.828762948513031, - "num_tokens": 5026751.0, - "step": 553 - }, - { - "epoch": 0.4209726443768997, - "grad_norm": 1.5719101428985596, - "learning_rate": 4.890251022446181e-06, - "loss": 0.549429178237915, - "mean_token_accuracy": 0.8110791444778442, - "num_tokens": 5041861.0, - "step": 554 - }, - { - "epoch": 0.4217325227963526, - "grad_norm": 1.8585275411605835, - "learning_rate": 4.889636443741752e-06, - "loss": 0.4448118805885315, - "mean_token_accuracy": 0.8462690711021423, - "num_tokens": 5052690.0, - "step": 555 - }, - { - "epoch": 0.42249240121580545, - "grad_norm": 2.189202070236206, - "learning_rate": 4.88902018789848e-06, - "loss": 0.4296762943267822, - "mean_token_accuracy": 0.8488791584968567, - "num_tokens": 5058964.0, - "step": 556 - }, - { - "epoch": 0.42325227963525835, - "grad_norm": 1.9328460693359375, - "learning_rate": 4.888402255348877e-06, - "loss": 0.5369474291801453, - "mean_token_accuracy": 0.8184729814529419, - "num_tokens": 5068465.0, - "step": 557 - }, - { - "epoch": 0.42401215805471126, - "grad_norm": 1.6233323812484741, - "learning_rate": 4.887782646526631e-06, - "loss": 0.5284391641616821, - "mean_token_accuracy": 0.8276044726371765, - "num_tokens": 5081052.0, - "step": 558 - }, - { - "epoch": 0.4247720364741641, - "grad_norm": 2.222813844680786, - "learning_rate": 4.887161361866608e-06, - "loss": 0.5679137706756592, - "mean_token_accuracy": 0.8012375831604004, - "num_tokens": 5090001.0, - "step": 559 - }, - { - "epoch": 0.425531914893617, - "grad_norm": 2.1062207221984863, - "learning_rate": 4.8865384018048494e-06, - "loss": 0.5554201602935791, - "mean_token_accuracy": 0.8128066062927246, - "num_tokens": 5097644.0, - "step": 560 - }, - { - "epoch": 0.42629179331306993, - "grad_norm": 1.5380984544754028, - "learning_rate": 4.8859137667785735e-06, - "loss": 0.4948265850543976, - "mean_token_accuracy": 0.8258291482925415, - "num_tokens": 5110069.0, - "step": 561 - }, - { - "epoch": 0.4270516717325228, - "grad_norm": 2.0290257930755615, - "learning_rate": 4.8852874572261715e-06, - "loss": 0.4969530403614044, - "mean_token_accuracy": 0.8297134637832642, - "num_tokens": 5117452.0, - "step": 562 - }, - { - "epoch": 0.4278115501519757, - "grad_norm": 1.5651452541351318, - "learning_rate": 4.884659473587213e-06, - "loss": 0.5353102087974548, - "mean_token_accuracy": 0.8161719441413879, - "num_tokens": 5133756.0, - "step": 563 - }, - { - "epoch": 0.42857142857142855, - "grad_norm": 2.2470998764038086, - "learning_rate": 4.884029816302441e-06, - "loss": 0.5104288458824158, - "mean_token_accuracy": 0.8081635236740112, - "num_tokens": 5140278.0, - "step": 564 - }, - { - "epoch": 0.42933130699088146, - "grad_norm": 1.726891279220581, - "learning_rate": 4.883398485813772e-06, - "loss": 0.4508771002292633, - "mean_token_accuracy": 0.8548800349235535, - "num_tokens": 5150115.0, - "step": 565 - }, - { - "epoch": 0.43009118541033436, - "grad_norm": 1.4779289960861206, - "learning_rate": 4.8827654825642984e-06, - "loss": 0.46861088275909424, - "mean_token_accuracy": 0.8209476470947266, - "num_tokens": 5163225.0, - "step": 566 - }, - { - "epoch": 0.4308510638297872, - "grad_norm": 1.2361034154891968, - "learning_rate": 4.882130806998287e-06, - "loss": 0.4591076672077179, - "mean_token_accuracy": 0.803041934967041, - "num_tokens": 5180342.0, - "step": 567 - }, - { - "epoch": 0.4316109422492401, - "grad_norm": 1.882467269897461, - "learning_rate": 4.881494459561177e-06, - "loss": 0.579258143901825, - "mean_token_accuracy": 0.8007112741470337, - "num_tokens": 5189595.0, - "step": 568 - }, - { - "epoch": 0.43237082066869303, - "grad_norm": 1.095462441444397, - "learning_rate": 4.880856440699582e-06, - "loss": 0.3806574046611786, - "mean_token_accuracy": 0.8650111556053162, - "num_tokens": 5211642.0, - "step": 569 - }, - { - "epoch": 0.4331306990881459, - "grad_norm": 1.6469846963882446, - "learning_rate": 4.880216750861288e-06, - "loss": 0.544589638710022, - "mean_token_accuracy": 0.8060122728347778, - "num_tokens": 5224137.0, - "step": 570 - }, - { - "epoch": 0.4338905775075988, - "grad_norm": 1.8561251163482666, - "learning_rate": 4.879575390495254e-06, - "loss": 0.4094924330711365, - "mean_token_accuracy": 0.8591406345367432, - "num_tokens": 5231588.0, - "step": 571 - }, - { - "epoch": 0.43465045592705165, - "grad_norm": 3.01326847076416, - "learning_rate": 4.878932360051611e-06, - "loss": 0.6139192581176758, - "mean_token_accuracy": 0.8108739852905273, - "num_tokens": 5236853.0, - "step": 572 - }, - { - "epoch": 0.43541033434650456, - "grad_norm": 2.1753034591674805, - "learning_rate": 4.878287659981663e-06, - "loss": 0.49082931876182556, - "mean_token_accuracy": 0.862828254699707, - "num_tokens": 5243264.0, - "step": 573 - }, - { - "epoch": 0.43617021276595747, - "grad_norm": 1.4437755346298218, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.5608728528022766, - "mean_token_accuracy": 0.8271626234054565, - "num_tokens": 5261757.0, - "step": 574 - }, - { - "epoch": 0.4369300911854103, - "grad_norm": 1.786683440208435, - "learning_rate": 4.876993252773923e-06, - "loss": 0.4377627968788147, - "mean_token_accuracy": 0.844936192035675, - "num_tokens": 5271038.0, - "step": 575 - }, - { - "epoch": 0.4376899696048632, - "grad_norm": 1.3425915241241455, - "learning_rate": 4.876343546544596e-06, - "loss": 0.44762521982192993, - "mean_token_accuracy": 0.8397793769836426, - "num_tokens": 5285555.0, - "step": 576 - }, - { - "epoch": 0.43844984802431614, - "grad_norm": 2.1549675464630127, - "learning_rate": 4.8756921725058935e-06, - "loss": 0.5332942008972168, - "mean_token_accuracy": 0.820149302482605, - "num_tokens": 5294595.0, - "step": 577 - }, - { - "epoch": 0.439209726443769, - "grad_norm": 1.5254042148590088, - "learning_rate": 4.875039131114975e-06, - "loss": 0.3646543622016907, - "mean_token_accuracy": 0.8442583084106445, - "num_tokens": 5304955.0, - "step": 578 - }, - { - "epoch": 0.4399696048632219, - "grad_norm": 1.5751557350158691, - "learning_rate": 4.8743844228301676e-06, - "loss": 0.4854734539985657, - "mean_token_accuracy": 0.8317523002624512, - "num_tokens": 5317351.0, - "step": 579 - }, - { - "epoch": 0.44072948328267475, - "grad_norm": 1.6950466632843018, - "learning_rate": 4.873728048110973e-06, - "loss": 0.5907570719718933, - "mean_token_accuracy": 0.7946986556053162, - "num_tokens": 5332542.0, - "step": 580 - }, - { - "epoch": 0.44148936170212766, - "grad_norm": 2.1180708408355713, - "learning_rate": 4.873070007418059e-06, - "loss": 0.5220296382904053, - "mean_token_accuracy": 0.8037363290786743, - "num_tokens": 5341722.0, - "step": 581 - }, - { - "epoch": 0.44224924012158057, - "grad_norm": 1.3643816709518433, - "learning_rate": 4.872410301213265e-06, - "loss": 0.4865502417087555, - "mean_token_accuracy": 0.8377852439880371, - "num_tokens": 5359359.0, - "step": 582 - }, - { - "epoch": 0.4430091185410334, - "grad_norm": 1.483280897140503, - "learning_rate": 4.871748929959598e-06, - "loss": 0.36856764554977417, - "mean_token_accuracy": 0.8709549903869629, - "num_tokens": 5369749.0, - "step": 583 - }, - { - "epoch": 0.44376899696048633, - "grad_norm": 1.6891541481018066, - "learning_rate": 4.871085894121234e-06, - "loss": 0.5768930912017822, - "mean_token_accuracy": 0.8030461668968201, - "num_tokens": 5383912.0, - "step": 584 - }, - { - "epoch": 0.44452887537993924, - "grad_norm": 2.1318740844726562, - "learning_rate": 4.870421194163515e-06, - "loss": 0.4337100386619568, - "mean_token_accuracy": 0.8562518358230591, - "num_tokens": 5389412.0, - "step": 585 - }, - { - "epoch": 0.4452887537993921, - "grad_norm": 2.540255546569824, - "learning_rate": 4.869754830552956e-06, - "loss": 0.4708256125450134, - "mean_token_accuracy": 0.8446552753448486, - "num_tokens": 5394762.0, - "step": 586 - }, - { - "epoch": 0.446048632218845, - "grad_norm": 2.048015594482422, - "learning_rate": 4.869086803757235e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8181137442588806, - "num_tokens": 5402379.0, - "step": 587 - }, - { - "epoch": 0.44680851063829785, - "grad_norm": 2.9821012020111084, - "learning_rate": 4.868417114245199e-06, - "loss": 0.6299797296524048, - "mean_token_accuracy": 0.8237329125404358, - "num_tokens": 5408229.0, - "step": 588 - }, - { - "epoch": 0.44756838905775076, - "grad_norm": 1.7807202339172363, - "learning_rate": 4.867745762486862e-06, - "loss": 0.5176759958267212, - "mean_token_accuracy": 0.8184244632720947, - "num_tokens": 5418383.0, - "step": 589 - }, - { - "epoch": 0.44832826747720367, - "grad_norm": 1.5466399192810059, - "learning_rate": 4.8670727489534035e-06, - "loss": 0.5137228965759277, - "mean_token_accuracy": 0.8365053534507751, - "num_tokens": 5432127.0, - "step": 590 - }, - { - "epoch": 0.4490881458966565, - "grad_norm": 2.9521141052246094, - "learning_rate": 4.866398074117173e-06, - "loss": 0.4056887924671173, - "mean_token_accuracy": 0.8561501502990723, - "num_tokens": 5436062.0, - "step": 591 - }, - { - "epoch": 0.44984802431610943, - "grad_norm": 2.058743953704834, - "learning_rate": 4.86572173845168e-06, - "loss": 0.6124799251556396, - "mean_token_accuracy": 0.8007957339286804, - "num_tokens": 5444989.0, - "step": 592 - }, - { - "epoch": 0.4506079027355623, - "grad_norm": 2.1243767738342285, - "learning_rate": 4.865043742431605e-06, - "loss": 0.5659694671630859, - "mean_token_accuracy": 0.8084750175476074, - "num_tokens": 5453865.0, - "step": 593 - }, - { - "epoch": 0.4513677811550152, - "grad_norm": 1.6732314825057983, - "learning_rate": 4.864364086532792e-06, - "loss": 0.47879064083099365, - "mean_token_accuracy": 0.8346436023712158, - "num_tokens": 5466398.0, - "step": 594 - }, - { - "epoch": 0.4521276595744681, - "grad_norm": 1.3793858289718628, - "learning_rate": 4.863682771232249e-06, - "loss": 0.45989373326301575, - "mean_token_accuracy": 0.8254791498184204, - "num_tokens": 5482121.0, - "step": 595 - }, - { - "epoch": 0.45288753799392095, - "grad_norm": 1.9812315702438354, - "learning_rate": 4.862999797008149e-06, - "loss": 0.5778874754905701, - "mean_token_accuracy": 0.8041508197784424, - "num_tokens": 5493000.0, - "step": 596 - }, - { - "epoch": 0.45364741641337386, - "grad_norm": 3.3065083026885986, - "learning_rate": 4.862315164339829e-06, - "loss": 0.4623975157737732, - "mean_token_accuracy": 0.8426318168640137, - "num_tokens": 5496723.0, - "step": 597 - }, - { - "epoch": 0.45440729483282677, - "grad_norm": 3.167119026184082, - "learning_rate": 4.861628873707792e-06, - "loss": 0.6984533667564392, - "mean_token_accuracy": 0.772136926651001, - "num_tokens": 5501161.0, - "step": 598 - }, - { - "epoch": 0.4551671732522796, - "grad_norm": 2.2130985260009766, - "learning_rate": 4.860940925593703e-06, - "loss": 0.4823192059993744, - "mean_token_accuracy": 0.8462972640991211, - "num_tokens": 5509544.0, - "step": 599 - }, - { - "epoch": 0.45592705167173253, - "grad_norm": 3.029191732406616, - "learning_rate": 4.86025132048039e-06, - "loss": 0.523664116859436, - "mean_token_accuracy": 0.8229140043258667, - "num_tokens": 5514586.0, - "step": 600 - }, - { - "epoch": 0.4566869300911854, - "grad_norm": 1.6983962059020996, - "learning_rate": 4.859560058851844e-06, - "loss": 0.4832698106765747, - "mean_token_accuracy": 0.8403248190879822, - "num_tokens": 5525773.0, - "step": 601 - }, - { - "epoch": 0.4574468085106383, - "grad_norm": 3.0504038333892822, - "learning_rate": 4.8588671411932195e-06, - "loss": 0.5158926248550415, - "mean_token_accuracy": 0.8098392486572266, - "num_tokens": 5529739.0, - "step": 602 - }, - { - "epoch": 0.4582066869300912, - "grad_norm": 2.584836483001709, - "learning_rate": 4.858172567990832e-06, - "loss": 0.5724587440490723, - "mean_token_accuracy": 0.8128519058227539, - "num_tokens": 5535763.0, - "step": 603 - }, - { - "epoch": 0.45896656534954405, - "grad_norm": 2.0514042377471924, - "learning_rate": 4.857476339732162e-06, - "loss": 0.4337679445743561, - "mean_token_accuracy": 0.8405929207801819, - "num_tokens": 5543075.0, - "step": 604 - }, - { - "epoch": 0.45972644376899696, - "grad_norm": 2.2949347496032715, - "learning_rate": 4.856778456905846e-06, - "loss": 0.46532145142555237, - "mean_token_accuracy": 0.8345137238502502, - "num_tokens": 5549035.0, - "step": 605 - }, - { - "epoch": 0.46048632218844987, - "grad_norm": 2.2067551612854004, - "learning_rate": 4.856078920001689e-06, - "loss": 0.5855136513710022, - "mean_token_accuracy": 0.8043795228004456, - "num_tokens": 5555545.0, - "step": 606 - }, - { - "epoch": 0.4612462006079027, - "grad_norm": 2.101945161819458, - "learning_rate": 4.855377729510648e-06, - "loss": 0.6071814298629761, - "mean_token_accuracy": 0.7973253130912781, - "num_tokens": 5563615.0, - "step": 607 - }, - { - "epoch": 0.46200607902735563, - "grad_norm": 2.5958821773529053, - "learning_rate": 4.8546748859248504e-06, - "loss": 0.6278061866760254, - "mean_token_accuracy": 0.7864972352981567, - "num_tokens": 5570078.0, - "step": 608 - }, - { - "epoch": 0.4627659574468085, - "grad_norm": 2.778101921081543, - "learning_rate": 4.853970389737576e-06, - "loss": 0.35521194338798523, - "mean_token_accuracy": 0.8752605319023132, - "num_tokens": 5573995.0, - "step": 609 - }, - { - "epoch": 0.4635258358662614, - "grad_norm": 2.600534677505493, - "learning_rate": 4.8532642414432675e-06, - "loss": 0.6541563868522644, - "mean_token_accuracy": 0.7843613028526306, - "num_tokens": 5580333.0, - "step": 610 - }, - { - "epoch": 0.4642857142857143, - "grad_norm": 1.778337836265564, - "learning_rate": 4.852556441537528e-06, - "loss": 0.3561405837535858, - "mean_token_accuracy": 0.8579353094100952, - "num_tokens": 5588430.0, - "step": 611 - }, - { - "epoch": 0.46504559270516715, - "grad_norm": 1.5653862953186035, - "learning_rate": 4.851846990517118e-06, - "loss": 0.6067906618118286, - "mean_token_accuracy": 0.7919317483901978, - "num_tokens": 5601700.0, - "step": 612 - }, - { - "epoch": 0.46580547112462006, - "grad_norm": 1.6097723245620728, - "learning_rate": 4.851135888879958e-06, - "loss": 0.446664422750473, - "mean_token_accuracy": 0.8441969156265259, - "num_tokens": 5612063.0, - "step": 613 - }, - { - "epoch": 0.46656534954407297, - "grad_norm": 1.961207389831543, - "learning_rate": 4.850423137125126e-06, - "loss": 0.5508605241775513, - "mean_token_accuracy": 0.8240450024604797, - "num_tokens": 5620245.0, - "step": 614 - }, - { - "epoch": 0.4673252279635258, - "grad_norm": 2.2189085483551025, - "learning_rate": 4.8497087357528585e-06, - "loss": 0.6805076599121094, - "mean_token_accuracy": 0.771978497505188, - "num_tokens": 5629590.0, - "step": 615 - }, - { - "epoch": 0.46808510638297873, - "grad_norm": 2.5176279544830322, - "learning_rate": 4.8489926852645505e-06, - "loss": 0.4512156844139099, - "mean_token_accuracy": 0.836459755897522, - "num_tokens": 5635259.0, - "step": 616 - }, - { - "epoch": 0.4688449848024316, - "grad_norm": 1.5327287912368774, - "learning_rate": 4.848274986162754e-06, - "loss": 0.4884302616119385, - "mean_token_accuracy": 0.8194037079811096, - "num_tokens": 5649993.0, - "step": 617 - }, - { - "epoch": 0.4696048632218845, - "grad_norm": 2.184554100036621, - "learning_rate": 4.847555638951177e-06, - "loss": 0.5141451358795166, - "mean_token_accuracy": 0.8245922327041626, - "num_tokens": 5657375.0, - "step": 618 - }, - { - "epoch": 0.4703647416413374, - "grad_norm": 1.6143407821655273, - "learning_rate": 4.846834644134686e-06, - "loss": 0.4276641607284546, - "mean_token_accuracy": 0.8481845855712891, - "num_tokens": 5667941.0, - "step": 619 - }, - { - "epoch": 0.47112462006079026, - "grad_norm": 2.3747270107269287, - "learning_rate": 4.846112002219301e-06, - "loss": 0.5608246922492981, - "mean_token_accuracy": 0.8073011040687561, - "num_tokens": 5675042.0, - "step": 620 - }, - { - "epoch": 0.47188449848024316, - "grad_norm": 2.390404224395752, - "learning_rate": 4.845387713712203e-06, - "loss": 0.46616724133491516, - "mean_token_accuracy": 0.8468319177627563, - "num_tokens": 5680207.0, - "step": 621 - }, - { - "epoch": 0.4726443768996961, - "grad_norm": 1.7245099544525146, - "learning_rate": 4.844661779121723e-06, - "loss": 0.5652435421943665, - "mean_token_accuracy": 0.8010749816894531, - "num_tokens": 5693759.0, - "step": 622 - }, - { - "epoch": 0.4734042553191489, - "grad_norm": 2.6923108100891113, - "learning_rate": 4.843934198957351e-06, - "loss": 0.6254661679267883, - "mean_token_accuracy": 0.8236024975776672, - "num_tokens": 5699916.0, - "step": 623 - }, - { - "epoch": 0.47416413373860183, - "grad_norm": 2.516901969909668, - "learning_rate": 4.84320497372973e-06, - "loss": 0.6334252953529358, - "mean_token_accuracy": 0.7803834676742554, - "num_tokens": 5706554.0, - "step": 624 - }, - { - "epoch": 0.4749240121580547, - "grad_norm": 2.3744447231292725, - "learning_rate": 4.842474103950658e-06, - "loss": 0.4221811890602112, - "mean_token_accuracy": 0.8639545440673828, - "num_tokens": 5711756.0, - "step": 625 - }, - { - "epoch": 0.4756838905775076, - "grad_norm": 3.2373476028442383, - "learning_rate": 4.841741590133089e-06, - "loss": 0.6637828946113586, - "mean_token_accuracy": 0.7968347072601318, - "num_tokens": 5716458.0, - "step": 626 - }, - { - "epoch": 0.4764437689969605, - "grad_norm": 2.153888463973999, - "learning_rate": 4.841007432791129e-06, - "loss": 0.4877486228942871, - "mean_token_accuracy": 0.8345249891281128, - "num_tokens": 5723155.0, - "step": 627 - }, - { - "epoch": 0.47720364741641336, - "grad_norm": 2.120497703552246, - "learning_rate": 4.8402716324400375e-06, - "loss": 0.37323033809661865, - "mean_token_accuracy": 0.8734050393104553, - "num_tokens": 5729171.0, - "step": 628 - }, - { - "epoch": 0.47796352583586627, - "grad_norm": 1.5294172763824463, - "learning_rate": 4.839534189596228e-06, - "loss": 0.4057067334651947, - "mean_token_accuracy": 0.8523319959640503, - "num_tokens": 5740112.0, - "step": 629 - }, - { - "epoch": 0.4787234042553192, - "grad_norm": 2.1913886070251465, - "learning_rate": 4.8387951047772656e-06, - "loss": 0.4835960865020752, - "mean_token_accuracy": 0.8438145518302917, - "num_tokens": 5746838.0, - "step": 630 - }, - { - "epoch": 0.479483282674772, - "grad_norm": 1.482897162437439, - "learning_rate": 4.838054378501868e-06, - "loss": 0.46967992186546326, - "mean_token_accuracy": 0.8315759897232056, - "num_tokens": 5760428.0, - "step": 631 - }, - { - "epoch": 0.48024316109422494, - "grad_norm": 1.38850998878479, - "learning_rate": 4.837312011289907e-06, - "loss": 0.41845446825027466, - "mean_token_accuracy": 0.8557186126708984, - "num_tokens": 5773437.0, - "step": 632 - }, - { - "epoch": 0.4810030395136778, - "grad_norm": 3.8337457180023193, - "learning_rate": 4.836568003662403e-06, - "loss": 0.5102912187576294, - "mean_token_accuracy": 0.830644965171814, - "num_tokens": 5776367.0, - "step": 633 - }, - { - "epoch": 0.4817629179331307, - "grad_norm": 1.2084007263183594, - "learning_rate": 4.8358223561415304e-06, - "loss": 0.3835333585739136, - "mean_token_accuracy": 0.8639016151428223, - "num_tokens": 5792246.0, - "step": 634 - }, - { - "epoch": 0.4825227963525836, - "grad_norm": 1.939408540725708, - "learning_rate": 4.835075069250613e-06, - "loss": 0.4044850468635559, - "mean_token_accuracy": 0.8488376140594482, - "num_tokens": 5799853.0, - "step": 635 - }, - { - "epoch": 0.48328267477203646, - "grad_norm": 1.345870852470398, - "learning_rate": 4.8343261435141245e-06, - "loss": 0.46660199761390686, - "mean_token_accuracy": 0.8371681571006775, - "num_tokens": 5817478.0, - "step": 636 - }, - { - "epoch": 0.48404255319148937, - "grad_norm": 1.6531339883804321, - "learning_rate": 4.833575579457691e-06, - "loss": 0.3886989951133728, - "mean_token_accuracy": 0.8763507008552551, - "num_tokens": 5825739.0, - "step": 637 - }, - { - "epoch": 0.4848024316109423, - "grad_norm": 1.6443969011306763, - "learning_rate": 4.832823377608088e-06, - "loss": 0.4070289731025696, - "mean_token_accuracy": 0.8586630821228027, - "num_tokens": 5837917.0, - "step": 638 - }, - { - "epoch": 0.48556231003039513, - "grad_norm": 2.005136013031006, - "learning_rate": 4.832069538493237e-06, - "loss": 0.40616685152053833, - "mean_token_accuracy": 0.8571510314941406, - "num_tokens": 5845250.0, - "step": 639 - }, - { - "epoch": 0.48632218844984804, - "grad_norm": 1.5244266986846924, - "learning_rate": 4.831314062642213e-06, - "loss": 0.49530288577079773, - "mean_token_accuracy": 0.8328841924667358, - "num_tokens": 5857407.0, - "step": 640 - }, - { - "epoch": 0.4870820668693009, - "grad_norm": 1.9876971244812012, - "learning_rate": 4.830556950585239e-06, - "loss": 0.4583776593208313, - "mean_token_accuracy": 0.8427221179008484, - "num_tokens": 5865391.0, - "step": 641 - }, - { - "epoch": 0.4878419452887538, - "grad_norm": 3.023336172103882, - "learning_rate": 4.829798202853683e-06, - "loss": 0.6134771108627319, - "mean_token_accuracy": 0.7981935739517212, - "num_tokens": 5870729.0, - "step": 642 - }, - { - "epoch": 0.4886018237082067, - "grad_norm": 1.8889515399932861, - "learning_rate": 4.829037819980065e-06, - "loss": 0.4420135021209717, - "mean_token_accuracy": 0.8480775356292725, - "num_tokens": 5878982.0, - "step": 643 - }, - { - "epoch": 0.48936170212765956, - "grad_norm": 2.2408435344696045, - "learning_rate": 4.828275802498051e-06, - "loss": 0.525706946849823, - "mean_token_accuracy": 0.8271557092666626, - "num_tokens": 5885097.0, - "step": 644 - }, - { - "epoch": 0.49012158054711247, - "grad_norm": 1.9734224081039429, - "learning_rate": 4.827512150942454e-06, - "loss": 0.44246578216552734, - "mean_token_accuracy": 0.8456668257713318, - "num_tokens": 5893941.0, - "step": 645 - }, - { - "epoch": 0.4908814589665654, - "grad_norm": 1.9618173837661743, - "learning_rate": 4.8267468658492335e-06, - "loss": 0.5119768381118774, - "mean_token_accuracy": 0.8355510830879211, - "num_tokens": 5902829.0, - "step": 646 - }, - { - "epoch": 0.49164133738601823, - "grad_norm": 1.7181587219238281, - "learning_rate": 4.825979947755496e-06, - "loss": 0.5666520595550537, - "mean_token_accuracy": 0.7951971888542175, - "num_tokens": 5915212.0, - "step": 647 - }, - { - "epoch": 0.49240121580547114, - "grad_norm": 3.0121164321899414, - "learning_rate": 4.8252113971994955e-06, - "loss": 0.628632128238678, - "mean_token_accuracy": 0.8041050434112549, - "num_tokens": 5921410.0, - "step": 648 - }, - { - "epoch": 0.493161094224924, - "grad_norm": 2.9980475902557373, - "learning_rate": 4.824441214720629e-06, - "loss": 0.4507424831390381, - "mean_token_accuracy": 0.8636263608932495, - "num_tokens": 5925179.0, - "step": 649 - }, - { - "epoch": 0.4939209726443769, - "grad_norm": 2.0096445083618164, - "learning_rate": 4.823669400859441e-06, - "loss": 0.602759838104248, - "mean_token_accuracy": 0.8104915618896484, - "num_tokens": 5934160.0, - "step": 650 - }, - { - "epoch": 0.4946808510638298, - "grad_norm": 1.1186442375183105, - "learning_rate": 4.8228959561576195e-06, - "loss": 0.41168469190597534, - "mean_token_accuracy": 0.8461419939994812, - "num_tokens": 5954163.0, - "step": 651 - }, - { - "epoch": 0.49544072948328266, - "grad_norm": 1.855465054512024, - "learning_rate": 4.822120881157998e-06, - "loss": 0.5049735307693481, - "mean_token_accuracy": 0.8225747346878052, - "num_tokens": 5963840.0, - "step": 652 - }, - { - "epoch": 0.49620060790273557, - "grad_norm": 3.550563335418701, - "learning_rate": 4.821344176404554e-06, - "loss": 0.49025264382362366, - "mean_token_accuracy": 0.8265978693962097, - "num_tokens": 5967358.0, - "step": 653 - }, - { - "epoch": 0.4969604863221885, - "grad_norm": 3.063910484313965, - "learning_rate": 4.820565842442408e-06, - "loss": 0.5652767419815063, - "mean_token_accuracy": 0.811700701713562, - "num_tokens": 5971858.0, - "step": 654 - }, - { - "epoch": 0.49772036474164133, - "grad_norm": 2.4613308906555176, - "learning_rate": 4.819785879817827e-06, - "loss": 0.5296125411987305, - "mean_token_accuracy": 0.8336488008499146, - "num_tokens": 5977442.0, - "step": 655 - }, - { - "epoch": 0.49848024316109424, - "grad_norm": 2.342519760131836, - "learning_rate": 4.819004289078217e-06, - "loss": 0.5753380060195923, - "mean_token_accuracy": 0.7922406792640686, - "num_tokens": 5984531.0, - "step": 656 - }, - { - "epoch": 0.4992401215805471, - "grad_norm": 2.0410680770874023, - "learning_rate": 4.818221070772129e-06, - "loss": 0.5433275699615479, - "mean_token_accuracy": 0.8043830990791321, - "num_tokens": 5992642.0, - "step": 657 - }, - { - "epoch": 0.5, - "grad_norm": 1.4999698400497437, - "learning_rate": 4.8174362254492555e-06, - "loss": 0.5248899459838867, - "mean_token_accuracy": 0.8107168674468994, - "num_tokens": 6005543.0, - "step": 658 - }, - { - "epoch": 0.5007598784194529, - "grad_norm": 1.9494401216506958, - "learning_rate": 4.816649753660431e-06, - "loss": 0.41291385889053345, - "mean_token_accuracy": 0.8650569915771484, - "num_tokens": 6012185.0, - "step": 659 - }, - { - "epoch": 0.5015197568389058, - "grad_norm": 2.7514095306396484, - "learning_rate": 4.815861655957632e-06, - "loss": 0.4244142770767212, - "mean_token_accuracy": 0.8485112190246582, - "num_tokens": 6016809.0, - "step": 660 - }, - { - "epoch": 0.5022796352583586, - "grad_norm": 1.4354928731918335, - "learning_rate": 4.815071932893976e-06, - "loss": 0.4332060217857361, - "mean_token_accuracy": 0.8386815786361694, - "num_tokens": 6034795.0, - "step": 661 - }, - { - "epoch": 0.5030395136778115, - "grad_norm": 1.3113417625427246, - "learning_rate": 4.81428058502372e-06, - "loss": 0.5415540933609009, - "mean_token_accuracy": 0.8115285038948059, - "num_tokens": 6053624.0, - "step": 662 - }, - { - "epoch": 0.5037993920972644, - "grad_norm": 1.820868730545044, - "learning_rate": 4.813487612902265e-06, - "loss": 0.5360245108604431, - "mean_token_accuracy": 0.8313555717468262, - "num_tokens": 6063399.0, - "step": 663 - }, - { - "epoch": 0.5045592705167173, - "grad_norm": 2.347001552581787, - "learning_rate": 4.812693017086145e-06, - "loss": 0.4926982820034027, - "mean_token_accuracy": 0.8137006759643555, - "num_tokens": 6070111.0, - "step": 664 - }, - { - "epoch": 0.5053191489361702, - "grad_norm": 1.8830888271331787, - "learning_rate": 4.811896798133042e-06, - "loss": 0.5419014692306519, - "mean_token_accuracy": 0.8027454614639282, - "num_tokens": 6081090.0, - "step": 665 - }, - { - "epoch": 0.506079027355623, - "grad_norm": 2.3258056640625, - "learning_rate": 4.811098956601772e-06, - "loss": 0.4629337787628174, - "mean_token_accuracy": 0.8416580557823181, - "num_tokens": 6087921.0, - "step": 666 - }, - { - "epoch": 0.506838905775076, - "grad_norm": 1.9578291177749634, - "learning_rate": 4.810299493052289e-06, - "loss": 0.40305402874946594, - "mean_token_accuracy": 0.8529061079025269, - "num_tokens": 6100034.0, - "step": 667 - }, - { - "epoch": 0.5075987841945289, - "grad_norm": 2.800635576248169, - "learning_rate": 4.809498408045691e-06, - "loss": 0.5087342262268066, - "mean_token_accuracy": 0.8214689493179321, - "num_tokens": 6104742.0, - "step": 668 - }, - { - "epoch": 0.5083586626139818, - "grad_norm": 1.5318149328231812, - "learning_rate": 4.808695702144206e-06, - "loss": 0.4733222723007202, - "mean_token_accuracy": 0.837577223777771, - "num_tokens": 6117242.0, - "step": 669 - }, - { - "epoch": 0.5091185410334347, - "grad_norm": 1.2368661165237427, - "learning_rate": 4.807891375911207e-06, - "loss": 0.3929097056388855, - "mean_token_accuracy": 0.8331400752067566, - "num_tokens": 6133509.0, - "step": 670 - }, - { - "epoch": 0.5098784194528876, - "grad_norm": 2.4711415767669678, - "learning_rate": 4.8070854299112e-06, - "loss": 0.6294851303100586, - "mean_token_accuracy": 0.7956781983375549, - "num_tokens": 6140294.0, - "step": 671 - }, - { - "epoch": 0.5106382978723404, - "grad_norm": 2.590961217880249, - "learning_rate": 4.806277864709828e-06, - "loss": 0.580160915851593, - "mean_token_accuracy": 0.809589684009552, - "num_tokens": 6145803.0, - "step": 672 - }, - { - "epoch": 0.5113981762917933, - "grad_norm": 2.4653842449188232, - "learning_rate": 4.805468680873874e-06, - "loss": 0.5262120366096497, - "mean_token_accuracy": 0.822458803653717, - "num_tokens": 6151236.0, - "step": 673 - }, - { - "epoch": 0.5121580547112462, - "grad_norm": 2.860720157623291, - "learning_rate": 4.804657878971252e-06, - "loss": 0.4007391035556793, - "mean_token_accuracy": 0.8637382984161377, - "num_tokens": 6155310.0, - "step": 674 - }, - { - "epoch": 0.5129179331306991, - "grad_norm": 2.520282030105591, - "learning_rate": 4.803845459571014e-06, - "loss": 0.45798182487487793, - "mean_token_accuracy": 0.8270114660263062, - "num_tokens": 6160326.0, - "step": 675 - }, - { - "epoch": 0.513677811550152, - "grad_norm": 2.7290921211242676, - "learning_rate": 4.803031423243349e-06, - "loss": 0.5745848417282104, - "mean_token_accuracy": 0.8401234745979309, - "num_tokens": 6165709.0, - "step": 676 - }, - { - "epoch": 0.5144376899696048, - "grad_norm": 1.6678650379180908, - "learning_rate": 4.802215770559578e-06, - "loss": 0.5257721543312073, - "mean_token_accuracy": 0.8241991996765137, - "num_tokens": 6177875.0, - "step": 677 - }, - { - "epoch": 0.5151975683890577, - "grad_norm": 2.1720468997955322, - "learning_rate": 4.801398502092156e-06, - "loss": 0.45342206954956055, - "mean_token_accuracy": 0.8463799953460693, - "num_tokens": 6185415.0, - "step": 678 - }, - { - "epoch": 0.5159574468085106, - "grad_norm": 2.282259702682495, - "learning_rate": 4.800579618414677e-06, - "loss": 0.4864169955253601, - "mean_token_accuracy": 0.8300632238388062, - "num_tokens": 6191832.0, - "step": 679 - }, - { - "epoch": 0.5167173252279635, - "grad_norm": 2.0092248916625977, - "learning_rate": 4.799759120101861e-06, - "loss": 0.5781463980674744, - "mean_token_accuracy": 0.8267031908035278, - "num_tokens": 6199440.0, - "step": 680 - }, - { - "epoch": 0.5174772036474165, - "grad_norm": 1.396580696105957, - "learning_rate": 4.798937007729568e-06, - "loss": 0.49689239263534546, - "mean_token_accuracy": 0.8257499933242798, - "num_tokens": 6213840.0, - "step": 681 - }, - { - "epoch": 0.5182370820668692, - "grad_norm": 1.9060769081115723, - "learning_rate": 4.798113281874788e-06, - "loss": 0.48969539999961853, - "mean_token_accuracy": 0.8171790838241577, - "num_tokens": 6223006.0, - "step": 682 - }, - { - "epoch": 0.5189969604863222, - "grad_norm": 1.6255282163619995, - "learning_rate": 4.797287943115642e-06, - "loss": 0.5532330870628357, - "mean_token_accuracy": 0.8173393607139587, - "num_tokens": 6234857.0, - "step": 683 - }, - { - "epoch": 0.5197568389057751, - "grad_norm": 1.6923905611038208, - "learning_rate": 4.796460992031386e-06, - "loss": 0.4880887269973755, - "mean_token_accuracy": 0.834983229637146, - "num_tokens": 6245252.0, - "step": 684 - }, - { - "epoch": 0.520516717325228, - "grad_norm": 2.13161301612854, - "learning_rate": 4.7956324292024045e-06, - "loss": 0.5687593817710876, - "mean_token_accuracy": 0.7996571063995361, - "num_tokens": 6253726.0, - "step": 685 - }, - { - "epoch": 0.5212765957446809, - "grad_norm": 2.509375810623169, - "learning_rate": 4.794802255210217e-06, - "loss": 0.5396929979324341, - "mean_token_accuracy": 0.8007107973098755, - "num_tokens": 6259238.0, - "step": 686 - }, - { - "epoch": 0.5220364741641338, - "grad_norm": 2.393710136413574, - "learning_rate": 4.793970470637469e-06, - "loss": 0.6165191531181335, - "mean_token_accuracy": 0.7891418933868408, - "num_tokens": 6266325.0, - "step": 687 - }, - { - "epoch": 0.5227963525835866, - "grad_norm": 1.511647343635559, - "learning_rate": 4.7931370760679415e-06, - "loss": 0.4773876965045929, - "mean_token_accuracy": 0.8381044864654541, - "num_tokens": 6277447.0, - "step": 688 - }, - { - "epoch": 0.5235562310030395, - "grad_norm": 2.206587314605713, - "learning_rate": 4.792302072086542e-06, - "loss": 0.5482058525085449, - "mean_token_accuracy": 0.8239108920097351, - "num_tokens": 6285163.0, - "step": 689 - }, - { - "epoch": 0.5243161094224924, - "grad_norm": 3.018146514892578, - "learning_rate": 4.7914654592793065e-06, - "loss": 0.4880615472793579, - "mean_token_accuracy": 0.8361308574676514, - "num_tokens": 6289386.0, - "step": 690 - }, - { - "epoch": 0.5250759878419453, - "grad_norm": 1.6469231843948364, - "learning_rate": 4.790627238233405e-06, - "loss": 0.4164774715900421, - "mean_token_accuracy": 0.8496290445327759, - "num_tokens": 6298915.0, - "step": 691 - }, - { - "epoch": 0.5258358662613982, - "grad_norm": 2.352505922317505, - "learning_rate": 4.789787409537131e-06, - "loss": 0.5366303324699402, - "mean_token_accuracy": 0.8350417613983154, - "num_tokens": 6306130.0, - "step": 692 - }, - { - "epoch": 0.526595744680851, - "grad_norm": 1.7463021278381348, - "learning_rate": 4.7889459737799105e-06, - "loss": 0.4389137923717499, - "mean_token_accuracy": 0.8463300466537476, - "num_tokens": 6315503.0, - "step": 693 - }, - { - "epoch": 0.5273556231003039, - "grad_norm": 2.257706642150879, - "learning_rate": 4.788102931552294e-06, - "loss": 0.5309344530105591, - "mean_token_accuracy": 0.8164352178573608, - "num_tokens": 6321852.0, - "step": 694 - }, - { - "epoch": 0.5281155015197568, - "grad_norm": 2.392732620239258, - "learning_rate": 4.787258283445962e-06, - "loss": 0.3956204056739807, - "mean_token_accuracy": 0.8671456575393677, - "num_tokens": 6327380.0, - "step": 695 - }, - { - "epoch": 0.5288753799392097, - "grad_norm": 2.210514545440674, - "learning_rate": 4.786412030053721e-06, - "loss": 0.4842875003814697, - "mean_token_accuracy": 0.8508446216583252, - "num_tokens": 6334898.0, - "step": 696 - }, - { - "epoch": 0.5296352583586627, - "grad_norm": 1.8678946495056152, - "learning_rate": 4.785564171969503e-06, - "loss": 0.47399595379829407, - "mean_token_accuracy": 0.8514996767044067, - "num_tokens": 6346374.0, - "step": 697 - }, - { - "epoch": 0.5303951367781155, - "grad_norm": 2.604079484939575, - "learning_rate": 4.784714709788368e-06, - "loss": 0.5950228571891785, - "mean_token_accuracy": 0.7983481884002686, - "num_tokens": 6351648.0, - "step": 698 - }, - { - "epoch": 0.5311550151975684, - "grad_norm": 1.662381649017334, - "learning_rate": 4.783863644106502e-06, - "loss": 0.41616758704185486, - "mean_token_accuracy": 0.8554803133010864, - "num_tokens": 6360506.0, - "step": 699 - }, - { - "epoch": 0.5319148936170213, - "grad_norm": 1.6300342082977295, - "learning_rate": 4.783010975521216e-06, - "loss": 0.43029269576072693, - "mean_token_accuracy": 0.8443028926849365, - "num_tokens": 6370675.0, - "step": 700 - }, - { - "epoch": 0.5326747720364742, - "grad_norm": 1.731873869895935, - "learning_rate": 4.782156704630944e-06, - "loss": 0.4383814334869385, - "mean_token_accuracy": 0.8443183898925781, - "num_tokens": 6381803.0, - "step": 701 - }, - { - "epoch": 0.5334346504559271, - "grad_norm": 3.1788413524627686, - "learning_rate": 4.7813008320352475e-06, - "loss": 0.32194480299949646, - "mean_token_accuracy": 0.8870962858200073, - "num_tokens": 6389263.0, - "step": 702 - }, - { - "epoch": 0.53419452887538, - "grad_norm": 2.099513530731201, - "learning_rate": 4.78044335833481e-06, - "loss": 0.36962923407554626, - "mean_token_accuracy": 0.8661133646965027, - "num_tokens": 6395589.0, - "step": 703 - }, - { - "epoch": 0.5349544072948328, - "grad_norm": 1.4859435558319092, - "learning_rate": 4.77958428413144e-06, - "loss": 0.4619954824447632, - "mean_token_accuracy": 0.8438555002212524, - "num_tokens": 6407470.0, - "step": 704 - }, - { - "epoch": 0.5357142857142857, - "grad_norm": 1.2561073303222656, - "learning_rate": 4.7787236100280685e-06, - "loss": 0.3770977258682251, - "mean_token_accuracy": 0.8515733480453491, - "num_tokens": 6422888.0, - "step": 705 - }, - { - "epoch": 0.5364741641337386, - "grad_norm": 1.4455817937850952, - "learning_rate": 4.777861336628751e-06, - "loss": 0.46481069922447205, - "mean_token_accuracy": 0.8502002954483032, - "num_tokens": 6441266.0, - "step": 706 - }, - { - "epoch": 0.5372340425531915, - "grad_norm": 1.1387295722961426, - "learning_rate": 4.7769974645386616e-06, - "loss": 0.36964765191078186, - "mean_token_accuracy": 0.8719524145126343, - "num_tokens": 6463686.0, - "step": 707 - }, - { - "epoch": 0.5379939209726444, - "grad_norm": 1.7179663181304932, - "learning_rate": 4.776131994364102e-06, - "loss": 0.4231719970703125, - "mean_token_accuracy": 0.8416585922241211, - "num_tokens": 6472956.0, - "step": 708 - }, - { - "epoch": 0.5387537993920972, - "grad_norm": 1.6328502893447876, - "learning_rate": 4.775264926712489e-06, - "loss": 0.5836569666862488, - "mean_token_accuracy": 0.8039724230766296, - "num_tokens": 6485773.0, - "step": 709 - }, - { - "epoch": 0.5395136778115501, - "grad_norm": 1.8515360355377197, - "learning_rate": 4.774396262192368e-06, - "loss": 0.5477553009986877, - "mean_token_accuracy": 0.8136521577835083, - "num_tokens": 6496379.0, - "step": 710 - }, - { - "epoch": 0.540273556231003, - "grad_norm": 1.741858959197998, - "learning_rate": 4.7735260014133986e-06, - "loss": 0.4663267731666565, - "mean_token_accuracy": 0.8473691940307617, - "num_tokens": 6507652.0, - "step": 711 - }, - { - "epoch": 0.541033434650456, - "grad_norm": 1.7516659498214722, - "learning_rate": 4.772654144986364e-06, - "loss": 0.374914288520813, - "mean_token_accuracy": 0.8600220680236816, - "num_tokens": 6519030.0, - "step": 712 - }, - { - "epoch": 0.5417933130699089, - "grad_norm": 2.662343978881836, - "learning_rate": 4.7717806935231665e-06, - "loss": 0.4206875264644623, - "mean_token_accuracy": 0.8544126749038696, - "num_tokens": 6523669.0, - "step": 713 - }, - { - "epoch": 0.5425531914893617, - "grad_norm": 1.4088834524154663, - "learning_rate": 4.770905647636828e-06, - "loss": 0.5824331045150757, - "mean_token_accuracy": 0.7857901453971863, - "num_tokens": 6540560.0, - "step": 714 - }, - { - "epoch": 0.5433130699088146, - "grad_norm": 2.173656940460205, - "learning_rate": 4.77002900794149e-06, - "loss": 0.555023729801178, - "mean_token_accuracy": 0.8067290782928467, - "num_tokens": 6548946.0, - "step": 715 - }, - { - "epoch": 0.5440729483282675, - "grad_norm": 2.121018648147583, - "learning_rate": 4.769150775052411e-06, - "loss": 0.559730052947998, - "mean_token_accuracy": 0.8166372776031494, - "num_tokens": 6556065.0, - "step": 716 - }, - { - "epoch": 0.5448328267477204, - "grad_norm": 3.335866928100586, - "learning_rate": 4.768270949585968e-06, - "loss": 0.6442267894744873, - "mean_token_accuracy": 0.7858607769012451, - "num_tokens": 6560615.0, - "step": 717 - }, - { - "epoch": 0.5455927051671733, - "grad_norm": 2.3813695907592773, - "learning_rate": 4.767389532159659e-06, - "loss": 0.4027421474456787, - "mean_token_accuracy": 0.8635619282722473, - "num_tokens": 6565841.0, - "step": 718 - }, - { - "epoch": 0.5463525835866262, - "grad_norm": 2.0657708644866943, - "learning_rate": 4.766506523392095e-06, - "loss": 0.38899827003479004, - "mean_token_accuracy": 0.8660480380058289, - "num_tokens": 6572362.0, - "step": 719 - }, - { - "epoch": 0.547112462006079, - "grad_norm": 1.093705415725708, - "learning_rate": 4.765621923903005e-06, - "loss": 0.45967352390289307, - "mean_token_accuracy": 0.8338102102279663, - "num_tokens": 6595998.0, - "step": 720 - }, - { - "epoch": 0.5478723404255319, - "grad_norm": 2.942065954208374, - "learning_rate": 4.764735734313236e-06, - "loss": 0.42910510301589966, - "mean_token_accuracy": 0.8406122922897339, - "num_tokens": 6601075.0, - "step": 721 - }, - { - "epoch": 0.5486322188449848, - "grad_norm": 2.049011707305908, - "learning_rate": 4.763847955244749e-06, - "loss": 0.5584231615066528, - "mean_token_accuracy": 0.8171684741973877, - "num_tokens": 6609310.0, - "step": 722 - }, - { - "epoch": 0.5493920972644377, - "grad_norm": 2.485543966293335, - "learning_rate": 4.762958587320623e-06, - "loss": 0.5396170020103455, - "mean_token_accuracy": 0.8158525824546814, - "num_tokens": 6616185.0, - "step": 723 - }, - { - "epoch": 0.5501519756838906, - "grad_norm": 1.87015962600708, - "learning_rate": 4.762067631165049e-06, - "loss": 0.49739527702331543, - "mean_token_accuracy": 0.8303765654563904, - "num_tokens": 6625629.0, - "step": 724 - }, - { - "epoch": 0.5509118541033434, - "grad_norm": 4.239654541015625, - "learning_rate": 4.761175087403336e-06, - "loss": 0.6029239296913147, - "mean_token_accuracy": 0.8123486042022705, - "num_tokens": 6629194.0, - "step": 725 - }, - { - "epoch": 0.5516717325227963, - "grad_norm": 2.0134730339050293, - "learning_rate": 4.760280956661904e-06, - "loss": 0.4777873754501343, - "mean_token_accuracy": 0.8283513784408569, - "num_tokens": 6636929.0, - "step": 726 - }, - { - "epoch": 0.5524316109422492, - "grad_norm": 1.991780400276184, - "learning_rate": 4.75938523956829e-06, - "loss": 0.4631248116493225, - "mean_token_accuracy": 0.8275107741355896, - "num_tokens": 6645135.0, - "step": 727 - }, - { - "epoch": 0.5531914893617021, - "grad_norm": 1.423792839050293, - "learning_rate": 4.75848793675114e-06, - "loss": 0.49630722403526306, - "mean_token_accuracy": 0.8388000130653381, - "num_tokens": 6662690.0, - "step": 728 - }, - { - "epoch": 0.5539513677811551, - "grad_norm": 2.345294952392578, - "learning_rate": 4.757589048840219e-06, - "loss": 0.37830638885498047, - "mean_token_accuracy": 0.8782080411911011, - "num_tokens": 6667285.0, - "step": 729 - }, - { - "epoch": 0.5547112462006079, - "grad_norm": 2.7452144622802734, - "learning_rate": 4.756688576466398e-06, - "loss": 0.51595538854599, - "mean_token_accuracy": 0.8441770672798157, - "num_tokens": 6672324.0, - "step": 730 - }, - { - "epoch": 0.5554711246200608, - "grad_norm": 1.5247859954833984, - "learning_rate": 4.755786520261666e-06, - "loss": 0.48365193605422974, - "mean_token_accuracy": 0.8276445269584656, - "num_tokens": 6685296.0, - "step": 731 - }, - { - "epoch": 0.5562310030395137, - "grad_norm": 1.4018276929855347, - "learning_rate": 4.75488288085912e-06, - "loss": 0.3876481354236603, - "mean_token_accuracy": 0.8612343072891235, - "num_tokens": 6697515.0, - "step": 732 - }, - { - "epoch": 0.5569908814589666, - "grad_norm": 2.9570324420928955, - "learning_rate": 4.753977658892967e-06, - "loss": 0.5468149185180664, - "mean_token_accuracy": 0.8054271340370178, - "num_tokens": 6702194.0, - "step": 733 - }, - { - "epoch": 0.5577507598784195, - "grad_norm": 1.9282715320587158, - "learning_rate": 4.753070854998529e-06, - "loss": 0.4758574962615967, - "mean_token_accuracy": 0.8379775285720825, - "num_tokens": 6709938.0, - "step": 734 - }, - { - "epoch": 0.5585106382978723, - "grad_norm": 1.981264591217041, - "learning_rate": 4.752162469812234e-06, - "loss": 0.48461222648620605, - "mean_token_accuracy": 0.833509087562561, - "num_tokens": 6718125.0, - "step": 735 - }, - { - "epoch": 0.5592705167173252, - "grad_norm": 1.1643427610397339, - "learning_rate": 4.751252503971624e-06, - "loss": 0.410121887922287, - "mean_token_accuracy": 0.8221402764320374, - "num_tokens": 6735125.0, - "step": 736 - }, - { - "epoch": 0.5600303951367781, - "grad_norm": 1.786566972732544, - "learning_rate": 4.750340958115346e-06, - "loss": 0.5964341163635254, - "mean_token_accuracy": 0.8038164377212524, - "num_tokens": 6747369.0, - "step": 737 - }, - { - "epoch": 0.560790273556231, - "grad_norm": 1.7256991863250732, - "learning_rate": 4.749427832883158e-06, - "loss": 0.48737066984176636, - "mean_token_accuracy": 0.830894947052002, - "num_tokens": 6758115.0, - "step": 738 - }, - { - "epoch": 0.5615501519756839, - "grad_norm": 1.997747540473938, - "learning_rate": 4.748513128915928e-06, - "loss": 0.5238886475563049, - "mean_token_accuracy": 0.8066858053207397, - "num_tokens": 6766111.0, - "step": 739 - }, - { - "epoch": 0.5623100303951368, - "grad_norm": 2.127016305923462, - "learning_rate": 4.747596846855629e-06, - "loss": 0.5045586228370667, - "mean_token_accuracy": 0.821424126625061, - "num_tokens": 6772893.0, - "step": 740 - }, - { - "epoch": 0.5630699088145896, - "grad_norm": 1.7664796113967896, - "learning_rate": 4.7466789873453446e-06, - "loss": 0.42954835295677185, - "mean_token_accuracy": 0.8533384799957275, - "num_tokens": 6785133.0, - "step": 741 - }, - { - "epoch": 0.5638297872340425, - "grad_norm": 1.4987404346466064, - "learning_rate": 4.7457595510292615e-06, - "loss": 0.5378558039665222, - "mean_token_accuracy": 0.8184819221496582, - "num_tokens": 6799563.0, - "step": 742 - }, - { - "epoch": 0.5645896656534954, - "grad_norm": 1.4444655179977417, - "learning_rate": 4.744838538552678e-06, - "loss": 0.42193782329559326, - "mean_token_accuracy": 0.837514340877533, - "num_tokens": 6812470.0, - "step": 743 - }, - { - "epoch": 0.5653495440729484, - "grad_norm": 3.867751121520996, - "learning_rate": 4.7439159505619946e-06, - "loss": 0.4457814693450928, - "mean_token_accuracy": 0.8630104660987854, - "num_tokens": 6815652.0, - "step": 744 - }, - { - "epoch": 0.5661094224924013, - "grad_norm": 2.1250710487365723, - "learning_rate": 4.74299178770472e-06, - "loss": 0.5638922452926636, - "mean_token_accuracy": 0.7969781160354614, - "num_tokens": 6824566.0, - "step": 745 - }, - { - "epoch": 0.5668693009118541, - "grad_norm": 2.547072410583496, - "learning_rate": 4.742066050629465e-06, - "loss": 0.5516207814216614, - "mean_token_accuracy": 0.8160669803619385, - "num_tokens": 6830589.0, - "step": 746 - }, - { - "epoch": 0.567629179331307, - "grad_norm": 1.2975233793258667, - "learning_rate": 4.741138739985951e-06, - "loss": 0.3823344111442566, - "mean_token_accuracy": 0.8668368458747864, - "num_tokens": 6842707.0, - "step": 747 - }, - { - "epoch": 0.5683890577507599, - "grad_norm": 1.3410450220108032, - "learning_rate": 4.740209856424998e-06, - "loss": 0.5148671269416809, - "mean_token_accuracy": 0.8188045024871826, - "num_tokens": 6857624.0, - "step": 748 - }, - { - "epoch": 0.5691489361702128, - "grad_norm": 1.219467282295227, - "learning_rate": 4.7392794005985324e-06, - "loss": 0.3998957872390747, - "mean_token_accuracy": 0.855175256729126, - "num_tokens": 6875064.0, - "step": 749 - }, - { - "epoch": 0.5699088145896657, - "grad_norm": 1.3530343770980835, - "learning_rate": 4.738347373159585e-06, - "loss": 0.5359633564949036, - "mean_token_accuracy": 0.8178457021713257, - "num_tokens": 6890911.0, - "step": 750 - }, - { - "epoch": 0.5706686930091185, - "grad_norm": 2.146988868713379, - "learning_rate": 4.737413774762287e-06, - "loss": 0.4460008144378662, - "mean_token_accuracy": 0.8172903060913086, - "num_tokens": 6896959.0, - "step": 751 - }, - { - "epoch": 0.5714285714285714, - "grad_norm": 1.456023097038269, - "learning_rate": 4.736478606061876e-06, - "loss": 0.43616920709609985, - "mean_token_accuracy": 0.8465108871459961, - "num_tokens": 6908904.0, - "step": 752 - }, - { - "epoch": 0.5721884498480243, - "grad_norm": 2.9696967601776123, - "learning_rate": 4.735541867714687e-06, - "loss": 0.43464532494544983, - "mean_token_accuracy": 0.8608652353286743, - "num_tokens": 6913026.0, - "step": 753 - }, - { - "epoch": 0.5729483282674772, - "grad_norm": 2.2990667819976807, - "learning_rate": 4.73460356037816e-06, - "loss": 0.6619116067886353, - "mean_token_accuracy": 0.7821142673492432, - "num_tokens": 6920588.0, - "step": 754 - }, - { - "epoch": 0.5737082066869301, - "grad_norm": 2.054746389389038, - "learning_rate": 4.733663684710835e-06, - "loss": 0.5304250717163086, - "mean_token_accuracy": 0.8265531063079834, - "num_tokens": 6928910.0, - "step": 755 - }, - { - "epoch": 0.574468085106383, - "grad_norm": 2.0050594806671143, - "learning_rate": 4.732722241372354e-06, - "loss": 0.6393026113510132, - "mean_token_accuracy": 0.796819806098938, - "num_tokens": 6940217.0, - "step": 756 - }, - { - "epoch": 0.5752279635258358, - "grad_norm": 1.4285320043563843, - "learning_rate": 4.731779231023456e-06, - "loss": 0.5432837009429932, - "mean_token_accuracy": 0.8104778528213501, - "num_tokens": 6959101.0, - "step": 757 - }, - { - "epoch": 0.5759878419452887, - "grad_norm": 2.3941943645477295, - "learning_rate": 4.730834654325984e-06, - "loss": 0.46550673246383667, - "mean_token_accuracy": 0.8444503545761108, - "num_tokens": 6965036.0, - "step": 758 - }, - { - "epoch": 0.5767477203647416, - "grad_norm": 2.3850574493408203, - "learning_rate": 4.729888511942877e-06, - "loss": 0.4916389584541321, - "mean_token_accuracy": 0.8228527307510376, - "num_tokens": 6971184.0, - "step": 759 - }, - { - "epoch": 0.5775075987841946, - "grad_norm": 1.627480149269104, - "learning_rate": 4.728940804538176e-06, - "loss": 0.5863215923309326, - "mean_token_accuracy": 0.7995302677154541, - "num_tokens": 6982569.0, - "step": 760 - }, - { - "epoch": 0.5782674772036475, - "grad_norm": 1.1723195314407349, - "learning_rate": 4.727991532777016e-06, - "loss": 0.36908864974975586, - "mean_token_accuracy": 0.8355655670166016, - "num_tokens": 6998659.0, - "step": 761 - }, - { - "epoch": 0.5790273556231003, - "grad_norm": 1.5324925184249878, - "learning_rate": 4.727040697325634e-06, - "loss": 0.557658851146698, - "mean_token_accuracy": 0.8141458034515381, - "num_tokens": 7012969.0, - "step": 762 - }, - { - "epoch": 0.5797872340425532, - "grad_norm": 2.4106390476226807, - "learning_rate": 4.726088298851362e-06, - "loss": 0.5004243850708008, - "mean_token_accuracy": 0.8376860618591309, - "num_tokens": 7018301.0, - "step": 763 - }, - { - "epoch": 0.5805471124620061, - "grad_norm": 2.2594921588897705, - "learning_rate": 4.725134338022631e-06, - "loss": 0.6067016124725342, - "mean_token_accuracy": 0.8100241422653198, - "num_tokens": 7025201.0, - "step": 764 - }, - { - "epoch": 0.581306990881459, - "grad_norm": 1.4649826288223267, - "learning_rate": 4.724178815508967e-06, - "loss": 0.36200693249702454, - "mean_token_accuracy": 0.8621826171875, - "num_tokens": 7035112.0, - "step": 765 - }, - { - "epoch": 0.5820668693009119, - "grad_norm": 2.3634560108184814, - "learning_rate": 4.723221731980993e-06, - "loss": 0.41862213611602783, - "mean_token_accuracy": 0.8541463613510132, - "num_tokens": 7040339.0, - "step": 766 - }, - { - "epoch": 0.5828267477203647, - "grad_norm": 2.7798104286193848, - "learning_rate": 4.722263088110426e-06, - "loss": 0.4647108018398285, - "mean_token_accuracy": 0.8505672216415405, - "num_tokens": 7044880.0, - "step": 767 - }, - { - "epoch": 0.5835866261398176, - "grad_norm": 2.070528507232666, - "learning_rate": 4.721302884570079e-06, - "loss": 0.5147565007209778, - "mean_token_accuracy": 0.8113877773284912, - "num_tokens": 7052433.0, - "step": 768 - }, - { - "epoch": 0.5843465045592705, - "grad_norm": 2.1953284740448, - "learning_rate": 4.720341122033862e-06, - "loss": 0.5075466632843018, - "mean_token_accuracy": 0.8474211096763611, - "num_tokens": 7058686.0, - "step": 769 - }, - { - "epoch": 0.5851063829787234, - "grad_norm": 1.9287755489349365, - "learning_rate": 4.719377801176774e-06, - "loss": 0.5382202863693237, - "mean_token_accuracy": 0.8148090243339539, - "num_tokens": 7067538.0, - "step": 770 - }, - { - "epoch": 0.5858662613981763, - "grad_norm": 1.5574456453323364, - "learning_rate": 4.718412922674913e-06, - "loss": 0.43406790494918823, - "mean_token_accuracy": 0.8477081060409546, - "num_tokens": 7077853.0, - "step": 771 - }, - { - "epoch": 0.5866261398176292, - "grad_norm": 1.5490336418151855, - "learning_rate": 4.717446487205466e-06, - "loss": 0.43164271116256714, - "mean_token_accuracy": 0.8504570126533508, - "num_tokens": 7091728.0, - "step": 772 - }, - { - "epoch": 0.587386018237082, - "grad_norm": 1.6945984363555908, - "learning_rate": 4.716478495446717e-06, - "loss": 0.5153743624687195, - "mean_token_accuracy": 0.8213579058647156, - "num_tokens": 7108680.0, - "step": 773 - }, - { - "epoch": 0.5881458966565349, - "grad_norm": 2.2633883953094482, - "learning_rate": 4.715508948078037e-06, - "loss": 0.45254790782928467, - "mean_token_accuracy": 0.8392219543457031, - "num_tokens": 7115546.0, - "step": 774 - }, - { - "epoch": 0.5889057750759878, - "grad_norm": 1.5731090307235718, - "learning_rate": 4.714537845779894e-06, - "loss": 0.38678881525993347, - "mean_token_accuracy": 0.8800252676010132, - "num_tokens": 7126360.0, - "step": 775 - }, - { - "epoch": 0.5896656534954408, - "grad_norm": 2.4873392581939697, - "learning_rate": 4.7135651892338445e-06, - "loss": 0.5190927386283875, - "mean_token_accuracy": 0.8145407438278198, - "num_tokens": 7135705.0, - "step": 776 - }, - { - "epoch": 0.5904255319148937, - "grad_norm": 1.2931004762649536, - "learning_rate": 4.712590979122534e-06, - "loss": 0.3686544895172119, - "mean_token_accuracy": 0.8720537424087524, - "num_tokens": 7150688.0, - "step": 777 - }, - { - "epoch": 0.5911854103343465, - "grad_norm": 1.6353671550750732, - "learning_rate": 4.7116152161297045e-06, - "loss": 0.49065062403678894, - "mean_token_accuracy": 0.8203760385513306, - "num_tokens": 7161040.0, - "step": 778 - }, - { - "epoch": 0.5919452887537994, - "grad_norm": 1.2345483303070068, - "learning_rate": 4.710637900940181e-06, - "loss": 0.4004976451396942, - "mean_token_accuracy": 0.8302007913589478, - "num_tokens": 7178074.0, - "step": 779 - }, - { - "epoch": 0.5927051671732523, - "grad_norm": 2.2506837844848633, - "learning_rate": 4.7096590342398825e-06, - "loss": 0.45142874121665955, - "mean_token_accuracy": 0.8481036424636841, - "num_tokens": 7184153.0, - "step": 780 - }, - { - "epoch": 0.5934650455927052, - "grad_norm": 1.420479416847229, - "learning_rate": 4.708678616715815e-06, - "loss": 0.4802100360393524, - "mean_token_accuracy": 0.8586992025375366, - "num_tokens": 7202810.0, - "step": 781 - }, - { - "epoch": 0.5942249240121581, - "grad_norm": 3.457632303237915, - "learning_rate": 4.707696649056073e-06, - "loss": 0.5265094041824341, - "mean_token_accuracy": 0.8260114192962646, - "num_tokens": 7206396.0, - "step": 782 - }, - { - "epoch": 0.5949848024316109, - "grad_norm": 1.1592093706130981, - "learning_rate": 4.706713131949839e-06, - "loss": 0.3708173632621765, - "mean_token_accuracy": 0.8476542234420776, - "num_tokens": 7225034.0, - "step": 783 - }, - { - "epoch": 0.5957446808510638, - "grad_norm": 1.6761400699615479, - "learning_rate": 4.705728066087384e-06, - "loss": 0.4137252867221832, - "mean_token_accuracy": 0.8462049961090088, - "num_tokens": 7237101.0, - "step": 784 - }, - { - "epoch": 0.5965045592705167, - "grad_norm": 2.320185422897339, - "learning_rate": 4.704741452160064e-06, - "loss": 0.5157154202461243, - "mean_token_accuracy": 0.8391785621643066, - "num_tokens": 7243826.0, - "step": 785 - }, - { - "epoch": 0.5972644376899696, - "grad_norm": 2.079423427581787, - "learning_rate": 4.703753290860323e-06, - "loss": 0.4734993278980255, - "mean_token_accuracy": 0.8353281021118164, - "num_tokens": 7250175.0, - "step": 786 - }, - { - "epoch": 0.5980243161094225, - "grad_norm": 1.8215159177780151, - "learning_rate": 4.702763582881692e-06, - "loss": 0.520193338394165, - "mean_token_accuracy": 0.844062864780426, - "num_tokens": 7258868.0, - "step": 787 - }, - { - "epoch": 0.5987841945288754, - "grad_norm": 1.3823071718215942, - "learning_rate": 4.701772328918784e-06, - "loss": 0.4177844822406769, - "mean_token_accuracy": 0.8363165259361267, - "num_tokens": 7271744.0, - "step": 788 - }, - { - "epoch": 0.5995440729483282, - "grad_norm": 2.4749298095703125, - "learning_rate": 4.700779529667301e-06, - "loss": 0.5115069150924683, - "mean_token_accuracy": 0.8473520278930664, - "num_tokens": 7277040.0, - "step": 789 - }, - { - "epoch": 0.6003039513677811, - "grad_norm": 1.7072296142578125, - "learning_rate": 4.699785185824026e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8161447048187256, - "num_tokens": 7288288.0, - "step": 790 - }, - { - "epoch": 0.601063829787234, - "grad_norm": 1.6479384899139404, - "learning_rate": 4.69878929808683e-06, - "loss": 0.4445168972015381, - "mean_token_accuracy": 0.8381255865097046, - "num_tokens": 7298640.0, - "step": 791 - }, - { - "epoch": 0.601823708206687, - "grad_norm": 1.9095896482467651, - "learning_rate": 4.6977918671546635e-06, - "loss": 0.5841238498687744, - "mean_token_accuracy": 0.7971454858779907, - "num_tokens": 7307220.0, - "step": 792 - }, - { - "epoch": 0.6025835866261399, - "grad_norm": 1.9614146947860718, - "learning_rate": 4.696792893727562e-06, - "loss": 0.34684082865715027, - "mean_token_accuracy": 0.8739526271820068, - "num_tokens": 7313875.0, - "step": 793 - }, - { - "epoch": 0.6033434650455927, - "grad_norm": 2.015570640563965, - "learning_rate": 4.695792378506645e-06, - "loss": 0.42779117822647095, - "mean_token_accuracy": 0.8625012636184692, - "num_tokens": 7321439.0, - "step": 794 - }, - { - "epoch": 0.6041033434650456, - "grad_norm": 2.8581228256225586, - "learning_rate": 4.694790322194111e-06, - "loss": 0.6519991159439087, - "mean_token_accuracy": 0.7629562616348267, - "num_tokens": 7326916.0, - "step": 795 - }, - { - "epoch": 0.6048632218844985, - "grad_norm": 2.482715368270874, - "learning_rate": 4.693786725493242e-06, - "loss": 0.532963216304779, - "mean_token_accuracy": 0.832184910774231, - "num_tokens": 7333311.0, - "step": 796 - }, - { - "epoch": 0.6056231003039514, - "grad_norm": 1.6076741218566895, - "learning_rate": 4.692781589108402e-06, - "loss": 0.43381205201148987, - "mean_token_accuracy": 0.8402494192123413, - "num_tokens": 7343731.0, - "step": 797 - }, - { - "epoch": 0.6063829787234043, - "grad_norm": 2.2133216857910156, - "learning_rate": 4.691774913745033e-06, - "loss": 0.4380851089954376, - "mean_token_accuracy": 0.8600908517837524, - "num_tokens": 7350224.0, - "step": 798 - }, - { - "epoch": 0.6071428571428571, - "grad_norm": 2.046280860900879, - "learning_rate": 4.690766700109659e-06, - "loss": 0.3821919560432434, - "mean_token_accuracy": 0.8691814541816711, - "num_tokens": 7356717.0, - "step": 799 - }, - { - "epoch": 0.60790273556231, - "grad_norm": 1.8482693433761597, - "learning_rate": 4.689756948909884e-06, - "loss": 0.5217651128768921, - "mean_token_accuracy": 0.803473711013794, - "num_tokens": 7365806.0, - "step": 800 - }, - { - "epoch": 0.6086626139817629, - "grad_norm": 2.192134141921997, - "learning_rate": 4.688745660854388e-06, - "loss": 0.573980987071991, - "mean_token_accuracy": 0.8198676109313965, - "num_tokens": 7380281.0, - "step": 801 - }, - { - "epoch": 0.6094224924012158, - "grad_norm": 2.363626718521118, - "learning_rate": 4.687732836652935e-06, - "loss": 0.5204599499702454, - "mean_token_accuracy": 0.8373252153396606, - "num_tokens": 7386938.0, - "step": 802 - }, - { - "epoch": 0.6101823708206687, - "grad_norm": 1.9320523738861084, - "learning_rate": 4.686718477016361e-06, - "loss": 0.47316622734069824, - "mean_token_accuracy": 0.830596923828125, - "num_tokens": 7395069.0, - "step": 803 - }, - { - "epoch": 0.6109422492401215, - "grad_norm": 2.6573057174682617, - "learning_rate": 4.6857025826565845e-06, - "loss": 0.5495861768722534, - "mean_token_accuracy": 0.8187421560287476, - "num_tokens": 7400563.0, - "step": 804 - }, - { - "epoch": 0.6117021276595744, - "grad_norm": 2.0893123149871826, - "learning_rate": 4.684685154286599e-06, - "loss": 0.5362675786018372, - "mean_token_accuracy": 0.8394701480865479, - "num_tokens": 7406973.0, - "step": 805 - }, - { - "epoch": 0.6124620060790273, - "grad_norm": 2.455130100250244, - "learning_rate": 4.683666192620474e-06, - "loss": 0.5405995845794678, - "mean_token_accuracy": 0.8079100847244263, - "num_tokens": 7412931.0, - "step": 806 - }, - { - "epoch": 0.6132218844984803, - "grad_norm": 2.311915636062622, - "learning_rate": 4.682645698373357e-06, - "loss": 0.5395106077194214, - "mean_token_accuracy": 0.8156260251998901, - "num_tokens": 7419699.0, - "step": 807 - }, - { - "epoch": 0.6139817629179332, - "grad_norm": 1.686838984489441, - "learning_rate": 4.6816236722614694e-06, - "loss": 0.6034521460533142, - "mean_token_accuracy": 0.7855954170227051, - "num_tokens": 7431899.0, - "step": 808 - }, - { - "epoch": 0.6147416413373861, - "grad_norm": 1.682759165763855, - "learning_rate": 4.680600115002109e-06, - "loss": 0.48593831062316895, - "mean_token_accuracy": 0.8229435682296753, - "num_tokens": 7443187.0, - "step": 809 - }, - { - "epoch": 0.6155015197568389, - "grad_norm": 2.064589738845825, - "learning_rate": 4.679575027313649e-06, - "loss": 0.5098468661308289, - "mean_token_accuracy": 0.8234638571739197, - "num_tokens": 7450868.0, - "step": 810 - }, - { - "epoch": 0.6162613981762918, - "grad_norm": 2.2063486576080322, - "learning_rate": 4.6785484099155324e-06, - "loss": 0.5138497352600098, - "mean_token_accuracy": 0.8152111172676086, - "num_tokens": 7457176.0, - "step": 811 - }, - { - "epoch": 0.6170212765957447, - "grad_norm": 1.6258726119995117, - "learning_rate": 4.67752026352828e-06, - "loss": 0.4064181447029114, - "mean_token_accuracy": 0.8720619678497314, - "num_tokens": 7466557.0, - "step": 812 - }, - { - "epoch": 0.6177811550151976, - "grad_norm": 2.3309383392333984, - "learning_rate": 4.676490588873486e-06, - "loss": 0.5180112719535828, - "mean_token_accuracy": 0.8233879804611206, - "num_tokens": 7472650.0, - "step": 813 - }, - { - "epoch": 0.6185410334346505, - "grad_norm": 1.4545246362686157, - "learning_rate": 4.675459386673815e-06, - "loss": 0.37917959690093994, - "mean_token_accuracy": 0.8598103523254395, - "num_tokens": 7485171.0, - "step": 814 - }, - { - "epoch": 0.6193009118541033, - "grad_norm": 2.654231071472168, - "learning_rate": 4.674426657653003e-06, - "loss": 0.554074227809906, - "mean_token_accuracy": 0.8026446104049683, - "num_tokens": 7490787.0, - "step": 815 - }, - { - "epoch": 0.6200607902735562, - "grad_norm": 1.5543994903564453, - "learning_rate": 4.67339240253586e-06, - "loss": 0.6335440278053284, - "mean_token_accuracy": 0.783241868019104, - "num_tokens": 7505975.0, - "step": 816 - }, - { - "epoch": 0.6208206686930091, - "grad_norm": 2.079998016357422, - "learning_rate": 4.672356622048266e-06, - "loss": 0.5169394016265869, - "mean_token_accuracy": 0.8088761568069458, - "num_tokens": 7513470.0, - "step": 817 - }, - { - "epoch": 0.621580547112462, - "grad_norm": 1.5971896648406982, - "learning_rate": 4.671319316917172e-06, - "loss": 0.44588586688041687, - "mean_token_accuracy": 0.8518649339675903, - "num_tokens": 7524352.0, - "step": 818 - }, - { - "epoch": 0.6223404255319149, - "grad_norm": 2.477579116821289, - "learning_rate": 4.670280487870599e-06, - "loss": 0.5713893175125122, - "mean_token_accuracy": 0.8116940259933472, - "num_tokens": 7530359.0, - "step": 819 - }, - { - "epoch": 0.6231003039513677, - "grad_norm": 2.066211700439453, - "learning_rate": 4.669240135637635e-06, - "loss": 0.5295331478118896, - "mean_token_accuracy": 0.819536566734314, - "num_tokens": 7536963.0, - "step": 820 - }, - { - "epoch": 0.6238601823708206, - "grad_norm": 2.1217997074127197, - "learning_rate": 4.668198260948442e-06, - "loss": 0.6146406531333923, - "mean_token_accuracy": 0.7932635545730591, - "num_tokens": 7545800.0, - "step": 821 - }, - { - "epoch": 0.6246200607902735, - "grad_norm": 2.0173542499542236, - "learning_rate": 4.667154864534245e-06, - "loss": 0.6240535974502563, - "mean_token_accuracy": 0.7883644104003906, - "num_tokens": 7556165.0, - "step": 822 - }, - { - "epoch": 0.6253799392097265, - "grad_norm": 2.014526128768921, - "learning_rate": 4.666109947127343e-06, - "loss": 0.40367332100868225, - "mean_token_accuracy": 0.8653522729873657, - "num_tokens": 7562665.0, - "step": 823 - }, - { - "epoch": 0.6261398176291794, - "grad_norm": 2.5078861713409424, - "learning_rate": 4.665063509461098e-06, - "loss": 0.5903617739677429, - "mean_token_accuracy": 0.7902897596359253, - "num_tokens": 7568922.0, - "step": 824 - }, - { - "epoch": 0.6268996960486323, - "grad_norm": 2.454622745513916, - "learning_rate": 4.664015552269938e-06, - "loss": 0.5238361358642578, - "mean_token_accuracy": 0.838546872138977, - "num_tokens": 7575965.0, - "step": 825 - }, - { - "epoch": 0.6276595744680851, - "grad_norm": 2.920919418334961, - "learning_rate": 4.662966076289363e-06, - "loss": 0.5028782486915588, - "mean_token_accuracy": 0.8311152458190918, - "num_tokens": 7580193.0, - "step": 826 - }, - { - "epoch": 0.628419452887538, - "grad_norm": 1.545382022857666, - "learning_rate": 4.661915082255932e-06, - "loss": 0.4817378520965576, - "mean_token_accuracy": 0.8373227119445801, - "num_tokens": 7593024.0, - "step": 827 - }, - { - "epoch": 0.6291793313069909, - "grad_norm": 1.5152469873428345, - "learning_rate": 4.6608625709072766e-06, - "loss": 0.4693033695220947, - "mean_token_accuracy": 0.8150848150253296, - "num_tokens": 7606459.0, - "step": 828 - }, - { - "epoch": 0.6299392097264438, - "grad_norm": 2.1310224533081055, - "learning_rate": 4.659808542982089e-06, - "loss": 0.4653395414352417, - "mean_token_accuracy": 0.8286294341087341, - "num_tokens": 7613036.0, - "step": 829 - }, - { - "epoch": 0.6306990881458967, - "grad_norm": 2.1949679851531982, - "learning_rate": 4.658752999220125e-06, - "loss": 0.3698633909225464, - "mean_token_accuracy": 0.871590793132782, - "num_tokens": 7618527.0, - "step": 830 - }, - { - "epoch": 0.6314589665653495, - "grad_norm": 2.2770416736602783, - "learning_rate": 4.657695940362207e-06, - "loss": 0.5202419757843018, - "mean_token_accuracy": 0.817577600479126, - "num_tokens": 7624459.0, - "step": 831 - }, - { - "epoch": 0.6322188449848024, - "grad_norm": 1.402042269706726, - "learning_rate": 4.65663736715022e-06, - "loss": 0.51531583070755, - "mean_token_accuracy": 0.8228116631507874, - "num_tokens": 7639371.0, - "step": 832 - }, - { - "epoch": 0.6329787234042553, - "grad_norm": 3.3554883003234863, - "learning_rate": 4.65557728032711e-06, - "loss": 0.6771188378334045, - "mean_token_accuracy": 0.7880028486251831, - "num_tokens": 7643924.0, - "step": 833 - }, - { - "epoch": 0.6337386018237082, - "grad_norm": 2.081040143966675, - "learning_rate": 4.654515680636888e-06, - "loss": 0.5712796449661255, - "mean_token_accuracy": 0.8177868127822876, - "num_tokens": 7651881.0, - "step": 834 - }, - { - "epoch": 0.6344984802431611, - "grad_norm": 0.9128716588020325, - "learning_rate": 4.653452568824625e-06, - "loss": 0.3423936069011688, - "mean_token_accuracy": 0.8782886266708374, - "num_tokens": 7677829.0, - "step": 835 - }, - { - "epoch": 0.6352583586626139, - "grad_norm": 3.49015736579895, - "learning_rate": 4.652387945636454e-06, - "loss": 0.34657734632492065, - "mean_token_accuracy": 0.8770567178726196, - "num_tokens": 7680796.0, - "step": 836 - }, - { - "epoch": 0.6360182370820668, - "grad_norm": 2.026247501373291, - "learning_rate": 4.651321811819568e-06, - "loss": 0.5098431706428528, - "mean_token_accuracy": 0.8216961622238159, - "num_tokens": 7688746.0, - "step": 837 - }, - { - "epoch": 0.6367781155015197, - "grad_norm": 2.444343090057373, - "learning_rate": 4.650254168122222e-06, - "loss": 0.5490090250968933, - "mean_token_accuracy": 0.8092857599258423, - "num_tokens": 7695220.0, - "step": 838 - }, - { - "epoch": 0.6375379939209727, - "grad_norm": 2.0171122550964355, - "learning_rate": 4.649185015293728e-06, - "loss": 0.47221142053604126, - "mean_token_accuracy": 0.8514408469200134, - "num_tokens": 7702759.0, - "step": 839 - }, - { - "epoch": 0.6382978723404256, - "grad_norm": 1.9800984859466553, - "learning_rate": 4.64811435408446e-06, - "loss": 0.5238803625106812, - "mean_token_accuracy": 0.8479194641113281, - "num_tokens": 7714017.0, - "step": 840 - }, - { - "epoch": 0.6390577507598785, - "grad_norm": 3.0674357414245605, - "learning_rate": 4.647042185245848e-06, - "loss": 0.4668245315551758, - "mean_token_accuracy": 0.8381714820861816, - "num_tokens": 7717801.0, - "step": 841 - }, - { - "epoch": 0.6398176291793313, - "grad_norm": 1.5672820806503296, - "learning_rate": 4.645968509530381e-06, - "loss": 0.4428741931915283, - "mean_token_accuracy": 0.8416479825973511, - "num_tokens": 7728342.0, - "step": 842 - }, - { - "epoch": 0.6405775075987842, - "grad_norm": 2.3042354583740234, - "learning_rate": 4.644893327691608e-06, - "loss": 0.49937760829925537, - "mean_token_accuracy": 0.827070951461792, - "num_tokens": 7734576.0, - "step": 843 - }, - { - "epoch": 0.6413373860182371, - "grad_norm": 2.057772159576416, - "learning_rate": 4.6438166404841316e-06, - "loss": 0.5912986993789673, - "mean_token_accuracy": 0.805509090423584, - "num_tokens": 7742481.0, - "step": 844 - }, - { - "epoch": 0.64209726443769, - "grad_norm": 1.9688186645507812, - "learning_rate": 4.6427384486636115e-06, - "loss": 0.482401967048645, - "mean_token_accuracy": 0.8358086347579956, - "num_tokens": 7750002.0, - "step": 845 - }, - { - "epoch": 0.6428571428571429, - "grad_norm": 2.6852948665618896, - "learning_rate": 4.6416587529867665e-06, - "loss": 0.5479315519332886, - "mean_token_accuracy": 0.8091106414794922, - "num_tokens": 7755578.0, - "step": 846 - }, - { - "epoch": 0.6436170212765957, - "grad_norm": 2.0547337532043457, - "learning_rate": 4.640577554211366e-06, - "loss": 0.5327274203300476, - "mean_token_accuracy": 0.8280376195907593, - "num_tokens": 7763513.0, - "step": 847 - }, - { - "epoch": 0.6443768996960486, - "grad_norm": 2.0328633785247803, - "learning_rate": 4.63949485309624e-06, - "loss": 0.4814409613609314, - "mean_token_accuracy": 0.8527672290802002, - "num_tokens": 7771131.0, - "step": 848 - }, - { - "epoch": 0.6451367781155015, - "grad_norm": 1.5892863273620605, - "learning_rate": 4.638410650401267e-06, - "loss": 0.4492785334587097, - "mean_token_accuracy": 0.846997857093811, - "num_tokens": 7781572.0, - "step": 849 - }, - { - "epoch": 0.6458966565349544, - "grad_norm": 1.8295910358428955, - "learning_rate": 4.637324946887384e-06, - "loss": 0.37088239192962646, - "mean_token_accuracy": 0.8616628646850586, - "num_tokens": 7788604.0, - "step": 850 - }, - { - "epoch": 0.6466565349544073, - "grad_norm": 3.380040168762207, - "learning_rate": 4.636237743316578e-06, - "loss": 0.4737280607223511, - "mean_token_accuracy": 0.855940580368042, - "num_tokens": 7792504.0, - "step": 851 - }, - { - "epoch": 0.6474164133738601, - "grad_norm": 2.8790009021759033, - "learning_rate": 4.635149040451891e-06, - "loss": 0.39790448546409607, - "mean_token_accuracy": 0.8710698485374451, - "num_tokens": 7796333.0, - "step": 852 - }, - { - "epoch": 0.648176291793313, - "grad_norm": 1.914914608001709, - "learning_rate": 4.634058839057417e-06, - "loss": 0.2954312562942505, - "mean_token_accuracy": 0.8880234956741333, - "num_tokens": 7802456.0, - "step": 853 - }, - { - "epoch": 0.648936170212766, - "grad_norm": 1.3709120750427246, - "learning_rate": 4.632967139898301e-06, - "loss": 0.43224576115608215, - "mean_token_accuracy": 0.8446190357208252, - "num_tokens": 7816770.0, - "step": 854 - }, - { - "epoch": 0.6496960486322189, - "grad_norm": 1.6579312086105347, - "learning_rate": 4.63187394374074e-06, - "loss": 0.3535553514957428, - "mean_token_accuracy": 0.8738704919815063, - "num_tokens": 7824963.0, - "step": 855 - }, - { - "epoch": 0.6504559270516718, - "grad_norm": 2.4055678844451904, - "learning_rate": 4.63077925135198e-06, - "loss": 0.5078744292259216, - "mean_token_accuracy": 0.8430874347686768, - "num_tokens": 7830962.0, - "step": 856 - }, - { - "epoch": 0.6512158054711246, - "grad_norm": 2.5171499252319336, - "learning_rate": 4.629683063500319e-06, - "loss": 0.5172419548034668, - "mean_token_accuracy": 0.8087141513824463, - "num_tokens": 7836638.0, - "step": 857 - }, - { - "epoch": 0.6519756838905775, - "grad_norm": 1.7588486671447754, - "learning_rate": 4.628585380955104e-06, - "loss": 0.5759496092796326, - "mean_token_accuracy": 0.8043236136436462, - "num_tokens": 7844654.0, - "step": 858 - }, - { - "epoch": 0.6527355623100304, - "grad_norm": 1.5887070894241333, - "learning_rate": 4.62748620448673e-06, - "loss": 0.41849038004875183, - "mean_token_accuracy": 0.8556643724441528, - "num_tokens": 7855642.0, - "step": 859 - }, - { - "epoch": 0.6534954407294833, - "grad_norm": 3.227942705154419, - "learning_rate": 4.626385534866642e-06, - "loss": 0.5279449224472046, - "mean_token_accuracy": 0.8250958323478699, - "num_tokens": 7859890.0, - "step": 860 - }, - { - "epoch": 0.6542553191489362, - "grad_norm": 2.440467119216919, - "learning_rate": 4.625283372867333e-06, - "loss": 0.5294933319091797, - "mean_token_accuracy": 0.8235013484954834, - "num_tokens": 7866766.0, - "step": 861 - }, - { - "epoch": 0.6550151975683891, - "grad_norm": 2.4106903076171875, - "learning_rate": 4.624179719262342e-06, - "loss": 0.5662813186645508, - "mean_token_accuracy": 0.8061668872833252, - "num_tokens": 7872809.0, - "step": 862 - }, - { - "epoch": 0.6557750759878419, - "grad_norm": 3.5151145458221436, - "learning_rate": 4.623074574826254e-06, - "loss": 0.5471097230911255, - "mean_token_accuracy": 0.8220691084861755, - "num_tokens": 7876136.0, - "step": 863 - }, - { - "epoch": 0.6565349544072948, - "grad_norm": 1.5319840908050537, - "learning_rate": 4.621967940334705e-06, - "loss": 0.4178982377052307, - "mean_token_accuracy": 0.8517135977745056, - "num_tokens": 7886113.0, - "step": 864 - }, - { - "epoch": 0.6572948328267477, - "grad_norm": 1.63701331615448, - "learning_rate": 4.620859816564371e-06, - "loss": 0.4666512608528137, - "mean_token_accuracy": 0.8223508596420288, - "num_tokens": 7897982.0, - "step": 865 - }, - { - "epoch": 0.6580547112462006, - "grad_norm": 2.1515414714813232, - "learning_rate": 4.619750204292978e-06, - "loss": 0.5359305143356323, - "mean_token_accuracy": 0.8192868232727051, - "num_tokens": 7904947.0, - "step": 866 - }, - { - "epoch": 0.6588145896656535, - "grad_norm": 2.2140955924987793, - "learning_rate": 4.618639104299294e-06, - "loss": 0.5275633931159973, - "mean_token_accuracy": 0.8120715618133545, - "num_tokens": 7913913.0, - "step": 867 - }, - { - "epoch": 0.6595744680851063, - "grad_norm": 1.3956893682479858, - "learning_rate": 4.6175265173631304e-06, - "loss": 0.4378768503665924, - "mean_token_accuracy": 0.8479125499725342, - "num_tokens": 7927979.0, - "step": 868 - }, - { - "epoch": 0.6603343465045592, - "grad_norm": 2.98103928565979, - "learning_rate": 4.616412444265344e-06, - "loss": 0.42614591121673584, - "mean_token_accuracy": 0.8595094680786133, - "num_tokens": 7934293.0, - "step": 869 - }, - { - "epoch": 0.6610942249240122, - "grad_norm": 2.554845094680786, - "learning_rate": 4.6152968857878365e-06, - "loss": 0.3698030412197113, - "mean_token_accuracy": 0.8717041015625, - "num_tokens": 7938547.0, - "step": 870 - }, - { - "epoch": 0.6618541033434651, - "grad_norm": 3.0901825428009033, - "learning_rate": 4.6141798427135475e-06, - "loss": 0.5037497282028198, - "mean_token_accuracy": 0.8354041576385498, - "num_tokens": 7942829.0, - "step": 871 - }, - { - "epoch": 0.662613981762918, - "grad_norm": 2.8692073822021484, - "learning_rate": 4.6130613158264605e-06, - "loss": 0.5418164134025574, - "mean_token_accuracy": 0.8298909664154053, - "num_tokens": 7949303.0, - "step": 872 - }, - { - "epoch": 0.6633738601823708, - "grad_norm": 3.960404396057129, - "learning_rate": 4.611941305911602e-06, - "loss": 0.6284480094909668, - "mean_token_accuracy": 0.837495744228363, - "num_tokens": 7952486.0, - "step": 873 - }, - { - "epoch": 0.6641337386018237, - "grad_norm": 2.6690115928649902, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5214360952377319, - "mean_token_accuracy": 0.8213508129119873, - "num_tokens": 7957559.0, - "step": 874 - }, - { - "epoch": 0.6648936170212766, - "grad_norm": 2.3376171588897705, - "learning_rate": 4.609696840143875e-06, - "loss": 0.46887528896331787, - "mean_token_accuracy": 0.8438819646835327, - "num_tokens": 7962826.0, - "step": 875 - }, - { - "epoch": 0.6656534954407295, - "grad_norm": 2.2222683429718018, - "learning_rate": 4.6085723858662575e-06, - "loss": 0.5607719421386719, - "mean_token_accuracy": 0.8128405809402466, - "num_tokens": 7970131.0, - "step": 876 - }, - { - "epoch": 0.6664133738601824, - "grad_norm": 2.069091558456421, - "learning_rate": 4.607446451711372e-06, - "loss": 0.506301760673523, - "mean_token_accuracy": 0.8256827592849731, - "num_tokens": 7977524.0, - "step": 877 - }, - { - "epoch": 0.6671732522796353, - "grad_norm": 1.3724967241287231, - "learning_rate": 4.606319038469443e-06, - "loss": 0.43285101652145386, - "mean_token_accuracy": 0.8525032997131348, - "num_tokens": 7989174.0, - "step": 878 - }, - { - "epoch": 0.6679331306990881, - "grad_norm": 2.278205156326294, - "learning_rate": 4.605190146931731e-06, - "loss": 0.4845905303955078, - "mean_token_accuracy": 0.8284652829170227, - "num_tokens": 7998524.0, - "step": 879 - }, - { - "epoch": 0.668693009118541, - "grad_norm": 1.3871766328811646, - "learning_rate": 4.604059777890537e-06, - "loss": 0.5736679434776306, - "mean_token_accuracy": 0.8223285675048828, - "num_tokens": 8015776.0, - "step": 880 - }, - { - "epoch": 0.6694528875379939, - "grad_norm": 1.926164984703064, - "learning_rate": 4.602927932139197e-06, - "loss": 0.4133230447769165, - "mean_token_accuracy": 0.8653768301010132, - "num_tokens": 8022979.0, - "step": 881 - }, - { - "epoch": 0.6702127659574468, - "grad_norm": 2.109272003173828, - "learning_rate": 4.601794610472083e-06, - "loss": 0.7005600929260254, - "mean_token_accuracy": 0.7777010202407837, - "num_tokens": 8032618.0, - "step": 882 - }, - { - "epoch": 0.6709726443768997, - "grad_norm": 2.077977418899536, - "learning_rate": 4.6006598136846056e-06, - "loss": 0.5278208255767822, - "mean_token_accuracy": 0.8230358958244324, - "num_tokens": 8040534.0, - "step": 883 - }, - { - "epoch": 0.6717325227963525, - "grad_norm": 1.678581714630127, - "learning_rate": 4.599523542573207e-06, - "loss": 0.4955351650714874, - "mean_token_accuracy": 0.8270003795623779, - "num_tokens": 8052249.0, - "step": 884 - }, - { - "epoch": 0.6724924012158054, - "grad_norm": 2.0751662254333496, - "learning_rate": 4.598385797935368e-06, - "loss": 0.5266247987747192, - "mean_token_accuracy": 0.8263581991195679, - "num_tokens": 8060600.0, - "step": 885 - }, - { - "epoch": 0.6732522796352584, - "grad_norm": 2.418405771255493, - "learning_rate": 4.5972465805696e-06, - "loss": 0.4481425881385803, - "mean_token_accuracy": 0.846164345741272, - "num_tokens": 8066025.0, - "step": 886 - }, - { - "epoch": 0.6740121580547113, - "grad_norm": 2.3936474323272705, - "learning_rate": 4.596105891275449e-06, - "loss": 0.4553404450416565, - "mean_token_accuracy": 0.8412896394729614, - "num_tokens": 8071544.0, - "step": 887 - }, - { - "epoch": 0.6747720364741642, - "grad_norm": 2.2024407386779785, - "learning_rate": 4.594963730853497e-06, - "loss": 0.6218541860580444, - "mean_token_accuracy": 0.7890232801437378, - "num_tokens": 8079061.0, - "step": 888 - }, - { - "epoch": 0.675531914893617, - "grad_norm": 2.51015567779541, - "learning_rate": 4.593820100105355e-06, - "loss": 0.5149124264717102, - "mean_token_accuracy": 0.8241918087005615, - "num_tokens": 8084293.0, - "step": 889 - }, - { - "epoch": 0.6762917933130699, - "grad_norm": 1.8748939037322998, - "learning_rate": 4.5926749998336665e-06, - "loss": 0.50836181640625, - "mean_token_accuracy": 0.8067223429679871, - "num_tokens": 8092511.0, - "step": 890 - }, - { - "epoch": 0.6770516717325228, - "grad_norm": 1.801193118095398, - "learning_rate": 4.5915284308421075e-06, - "loss": 0.4372861683368683, - "mean_token_accuracy": 0.8510604500770569, - "num_tokens": 8101174.0, - "step": 891 - }, - { - "epoch": 0.6778115501519757, - "grad_norm": 2.6476457118988037, - "learning_rate": 4.590380393935383e-06, - "loss": 0.38700711727142334, - "mean_token_accuracy": 0.8659796714782715, - "num_tokens": 8105398.0, - "step": 892 - }, - { - "epoch": 0.6785714285714286, - "grad_norm": 1.1147183179855347, - "learning_rate": 4.589230889919232e-06, - "loss": 0.38546115159988403, - "mean_token_accuracy": 0.8570581674575806, - "num_tokens": 8127394.0, - "step": 893 - }, - { - "epoch": 0.6793313069908815, - "grad_norm": 2.908905506134033, - "learning_rate": 4.588079919600419e-06, - "loss": 0.5108504295349121, - "mean_token_accuracy": 0.8121406435966492, - "num_tokens": 8131801.0, - "step": 894 - }, - { - "epoch": 0.6800911854103343, - "grad_norm": 3.1522326469421387, - "learning_rate": 4.586927483786739e-06, - "loss": 0.44059112668037415, - "mean_token_accuracy": 0.8448011875152588, - "num_tokens": 8154416.0, - "step": 895 - }, - { - "epoch": 0.6808510638297872, - "grad_norm": 1.5142440795898438, - "learning_rate": 4.585773583287017e-06, - "loss": 0.513217568397522, - "mean_token_accuracy": 0.8386049270629883, - "num_tokens": 8171156.0, - "step": 896 - }, - { - "epoch": 0.6816109422492401, - "grad_norm": 2.597881317138672, - "learning_rate": 4.584618218911104e-06, - "loss": 0.4937712550163269, - "mean_token_accuracy": 0.8223681449890137, - "num_tokens": 8176124.0, - "step": 897 - }, - { - "epoch": 0.682370820668693, - "grad_norm": 1.8185619115829468, - "learning_rate": 4.583461391469879e-06, - "loss": 0.519811749458313, - "mean_token_accuracy": 0.8169777393341064, - "num_tokens": 8185136.0, - "step": 898 - }, - { - "epoch": 0.6831306990881459, - "grad_norm": 3.2061994075775146, - "learning_rate": 4.582303101775249e-06, - "loss": 0.4655115008354187, - "mean_token_accuracy": 0.8425977230072021, - "num_tokens": 8188864.0, - "step": 899 - }, - { - "epoch": 0.6838905775075987, - "grad_norm": 1.3485229015350342, - "learning_rate": 4.581143350640146e-06, - "loss": 0.5014470815658569, - "mean_token_accuracy": 0.8273109197616577, - "num_tokens": 8203460.0, - "step": 900 - }, - { - "epoch": 0.6846504559270516, - "grad_norm": 1.3264713287353516, - "learning_rate": 4.579982138878527e-06, - "loss": 0.5073703527450562, - "mean_token_accuracy": 0.8259357213973999, - "num_tokens": 8219348.0, - "step": 901 - }, - { - "epoch": 0.6854103343465046, - "grad_norm": 2.4436347484588623, - "learning_rate": 4.578819467305375e-06, - "loss": 0.47020310163497925, - "mean_token_accuracy": 0.8567265272140503, - "num_tokens": 8224427.0, - "step": 902 - }, - { - "epoch": 0.6861702127659575, - "grad_norm": 1.921749234199524, - "learning_rate": 4.5776553367367e-06, - "loss": 0.622514009475708, - "mean_token_accuracy": 0.7863982319831848, - "num_tokens": 8233151.0, - "step": 903 - }, - { - "epoch": 0.6869300911854104, - "grad_norm": 1.8815616369247437, - "learning_rate": 4.576489747989532e-06, - "loss": 0.4910545349121094, - "mean_token_accuracy": 0.8147122859954834, - "num_tokens": 8240762.0, - "step": 904 - }, - { - "epoch": 0.6876899696048632, - "grad_norm": 1.2366989850997925, - "learning_rate": 4.575322701881926e-06, - "loss": 0.3947566747665405, - "mean_token_accuracy": 0.873993992805481, - "num_tokens": 8259381.0, - "step": 905 - }, - { - "epoch": 0.6884498480243161, - "grad_norm": 1.5767735242843628, - "learning_rate": 4.57415419923296e-06, - "loss": 0.57136070728302, - "mean_token_accuracy": 0.8028088808059692, - "num_tokens": 8273296.0, - "step": 906 - }, - { - "epoch": 0.689209726443769, - "grad_norm": 2.378675699234009, - "learning_rate": 4.572984240862733e-06, - "loss": 0.5894849896430969, - "mean_token_accuracy": 0.7977708578109741, - "num_tokens": 8280083.0, - "step": 907 - }, - { - "epoch": 0.6899696048632219, - "grad_norm": 2.0401132106781006, - "learning_rate": 4.57181282759237e-06, - "loss": 0.5524613261222839, - "mean_token_accuracy": 0.8138598203659058, - "num_tokens": 8288236.0, - "step": 908 - }, - { - "epoch": 0.6907294832826748, - "grad_norm": 2.293701648712158, - "learning_rate": 4.570639960244011e-06, - "loss": 0.5154546499252319, - "mean_token_accuracy": 0.8234660625457764, - "num_tokens": 8294493.0, - "step": 909 - }, - { - "epoch": 0.6914893617021277, - "grad_norm": 1.9286527633666992, - "learning_rate": 4.56946563964082e-06, - "loss": 0.5364264845848083, - "mean_token_accuracy": 0.8147368431091309, - "num_tokens": 8303441.0, - "step": 910 - }, - { - "epoch": 0.6922492401215805, - "grad_norm": 1.2571251392364502, - "learning_rate": 4.5682898666069815e-06, - "loss": 0.43535223603248596, - "mean_token_accuracy": 0.859239935874939, - "num_tokens": 8321548.0, - "step": 911 - }, - { - "epoch": 0.6930091185410334, - "grad_norm": 1.2224860191345215, - "learning_rate": 4.567112641967697e-06, - "loss": 0.40205076336860657, - "mean_token_accuracy": 0.8724711537361145, - "num_tokens": 8335205.0, - "step": 912 - }, - { - "epoch": 0.6937689969604863, - "grad_norm": 1.2064491510391235, - "learning_rate": 4.5659339665491894e-06, - "loss": 0.37790587544441223, - "mean_token_accuracy": 0.8464339971542358, - "num_tokens": 8350926.0, - "step": 913 - }, - { - "epoch": 0.6945288753799392, - "grad_norm": 2.1755270957946777, - "learning_rate": 4.5647538411786965e-06, - "loss": 0.42034298181533813, - "mean_token_accuracy": 0.84148108959198, - "num_tokens": 8356739.0, - "step": 914 - }, - { - "epoch": 0.6952887537993921, - "grad_norm": 1.234864592552185, - "learning_rate": 4.563572266684478e-06, - "loss": 0.5062938332557678, - "mean_token_accuracy": 0.8132052421569824, - "num_tokens": 8373660.0, - "step": 915 - }, - { - "epoch": 0.6960486322188449, - "grad_norm": 2.4250621795654297, - "learning_rate": 4.562389243895807e-06, - "loss": 0.4907791018486023, - "mean_token_accuracy": 0.8337979912757874, - "num_tokens": 8378661.0, - "step": 916 - }, - { - "epoch": 0.6968085106382979, - "grad_norm": 1.5018314123153687, - "learning_rate": 4.561204773642974e-06, - "loss": 0.41041281819343567, - "mean_token_accuracy": 0.8569784164428711, - "num_tokens": 8390322.0, - "step": 917 - }, - { - "epoch": 0.6975683890577508, - "grad_norm": 2.797269344329834, - "learning_rate": 4.5600188567572874e-06, - "loss": 0.3146931529045105, - "mean_token_accuracy": 0.8913302421569824, - "num_tokens": 8393567.0, - "step": 918 - }, - { - "epoch": 0.6983282674772037, - "grad_norm": 1.4002827405929565, - "learning_rate": 4.558831494071069e-06, - "loss": 0.4275597333908081, - "mean_token_accuracy": 0.8504893779754639, - "num_tokens": 8407119.0, - "step": 919 - }, - { - "epoch": 0.6990881458966566, - "grad_norm": 1.7045831680297852, - "learning_rate": 4.557642686417654e-06, - "loss": 0.49593430757522583, - "mean_token_accuracy": 0.8185091018676758, - "num_tokens": 8417408.0, - "step": 920 - }, - { - "epoch": 0.6998480243161094, - "grad_norm": 2.8818066120147705, - "learning_rate": 4.556452434631396e-06, - "loss": 0.637908935546875, - "mean_token_accuracy": 0.7883946895599365, - "num_tokens": 8422319.0, - "step": 921 - }, - { - "epoch": 0.7006079027355623, - "grad_norm": 2.3587265014648438, - "learning_rate": 4.555260739547657e-06, - "loss": 0.38749319314956665, - "mean_token_accuracy": 0.8774704933166504, - "num_tokens": 8427315.0, - "step": 922 - }, - { - "epoch": 0.7013677811550152, - "grad_norm": 1.6648749113082886, - "learning_rate": 4.554067602002815e-06, - "loss": 0.4044865369796753, - "mean_token_accuracy": 0.8524141311645508, - "num_tokens": 8438662.0, - "step": 923 - }, - { - "epoch": 0.7021276595744681, - "grad_norm": 3.467787742614746, - "learning_rate": 4.55287302283426e-06, - "loss": 0.591016411781311, - "mean_token_accuracy": 0.81184983253479, - "num_tokens": 8442237.0, - "step": 924 - }, - { - "epoch": 0.702887537993921, - "grad_norm": 2.1458635330200195, - "learning_rate": 4.551677002880395e-06, - "loss": 0.5017476677894592, - "mean_token_accuracy": 0.822914183139801, - "num_tokens": 8449494.0, - "step": 925 - }, - { - "epoch": 0.7036474164133738, - "grad_norm": 2.521714448928833, - "learning_rate": 4.550479542980632e-06, - "loss": 0.531912088394165, - "mean_token_accuracy": 0.8225687742233276, - "num_tokens": 8454983.0, - "step": 926 - }, - { - "epoch": 0.7044072948328267, - "grad_norm": 3.5248100757598877, - "learning_rate": 4.549280643975394e-06, - "loss": 0.4631815254688263, - "mean_token_accuracy": 0.8443771600723267, - "num_tokens": 8458504.0, - "step": 927 - }, - { - "epoch": 0.7051671732522796, - "grad_norm": 2.5105819702148438, - "learning_rate": 4.548080306706114e-06, - "loss": 0.30487123131752014, - "mean_token_accuracy": 0.9018767476081848, - "num_tokens": 8462589.0, - "step": 928 - }, - { - "epoch": 0.7059270516717325, - "grad_norm": 1.3367713689804077, - "learning_rate": 4.5468785320152365e-06, - "loss": 0.4355026185512543, - "mean_token_accuracy": 0.8323584794998169, - "num_tokens": 8478450.0, - "step": 929 - }, - { - "epoch": 0.7066869300911854, - "grad_norm": 2.2506282329559326, - "learning_rate": 4.545675320746212e-06, - "loss": 0.5082957744598389, - "mean_token_accuracy": 0.823430597782135, - "num_tokens": 8485991.0, - "step": 930 - }, - { - "epoch": 0.7074468085106383, - "grad_norm": 1.7164632081985474, - "learning_rate": 4.544470673743502e-06, - "loss": 0.3960164785385132, - "mean_token_accuracy": 0.8592486381530762, - "num_tokens": 8495217.0, - "step": 931 - }, - { - "epoch": 0.7082066869300911, - "grad_norm": 1.5864969491958618, - "learning_rate": 4.543264591852572e-06, - "loss": 0.49114471673965454, - "mean_token_accuracy": 0.8330780267715454, - "num_tokens": 8508904.0, - "step": 932 - }, - { - "epoch": 0.708966565349544, - "grad_norm": 2.1707003116607666, - "learning_rate": 4.542057075919898e-06, - "loss": 0.49895772337913513, - "mean_token_accuracy": 0.8327431082725525, - "num_tokens": 8515792.0, - "step": 933 - }, - { - "epoch": 0.709726443768997, - "grad_norm": 1.9002083539962769, - "learning_rate": 4.54084812679296e-06, - "loss": 0.4548531472682953, - "mean_token_accuracy": 0.834532618522644, - "num_tokens": 8524006.0, - "step": 934 - }, - { - "epoch": 0.7104863221884499, - "grad_norm": 1.8505141735076904, - "learning_rate": 4.539637745320247e-06, - "loss": 0.35716521739959717, - "mean_token_accuracy": 0.872222900390625, - "num_tokens": 8533647.0, - "step": 935 - }, - { - "epoch": 0.7112462006079028, - "grad_norm": 2.092620849609375, - "learning_rate": 4.53842593235125e-06, - "loss": 0.4673694372177124, - "mean_token_accuracy": 0.8460999131202698, - "num_tokens": 8540734.0, - "step": 936 - }, - { - "epoch": 0.7120060790273556, - "grad_norm": 2.689514636993408, - "learning_rate": 4.537212688736466e-06, - "loss": 0.45461273193359375, - "mean_token_accuracy": 0.8450704216957092, - "num_tokens": 8544948.0, - "step": 937 - }, - { - "epoch": 0.7127659574468085, - "grad_norm": 2.4507734775543213, - "learning_rate": 4.535998015327396e-06, - "loss": 0.4571906626224518, - "mean_token_accuracy": 0.8429360389709473, - "num_tokens": 8550445.0, - "step": 938 - }, - { - "epoch": 0.7135258358662614, - "grad_norm": 1.8960013389587402, - "learning_rate": 4.534781912976546e-06, - "loss": 0.4461391568183899, - "mean_token_accuracy": 0.8487973213195801, - "num_tokens": 8557630.0, - "step": 939 - }, - { - "epoch": 0.7142857142857143, - "grad_norm": 1.602611780166626, - "learning_rate": 4.533564382537421e-06, - "loss": 0.5277102589607239, - "mean_token_accuracy": 0.8330916166305542, - "num_tokens": 8570397.0, - "step": 940 - }, - { - "epoch": 0.7150455927051672, - "grad_norm": 1.8936395645141602, - "learning_rate": 4.532345424864533e-06, - "loss": 0.38619571924209595, - "mean_token_accuracy": 0.8514572381973267, - "num_tokens": 8582673.0, - "step": 941 - }, - { - "epoch": 0.71580547112462, - "grad_norm": 1.3898619413375854, - "learning_rate": 4.531125040813392e-06, - "loss": 0.4825032949447632, - "mean_token_accuracy": 0.833012580871582, - "num_tokens": 8597239.0, - "step": 942 - }, - { - "epoch": 0.7165653495440729, - "grad_norm": 2.128230571746826, - "learning_rate": 4.529903231240511e-06, - "loss": 0.4862118065357208, - "mean_token_accuracy": 0.8210917711257935, - "num_tokens": 8605877.0, - "step": 943 - }, - { - "epoch": 0.7173252279635258, - "grad_norm": 1.6552259922027588, - "learning_rate": 4.528679997003403e-06, - "loss": 0.5092059373855591, - "mean_token_accuracy": 0.8247389793395996, - "num_tokens": 8617060.0, - "step": 944 - }, - { - "epoch": 0.7180851063829787, - "grad_norm": 2.1174771785736084, - "learning_rate": 4.52745533896058e-06, - "loss": 0.39110174775123596, - "mean_token_accuracy": 0.8672944903373718, - "num_tokens": 8623306.0, - "step": 945 - }, - { - "epoch": 0.7188449848024316, - "grad_norm": 2.8648383617401123, - "learning_rate": 4.526229257971556e-06, - "loss": 0.49864327907562256, - "mean_token_accuracy": 0.8305130004882812, - "num_tokens": 8627466.0, - "step": 946 - }, - { - "epoch": 0.7196048632218845, - "grad_norm": 2.155514717102051, - "learning_rate": 4.52500175489684e-06, - "loss": 0.5070191025733948, - "mean_token_accuracy": 0.8311188817024231, - "num_tokens": 8634759.0, - "step": 947 - }, - { - "epoch": 0.7203647416413373, - "grad_norm": 1.8432683944702148, - "learning_rate": 4.523772830597942e-06, - "loss": 0.5569252371788025, - "mean_token_accuracy": 0.8070821762084961, - "num_tokens": 8644160.0, - "step": 948 - }, - { - "epoch": 0.7211246200607903, - "grad_norm": 2.8912241458892822, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4799427390098572, - "mean_token_accuracy": 0.8443552851676941, - "num_tokens": 8648377.0, - "step": 949 - }, - { - "epoch": 0.7218844984802432, - "grad_norm": 3.3449625968933105, - "learning_rate": 4.521310721778622e-06, - "loss": 0.44043463468551636, - "mean_token_accuracy": 0.8521315455436707, - "num_tokens": 8651846.0, - "step": 950 - }, - { - "epoch": 0.7226443768996961, - "grad_norm": 1.4127917289733887, - "learning_rate": 4.520077538986203e-06, - "loss": 0.4700999855995178, - "mean_token_accuracy": 0.8377952575683594, - "num_tokens": 8665199.0, - "step": 951 - }, - { - "epoch": 0.723404255319149, - "grad_norm": 2.1607301235198975, - "learning_rate": 4.518842938425606e-06, - "loss": 0.4374256730079651, - "mean_token_accuracy": 0.8448896408081055, - "num_tokens": 8672158.0, - "step": 952 - }, - { - "epoch": 0.7241641337386018, - "grad_norm": 1.3442779779434204, - "learning_rate": 4.51760692096332e-06, - "loss": 0.38948923349380493, - "mean_token_accuracy": 0.8598923683166504, - "num_tokens": 8684532.0, - "step": 953 - }, - { - "epoch": 0.7249240121580547, - "grad_norm": 2.0003178119659424, - "learning_rate": 4.516369487466832e-06, - "loss": 0.3797217011451721, - "mean_token_accuracy": 0.8652102947235107, - "num_tokens": 8691460.0, - "step": 954 - }, - { - "epoch": 0.7256838905775076, - "grad_norm": 1.8196535110473633, - "learning_rate": 4.5151306388046175e-06, - "loss": 0.5676811933517456, - "mean_token_accuracy": 0.818500816822052, - "num_tokens": 8701624.0, - "step": 955 - }, - { - "epoch": 0.7264437689969605, - "grad_norm": 2.1962296962738037, - "learning_rate": 4.513890375846152e-06, - "loss": 0.45399484038352966, - "mean_token_accuracy": 0.8463879227638245, - "num_tokens": 8707410.0, - "step": 956 - }, - { - "epoch": 0.7272036474164134, - "grad_norm": 1.8798872232437134, - "learning_rate": 4.512648699461897e-06, - "loss": 0.5679811239242554, - "mean_token_accuracy": 0.8089900016784668, - "num_tokens": 8715630.0, - "step": 957 - }, - { - "epoch": 0.7279635258358662, - "grad_norm": 2.3540258407592773, - "learning_rate": 4.511405610523309e-06, - "loss": 0.5282865762710571, - "mean_token_accuracy": 0.8196114301681519, - "num_tokens": 8721934.0, - "step": 958 - }, - { - "epoch": 0.7287234042553191, - "grad_norm": 2.5630908012390137, - "learning_rate": 4.510161109902837e-06, - "loss": 0.39442378282546997, - "mean_token_accuracy": 0.8400980830192566, - "num_tokens": 8726511.0, - "step": 959 - }, - { - "epoch": 0.729483282674772, - "grad_norm": 1.9829226732254028, - "learning_rate": 4.508915198473919e-06, - "loss": 0.4611976742744446, - "mean_token_accuracy": 0.8439624309539795, - "num_tokens": 8733460.0, - "step": 960 - }, - { - "epoch": 0.7302431610942249, - "grad_norm": 3.0291950702667236, - "learning_rate": 4.507667877110982e-06, - "loss": 0.5158340930938721, - "mean_token_accuracy": 0.8300060033798218, - "num_tokens": 8737629.0, - "step": 961 - }, - { - "epoch": 0.7310030395136778, - "grad_norm": 1.9208252429962158, - "learning_rate": 4.506419146689445e-06, - "loss": 0.3807099163532257, - "mean_token_accuracy": 0.871469259262085, - "num_tokens": 8744615.0, - "step": 962 - }, - { - "epoch": 0.7317629179331308, - "grad_norm": 3.051565408706665, - "learning_rate": 4.505169008085717e-06, - "loss": 0.38461726903915405, - "mean_token_accuracy": 0.874465823173523, - "num_tokens": 8748154.0, - "step": 963 - }, - { - "epoch": 0.7325227963525835, - "grad_norm": 1.375466227531433, - "learning_rate": 4.503917462177192e-06, - "loss": 0.42490679025650024, - "mean_token_accuracy": 0.8457326889038086, - "num_tokens": 8760965.0, - "step": 964 - }, - { - "epoch": 0.7332826747720365, - "grad_norm": 2.216681957244873, - "learning_rate": 4.5026645098422515e-06, - "loss": 0.43149900436401367, - "mean_token_accuracy": 0.8527278900146484, - "num_tokens": 8766996.0, - "step": 965 - }, - { - "epoch": 0.7340425531914894, - "grad_norm": 1.9422595500946045, - "learning_rate": 4.5014101519602684e-06, - "loss": 0.4964504539966583, - "mean_token_accuracy": 0.8137556314468384, - "num_tokens": 8774411.0, - "step": 966 - }, - { - "epoch": 0.7348024316109423, - "grad_norm": 2.058887004852295, - "learning_rate": 4.500154389411598e-06, - "loss": 0.4977570176124573, - "mean_token_accuracy": 0.8254626989364624, - "num_tokens": 8782220.0, - "step": 967 - }, - { - "epoch": 0.7355623100303952, - "grad_norm": 2.9977786540985107, - "learning_rate": 4.498897223077582e-06, - "loss": 0.4061415195465088, - "mean_token_accuracy": 0.8752427101135254, - "num_tokens": 8786120.0, - "step": 968 - }, - { - "epoch": 0.736322188449848, - "grad_norm": 2.2636303901672363, - "learning_rate": 4.49763865384055e-06, - "loss": 0.5062161087989807, - "mean_token_accuracy": 0.8171653747558594, - "num_tokens": 8792459.0, - "step": 969 - }, - { - "epoch": 0.7370820668693009, - "grad_norm": 1.8850842714309692, - "learning_rate": 4.496378682583813e-06, - "loss": 0.5014280676841736, - "mean_token_accuracy": 0.8547511100769043, - "num_tokens": 8800675.0, - "step": 970 - }, - { - "epoch": 0.7378419452887538, - "grad_norm": 1.191985011100769, - "learning_rate": 4.495117310191667e-06, - "loss": 0.4713883101940155, - "mean_token_accuracy": 0.8213596343994141, - "num_tokens": 8820740.0, - "step": 971 - }, - { - "epoch": 0.7386018237082067, - "grad_norm": 1.823000192642212, - "learning_rate": 4.493854537549393e-06, - "loss": 0.46332645416259766, - "mean_token_accuracy": 0.8359860777854919, - "num_tokens": 8828884.0, - "step": 972 - }, - { - "epoch": 0.7393617021276596, - "grad_norm": 2.590446949005127, - "learning_rate": 4.492590365543253e-06, - "loss": 0.49074703454971313, - "mean_token_accuracy": 0.8433758020401001, - "num_tokens": 8833859.0, - "step": 973 - }, - { - "epoch": 0.7401215805471124, - "grad_norm": 2.2762670516967773, - "learning_rate": 4.491324795060491e-06, - "loss": 0.39465656876564026, - "mean_token_accuracy": 0.8734766244888306, - "num_tokens": 8839350.0, - "step": 974 - }, - { - "epoch": 0.7408814589665653, - "grad_norm": 2.698725461959839, - "learning_rate": 4.490057826989333e-06, - "loss": 0.5552085041999817, - "mean_token_accuracy": 0.8132266998291016, - "num_tokens": 8844373.0, - "step": 975 - }, - { - "epoch": 0.7416413373860182, - "grad_norm": 2.704606294631958, - "learning_rate": 4.488789462218988e-06, - "loss": 0.3447791635990143, - "mean_token_accuracy": 0.8736170530319214, - "num_tokens": 8848236.0, - "step": 976 - }, - { - "epoch": 0.7424012158054711, - "grad_norm": 3.1260716915130615, - "learning_rate": 4.487519701639641e-06, - "loss": 0.5945233702659607, - "mean_token_accuracy": 0.7997599840164185, - "num_tokens": 8852935.0, - "step": 977 - }, - { - "epoch": 0.743161094224924, - "grad_norm": 1.6895452737808228, - "learning_rate": 4.486248546142459e-06, - "loss": 0.4823892116546631, - "mean_token_accuracy": 0.8279662132263184, - "num_tokens": 8861743.0, - "step": 978 - }, - { - "epoch": 0.743920972644377, - "grad_norm": 1.9161452054977417, - "learning_rate": 4.4849759966195885e-06, - "loss": 0.5266581773757935, - "mean_token_accuracy": 0.8218623399734497, - "num_tokens": 8870601.0, - "step": 979 - }, - { - "epoch": 0.7446808510638298, - "grad_norm": 1.6894301176071167, - "learning_rate": 4.483702053964154e-06, - "loss": 0.4186219573020935, - "mean_token_accuracy": 0.8471781015396118, - "num_tokens": 8885617.0, - "step": 980 - }, - { - "epoch": 0.7454407294832827, - "grad_norm": 1.6319992542266846, - "learning_rate": 4.482426719070258e-06, - "loss": 0.541317880153656, - "mean_token_accuracy": 0.8216162323951721, - "num_tokens": 8897595.0, - "step": 981 - }, - { - "epoch": 0.7462006079027356, - "grad_norm": 5.102413177490234, - "learning_rate": 4.4811499928329775e-06, - "loss": 0.3928517699241638, - "mean_token_accuracy": 0.858033299446106, - "num_tokens": 8901682.0, - "step": 982 - }, - { - "epoch": 0.7469604863221885, - "grad_norm": 2.213860273361206, - "learning_rate": 4.479871876148368e-06, - "loss": 0.4276347756385803, - "mean_token_accuracy": 0.8529798984527588, - "num_tokens": 8908088.0, - "step": 983 - }, - { - "epoch": 0.7477203647416414, - "grad_norm": 1.2180038690567017, - "learning_rate": 4.478592369913464e-06, - "loss": 0.3941590189933777, - "mean_token_accuracy": 0.8608149290084839, - "num_tokens": 8925876.0, - "step": 984 - }, - { - "epoch": 0.7484802431610942, - "grad_norm": 2.849802255630493, - "learning_rate": 4.477311475026271e-06, - "loss": 0.42190325260162354, - "mean_token_accuracy": 0.860505223274231, - "num_tokens": 8930190.0, - "step": 985 - }, - { - "epoch": 0.7492401215805471, - "grad_norm": 1.704128384590149, - "learning_rate": 4.476029192385769e-06, - "loss": 0.4786282777786255, - "mean_token_accuracy": 0.8302322626113892, - "num_tokens": 8938340.0, - "step": 986 - }, - { - "epoch": 0.75, - "grad_norm": 2.06322979927063, - "learning_rate": 4.474745522891915e-06, - "loss": 0.4648786187171936, - "mean_token_accuracy": 0.8366481065750122, - "num_tokens": 8944633.0, - "step": 987 - }, - { - "epoch": 0.7507598784194529, - "grad_norm": 2.0745396614074707, - "learning_rate": 4.473460467445637e-06, - "loss": 0.5744885206222534, - "mean_token_accuracy": 0.8357284069061279, - "num_tokens": 8954457.0, - "step": 988 - }, - { - "epoch": 0.7515197568389058, - "grad_norm": 1.9281407594680786, - "learning_rate": 4.472174026948836e-06, - "loss": 0.528974175453186, - "mean_token_accuracy": 0.8083580732345581, - "num_tokens": 8962701.0, - "step": 989 - }, - { - "epoch": 0.7522796352583586, - "grad_norm": 3.012381076812744, - "learning_rate": 4.470886202304385e-06, - "loss": 0.48754751682281494, - "mean_token_accuracy": 0.8368391990661621, - "num_tokens": 8967272.0, - "step": 990 - }, - { - "epoch": 0.7530395136778115, - "grad_norm": 1.691826581954956, - "learning_rate": 4.469596994416131e-06, - "loss": 0.484740674495697, - "mean_token_accuracy": 0.8500643968582153, - "num_tokens": 8976615.0, - "step": 991 - }, - { - "epoch": 0.7537993920972644, - "grad_norm": 2.4961965084075928, - "learning_rate": 4.468306404188887e-06, - "loss": 0.50777268409729, - "mean_token_accuracy": 0.8168395757675171, - "num_tokens": 8983235.0, - "step": 992 - }, - { - "epoch": 0.7545592705167173, - "grad_norm": 1.512007713317871, - "learning_rate": 4.467014432528441e-06, - "loss": 0.4583340287208557, - "mean_token_accuracy": 0.8465162515640259, - "num_tokens": 8993815.0, - "step": 993 - }, - { - "epoch": 0.7553191489361702, - "grad_norm": 1.9362257719039917, - "learning_rate": 4.465721080341547e-06, - "loss": 0.6027892827987671, - "mean_token_accuracy": 0.8052380084991455, - "num_tokens": 9002697.0, - "step": 994 - }, - { - "epoch": 0.756079027355623, - "grad_norm": 2.473632335662842, - "learning_rate": 4.4644263485359316e-06, - "loss": 0.5394320487976074, - "mean_token_accuracy": 0.834665834903717, - "num_tokens": 9007428.0, - "step": 995 - }, - { - "epoch": 0.756838905775076, - "grad_norm": 2.2527434825897217, - "learning_rate": 4.463130238020284e-06, - "loss": 0.5485198497772217, - "mean_token_accuracy": 0.8090173006057739, - "num_tokens": 9013570.0, - "step": 996 - }, - { - "epoch": 0.7575987841945289, - "grad_norm": 1.4130940437316895, - "learning_rate": 4.4618327497042676e-06, - "loss": 0.37994423508644104, - "mean_token_accuracy": 0.8625167012214661, - "num_tokens": 9025485.0, - "step": 997 - }, - { - "epoch": 0.7583586626139818, - "grad_norm": 2.685115098953247, - "learning_rate": 4.460533884498509e-06, - "loss": 0.447973370552063, - "mean_token_accuracy": 0.8564165234565735, - "num_tokens": 9030355.0, - "step": 998 - }, - { - "epoch": 0.7591185410334347, - "grad_norm": 3.2743139266967773, - "learning_rate": 4.4592336433146e-06, - "loss": 0.45275989174842834, - "mean_token_accuracy": 0.8462578058242798, - "num_tokens": 9034406.0, - "step": 999 - }, - { - "epoch": 0.7598784194528876, - "grad_norm": 1.9383049011230469, - "learning_rate": 4.457932027065102e-06, - "loss": 0.5387729406356812, - "mean_token_accuracy": 0.8357330560684204, - "num_tokens": 9041502.0, - "step": 1000 - }, - { - "epoch": 0.7606382978723404, - "grad_norm": 2.7348275184631348, - "learning_rate": 4.456629036663537e-06, - "loss": 0.4448447823524475, - "mean_token_accuracy": 0.8453642129898071, - "num_tokens": 9046088.0, - "step": 1001 - }, - { - "epoch": 0.7613981762917933, - "grad_norm": 1.8477401733398438, - "learning_rate": 4.455324673024396e-06, - "loss": 0.5766505002975464, - "mean_token_accuracy": 0.8074213862419128, - "num_tokens": 9055678.0, - "step": 1002 - }, - { - "epoch": 0.7621580547112462, - "grad_norm": 3.134481430053711, - "learning_rate": 4.4540189370631315e-06, - "loss": 0.5690872669219971, - "mean_token_accuracy": 0.8414670825004578, - "num_tokens": 9062006.0, - "step": 1003 - }, - { - "epoch": 0.7629179331306991, - "grad_norm": 1.7933398485183716, - "learning_rate": 4.452711829696158e-06, - "loss": 0.4898291826248169, - "mean_token_accuracy": 0.8259007930755615, - "num_tokens": 9070754.0, - "step": 1004 - }, - { - "epoch": 0.763677811550152, - "grad_norm": 1.2552275657653809, - "learning_rate": 4.451403351840855e-06, - "loss": 0.4280198812484741, - "mean_token_accuracy": 0.8409112691879272, - "num_tokens": 9085306.0, - "step": 1005 - }, - { - "epoch": 0.7644376899696048, - "grad_norm": 1.6749331951141357, - "learning_rate": 4.450093504415562e-06, - "loss": 0.3723178505897522, - "mean_token_accuracy": 0.8545734882354736, - "num_tokens": 9102453.0, - "step": 1006 - }, - { - "epoch": 0.7651975683890577, - "grad_norm": 2.7514500617980957, - "learning_rate": 4.44878228833958e-06, - "loss": 0.5463190674781799, - "mean_token_accuracy": 0.8121639490127563, - "num_tokens": 9108342.0, - "step": 1007 - }, - { - "epoch": 0.7659574468085106, - "grad_norm": 1.3322733640670776, - "learning_rate": 4.447469704533172e-06, - "loss": 0.573723316192627, - "mean_token_accuracy": 0.8065711259841919, - "num_tokens": 9123712.0, - "step": 1008 - }, - { - "epoch": 0.7667173252279635, - "grad_norm": 2.6893765926361084, - "learning_rate": 4.446155753917559e-06, - "loss": 0.6856257915496826, - "mean_token_accuracy": 0.7718256711959839, - "num_tokens": 9130728.0, - "step": 1009 - }, - { - "epoch": 0.7674772036474165, - "grad_norm": 1.792765498161316, - "learning_rate": 4.444840437414923e-06, - "loss": 0.48203110694885254, - "mean_token_accuracy": 0.8419194221496582, - "num_tokens": 9137983.0, - "step": 1010 - }, - { - "epoch": 0.7682370820668692, - "grad_norm": 1.4957399368286133, - "learning_rate": 4.443523755948401e-06, - "loss": 0.4372181296348572, - "mean_token_accuracy": 0.8491764664649963, - "num_tokens": 9148081.0, - "step": 1011 - }, - { - "epoch": 0.7689969604863222, - "grad_norm": 1.7294867038726807, - "learning_rate": 4.442205710442095e-06, - "loss": 0.54277503490448, - "mean_token_accuracy": 0.8196806907653809, - "num_tokens": 9158407.0, - "step": 1012 - }, - { - "epoch": 0.7697568389057751, - "grad_norm": 2.2091221809387207, - "learning_rate": 4.4408863018210564e-06, - "loss": 0.4888187646865845, - "mean_token_accuracy": 0.8384175300598145, - "num_tokens": 9164754.0, - "step": 1013 - }, - { - "epoch": 0.770516717325228, - "grad_norm": 1.7615830898284912, - "learning_rate": 4.439565531011299e-06, - "loss": 0.4640008211135864, - "mean_token_accuracy": 0.8424701690673828, - "num_tokens": 9172715.0, - "step": 1014 - }, - { - "epoch": 0.7712765957446809, - "grad_norm": 1.6796128749847412, - "learning_rate": 4.43824339893979e-06, - "loss": 0.5227609276771545, - "mean_token_accuracy": 0.8135923743247986, - "num_tokens": 9183214.0, - "step": 1015 - }, - { - "epoch": 0.7720364741641338, - "grad_norm": 2.1485698223114014, - "learning_rate": 4.436919906534452e-06, - "loss": 0.4857056140899658, - "mean_token_accuracy": 0.8323013782501221, - "num_tokens": 9190360.0, - "step": 1016 - }, - { - "epoch": 0.7727963525835866, - "grad_norm": 2.7842206954956055, - "learning_rate": 4.4355950547241645e-06, - "loss": 0.46406883001327515, - "mean_token_accuracy": 0.859869122505188, - "num_tokens": 9194523.0, - "step": 1017 - }, - { - "epoch": 0.7735562310030395, - "grad_norm": 2.3774640560150146, - "learning_rate": 4.434268844438758e-06, - "loss": 0.5625549554824829, - "mean_token_accuracy": 0.8188897371292114, - "num_tokens": 9201155.0, - "step": 1018 - }, - { - "epoch": 0.7743161094224924, - "grad_norm": 2.004427909851074, - "learning_rate": 4.432941276609018e-06, - "loss": 0.5164387226104736, - "mean_token_accuracy": 0.829569935798645, - "num_tokens": 9209269.0, - "step": 1019 - }, - { - "epoch": 0.7750759878419453, - "grad_norm": 1.7218989133834839, - "learning_rate": 4.431612352166684e-06, - "loss": 0.481005996465683, - "mean_token_accuracy": 0.8359906673431396, - "num_tokens": 9220860.0, - "step": 1020 - }, - { - "epoch": 0.7758358662613982, - "grad_norm": 2.197108507156372, - "learning_rate": 4.4302820720444454e-06, - "loss": 0.440413236618042, - "mean_token_accuracy": 0.8412867784500122, - "num_tokens": 9226414.0, - "step": 1021 - }, - { - "epoch": 0.776595744680851, - "grad_norm": 2.6995162963867188, - "learning_rate": 4.428950437175944e-06, - "loss": 0.3884299397468567, - "mean_token_accuracy": 0.8696021437644958, - "num_tokens": 9230898.0, - "step": 1022 - }, - { - "epoch": 0.7773556231003039, - "grad_norm": 2.1671667098999023, - "learning_rate": 4.427617448495772e-06, - "loss": 0.5747478008270264, - "mean_token_accuracy": 0.7842930555343628, - "num_tokens": 9238479.0, - "step": 1023 - }, - { - "epoch": 0.7781155015197568, - "grad_norm": 1.6299028396606445, - "learning_rate": 4.426283106939474e-06, - "loss": 0.39478403329849243, - "mean_token_accuracy": 0.8685503602027893, - "num_tokens": 9248263.0, - "step": 1024 - }, - { - "epoch": 0.7788753799392097, - "grad_norm": 2.2621798515319824, - "learning_rate": 4.424947413443539e-06, - "loss": 0.4582178592681885, - "mean_token_accuracy": 0.8312377333641052, - "num_tokens": 9254168.0, - "step": 1025 - }, - { - "epoch": 0.7796352583586627, - "grad_norm": 2.121091365814209, - "learning_rate": 4.423610368945411e-06, - "loss": 0.5315121412277222, - "mean_token_accuracy": 0.8121483325958252, - "num_tokens": 9261808.0, - "step": 1026 - }, - { - "epoch": 0.7803951367781155, - "grad_norm": 1.8558297157287598, - "learning_rate": 4.422271974383479e-06, - "loss": 0.4299176037311554, - "mean_token_accuracy": 0.8452648520469666, - "num_tokens": 9269264.0, - "step": 1027 - }, - { - "epoch": 0.7811550151975684, - "grad_norm": 1.9089949131011963, - "learning_rate": 4.420932230697079e-06, - "loss": 0.43876272439956665, - "mean_token_accuracy": 0.8434094190597534, - "num_tokens": 9277381.0, - "step": 1028 - }, - { - "epoch": 0.7819148936170213, - "grad_norm": 1.8619649410247803, - "learning_rate": 4.419591138826495e-06, - "loss": 0.48798668384552, - "mean_token_accuracy": 0.8281317353248596, - "num_tokens": 9285413.0, - "step": 1029 - }, - { - "epoch": 0.7826747720364742, - "grad_norm": 1.3273087739944458, - "learning_rate": 4.418248699712955e-06, - "loss": 0.4611460864543915, - "mean_token_accuracy": 0.8233213424682617, - "num_tokens": 9300805.0, - "step": 1030 - }, - { - "epoch": 0.7834346504559271, - "grad_norm": 1.0473746061325073, - "learning_rate": 4.416904914298637e-06, - "loss": 0.36537665128707886, - "mean_token_accuracy": 0.8671857118606567, - "num_tokens": 9320035.0, - "step": 1031 - }, - { - "epoch": 0.78419452887538, - "grad_norm": 1.9130918979644775, - "learning_rate": 4.415559783526661e-06, - "loss": 0.4916655123233795, - "mean_token_accuracy": 0.8266351222991943, - "num_tokens": 9326795.0, - "step": 1032 - }, - { - "epoch": 0.7849544072948328, - "grad_norm": 2.0001816749572754, - "learning_rate": 4.414213308341092e-06, - "loss": 0.5711008310317993, - "mean_token_accuracy": 0.8093076348304749, - "num_tokens": 9335625.0, - "step": 1033 - }, - { - "epoch": 0.7857142857142857, - "grad_norm": 3.933542251586914, - "learning_rate": 4.412865489686936e-06, - "loss": 0.621616542339325, - "mean_token_accuracy": 0.7938898801803589, - "num_tokens": 9339080.0, - "step": 1034 - }, - { - "epoch": 0.7864741641337386, - "grad_norm": 2.061558961868286, - "learning_rate": 4.411516328510145e-06, - "loss": 0.583686113357544, - "mean_token_accuracy": 0.8216883540153503, - "num_tokens": 9348581.0, - "step": 1035 - }, - { - "epoch": 0.7872340425531915, - "grad_norm": 1.9401264190673828, - "learning_rate": 4.410165825757613e-06, - "loss": 0.4905240535736084, - "mean_token_accuracy": 0.8229951858520508, - "num_tokens": 9356032.0, - "step": 1036 - }, - { - "epoch": 0.7879939209726444, - "grad_norm": 3.620547294616699, - "learning_rate": 4.408813982377175e-06, - "loss": 0.4269888997077942, - "mean_token_accuracy": 0.8713940978050232, - "num_tokens": 9359061.0, - "step": 1037 - }, - { - "epoch": 0.7887537993920972, - "grad_norm": 1.2027851343154907, - "learning_rate": 4.407460799317605e-06, - "loss": 0.39972418546676636, - "mean_token_accuracy": 0.8610097765922546, - "num_tokens": 9377068.0, - "step": 1038 - }, - { - "epoch": 0.7895136778115501, - "grad_norm": 2.566753387451172, - "learning_rate": 4.40610627752862e-06, - "loss": 0.45267152786254883, - "mean_token_accuracy": 0.83243328332901, - "num_tokens": 9383604.0, - "step": 1039 - }, - { - "epoch": 0.790273556231003, - "grad_norm": 2.940094470977783, - "learning_rate": 4.404750417960876e-06, - "loss": 0.42862242460250854, - "mean_token_accuracy": 0.8582849502563477, - "num_tokens": 9387541.0, - "step": 1040 - }, - { - "epoch": 0.791033434650456, - "grad_norm": 2.0223944187164307, - "learning_rate": 4.403393221565966e-06, - "loss": 0.4349963665008545, - "mean_token_accuracy": 0.8453047871589661, - "num_tokens": 9394382.0, - "step": 1041 - }, - { - "epoch": 0.7917933130699089, - "grad_norm": 2.9399030208587646, - "learning_rate": 4.402034689296425e-06, - "loss": 0.32197174429893494, - "mean_token_accuracy": 0.8953392505645752, - "num_tokens": 9397741.0, - "step": 1042 - }, - { - "epoch": 0.7925531914893617, - "grad_norm": 2.819016456604004, - "learning_rate": 4.400674822105721e-06, - "loss": 0.6790289878845215, - "mean_token_accuracy": 0.8135063648223877, - "num_tokens": 9403509.0, - "step": 1043 - }, - { - "epoch": 0.7933130699088146, - "grad_norm": 1.3225977420806885, - "learning_rate": 4.399313620948262e-06, - "loss": 0.42203834652900696, - "mean_token_accuracy": 0.8399381637573242, - "num_tokens": 9418870.0, - "step": 1044 - }, - { - "epoch": 0.7940729483282675, - "grad_norm": 1.7822176218032837, - "learning_rate": 4.397951086779392e-06, - "loss": 0.4666554927825928, - "mean_token_accuracy": 0.8364764451980591, - "num_tokens": 9427640.0, - "step": 1045 - }, - { - "epoch": 0.7948328267477204, - "grad_norm": 3.186439037322998, - "learning_rate": 4.396587220555389e-06, - "loss": 0.6048363447189331, - "mean_token_accuracy": 0.7806557416915894, - "num_tokens": 9431927.0, - "step": 1046 - }, - { - "epoch": 0.7955927051671733, - "grad_norm": 3.0804805755615234, - "learning_rate": 4.395222023233467e-06, - "loss": 0.445969820022583, - "mean_token_accuracy": 0.850671112537384, - "num_tokens": 9436136.0, - "step": 1047 - }, - { - "epoch": 0.7963525835866262, - "grad_norm": 1.675968885421753, - "learning_rate": 4.393855495771774e-06, - "loss": 0.4311422109603882, - "mean_token_accuracy": 0.8449079990386963, - "num_tokens": 9445189.0, - "step": 1048 - }, - { - "epoch": 0.797112462006079, - "grad_norm": 2.342410087585449, - "learning_rate": 4.3924876391293915e-06, - "loss": 0.5733606219291687, - "mean_token_accuracy": 0.8156592845916748, - "num_tokens": 9451939.0, - "step": 1049 - }, - { - "epoch": 0.7978723404255319, - "grad_norm": 1.5967470407485962, - "learning_rate": 4.391118454266335e-06, - "loss": 0.46664729714393616, - "mean_token_accuracy": 0.8091695308685303, - "num_tokens": 9463968.0, - "step": 1050 - }, - { - "epoch": 0.7986322188449848, - "grad_norm": 1.5777863264083862, - "learning_rate": 4.389747942143549e-06, - "loss": 0.46028903126716614, - "mean_token_accuracy": 0.8347330093383789, - "num_tokens": 9475561.0, - "step": 1051 - }, - { - "epoch": 0.7993920972644377, - "grad_norm": 2.7630488872528076, - "learning_rate": 4.388376103722914e-06, - "loss": 0.5618188977241516, - "mean_token_accuracy": 0.8273467421531677, - "num_tokens": 9480661.0, - "step": 1052 - }, - { - "epoch": 0.8001519756838906, - "grad_norm": 2.093397378921509, - "learning_rate": 4.387002939967237e-06, - "loss": 0.2998353838920593, - "mean_token_accuracy": 0.8905231952667236, - "num_tokens": 9485924.0, - "step": 1053 - }, - { - "epoch": 0.8009118541033434, - "grad_norm": 1.4385871887207031, - "learning_rate": 4.38562845184026e-06, - "loss": 0.4944111704826355, - "mean_token_accuracy": 0.8403056263923645, - "num_tokens": 9500128.0, - "step": 1054 - }, - { - "epoch": 0.8016717325227963, - "grad_norm": 1.6393156051635742, - "learning_rate": 4.384252640306649e-06, - "loss": 0.5727907419204712, - "mean_token_accuracy": 0.7849414348602295, - "num_tokens": 9511569.0, - "step": 1055 - }, - { - "epoch": 0.8024316109422492, - "grad_norm": 2.3909664154052734, - "learning_rate": 4.382875506332002e-06, - "loss": 0.4760419726371765, - "mean_token_accuracy": 0.8408266305923462, - "num_tokens": 9517244.0, - "step": 1056 - }, - { - "epoch": 0.8031914893617021, - "grad_norm": 1.7288594245910645, - "learning_rate": 4.381497050882845e-06, - "loss": 0.5375926494598389, - "mean_token_accuracy": 0.8138614892959595, - "num_tokens": 9528736.0, - "step": 1057 - }, - { - "epoch": 0.8039513677811551, - "grad_norm": 2.093407392501831, - "learning_rate": 4.380117274926632e-06, - "loss": 0.46659404039382935, - "mean_token_accuracy": 0.8450702428817749, - "num_tokens": 9536200.0, - "step": 1058 - }, - { - "epoch": 0.8047112462006079, - "grad_norm": 1.6835898160934448, - "learning_rate": 4.3787361794317405e-06, - "loss": 0.43157699704170227, - "mean_token_accuracy": 0.8279973268508911, - "num_tokens": 9546314.0, - "step": 1059 - }, - { - "epoch": 0.8054711246200608, - "grad_norm": 1.983067512512207, - "learning_rate": 4.377353765367479e-06, - "loss": 0.5021739602088928, - "mean_token_accuracy": 0.8274815082550049, - "num_tokens": 9554375.0, - "step": 1060 - }, - { - "epoch": 0.8062310030395137, - "grad_norm": 2.0472030639648438, - "learning_rate": 4.375970033704078e-06, - "loss": 0.34298190474510193, - "mean_token_accuracy": 0.8900876045227051, - "num_tokens": 9560230.0, - "step": 1061 - }, - { - "epoch": 0.8069908814589666, - "grad_norm": 1.9613717794418335, - "learning_rate": 4.374584985412692e-06, - "loss": 0.3826758861541748, - "mean_token_accuracy": 0.839923620223999, - "num_tokens": 9566809.0, - "step": 1062 - }, - { - "epoch": 0.8077507598784195, - "grad_norm": 1.991289496421814, - "learning_rate": 4.373198621465405e-06, - "loss": 0.5492525100708008, - "mean_token_accuracy": 0.8153272867202759, - "num_tokens": 9576810.0, - "step": 1063 - }, - { - "epoch": 0.8085106382978723, - "grad_norm": 2.421370506286621, - "learning_rate": 4.3718109428352155e-06, - "loss": 0.5240297317504883, - "mean_token_accuracy": 0.8087242245674133, - "num_tokens": 9582906.0, - "step": 1064 - }, - { - "epoch": 0.8092705167173252, - "grad_norm": 3.697765588760376, - "learning_rate": 4.370421950496055e-06, - "loss": 0.6096476912498474, - "mean_token_accuracy": 0.787585973739624, - "num_tokens": 9586920.0, - "step": 1065 - }, - { - "epoch": 0.8100303951367781, - "grad_norm": 2.0767786502838135, - "learning_rate": 4.369031645422768e-06, - "loss": 0.41120079159736633, - "mean_token_accuracy": 0.8513731956481934, - "num_tokens": 9593902.0, - "step": 1066 - }, - { - "epoch": 0.810790273556231, - "grad_norm": 2.5968732833862305, - "learning_rate": 4.367640028591126e-06, - "loss": 0.3364982008934021, - "mean_token_accuracy": 0.8786963224411011, - "num_tokens": 9597745.0, - "step": 1067 - }, - { - "epoch": 0.8115501519756839, - "grad_norm": 2.165742874145508, - "learning_rate": 4.366247100977818e-06, - "loss": 0.406129390001297, - "mean_token_accuracy": 0.868243932723999, - "num_tokens": 9603496.0, - "step": 1068 - }, - { - "epoch": 0.8123100303951368, - "grad_norm": 2.0493404865264893, - "learning_rate": 4.364852863560456e-06, - "loss": 0.5356296300888062, - "mean_token_accuracy": 0.8191947340965271, - "num_tokens": 9610898.0, - "step": 1069 - }, - { - "epoch": 0.8130699088145896, - "grad_norm": 2.3224308490753174, - "learning_rate": 4.363457317317568e-06, - "loss": 0.41461923718452454, - "mean_token_accuracy": 0.8537945747375488, - "num_tokens": 9616626.0, - "step": 1070 - }, - { - "epoch": 0.8138297872340425, - "grad_norm": 1.7387986183166504, - "learning_rate": 4.362060463228603e-06, - "loss": 0.5134786367416382, - "mean_token_accuracy": 0.8511737585067749, - "num_tokens": 9626223.0, - "step": 1071 - }, - { - "epoch": 0.8145896656534954, - "grad_norm": 3.0270655155181885, - "learning_rate": 4.360662302273926e-06, - "loss": 0.3410695791244507, - "mean_token_accuracy": 0.8746449947357178, - "num_tokens": 9629455.0, - "step": 1072 - }, - { - "epoch": 0.8153495440729484, - "grad_norm": 1.7727062702178955, - "learning_rate": 4.35926283543482e-06, - "loss": 0.4610968828201294, - "mean_token_accuracy": 0.8444793224334717, - "num_tokens": 9638070.0, - "step": 1073 - }, - { - "epoch": 0.8161094224924013, - "grad_norm": 3.6333565711975098, - "learning_rate": 4.357862063693486e-06, - "loss": 0.3881273865699768, - "mean_token_accuracy": 0.8757344484329224, - "num_tokens": 9641028.0, - "step": 1074 - }, - { - "epoch": 0.8168693009118541, - "grad_norm": 3.024042844772339, - "learning_rate": 4.356459988033039e-06, - "loss": 0.3853808641433716, - "mean_token_accuracy": 0.8602254390716553, - "num_tokens": 9645730.0, - "step": 1075 - }, - { - "epoch": 0.817629179331307, - "grad_norm": 2.3359482288360596, - "learning_rate": 4.355056609437509e-06, - "loss": 0.4852045476436615, - "mean_token_accuracy": 0.8502728343009949, - "num_tokens": 9650975.0, - "step": 1076 - }, - { - "epoch": 0.8183890577507599, - "grad_norm": 2.2390685081481934, - "learning_rate": 4.353651928891842e-06, - "loss": 0.5287341475486755, - "mean_token_accuracy": 0.8247801065444946, - "num_tokens": 9657471.0, - "step": 1077 - }, - { - "epoch": 0.8191489361702128, - "grad_norm": 2.3809144496917725, - "learning_rate": 4.352245947381897e-06, - "loss": 0.5218510627746582, - "mean_token_accuracy": 0.8149170875549316, - "num_tokens": 9664108.0, - "step": 1078 - }, - { - "epoch": 0.8199088145896657, - "grad_norm": 1.7072309255599976, - "learning_rate": 4.3508386658944455e-06, - "loss": 0.46481168270111084, - "mean_token_accuracy": 0.834963321685791, - "num_tokens": 9673175.0, - "step": 1079 - }, - { - "epoch": 0.8206686930091185, - "grad_norm": 1.7383702993392944, - "learning_rate": 4.349430085417171e-06, - "loss": 0.4505952000617981, - "mean_token_accuracy": 0.8507769107818604, - "num_tokens": 9682800.0, - "step": 1080 - }, - { - "epoch": 0.8214285714285714, - "grad_norm": 2.4308547973632812, - "learning_rate": 4.348020206938672e-06, - "loss": 0.4832455515861511, - "mean_token_accuracy": 0.8538393974304199, - "num_tokens": 9688123.0, - "step": 1081 - }, - { - "epoch": 0.8221884498480243, - "grad_norm": 2.2686192989349365, - "learning_rate": 4.3466090314484526e-06, - "loss": 0.5112563371658325, - "mean_token_accuracy": 0.8308460712432861, - "num_tokens": 9694299.0, - "step": 1082 - }, - { - "epoch": 0.8229483282674772, - "grad_norm": 2.806093454360962, - "learning_rate": 4.345196559936931e-06, - "loss": 0.4818246364593506, - "mean_token_accuracy": 0.86617112159729, - "num_tokens": 9698471.0, - "step": 1083 - }, - { - "epoch": 0.8237082066869301, - "grad_norm": 1.7340706586837769, - "learning_rate": 4.343782793395435e-06, - "loss": 0.38246971368789673, - "mean_token_accuracy": 0.8675198554992676, - "num_tokens": 9706444.0, - "step": 1084 - }, - { - "epoch": 0.824468085106383, - "grad_norm": 1.664942741394043, - "learning_rate": 4.3423677328162e-06, - "loss": 0.498797208070755, - "mean_token_accuracy": 0.8447319865226746, - "num_tokens": 9716765.0, - "step": 1085 - }, - { - "epoch": 0.8252279635258358, - "grad_norm": 1.3608235120773315, - "learning_rate": 4.340951379192369e-06, - "loss": 0.41961491107940674, - "mean_token_accuracy": 0.8339346647262573, - "num_tokens": 9729564.0, - "step": 1086 - }, - { - "epoch": 0.8259878419452887, - "grad_norm": 1.642503261566162, - "learning_rate": 4.3395337335179945e-06, - "loss": 0.5477945804595947, - "mean_token_accuracy": 0.8117889761924744, - "num_tokens": 9741217.0, - "step": 1087 - }, - { - "epoch": 0.8267477203647416, - "grad_norm": 3.0345044136047363, - "learning_rate": 4.338114796788035e-06, - "loss": 0.5024623870849609, - "mean_token_accuracy": 0.8333141207695007, - "num_tokens": 9744941.0, - "step": 1088 - }, - { - "epoch": 0.8275075987841946, - "grad_norm": 1.3096630573272705, - "learning_rate": 4.336694569998354e-06, - "loss": 0.44169723987579346, - "mean_token_accuracy": 0.859926700592041, - "num_tokens": 9757854.0, - "step": 1089 - }, - { - "epoch": 0.8282674772036475, - "grad_norm": 2.203279495239258, - "learning_rate": 4.3352730541457215e-06, - "loss": 0.5283265113830566, - "mean_token_accuracy": 0.8053759932518005, - "num_tokens": 9764096.0, - "step": 1090 - }, - { - "epoch": 0.8290273556231003, - "grad_norm": 1.3774312734603882, - "learning_rate": 4.333850250227814e-06, - "loss": 0.4584103226661682, - "mean_token_accuracy": 0.8342611193656921, - "num_tokens": 9777768.0, - "step": 1091 - }, - { - "epoch": 0.8297872340425532, - "grad_norm": 1.822637915611267, - "learning_rate": 4.332426159243206e-06, - "loss": 0.5432791709899902, - "mean_token_accuracy": 0.8136210441589355, - "num_tokens": 9791276.0, - "step": 1092 - }, - { - "epoch": 0.8305471124620061, - "grad_norm": 3.0190067291259766, - "learning_rate": 4.331000782191384e-06, - "loss": 0.5018150806427002, - "mean_token_accuracy": 0.8234807252883911, - "num_tokens": 9794902.0, - "step": 1093 - }, - { - "epoch": 0.831306990881459, - "grad_norm": 2.09987735748291, - "learning_rate": 4.329574120072728e-06, - "loss": 0.4270891547203064, - "mean_token_accuracy": 0.8544977903366089, - "num_tokens": 9800903.0, - "step": 1094 - }, - { - "epoch": 0.8320668693009119, - "grad_norm": 1.969549536705017, - "learning_rate": 4.328146173888528e-06, - "loss": 0.45801427960395813, - "mean_token_accuracy": 0.8334714770317078, - "num_tokens": 9808719.0, - "step": 1095 - }, - { - "epoch": 0.8328267477203647, - "grad_norm": 1.4565571546554565, - "learning_rate": 4.32671694464097e-06, - "loss": 0.34864288568496704, - "mean_token_accuracy": 0.8689061999320984, - "num_tokens": 9818262.0, - "step": 1096 - }, - { - "epoch": 0.8335866261398176, - "grad_norm": 1.2163832187652588, - "learning_rate": 4.3252864333331424e-06, - "loss": 0.37953704595565796, - "mean_token_accuracy": 0.866554856300354, - "num_tokens": 9833942.0, - "step": 1097 - }, - { - "epoch": 0.8343465045592705, - "grad_norm": 1.6112010478973389, - "learning_rate": 4.323854640969033e-06, - "loss": 0.5442801713943481, - "mean_token_accuracy": 0.8190416097640991, - "num_tokens": 9844765.0, - "step": 1098 - }, - { - "epoch": 0.8351063829787234, - "grad_norm": 1.8190315961837769, - "learning_rate": 4.322421568553529e-06, - "loss": 0.48271381855010986, - "mean_token_accuracy": 0.8203652501106262, - "num_tokens": 9852625.0, - "step": 1099 - }, - { - "epoch": 0.8358662613981763, - "grad_norm": 2.7897756099700928, - "learning_rate": 4.320987217092416e-06, - "loss": 0.4086323380470276, - "mean_token_accuracy": 0.8504934310913086, - "num_tokens": 9856888.0, - "step": 1100 - }, - { - "epoch": 0.8366261398176292, - "grad_norm": 1.7035977840423584, - "learning_rate": 4.319551587592377e-06, - "loss": 0.6325064301490784, - "mean_token_accuracy": 0.788190484046936, - "num_tokens": 9869419.0, - "step": 1101 - }, - { - "epoch": 0.837386018237082, - "grad_norm": 2.609731912612915, - "learning_rate": 4.318114681060989e-06, - "loss": 0.519314706325531, - "mean_token_accuracy": 0.8469992280006409, - "num_tokens": 9874553.0, - "step": 1102 - }, - { - "epoch": 0.8381458966565349, - "grad_norm": 1.2519766092300415, - "learning_rate": 4.316676498506735e-06, - "loss": 0.3566005825996399, - "mean_token_accuracy": 0.8588439226150513, - "num_tokens": 9886498.0, - "step": 1103 - }, - { - "epoch": 0.8389057750759878, - "grad_norm": 1.430892825126648, - "learning_rate": 4.3152370409389795e-06, - "loss": 0.5250182747840881, - "mean_token_accuracy": 0.8164948225021362, - "num_tokens": 9900256.0, - "step": 1104 - }, - { - "epoch": 0.8396656534954408, - "grad_norm": 3.1245436668395996, - "learning_rate": 4.3137963093679945e-06, - "loss": 0.3173971176147461, - "mean_token_accuracy": 0.8835347890853882, - "num_tokens": 9903899.0, - "step": 1105 - }, - { - "epoch": 0.8404255319148937, - "grad_norm": 3.131812572479248, - "learning_rate": 4.3123543048049395e-06, - "loss": 0.6567763090133667, - "mean_token_accuracy": 0.8233605027198792, - "num_tokens": 9908798.0, - "step": 1106 - }, - { - "epoch": 0.8411854103343465, - "grad_norm": 1.3551725149154663, - "learning_rate": 4.310911028261867e-06, - "loss": 0.3993729054927826, - "mean_token_accuracy": 0.8529655933380127, - "num_tokens": 9922577.0, - "step": 1107 - }, - { - "epoch": 0.8419452887537994, - "grad_norm": 2.572533130645752, - "learning_rate": 4.309466480751726e-06, - "loss": 0.40906503796577454, - "mean_token_accuracy": 0.8630726933479309, - "num_tokens": 9926890.0, - "step": 1108 - }, - { - "epoch": 0.8427051671732523, - "grad_norm": 1.9146469831466675, - "learning_rate": 4.308020663288356e-06, - "loss": 0.48423194885253906, - "mean_token_accuracy": 0.8370280861854553, - "num_tokens": 9934293.0, - "step": 1109 - }, - { - "epoch": 0.8434650455927052, - "grad_norm": 1.6178001165390015, - "learning_rate": 4.306573576886485e-06, - "loss": 0.4262213408946991, - "mean_token_accuracy": 0.839401125907898, - "num_tokens": 9944513.0, - "step": 1110 - }, - { - "epoch": 0.8442249240121581, - "grad_norm": 2.4444572925567627, - "learning_rate": 4.305125222561736e-06, - "loss": 0.5199950933456421, - "mean_token_accuracy": 0.8507720232009888, - "num_tokens": 9949512.0, - "step": 1111 - }, - { - "epoch": 0.8449848024316109, - "grad_norm": 1.7983134984970093, - "learning_rate": 4.303675601330618e-06, - "loss": 0.36155956983566284, - "mean_token_accuracy": 0.8568712472915649, - "num_tokens": 9956402.0, - "step": 1112 - }, - { - "epoch": 0.8457446808510638, - "grad_norm": 2.391096353530884, - "learning_rate": 4.302224714210532e-06, - "loss": 0.5391949415206909, - "mean_token_accuracy": 0.8183057308197021, - "num_tokens": 9961606.0, - "step": 1113 - }, - { - "epoch": 0.8465045592705167, - "grad_norm": 1.8520214557647705, - "learning_rate": 4.3007725622197675e-06, - "loss": 0.5758882761001587, - "mean_token_accuracy": 0.7924330234527588, - "num_tokens": 9971473.0, - "step": 1114 - }, - { - "epoch": 0.8472644376899696, - "grad_norm": 2.436640739440918, - "learning_rate": 4.2993191463775e-06, - "loss": 0.3837985396385193, - "mean_token_accuracy": 0.8620110750198364, - "num_tokens": 9976333.0, - "step": 1115 - }, - { - "epoch": 0.8480243161094225, - "grad_norm": 1.7287120819091797, - "learning_rate": 4.29786446770379e-06, - "loss": 0.40066856145858765, - "mean_token_accuracy": 0.8618333339691162, - "num_tokens": 9985617.0, - "step": 1116 - }, - { - "epoch": 0.8487841945288754, - "grad_norm": 2.0310518741607666, - "learning_rate": 4.296408527219592e-06, - "loss": 0.5465943217277527, - "mean_token_accuracy": 0.812044620513916, - "num_tokens": 9995363.0, - "step": 1117 - }, - { - "epoch": 0.8495440729483282, - "grad_norm": 1.4858589172363281, - "learning_rate": 4.294951325946737e-06, - "loss": 0.45840176939964294, - "mean_token_accuracy": 0.8432979583740234, - "num_tokens": 10006400.0, - "step": 1118 - }, - { - "epoch": 0.8503039513677811, - "grad_norm": 1.6153514385223389, - "learning_rate": 4.293492864907947e-06, - "loss": 0.5225611925125122, - "mean_token_accuracy": 0.8180211186408997, - "num_tokens": 10018352.0, - "step": 1119 - }, - { - "epoch": 0.851063829787234, - "grad_norm": 2.1178412437438965, - "learning_rate": 4.2920331451268246e-06, - "loss": 0.5580621361732483, - "mean_token_accuracy": 0.8211709260940552, - "num_tokens": 10025614.0, - "step": 1120 - }, - { - "epoch": 0.851823708206687, - "grad_norm": 2.036839246749878, - "learning_rate": 4.2905721676278585e-06, - "loss": 0.4658433198928833, - "mean_token_accuracy": 0.8380423784255981, - "num_tokens": 10032489.0, - "step": 1121 - }, - { - "epoch": 0.8525835866261399, - "grad_norm": 2.0056262016296387, - "learning_rate": 4.28910993343642e-06, - "loss": 0.47023308277130127, - "mean_token_accuracy": 0.8340359926223755, - "num_tokens": 10040050.0, - "step": 1122 - }, - { - "epoch": 0.8533434650455927, - "grad_norm": 2.540024518966675, - "learning_rate": 4.2876464435787576e-06, - "loss": 0.502303957939148, - "mean_token_accuracy": 0.8288739919662476, - "num_tokens": 10045042.0, - "step": 1123 - }, - { - "epoch": 0.8541033434650456, - "grad_norm": 1.7894693613052368, - "learning_rate": 4.286181699082008e-06, - "loss": 0.4732973575592041, - "mean_token_accuracy": 0.8340568542480469, - "num_tokens": 10054424.0, - "step": 1124 - }, - { - "epoch": 0.8548632218844985, - "grad_norm": 1.5601223707199097, - "learning_rate": 4.284715700974186e-06, - "loss": 0.472471684217453, - "mean_token_accuracy": 0.8274722695350647, - "num_tokens": 10065523.0, - "step": 1125 - }, - { - "epoch": 0.8556231003039514, - "grad_norm": 1.7326055765151978, - "learning_rate": 4.283248450284182e-06, - "loss": 0.5924872159957886, - "mean_token_accuracy": 0.7943467497825623, - "num_tokens": 10076839.0, - "step": 1126 - }, - { - "epoch": 0.8563829787234043, - "grad_norm": 1.5165479183197021, - "learning_rate": 4.281779948041772e-06, - "loss": 0.44768425822257996, - "mean_token_accuracy": 0.8394696712493896, - "num_tokens": 10088168.0, - "step": 1127 - }, - { - "epoch": 0.8571428571428571, - "grad_norm": 1.5448920726776123, - "learning_rate": 4.280310195277606e-06, - "loss": 0.4458175003528595, - "mean_token_accuracy": 0.835773229598999, - "num_tokens": 10100306.0, - "step": 1128 - }, - { - "epoch": 0.85790273556231, - "grad_norm": 1.6311609745025635, - "learning_rate": 4.278839193023214e-06, - "loss": 0.4158072769641876, - "mean_token_accuracy": 0.8482539653778076, - "num_tokens": 10110581.0, - "step": 1129 - }, - { - "epoch": 0.8586626139817629, - "grad_norm": 1.6714754104614258, - "learning_rate": 4.277366942311001e-06, - "loss": 0.3686875104904175, - "mean_token_accuracy": 0.8681533336639404, - "num_tokens": 10118799.0, - "step": 1130 - }, - { - "epoch": 0.8594224924012158, - "grad_norm": 2.1604413986206055, - "learning_rate": 4.2758934441742494e-06, - "loss": 0.37267982959747314, - "mean_token_accuracy": 0.8520427346229553, - "num_tokens": 10124734.0, - "step": 1131 - }, - { - "epoch": 0.8601823708206687, - "grad_norm": 2.123013973236084, - "learning_rate": 4.274418699647117e-06, - "loss": 0.49963313341140747, - "mean_token_accuracy": 0.8248758912086487, - "num_tokens": 10131965.0, - "step": 1132 - }, - { - "epoch": 0.8609422492401215, - "grad_norm": 1.4308786392211914, - "learning_rate": 4.272942709764638e-06, - "loss": 0.48666873574256897, - "mean_token_accuracy": 0.8304717540740967, - "num_tokens": 10145164.0, - "step": 1133 - }, - { - "epoch": 0.8617021276595744, - "grad_norm": 1.7952618598937988, - "learning_rate": 4.271465475562716e-06, - "loss": 0.5536223649978638, - "mean_token_accuracy": 0.8093959093093872, - "num_tokens": 10154083.0, - "step": 1134 - }, - { - "epoch": 0.8624620060790273, - "grad_norm": 2.0622456073760986, - "learning_rate": 4.269986998078132e-06, - "loss": 0.5173629522323608, - "mean_token_accuracy": 0.8285619020462036, - "num_tokens": 10161889.0, - "step": 1135 - }, - { - "epoch": 0.8632218844984803, - "grad_norm": 2.0707509517669678, - "learning_rate": 4.268507278348539e-06, - "loss": 0.5871608257293701, - "mean_token_accuracy": 0.7827386856079102, - "num_tokens": 10170726.0, - "step": 1136 - }, - { - "epoch": 0.8639817629179332, - "grad_norm": 2.054368257522583, - "learning_rate": 4.2670263174124615e-06, - "loss": 0.5788969993591309, - "mean_token_accuracy": 0.7967237234115601, - "num_tokens": 10178474.0, - "step": 1137 - }, - { - "epoch": 0.8647416413373861, - "grad_norm": 1.901846170425415, - "learning_rate": 4.265544116309294e-06, - "loss": 0.5405587553977966, - "mean_token_accuracy": 0.8151819705963135, - "num_tokens": 10187013.0, - "step": 1138 - }, - { - "epoch": 0.8655015197568389, - "grad_norm": 2.901285409927368, - "learning_rate": 4.264060676079302e-06, - "loss": 0.44101861119270325, - "mean_token_accuracy": 0.8433429002761841, - "num_tokens": 10191517.0, - "step": 1139 - }, - { - "epoch": 0.8662613981762918, - "grad_norm": 2.4168388843536377, - "learning_rate": 4.262575997763622e-06, - "loss": 0.4686204195022583, - "mean_token_accuracy": 0.8505309820175171, - "num_tokens": 10196948.0, - "step": 1140 - }, - { - "epoch": 0.8670212765957447, - "grad_norm": 1.9588396549224854, - "learning_rate": 4.2610900824042575e-06, - "loss": 0.47056013345718384, - "mean_token_accuracy": 0.8280024528503418, - "num_tokens": 10204292.0, - "step": 1141 - }, - { - "epoch": 0.8677811550151976, - "grad_norm": 2.569150924682617, - "learning_rate": 4.2596029310440826e-06, - "loss": 0.573108434677124, - "mean_token_accuracy": 0.8108246326446533, - "num_tokens": 10209571.0, - "step": 1142 - }, - { - "epoch": 0.8685410334346505, - "grad_norm": 2.038032293319702, - "learning_rate": 4.258114544726835e-06, - "loss": 0.40545332431793213, - "mean_token_accuracy": 0.8611703515052795, - "num_tokens": 10215716.0, - "step": 1143 - }, - { - "epoch": 0.8693009118541033, - "grad_norm": 1.9884231090545654, - "learning_rate": 4.256624924497124e-06, - "loss": 0.40085992217063904, - "mean_token_accuracy": 0.8615031242370605, - "num_tokens": 10222775.0, - "step": 1144 - }, - { - "epoch": 0.8700607902735562, - "grad_norm": 1.912842035293579, - "learning_rate": 4.25513407140042e-06, - "loss": 0.41022324562072754, - "mean_token_accuracy": 0.8459607362747192, - "num_tokens": 10229589.0, - "step": 1145 - }, - { - "epoch": 0.8708206686930091, - "grad_norm": 1.9190576076507568, - "learning_rate": 4.253641986483063e-06, - "loss": 0.5541447401046753, - "mean_token_accuracy": 0.8256468772888184, - "num_tokens": 10240633.0, - "step": 1146 - }, - { - "epoch": 0.871580547112462, - "grad_norm": 1.3742294311523438, - "learning_rate": 4.2521486707922545e-06, - "loss": 0.3680543899536133, - "mean_token_accuracy": 0.8654477596282959, - "num_tokens": 10251252.0, - "step": 1147 - }, - { - "epoch": 0.8723404255319149, - "grad_norm": 1.4438525438308716, - "learning_rate": 4.250654125376062e-06, - "loss": 0.45830875635147095, - "mean_token_accuracy": 0.8433834314346313, - "num_tokens": 10263980.0, - "step": 1148 - }, - { - "epoch": 0.8731003039513677, - "grad_norm": 2.1273653507232666, - "learning_rate": 4.249158351283414e-06, - "loss": 0.4129376709461212, - "mean_token_accuracy": 0.861556351184845, - "num_tokens": 10270426.0, - "step": 1149 - }, - { - "epoch": 0.8738601823708206, - "grad_norm": 2.598440647125244, - "learning_rate": 4.247661349564103e-06, - "loss": 0.418030709028244, - "mean_token_accuracy": 0.86553955078125, - "num_tokens": 10275493.0, - "step": 1150 - }, - { - "epoch": 0.8746200607902735, - "grad_norm": 1.6852490901947021, - "learning_rate": 4.246163121268782e-06, - "loss": 0.6403408050537109, - "mean_token_accuracy": 0.7966094017028809, - "num_tokens": 10287989.0, - "step": 1151 - }, - { - "epoch": 0.8753799392097265, - "grad_norm": 2.5013794898986816, - "learning_rate": 4.244663667448965e-06, - "loss": 0.49922505021095276, - "mean_token_accuracy": 0.8318735361099243, - "num_tokens": 10293360.0, - "step": 1152 - }, - { - "epoch": 0.8761398176291794, - "grad_norm": 1.2022709846496582, - "learning_rate": 4.243162989157027e-06, - "loss": 0.4414965510368347, - "mean_token_accuracy": 0.8338693380355835, - "num_tokens": 10310558.0, - "step": 1153 - }, - { - "epoch": 0.8768996960486323, - "grad_norm": 1.9903281927108765, - "learning_rate": 4.241661087446202e-06, - "loss": 0.4277610778808594, - "mean_token_accuracy": 0.8560749292373657, - "num_tokens": 10316983.0, - "step": 1154 - }, - { - "epoch": 0.8776595744680851, - "grad_norm": 2.104923725128174, - "learning_rate": 4.240157963370583e-06, - "loss": 0.44431713223457336, - "mean_token_accuracy": 0.8785282969474792, - "num_tokens": 10323294.0, - "step": 1155 - }, - { - "epoch": 0.878419452887538, - "grad_norm": 2.8364813327789307, - "learning_rate": 4.2386536179851175e-06, - "loss": 0.49948397278785706, - "mean_token_accuracy": 0.8305255174636841, - "num_tokens": 10327662.0, - "step": 1156 - }, - { - "epoch": 0.8791793313069909, - "grad_norm": 1.9493682384490967, - "learning_rate": 4.2371480523456156e-06, - "loss": 0.45867404341697693, - "mean_token_accuracy": 0.8373264074325562, - "num_tokens": 10335699.0, - "step": 1157 - }, - { - "epoch": 0.8799392097264438, - "grad_norm": 2.268616199493408, - "learning_rate": 4.235641267508741e-06, - "loss": 0.4547857940196991, - "mean_token_accuracy": 0.8252766132354736, - "num_tokens": 10342464.0, - "step": 1158 - }, - { - "epoch": 0.8806990881458967, - "grad_norm": 2.1334283351898193, - "learning_rate": 4.234133264532012e-06, - "loss": 0.39503124356269836, - "mean_token_accuracy": 0.8648351430892944, - "num_tokens": 10347514.0, - "step": 1159 - }, - { - "epoch": 0.8814589665653495, - "grad_norm": 1.2775357961654663, - "learning_rate": 4.232624044473805e-06, - "loss": 0.39945733547210693, - "mean_token_accuracy": 0.8369829654693604, - "num_tokens": 10363316.0, - "step": 1160 - }, - { - "epoch": 0.8822188449848024, - "grad_norm": 2.458413600921631, - "learning_rate": 4.231113608393348e-06, - "loss": 0.5020045638084412, - "mean_token_accuracy": 0.8295938968658447, - "num_tokens": 10368401.0, - "step": 1161 - }, - { - "epoch": 0.8829787234042553, - "grad_norm": 1.7464948892593384, - "learning_rate": 4.229601957350722e-06, - "loss": 0.5335392951965332, - "mean_token_accuracy": 0.8134858012199402, - "num_tokens": 10378337.0, - "step": 1162 - }, - { - "epoch": 0.8837386018237082, - "grad_norm": 3.1152119636535645, - "learning_rate": 4.228089092406863e-06, - "loss": 0.4811682105064392, - "mean_token_accuracy": 0.8460187315940857, - "num_tokens": 10382362.0, - "step": 1163 - }, - { - "epoch": 0.8844984802431611, - "grad_norm": 2.190847158432007, - "learning_rate": 4.226575014623557e-06, - "loss": 0.4428049921989441, - "mean_token_accuracy": 0.8382467031478882, - "num_tokens": 10388211.0, - "step": 1164 - }, - { - "epoch": 0.8852583586626139, - "grad_norm": 1.860153079032898, - "learning_rate": 4.225059725063444e-06, - "loss": 0.5265918970108032, - "mean_token_accuracy": 0.8181334733963013, - "num_tokens": 10398873.0, - "step": 1165 - }, - { - "epoch": 0.8860182370820668, - "grad_norm": 1.3372713327407837, - "learning_rate": 4.22354322479001e-06, - "loss": 0.43202850222587585, - "mean_token_accuracy": 0.8432420492172241, - "num_tokens": 10413158.0, - "step": 1166 - }, - { - "epoch": 0.8867781155015197, - "grad_norm": 1.3653379678726196, - "learning_rate": 4.222025514867596e-06, - "loss": 0.43780991435050964, - "mean_token_accuracy": 0.8441485166549683, - "num_tokens": 10428137.0, - "step": 1167 - }, - { - "epoch": 0.8875379939209727, - "grad_norm": 3.0230672359466553, - "learning_rate": 4.220506596361387e-06, - "loss": 0.6039337515830994, - "mean_token_accuracy": 0.8274872303009033, - "num_tokens": 10432586.0, - "step": 1168 - }, - { - "epoch": 0.8882978723404256, - "grad_norm": 2.2180392742156982, - "learning_rate": 4.218986470337419e-06, - "loss": 0.5453792810440063, - "mean_token_accuracy": 0.8127184510231018, - "num_tokens": 10439471.0, - "step": 1169 - }, - { - "epoch": 0.8890577507598785, - "grad_norm": 1.8519103527069092, - "learning_rate": 4.217465137862575e-06, - "loss": 0.5145469903945923, - "mean_token_accuracy": 0.8178654909133911, - "num_tokens": 10450471.0, - "step": 1170 - }, - { - "epoch": 0.8898176291793313, - "grad_norm": 2.034008026123047, - "learning_rate": 4.215942600004586e-06, - "loss": 0.44061461091041565, - "mean_token_accuracy": 0.8572084307670593, - "num_tokens": 10457382.0, - "step": 1171 - }, - { - "epoch": 0.8905775075987842, - "grad_norm": 3.4304304122924805, - "learning_rate": 4.214418857832025e-06, - "loss": 0.44397830963134766, - "mean_token_accuracy": 0.842149019241333, - "num_tokens": 10460650.0, - "step": 1172 - }, - { - "epoch": 0.8913373860182371, - "grad_norm": 1.9021750688552856, - "learning_rate": 4.212893912414316e-06, - "loss": 0.3769867420196533, - "mean_token_accuracy": 0.8806171417236328, - "num_tokens": 10468214.0, - "step": 1173 - }, - { - "epoch": 0.89209726443769, - "grad_norm": 1.9704062938690186, - "learning_rate": 4.211367764821722e-06, - "loss": 0.5501819849014282, - "mean_token_accuracy": 0.8176811337471008, - "num_tokens": 10476739.0, - "step": 1174 - }, - { - "epoch": 0.8928571428571429, - "grad_norm": 1.4350415468215942, - "learning_rate": 4.209840416125353e-06, - "loss": 0.41897401213645935, - "mean_token_accuracy": 0.8498011827468872, - "num_tokens": 10491769.0, - "step": 1175 - }, - { - "epoch": 0.8936170212765957, - "grad_norm": 3.8237783908843994, - "learning_rate": 4.208311867397162e-06, - "loss": 0.5296977162361145, - "mean_token_accuracy": 0.8168715834617615, - "num_tokens": 10494958.0, - "step": 1176 - }, - { - "epoch": 0.8943768996960486, - "grad_norm": 2.04784893989563, - "learning_rate": 4.206782119709942e-06, - "loss": 0.476105272769928, - "mean_token_accuracy": 0.834011435508728, - "num_tokens": 10502077.0, - "step": 1177 - }, - { - "epoch": 0.8951367781155015, - "grad_norm": 1.8839610815048218, - "learning_rate": 4.205251174137329e-06, - "loss": 0.49628815054893494, - "mean_token_accuracy": 0.8212119936943054, - "num_tokens": 10510077.0, - "step": 1178 - }, - { - "epoch": 0.8958966565349544, - "grad_norm": 1.2100634574890137, - "learning_rate": 4.2037190317538e-06, - "loss": 0.4931519329547882, - "mean_token_accuracy": 0.8170043230056763, - "num_tokens": 10528373.0, - "step": 1179 - }, - { - "epoch": 0.8966565349544073, - "grad_norm": 1.884637713432312, - "learning_rate": 4.202185693634671e-06, - "loss": 0.4913347363471985, - "mean_token_accuracy": 0.8234949707984924, - "num_tokens": 10537108.0, - "step": 1180 - }, - { - "epoch": 0.8974164133738601, - "grad_norm": 1.5062434673309326, - "learning_rate": 4.200651160856099e-06, - "loss": 0.4160492420196533, - "mean_token_accuracy": 0.845937192440033, - "num_tokens": 10547577.0, - "step": 1181 - }, - { - "epoch": 0.898176291793313, - "grad_norm": 2.331169605255127, - "learning_rate": 4.1991154344950755e-06, - "loss": 0.6532632112503052, - "mean_token_accuracy": 0.7743191123008728, - "num_tokens": 10556328.0, - "step": 1182 - }, - { - "epoch": 0.898936170212766, - "grad_norm": 1.3538362979888916, - "learning_rate": 4.197578515629435e-06, - "loss": 0.4437566101551056, - "mean_token_accuracy": 0.8427901268005371, - "num_tokens": 10570026.0, - "step": 1183 - }, - { - "epoch": 0.8996960486322189, - "grad_norm": 2.3828957080841064, - "learning_rate": 4.196040405337846e-06, - "loss": 0.6185290217399597, - "mean_token_accuracy": 0.7969824075698853, - "num_tokens": 10576465.0, - "step": 1184 - }, - { - "epoch": 0.9004559270516718, - "grad_norm": 2.4759042263031006, - "learning_rate": 4.194501104699813e-06, - "loss": 0.46489226818084717, - "mean_token_accuracy": 0.8472316265106201, - "num_tokens": 10582034.0, - "step": 1185 - }, - { - "epoch": 0.9012158054711246, - "grad_norm": 1.9215164184570312, - "learning_rate": 4.192960614795676e-06, - "loss": 0.48001551628112793, - "mean_token_accuracy": 0.8371596336364746, - "num_tokens": 10590556.0, - "step": 1186 - }, - { - "epoch": 0.9019756838905775, - "grad_norm": 2.2717080116271973, - "learning_rate": 4.19141893670661e-06, - "loss": 0.40083563327789307, - "mean_token_accuracy": 0.8464195728302002, - "num_tokens": 10595661.0, - "step": 1187 - }, - { - "epoch": 0.9027355623100304, - "grad_norm": 2.187122344970703, - "learning_rate": 4.189876071514624e-06, - "loss": 0.4942901134490967, - "mean_token_accuracy": 0.8186990022659302, - "num_tokens": 10603366.0, - "step": 1188 - }, - { - "epoch": 0.9034954407294833, - "grad_norm": 1.542414665222168, - "learning_rate": 4.188332020302561e-06, - "loss": 0.4731982946395874, - "mean_token_accuracy": 0.8487229347229004, - "num_tokens": 10616203.0, - "step": 1189 - }, - { - "epoch": 0.9042553191489362, - "grad_norm": 0.9957579970359802, - "learning_rate": 4.186786784154096e-06, - "loss": 0.33211836218833923, - "mean_token_accuracy": 0.870644748210907, - "num_tokens": 10633294.0, - "step": 1190 - }, - { - "epoch": 0.9050151975683891, - "grad_norm": 2.593867540359497, - "learning_rate": 4.1852403641537344e-06, - "loss": 0.6825464963912964, - "mean_token_accuracy": 0.7716869115829468, - "num_tokens": 10640615.0, - "step": 1191 - }, - { - "epoch": 0.9057750759878419, - "grad_norm": 2.0424516201019287, - "learning_rate": 4.183692761386813e-06, - "loss": 0.5672709941864014, - "mean_token_accuracy": 0.7973801493644714, - "num_tokens": 10649845.0, - "step": 1192 - }, - { - "epoch": 0.9065349544072948, - "grad_norm": 1.429018259048462, - "learning_rate": 4.1821439769395e-06, - "loss": 0.5427846908569336, - "mean_token_accuracy": 0.8200292587280273, - "num_tokens": 10665898.0, - "step": 1193 - }, - { - "epoch": 0.9072948328267477, - "grad_norm": 1.9764264822006226, - "learning_rate": 4.180594011898791e-06, - "loss": 0.4784567356109619, - "mean_token_accuracy": 0.82924485206604, - "num_tokens": 10673595.0, - "step": 1194 - }, - { - "epoch": 0.9080547112462006, - "grad_norm": 1.4004309177398682, - "learning_rate": 4.1790428673525104e-06, - "loss": 0.4791432023048401, - "mean_token_accuracy": 0.8334879875183105, - "num_tokens": 10687892.0, - "step": 1195 - }, - { - "epoch": 0.9088145896656535, - "grad_norm": 2.2207727432250977, - "learning_rate": 4.177490544389313e-06, - "loss": 0.5089365243911743, - "mean_token_accuracy": 0.8270776271820068, - "num_tokens": 10694911.0, - "step": 1196 - }, - { - "epoch": 0.9095744680851063, - "grad_norm": 2.2890450954437256, - "learning_rate": 4.175937044098678e-06, - "loss": 0.5152267813682556, - "mean_token_accuracy": 0.8527299165725708, - "num_tokens": 10700512.0, - "step": 1197 - }, - { - "epoch": 0.9103343465045592, - "grad_norm": 1.7938050031661987, - "learning_rate": 4.1743823675709115e-06, - "loss": 0.3507300615310669, - "mean_token_accuracy": 0.8694599866867065, - "num_tokens": 10707953.0, - "step": 1198 - }, - { - "epoch": 0.9110942249240122, - "grad_norm": 1.4368808269500732, - "learning_rate": 4.172826515897146e-06, - "loss": 0.407418429851532, - "mean_token_accuracy": 0.8432893753051758, - "num_tokens": 10717485.0, - "step": 1199 - }, - { - "epoch": 0.9118541033434651, - "grad_norm": 1.735339879989624, - "learning_rate": 4.171269490169337e-06, - "loss": 0.46996885538101196, - "mean_token_accuracy": 0.8331948518753052, - "num_tokens": 10726160.0, - "step": 1200 - }, - { - "epoch": 0.912613981762918, - "grad_norm": 1.7859221696853638, - "learning_rate": 4.1697112914802665e-06, - "loss": 0.5325199365615845, - "mean_token_accuracy": 0.8179605007171631, - "num_tokens": 10736284.0, - "step": 1201 - }, - { - "epoch": 0.9133738601823708, - "grad_norm": 2.6394896507263184, - "learning_rate": 4.168151920923536e-06, - "loss": 0.4039744734764099, - "mean_token_accuracy": 0.8545527458190918, - "num_tokens": 10740673.0, - "step": 1202 - }, - { - "epoch": 0.9141337386018237, - "grad_norm": 1.910988211631775, - "learning_rate": 4.1665913795935755e-06, - "loss": 0.5190291404724121, - "mean_token_accuracy": 0.8203921318054199, - "num_tokens": 10751946.0, - "step": 1203 - }, - { - "epoch": 0.9148936170212766, - "grad_norm": 3.0006964206695557, - "learning_rate": 4.16502966858563e-06, - "loss": 0.5856777429580688, - "mean_token_accuracy": 0.8061224222183228, - "num_tokens": 10756795.0, - "step": 1204 - }, - { - "epoch": 0.9156534954407295, - "grad_norm": 1.7396167516708374, - "learning_rate": 4.163466788995768e-06, - "loss": 0.54935222864151, - "mean_token_accuracy": 0.8052443265914917, - "num_tokens": 10767202.0, - "step": 1205 - }, - { - "epoch": 0.9164133738601824, - "grad_norm": 2.143735885620117, - "learning_rate": 4.161902741920881e-06, - "loss": 0.5020298361778259, - "mean_token_accuracy": 0.8249630928039551, - "num_tokens": 10774329.0, - "step": 1206 - }, - { - "epoch": 0.9171732522796353, - "grad_norm": 2.8871893882751465, - "learning_rate": 4.160337528458676e-06, - "loss": 0.5154489278793335, - "mean_token_accuracy": 0.8276848793029785, - "num_tokens": 10778929.0, - "step": 1207 - }, - { - "epoch": 0.9179331306990881, - "grad_norm": 1.4642788171768188, - "learning_rate": 4.15877114970768e-06, - "loss": 0.5033774375915527, - "mean_token_accuracy": 0.8296241164207458, - "num_tokens": 10790928.0, - "step": 1208 - }, - { - "epoch": 0.918693009118541, - "grad_norm": 1.8313497304916382, - "learning_rate": 4.1572036067672386e-06, - "loss": 0.5674909353256226, - "mean_token_accuracy": 0.7975562214851379, - "num_tokens": 10801372.0, - "step": 1209 - }, - { - "epoch": 0.9194528875379939, - "grad_norm": 2.005958080291748, - "learning_rate": 4.155634900737513e-06, - "loss": 0.5557019114494324, - "mean_token_accuracy": 0.8141391277313232, - "num_tokens": 10809150.0, - "step": 1210 - }, - { - "epoch": 0.9202127659574468, - "grad_norm": 2.333519697189331, - "learning_rate": 4.154065032719482e-06, - "loss": 0.6990420818328857, - "mean_token_accuracy": 0.7565394043922424, - "num_tokens": 10816612.0, - "step": 1211 - }, - { - "epoch": 0.9209726443768997, - "grad_norm": 1.4472655057907104, - "learning_rate": 4.152494003814939e-06, - "loss": 0.541398286819458, - "mean_token_accuracy": 0.8027358055114746, - "num_tokens": 10833840.0, - "step": 1212 - }, - { - "epoch": 0.9217325227963525, - "grad_norm": 1.6183619499206543, - "learning_rate": 4.150921815126493e-06, - "loss": 0.6096762418746948, - "mean_token_accuracy": 0.7994354963302612, - "num_tokens": 10846367.0, - "step": 1213 - }, - { - "epoch": 0.9224924012158054, - "grad_norm": 2.614919900894165, - "learning_rate": 4.149348467757566e-06, - "loss": 0.41846764087677, - "mean_token_accuracy": 0.8555068969726562, - "num_tokens": 10850836.0, - "step": 1214 - }, - { - "epoch": 0.9232522796352584, - "grad_norm": 1.4419831037521362, - "learning_rate": 4.147773962812393e-06, - "loss": 0.4139535427093506, - "mean_token_accuracy": 0.845671534538269, - "num_tokens": 10864228.0, - "step": 1215 - }, - { - "epoch": 0.9240121580547113, - "grad_norm": 2.3868865966796875, - "learning_rate": 4.146198301396025e-06, - "loss": 0.3357275128364563, - "mean_token_accuracy": 0.8829520344734192, - "num_tokens": 10868920.0, - "step": 1216 - }, - { - "epoch": 0.9247720364741642, - "grad_norm": 1.7685474157333374, - "learning_rate": 4.14462148461432e-06, - "loss": 0.45333072543144226, - "mean_token_accuracy": 0.8505891561508179, - "num_tokens": 10877286.0, - "step": 1217 - }, - { - "epoch": 0.925531914893617, - "grad_norm": 1.7627625465393066, - "learning_rate": 4.143043513573949e-06, - "loss": 0.5028705596923828, - "mean_token_accuracy": 0.825471043586731, - "num_tokens": 10887047.0, - "step": 1218 - }, - { - "epoch": 0.9262917933130699, - "grad_norm": 1.3168725967407227, - "learning_rate": 4.141464389382392e-06, - "loss": 0.5494637489318848, - "mean_token_accuracy": 0.8121747970581055, - "num_tokens": 10903599.0, - "step": 1219 - }, - { - "epoch": 0.9270516717325228, - "grad_norm": 2.5180399417877197, - "learning_rate": 4.13988411314794e-06, - "loss": 0.6134277582168579, - "mean_token_accuracy": 0.7983006834983826, - "num_tokens": 10909791.0, - "step": 1220 - }, - { - "epoch": 0.9278115501519757, - "grad_norm": 1.1889166831970215, - "learning_rate": 4.13830268597969e-06, - "loss": 0.36713096499443054, - "mean_token_accuracy": 0.8416121006011963, - "num_tokens": 10925794.0, - "step": 1221 - }, - { - "epoch": 0.9285714285714286, - "grad_norm": 2.142422676086426, - "learning_rate": 4.136720108987552e-06, - "loss": 0.4427933096885681, - "mean_token_accuracy": 0.8427745699882507, - "num_tokens": 10931622.0, - "step": 1222 - }, - { - "epoch": 0.9293313069908815, - "grad_norm": 1.908564567565918, - "learning_rate": 4.1351363832822364e-06, - "loss": 0.5088109374046326, - "mean_token_accuracy": 0.8309272527694702, - "num_tokens": 10940843.0, - "step": 1223 - }, - { - "epoch": 0.9300911854103343, - "grad_norm": 1.2862322330474854, - "learning_rate": 4.133551509975264e-06, - "loss": 0.3963761329650879, - "mean_token_accuracy": 0.8602159023284912, - "num_tokens": 10954481.0, - "step": 1224 - }, - { - "epoch": 0.9308510638297872, - "grad_norm": 1.5876200199127197, - "learning_rate": 4.13196549017896e-06, - "loss": 0.4311184287071228, - "mean_token_accuracy": 0.8460899591445923, - "num_tokens": 10963501.0, - "step": 1225 - }, - { - "epoch": 0.9316109422492401, - "grad_norm": 2.459878444671631, - "learning_rate": 4.130378325006453e-06, - "loss": 0.5016295313835144, - "mean_token_accuracy": 0.8125218152999878, - "num_tokens": 10968850.0, - "step": 1226 - }, - { - "epoch": 0.932370820668693, - "grad_norm": 2.059718370437622, - "learning_rate": 4.128790015571679e-06, - "loss": 0.48982277512550354, - "mean_token_accuracy": 0.8327049016952515, - "num_tokens": 10976642.0, - "step": 1227 - }, - { - "epoch": 0.9331306990881459, - "grad_norm": 1.3719185590744019, - "learning_rate": 4.127200562989372e-06, - "loss": 0.38778752088546753, - "mean_token_accuracy": 0.8623501062393188, - "num_tokens": 10988703.0, - "step": 1228 - }, - { - "epoch": 0.9338905775075987, - "grad_norm": 1.302140712738037, - "learning_rate": 4.125609968375073e-06, - "loss": 0.4887842535972595, - "mean_token_accuracy": 0.8322232961654663, - "num_tokens": 11005981.0, - "step": 1229 - }, - { - "epoch": 0.9346504559270516, - "grad_norm": 1.819624423980713, - "learning_rate": 4.12401823284512e-06, - "loss": 0.49825209379196167, - "mean_token_accuracy": 0.8278916478157043, - "num_tokens": 11014145.0, - "step": 1230 - }, - { - "epoch": 0.9354103343465046, - "grad_norm": 1.2762807607650757, - "learning_rate": 4.122425357516658e-06, - "loss": 0.433994323015213, - "mean_token_accuracy": 0.853028416633606, - "num_tokens": 11029232.0, - "step": 1231 - }, - { - "epoch": 0.9361702127659575, - "grad_norm": 2.2171671390533447, - "learning_rate": 4.1208313435076255e-06, - "loss": 0.38436949253082275, - "mean_token_accuracy": 0.8616260290145874, - "num_tokens": 11034743.0, - "step": 1232 - }, - { - "epoch": 0.9369300911854104, - "grad_norm": 1.355879545211792, - "learning_rate": 4.119236191936764e-06, - "loss": 0.5378084182739258, - "mean_token_accuracy": 0.8256701231002808, - "num_tokens": 11048149.0, - "step": 1233 - }, - { - "epoch": 0.9376899696048632, - "grad_norm": 2.66812801361084, - "learning_rate": 4.117639903923611e-06, - "loss": 0.5236451625823975, - "mean_token_accuracy": 0.8431973457336426, - "num_tokens": 11052295.0, - "step": 1234 - }, - { - "epoch": 0.9384498480243161, - "grad_norm": 1.5740545988082886, - "learning_rate": 4.116042480588505e-06, - "loss": 0.44322824478149414, - "mean_token_accuracy": 0.8436908721923828, - "num_tokens": 11062066.0, - "step": 1235 - }, - { - "epoch": 0.939209726443769, - "grad_norm": 1.230706810951233, - "learning_rate": 4.114443923052577e-06, - "loss": 0.3325323462486267, - "mean_token_accuracy": 0.8674666881561279, - "num_tokens": 11074300.0, - "step": 1236 - }, - { - "epoch": 0.9399696048632219, - "grad_norm": 1.9870070219039917, - "learning_rate": 4.112844232437757e-06, - "loss": 0.5711548328399658, - "mean_token_accuracy": 0.8081738948822021, - "num_tokens": 11082297.0, - "step": 1237 - }, - { - "epoch": 0.9407294832826748, - "grad_norm": 1.3020970821380615, - "learning_rate": 4.11124340986677e-06, - "loss": 0.4187922477722168, - "mean_token_accuracy": 0.8566171526908875, - "num_tokens": 11096810.0, - "step": 1238 - }, - { - "epoch": 0.9414893617021277, - "grad_norm": 2.1399197578430176, - "learning_rate": 4.109641456463135e-06, - "loss": 0.5293116569519043, - "mean_token_accuracy": 0.8176157474517822, - "num_tokens": 11102761.0, - "step": 1239 - }, - { - "epoch": 0.9422492401215805, - "grad_norm": 1.3503763675689697, - "learning_rate": 4.108038373351163e-06, - "loss": 0.4907652735710144, - "mean_token_accuracy": 0.8204987049102783, - "num_tokens": 11118480.0, - "step": 1240 - }, - { - "epoch": 0.9430091185410334, - "grad_norm": 1.9571399688720703, - "learning_rate": 4.106434161655962e-06, - "loss": 0.4709656536579132, - "mean_token_accuracy": 0.8371885418891907, - "num_tokens": 11126265.0, - "step": 1241 - }, - { - "epoch": 0.9437689969604863, - "grad_norm": 2.1277313232421875, - "learning_rate": 4.104828822503427e-06, - "loss": 0.4010283350944519, - "mean_token_accuracy": 0.8586333990097046, - "num_tokens": 11133022.0, - "step": 1242 - }, - { - "epoch": 0.9445288753799392, - "grad_norm": 1.6745036840438843, - "learning_rate": 4.103222357020248e-06, - "loss": 0.562545657157898, - "mean_token_accuracy": 0.8052060604095459, - "num_tokens": 11145255.0, - "step": 1243 - }, - { - "epoch": 0.9452887537993921, - "grad_norm": 2.3616299629211426, - "learning_rate": 4.101614766333904e-06, - "loss": 0.5878340601921082, - "mean_token_accuracy": 0.796745777130127, - "num_tokens": 11152020.0, - "step": 1244 - }, - { - "epoch": 0.9460486322188449, - "grad_norm": 1.6182078123092651, - "learning_rate": 4.100006051572664e-06, - "loss": 0.5357589721679688, - "mean_token_accuracy": 0.8089962005615234, - "num_tokens": 11163112.0, - "step": 1245 - }, - { - "epoch": 0.9468085106382979, - "grad_norm": 1.911770224571228, - "learning_rate": 4.098396213865587e-06, - "loss": 0.49805426597595215, - "mean_token_accuracy": 0.8289647102355957, - "num_tokens": 11171768.0, - "step": 1246 - }, - { - "epoch": 0.9475683890577508, - "grad_norm": 1.649155616760254, - "learning_rate": 4.096785254342518e-06, - "loss": 0.5756166577339172, - "mean_token_accuracy": 0.807680606842041, - "num_tokens": 11183527.0, - "step": 1247 - }, - { - "epoch": 0.9483282674772037, - "grad_norm": 1.8922761678695679, - "learning_rate": 4.095173174134091e-06, - "loss": 0.44688963890075684, - "mean_token_accuracy": 0.8375608921051025, - "num_tokens": 11191494.0, - "step": 1248 - }, - { - "epoch": 0.9490881458966566, - "grad_norm": 2.9044547080993652, - "learning_rate": 4.093559974371725e-06, - "loss": 0.48609739542007446, - "mean_token_accuracy": 0.8404892086982727, - "num_tokens": 11195837.0, - "step": 1249 - }, - { - "epoch": 0.9498480243161094, - "grad_norm": 2.287506580352783, - "learning_rate": 4.091945656187626e-06, - "loss": 0.5260225534439087, - "mean_token_accuracy": 0.8181945085525513, - "num_tokens": 11202174.0, - "step": 1250 - }, - { - "epoch": 0.9506079027355623, - "grad_norm": 1.7908886671066284, - "learning_rate": 4.090330220714785e-06, - "loss": 0.4207724928855896, - "mean_token_accuracy": 0.8616912364959717, - "num_tokens": 11209995.0, - "step": 1251 - }, - { - "epoch": 0.9513677811550152, - "grad_norm": 2.905418634414673, - "learning_rate": 4.0887136690869774e-06, - "loss": 0.4209241271018982, - "mean_token_accuracy": 0.8561323285102844, - "num_tokens": 11213799.0, - "step": 1252 - }, - { - "epoch": 0.9521276595744681, - "grad_norm": 2.814150333404541, - "learning_rate": 4.08709600243876e-06, - "loss": 0.36855608224868774, - "mean_token_accuracy": 0.8764539361000061, - "num_tokens": 11217643.0, - "step": 1253 - }, - { - "epoch": 0.952887537993921, - "grad_norm": 1.9385707378387451, - "learning_rate": 4.0854772219054735e-06, - "loss": 0.531031608581543, - "mean_token_accuracy": 0.80600905418396, - "num_tokens": 11225871.0, - "step": 1254 - }, - { - "epoch": 0.9536474164133738, - "grad_norm": 2.103058099746704, - "learning_rate": 4.083857328623243e-06, - "loss": 0.4576364755630493, - "mean_token_accuracy": 0.8447524905204773, - "num_tokens": 11231829.0, - "step": 1255 - }, - { - "epoch": 0.9544072948328267, - "grad_norm": 1.7518818378448486, - "learning_rate": 4.082236323728969e-06, - "loss": 0.5386767983436584, - "mean_token_accuracy": 0.8055596351623535, - "num_tokens": 11240977.0, - "step": 1256 - }, - { - "epoch": 0.9551671732522796, - "grad_norm": 1.8434966802597046, - "learning_rate": 4.0806142083603365e-06, - "loss": 0.5415925979614258, - "mean_token_accuracy": 0.809962272644043, - "num_tokens": 11249616.0, - "step": 1257 - }, - { - "epoch": 0.9559270516717325, - "grad_norm": 1.7341015338897705, - "learning_rate": 4.078990983655807e-06, - "loss": 0.4621101915836334, - "mean_token_accuracy": 0.8330386877059937, - "num_tokens": 11258616.0, - "step": 1258 - }, - { - "epoch": 0.9566869300911854, - "grad_norm": 1.8589727878570557, - "learning_rate": 4.077366650754624e-06, - "loss": 0.4031238555908203, - "mean_token_accuracy": 0.842434287071228, - "num_tokens": 11266006.0, - "step": 1259 - }, - { - "epoch": 0.9574468085106383, - "grad_norm": 1.657175898551941, - "learning_rate": 4.075741210796806e-06, - "loss": 0.41686388850212097, - "mean_token_accuracy": 0.8443650007247925, - "num_tokens": 11275601.0, - "step": 1260 - }, - { - "epoch": 0.9582066869300911, - "grad_norm": 2.4303717613220215, - "learning_rate": 4.07411466492315e-06, - "loss": 0.4554435610771179, - "mean_token_accuracy": 0.853043794631958, - "num_tokens": 11280650.0, - "step": 1261 - }, - { - "epoch": 0.958966565349544, - "grad_norm": 2.3653745651245117, - "learning_rate": 4.072487014275228e-06, - "loss": 0.4304995536804199, - "mean_token_accuracy": 0.8462260961532593, - "num_tokens": 11285637.0, - "step": 1262 - }, - { - "epoch": 0.959726443768997, - "grad_norm": 1.6689718961715698, - "learning_rate": 4.070858259995388e-06, - "loss": 0.5290807485580444, - "mean_token_accuracy": 0.8176917433738708, - "num_tokens": 11299110.0, - "step": 1263 - }, - { - "epoch": 0.9604863221884499, - "grad_norm": 2.103879451751709, - "learning_rate": 4.069228403226751e-06, - "loss": 0.4620879888534546, - "mean_token_accuracy": 0.835270345211029, - "num_tokens": 11305564.0, - "step": 1264 - }, - { - "epoch": 0.9612462006079028, - "grad_norm": 2.139012575149536, - "learning_rate": 4.067597445113216e-06, - "loss": 0.5143396258354187, - "mean_token_accuracy": 0.8191739320755005, - "num_tokens": 11311870.0, - "step": 1265 - }, - { - "epoch": 0.9620060790273556, - "grad_norm": 1.3971210718154907, - "learning_rate": 4.06596538679945e-06, - "loss": 0.472080260515213, - "mean_token_accuracy": 0.8321092128753662, - "num_tokens": 11323970.0, - "step": 1266 - }, - { - "epoch": 0.9627659574468085, - "grad_norm": 1.4965174198150635, - "learning_rate": 4.064332229430895e-06, - "loss": 0.359701007604599, - "mean_token_accuracy": 0.8903120160102844, - "num_tokens": 11333412.0, - "step": 1267 - }, - { - "epoch": 0.9635258358662614, - "grad_norm": 1.1898726224899292, - "learning_rate": 4.062697974153764e-06, - "loss": 0.3423798084259033, - "mean_token_accuracy": 0.8661491870880127, - "num_tokens": 11347657.0, - "step": 1268 - }, - { - "epoch": 0.9642857142857143, - "grad_norm": 1.4952168464660645, - "learning_rate": 4.06106262211504e-06, - "loss": 0.4214417338371277, - "mean_token_accuracy": 0.8362159729003906, - "num_tokens": 11357786.0, - "step": 1269 - }, - { - "epoch": 0.9650455927051672, - "grad_norm": 1.7949583530426025, - "learning_rate": 4.059426174462476e-06, - "loss": 0.59087735414505, - "mean_token_accuracy": 0.7965556979179382, - "num_tokens": 11370561.0, - "step": 1270 - }, - { - "epoch": 0.96580547112462, - "grad_norm": 1.8973214626312256, - "learning_rate": 4.057788632344594e-06, - "loss": 0.47525322437286377, - "mean_token_accuracy": 0.8317365050315857, - "num_tokens": 11378507.0, - "step": 1271 - }, - { - "epoch": 0.9665653495440729, - "grad_norm": 1.8665250539779663, - "learning_rate": 4.056149996910683e-06, - "loss": 0.3537125587463379, - "mean_token_accuracy": 0.8921569585800171, - "num_tokens": 11385186.0, - "step": 1272 - }, - { - "epoch": 0.9673252279635258, - "grad_norm": 1.5072317123413086, - "learning_rate": 4.054510269310803e-06, - "loss": 0.5145624876022339, - "mean_token_accuracy": 0.8265488147735596, - "num_tokens": 11397125.0, - "step": 1273 - }, - { - "epoch": 0.9680851063829787, - "grad_norm": 1.520525574684143, - "learning_rate": 4.052869450695776e-06, - "loss": 0.44322293996810913, - "mean_token_accuracy": 0.8403642177581787, - "num_tokens": 11409919.0, - "step": 1274 - }, - { - "epoch": 0.9688449848024316, - "grad_norm": 1.3764475584030151, - "learning_rate": 4.051227542217192e-06, - "loss": 0.5774400234222412, - "mean_token_accuracy": 0.804118275642395, - "num_tokens": 11425900.0, - "step": 1275 - }, - { - "epoch": 0.9696048632218845, - "grad_norm": 1.3922648429870605, - "learning_rate": 4.049584545027406e-06, - "loss": 0.42727944254875183, - "mean_token_accuracy": 0.8654505014419556, - "num_tokens": 11438787.0, - "step": 1276 - }, - { - "epoch": 0.9703647416413373, - "grad_norm": 1.8505840301513672, - "learning_rate": 4.047940460279537e-06, - "loss": 0.490803062915802, - "mean_token_accuracy": 0.8340574502944946, - "num_tokens": 11447997.0, - "step": 1277 - }, - { - "epoch": 0.9711246200607903, - "grad_norm": 2.28271222114563, - "learning_rate": 4.046295289127466e-06, - "loss": 0.588828444480896, - "mean_token_accuracy": 0.833497166633606, - "num_tokens": 11454072.0, - "step": 1278 - }, - { - "epoch": 0.9718844984802432, - "grad_norm": 2.4242560863494873, - "learning_rate": 4.044649032725836e-06, - "loss": 0.5128831267356873, - "mean_token_accuracy": 0.8225122690200806, - "num_tokens": 11460211.0, - "step": 1279 - }, - { - "epoch": 0.9726443768996961, - "grad_norm": 2.1738455295562744, - "learning_rate": 4.0430016922300566e-06, - "loss": 0.441631942987442, - "mean_token_accuracy": 0.841723620891571, - "num_tokens": 11466814.0, - "step": 1280 - }, - { - "epoch": 0.973404255319149, - "grad_norm": 2.541599988937378, - "learning_rate": 4.0413532687962926e-06, - "loss": 0.5062629580497742, - "mean_token_accuracy": 0.8013502359390259, - "num_tokens": 11472371.0, - "step": 1281 - }, - { - "epoch": 0.9741641337386018, - "grad_norm": 2.8011014461517334, - "learning_rate": 4.039703763581472e-06, - "loss": 0.5061966776847839, - "mean_token_accuracy": 0.829810380935669, - "num_tokens": 11476672.0, - "step": 1282 - }, - { - "epoch": 0.9749240121580547, - "grad_norm": 2.4505462646484375, - "learning_rate": 4.038053177743279e-06, - "loss": 0.43407535552978516, - "mean_token_accuracy": 0.8428469896316528, - "num_tokens": 11481297.0, - "step": 1283 - }, - { - "epoch": 0.9756838905775076, - "grad_norm": 2.1618378162384033, - "learning_rate": 4.036401512440161e-06, - "loss": 0.6056663393974304, - "mean_token_accuracy": 0.7977457642555237, - "num_tokens": 11488657.0, - "step": 1284 - }, - { - "epoch": 0.9764437689969605, - "grad_norm": 1.9192147254943848, - "learning_rate": 4.034748768831319e-06, - "loss": 0.524390697479248, - "mean_token_accuracy": 0.8120636940002441, - "num_tokens": 11496485.0, - "step": 1285 - }, - { - "epoch": 0.9772036474164134, - "grad_norm": 2.766435384750366, - "learning_rate": 4.033094948076713e-06, - "loss": 0.5494908690452576, - "mean_token_accuracy": 0.8141890168190002, - "num_tokens": 11501341.0, - "step": 1286 - }, - { - "epoch": 0.9779635258358662, - "grad_norm": 1.3519539833068848, - "learning_rate": 4.031440051337056e-06, - "loss": 0.4339691400527954, - "mean_token_accuracy": 0.8400131464004517, - "num_tokens": 11512843.0, - "step": 1287 - }, - { - "epoch": 0.9787234042553191, - "grad_norm": 1.2492141723632812, - "learning_rate": 4.02978407977382e-06, - "loss": 0.4433518052101135, - "mean_token_accuracy": 0.8432940244674683, - "num_tokens": 11530227.0, - "step": 1288 - }, - { - "epoch": 0.979483282674772, - "grad_norm": 1.6597715616226196, - "learning_rate": 4.02812703454923e-06, - "loss": 0.602222204208374, - "mean_token_accuracy": 0.786965548992157, - "num_tokens": 11543955.0, - "step": 1289 - }, - { - "epoch": 0.9802431610942249, - "grad_norm": 1.6621816158294678, - "learning_rate": 4.026468916826262e-06, - "loss": 0.35662174224853516, - "mean_token_accuracy": 0.8716133832931519, - "num_tokens": 11552064.0, - "step": 1290 - }, - { - "epoch": 0.9810030395136778, - "grad_norm": 4.539844989776611, - "learning_rate": 4.024809727768648e-06, - "loss": 0.543423593044281, - "mean_token_accuracy": 0.8293194770812988, - "num_tokens": 11555595.0, - "step": 1291 - }, - { - "epoch": 0.9817629179331308, - "grad_norm": 1.4026556015014648, - "learning_rate": 4.023149468540871e-06, - "loss": 0.4301237165927887, - "mean_token_accuracy": 0.8358224630355835, - "num_tokens": 11572275.0, - "step": 1292 - }, - { - "epoch": 0.9825227963525835, - "grad_norm": 1.611262321472168, - "learning_rate": 4.021488140308165e-06, - "loss": 0.5378580689430237, - "mean_token_accuracy": 0.8173760771751404, - "num_tokens": 11584299.0, - "step": 1293 - }, - { - "epoch": 0.9832826747720365, - "grad_norm": 4.138631820678711, - "learning_rate": 4.019825744236514e-06, - "loss": 0.40272149443626404, - "mean_token_accuracy": 0.8648844957351685, - "num_tokens": 11586705.0, - "step": 1294 - }, - { - "epoch": 0.9840425531914894, - "grad_norm": 3.177703619003296, - "learning_rate": 4.018162281492651e-06, - "loss": 0.5320103168487549, - "mean_token_accuracy": 0.8250276446342468, - "num_tokens": 11590689.0, - "step": 1295 - }, - { - "epoch": 0.9848024316109423, - "grad_norm": 2.727597713470459, - "learning_rate": 4.016497753244058e-06, - "loss": 0.5662774443626404, - "mean_token_accuracy": 0.8074625730514526, - "num_tokens": 11596092.0, - "step": 1296 - }, - { - "epoch": 0.9855623100303952, - "grad_norm": 1.485139012336731, - "learning_rate": 4.014832160658966e-06, - "loss": 0.5414972305297852, - "mean_token_accuracy": 0.8082696199417114, - "num_tokens": 11613785.0, - "step": 1297 - }, - { - "epoch": 0.986322188449848, - "grad_norm": 2.4025990962982178, - "learning_rate": 4.013165504906352e-06, - "loss": 0.6556503772735596, - "mean_token_accuracy": 0.7785214781761169, - "num_tokens": 11620421.0, - "step": 1298 - }, - { - "epoch": 0.9870820668693009, - "grad_norm": 1.878273606300354, - "learning_rate": 4.011497787155938e-06, - "loss": 0.4221133887767792, - "mean_token_accuracy": 0.850035548210144, - "num_tokens": 11627998.0, - "step": 1299 - }, - { - "epoch": 0.9878419452887538, - "grad_norm": 2.0430715084075928, - "learning_rate": 4.009829008578192e-06, - "loss": 0.5205984711647034, - "mean_token_accuracy": 0.819183349609375, - "num_tokens": 11636279.0, - "step": 1300 - }, - { - "epoch": 0.9886018237082067, - "grad_norm": 3.4769439697265625, - "learning_rate": 4.00815917034433e-06, - "loss": 0.5449948310852051, - "mean_token_accuracy": 0.8240023851394653, - "num_tokens": 11639638.0, - "step": 1301 - }, - { - "epoch": 0.9893617021276596, - "grad_norm": 2.4783987998962402, - "learning_rate": 4.006488273626307e-06, - "loss": 0.4316832423210144, - "mean_token_accuracy": 0.8474695086479187, - "num_tokens": 11645463.0, - "step": 1302 - }, - { - "epoch": 0.9901215805471124, - "grad_norm": 1.881475567817688, - "learning_rate": 4.004816319596822e-06, - "loss": 0.5157331824302673, - "mean_token_accuracy": 0.826042652130127, - "num_tokens": 11653955.0, - "step": 1303 - }, - { - "epoch": 0.9908814589665653, - "grad_norm": 2.6569254398345947, - "learning_rate": 4.003143309429317e-06, - "loss": 0.46492767333984375, - "mean_token_accuracy": 0.8320850133895874, - "num_tokens": 11659357.0, - "step": 1304 - }, - { - "epoch": 0.9916413373860182, - "grad_norm": 2.4917593002319336, - "learning_rate": 4.0014692442979756e-06, - "loss": 0.459585040807724, - "mean_token_accuracy": 0.8457611799240112, - "num_tokens": 11664207.0, - "step": 1305 - }, - { - "epoch": 0.9924012158054711, - "grad_norm": 2.6885526180267334, - "learning_rate": 3.999794125377721e-06, - "loss": 0.4677402973175049, - "mean_token_accuracy": 0.8307361602783203, - "num_tokens": 11668879.0, - "step": 1306 - }, - { - "epoch": 0.993161094224924, - "grad_norm": 1.9737319946289062, - "learning_rate": 3.998117953844215e-06, - "loss": 0.44684839248657227, - "mean_token_accuracy": 0.8367687463760376, - "num_tokens": 11676081.0, - "step": 1307 - }, - { - "epoch": 0.993920972644377, - "grad_norm": 1.4333021640777588, - "learning_rate": 3.996440730873861e-06, - "loss": 0.526146650314331, - "mean_token_accuracy": 0.816251814365387, - "num_tokens": 11689333.0, - "step": 1308 - }, - { - "epoch": 0.9946808510638298, - "grad_norm": 1.3689230680465698, - "learning_rate": 3.9947624576437975e-06, - "loss": 0.40214329957962036, - "mean_token_accuracy": 0.8610327839851379, - "num_tokens": 11701540.0, - "step": 1309 - }, - { - "epoch": 0.9954407294832827, - "grad_norm": 1.2435375452041626, - "learning_rate": 3.9930831353319025e-06, - "loss": 0.4532913267612457, - "mean_token_accuracy": 0.8415389060974121, - "num_tokens": 11717920.0, - "step": 1310 - }, - { - "epoch": 0.9962006079027356, - "grad_norm": 1.9968011379241943, - "learning_rate": 3.9914027651167866e-06, - "loss": 0.46954160928726196, - "mean_token_accuracy": 0.8351103663444519, - "num_tokens": 11724999.0, - "step": 1311 - }, - { - "epoch": 0.9969604863221885, - "grad_norm": 1.9521311521530151, - "learning_rate": 3.989721348177801e-06, - "loss": 0.5068016052246094, - "mean_token_accuracy": 0.8220845460891724, - "num_tokens": 11732569.0, - "step": 1312 - }, - { - "epoch": 0.9977203647416414, - "grad_norm": 2.7332582473754883, - "learning_rate": 3.988038885695028e-06, - "loss": 0.4154692590236664, - "mean_token_accuracy": 0.8493857383728027, - "num_tokens": 11736759.0, - "step": 1313 - }, - { - "epoch": 0.9984802431610942, - "grad_norm": 1.8656952381134033, - "learning_rate": 3.986355378849284e-06, - "loss": 0.4151354134082794, - "mean_token_accuracy": 0.83440101146698, - "num_tokens": 11743827.0, - "step": 1314 - }, - { - "epoch": 0.9992401215805471, - "grad_norm": 1.304006576538086, - "learning_rate": 3.984670828822118e-06, - "loss": 0.4926128089427948, - "mean_token_accuracy": 0.8603005409240723, - "num_tokens": 11757707.0, - "step": 1315 - }, - { - "epoch": 1.0, - "grad_norm": 1.497079610824585, - "learning_rate": 3.982985236795815e-06, - "loss": 0.43342477083206177, - "mean_token_accuracy": 0.8550825119018555, - "num_tokens": 11769678.0, - "step": 1316 - }, - { - "epoch": 1.000759878419453, - "grad_norm": 2.870274543762207, - "learning_rate": 3.981298603953385e-06, - "loss": 0.3723528981208801, - "mean_token_accuracy": 0.8745899796485901, - "num_tokens": 11773290.0, - "step": 1317 - }, - { - "epoch": 1.0015197568389058, - "grad_norm": 1.3442503213882446, - "learning_rate": 3.979610931478574e-06, - "loss": 0.34688329696655273, - "mean_token_accuracy": 0.8749074935913086, - "num_tokens": 11786400.0, - "step": 1318 - }, - { - "epoch": 1.0022796352583587, - "grad_norm": 1.7272238731384277, - "learning_rate": 3.977922220555855e-06, - "loss": 0.28274932503700256, - "mean_token_accuracy": 0.896713137626648, - "num_tokens": 11793059.0, - "step": 1319 - }, - { - "epoch": 1.0030395136778116, - "grad_norm": 1.7362451553344727, - "learning_rate": 3.976232472370431e-06, - "loss": 0.5494794845581055, - "mean_token_accuracy": 0.8341718912124634, - "num_tokens": 11802593.0, - "step": 1320 - }, - { - "epoch": 1.0037993920972645, - "grad_norm": 1.3316494226455688, - "learning_rate": 3.97454168810823e-06, - "loss": 0.41505366563796997, - "mean_token_accuracy": 0.8581969738006592, - "num_tokens": 11813925.0, - "step": 1321 - }, - { - "epoch": 1.0045592705167172, - "grad_norm": 1.6152615547180176, - "learning_rate": 3.972849868955913e-06, - "loss": 0.44761013984680176, - "mean_token_accuracy": 0.8413045406341553, - "num_tokens": 11825709.0, - "step": 1322 - }, - { - "epoch": 1.0053191489361701, - "grad_norm": 2.1172471046447754, - "learning_rate": 3.97115701610086e-06, - "loss": 0.3903353810310364, - "mean_token_accuracy": 0.8662760257720947, - "num_tokens": 11832070.0, - "step": 1323 - }, - { - "epoch": 1.006079027355623, - "grad_norm": 1.5923868417739868, - "learning_rate": 3.969463130731183e-06, - "loss": 0.4491051137447357, - "mean_token_accuracy": 0.8677828311920166, - "num_tokens": 11843154.0, - "step": 1324 - }, - { - "epoch": 1.006838905775076, - "grad_norm": 1.6848995685577393, - "learning_rate": 3.967768214035716e-06, - "loss": 0.45765817165374756, - "mean_token_accuracy": 0.8401060104370117, - "num_tokens": 11854826.0, - "step": 1325 - }, - { - "epoch": 1.0075987841945289, - "grad_norm": 2.3739020824432373, - "learning_rate": 3.966072267204014e-06, - "loss": 0.4482722580432892, - "mean_token_accuracy": 0.8368916511535645, - "num_tokens": 11860559.0, - "step": 1326 - }, - { - "epoch": 1.0083586626139818, - "grad_norm": 1.5403034687042236, - "learning_rate": 3.964375291426361e-06, - "loss": 0.35589972138404846, - "mean_token_accuracy": 0.8728118538856506, - "num_tokens": 11871959.0, - "step": 1327 - }, - { - "epoch": 1.0091185410334347, - "grad_norm": 1.6750119924545288, - "learning_rate": 3.962677287893758e-06, - "loss": 0.35873427987098694, - "mean_token_accuracy": 0.9027186632156372, - "num_tokens": 11881818.0, - "step": 1328 - }, - { - "epoch": 1.0098784194528876, - "grad_norm": 1.5489170551300049, - "learning_rate": 3.9609782577979305e-06, - "loss": 0.3634672462940216, - "mean_token_accuracy": 0.8582607507705688, - "num_tokens": 11891084.0, - "step": 1329 - }, - { - "epoch": 1.0106382978723405, - "grad_norm": 2.43859601020813, - "learning_rate": 3.959278202331323e-06, - "loss": 0.3640799820423126, - "mean_token_accuracy": 0.88062584400177, - "num_tokens": 11896032.0, - "step": 1330 - }, - { - "epoch": 1.0113981762917934, - "grad_norm": 3.612184524536133, - "learning_rate": 3.9575771226870986e-06, - "loss": 0.3733130097389221, - "mean_token_accuracy": 0.8946067094802856, - "num_tokens": 11899479.0, - "step": 1331 - }, - { - "epoch": 1.012158054711246, - "grad_norm": 1.541355848312378, - "learning_rate": 3.955875020059141e-06, - "loss": 0.320593923330307, - "mean_token_accuracy": 0.9057406783103943, - "num_tokens": 11910179.0, - "step": 1332 - }, - { - "epoch": 1.012917933130699, - "grad_norm": 2.0565030574798584, - "learning_rate": 3.954171895642052e-06, - "loss": 0.3341682553291321, - "mean_token_accuracy": 0.8829344511032104, - "num_tokens": 11916489.0, - "step": 1333 - }, - { - "epoch": 1.013677811550152, - "grad_norm": 2.9732539653778076, - "learning_rate": 3.9524677506311505e-06, - "loss": 0.38488566875457764, - "mean_token_accuracy": 0.8752974271774292, - "num_tokens": 11920682.0, - "step": 1334 - }, - { - "epoch": 1.0144376899696048, - "grad_norm": 2.7697458267211914, - "learning_rate": 3.950762586222469e-06, - "loss": 0.39864760637283325, - "mean_token_accuracy": 0.8593167662620544, - "num_tokens": 11925233.0, - "step": 1335 - }, - { - "epoch": 1.0151975683890577, - "grad_norm": 2.2302119731903076, - "learning_rate": 3.949056403612758e-06, - "loss": 0.3985682725906372, - "mean_token_accuracy": 0.8677899837493896, - "num_tokens": 11932000.0, - "step": 1336 - }, - { - "epoch": 1.0159574468085106, - "grad_norm": 2.360572576522827, - "learning_rate": 3.947349203999485e-06, - "loss": 0.36940714716911316, - "mean_token_accuracy": 0.8760676383972168, - "num_tokens": 11937569.0, - "step": 1337 - }, - { - "epoch": 1.0167173252279635, - "grad_norm": 1.3383921384811401, - "learning_rate": 3.945640988580824e-06, - "loss": 0.40628793835639954, - "mean_token_accuracy": 0.866442084312439, - "num_tokens": 11955679.0, - "step": 1338 - }, - { - "epoch": 1.0174772036474165, - "grad_norm": 2.1502623558044434, - "learning_rate": 3.943931758555669e-06, - "loss": 0.4493565559387207, - "mean_token_accuracy": 0.8307522535324097, - "num_tokens": 11962734.0, - "step": 1339 - }, - { - "epoch": 1.0182370820668694, - "grad_norm": 2.4737331867218018, - "learning_rate": 3.942221515123624e-06, - "loss": 0.28508758544921875, - "mean_token_accuracy": 0.8967142105102539, - "num_tokens": 11967783.0, - "step": 1340 - }, - { - "epoch": 1.0189969604863223, - "grad_norm": 2.4525370597839355, - "learning_rate": 3.940510259485002e-06, - "loss": 0.40227818489074707, - "mean_token_accuracy": 0.8618967533111572, - "num_tokens": 11972918.0, - "step": 1341 - }, - { - "epoch": 1.0197568389057752, - "grad_norm": 1.7299731969833374, - "learning_rate": 3.938797992840828e-06, - "loss": 0.26339593529701233, - "mean_token_accuracy": 0.9004406929016113, - "num_tokens": 11981250.0, - "step": 1342 - }, - { - "epoch": 1.0205167173252279, - "grad_norm": 2.8756747245788574, - "learning_rate": 3.937084716392839e-06, - "loss": 0.47792482376098633, - "mean_token_accuracy": 0.8440839052200317, - "num_tokens": 11986356.0, - "step": 1343 - }, - { - "epoch": 1.0212765957446808, - "grad_norm": 2.104473114013672, - "learning_rate": 3.935370431343475e-06, - "loss": 0.36723971366882324, - "mean_token_accuracy": 0.8831232786178589, - "num_tokens": 11994495.0, - "step": 1344 - }, - { - "epoch": 1.0220364741641337, - "grad_norm": 1.9173074960708618, - "learning_rate": 3.933655138895889e-06, - "loss": 0.409319669008255, - "mean_token_accuracy": 0.8632645606994629, - "num_tokens": 12002060.0, - "step": 1345 - }, - { - "epoch": 1.0227963525835866, - "grad_norm": 2.958311080932617, - "learning_rate": 3.9319388402539395e-06, - "loss": 0.5390093922615051, - "mean_token_accuracy": 0.8204828500747681, - "num_tokens": 12007588.0, - "step": 1346 - }, - { - "epoch": 1.0235562310030395, - "grad_norm": 1.6470831632614136, - "learning_rate": 3.930221536622192e-06, - "loss": 0.4524633288383484, - "mean_token_accuracy": 0.8516575694084167, - "num_tokens": 12018831.0, - "step": 1347 - }, - { - "epoch": 1.0243161094224924, - "grad_norm": 1.3160780668258667, - "learning_rate": 3.928503229205913e-06, - "loss": 0.4180558919906616, - "mean_token_accuracy": 0.8495022058486938, - "num_tokens": 12033947.0, - "step": 1348 - }, - { - "epoch": 1.0250759878419453, - "grad_norm": 1.9686089754104614, - "learning_rate": 3.92678391921108e-06, - "loss": 0.41927334666252136, - "mean_token_accuracy": 0.8462997674942017, - "num_tokens": 12042005.0, - "step": 1349 - }, - { - "epoch": 1.0258358662613982, - "grad_norm": 2.351778507232666, - "learning_rate": 3.92506360784437e-06, - "loss": 0.2946245074272156, - "mean_token_accuracy": 0.9170923233032227, - "num_tokens": 12046579.0, - "step": 1350 - }, - { - "epoch": 1.0265957446808511, - "grad_norm": 2.0636913776397705, - "learning_rate": 3.923342296313162e-06, - "loss": 0.3422774076461792, - "mean_token_accuracy": 0.8809213638305664, - "num_tokens": 12053214.0, - "step": 1351 - }, - { - "epoch": 1.027355623100304, - "grad_norm": 1.7272592782974243, - "learning_rate": 3.92161998582554e-06, - "loss": 0.5864541530609131, - "mean_token_accuracy": 0.7986117601394653, - "num_tokens": 12068522.0, - "step": 1352 - }, - { - "epoch": 1.028115501519757, - "grad_norm": 0.8980231881141663, - "learning_rate": 3.919896677590289e-06, - "loss": 0.2964550256729126, - "mean_token_accuracy": 0.8911845088005066, - "num_tokens": 12093834.0, - "step": 1353 - }, - { - "epoch": 1.0288753799392096, - "grad_norm": 1.6031712293624878, - "learning_rate": 3.918172372816892e-06, - "loss": 0.37254488468170166, - "mean_token_accuracy": 0.8615843057632446, - "num_tokens": 12104393.0, - "step": 1354 - }, - { - "epoch": 1.0296352583586625, - "grad_norm": 1.282134771347046, - "learning_rate": 3.916447072715531e-06, - "loss": 0.3522927761077881, - "mean_token_accuracy": 0.8713657259941101, - "num_tokens": 12118671.0, - "step": 1355 - }, - { - "epoch": 1.0303951367781155, - "grad_norm": 2.1986680030822754, - "learning_rate": 3.914720778497091e-06, - "loss": 0.3716316223144531, - "mean_token_accuracy": 0.8661249279975891, - "num_tokens": 12125178.0, - "step": 1356 - }, - { - "epoch": 1.0311550151975684, - "grad_norm": 1.5937882661819458, - "learning_rate": 3.91299349137315e-06, - "loss": 0.48067355155944824, - "mean_token_accuracy": 0.8284252882003784, - "num_tokens": 12136785.0, - "step": 1357 - }, - { - "epoch": 1.0319148936170213, - "grad_norm": 1.6743099689483643, - "learning_rate": 3.9112652125559845e-06, - "loss": 0.4461551308631897, - "mean_token_accuracy": 0.8381845355033875, - "num_tokens": 12150066.0, - "step": 1358 - }, - { - "epoch": 1.0326747720364742, - "grad_norm": 2.2346715927124023, - "learning_rate": 3.909535943258567e-06, - "loss": 0.3148220181465149, - "mean_token_accuracy": 0.8797591924667358, - "num_tokens": 12155506.0, - "step": 1359 - }, - { - "epoch": 1.033434650455927, - "grad_norm": 1.9608992338180542, - "learning_rate": 3.907805684694567e-06, - "loss": 0.32598960399627686, - "mean_token_accuracy": 0.8819410800933838, - "num_tokens": 12163261.0, - "step": 1360 - }, - { - "epoch": 1.03419452887538, - "grad_norm": 2.413477897644043, - "learning_rate": 3.906074438078343e-06, - "loss": 0.38179588317871094, - "mean_token_accuracy": 0.8739585876464844, - "num_tokens": 12169254.0, - "step": 1361 - }, - { - "epoch": 1.034954407294833, - "grad_norm": 2.0258278846740723, - "learning_rate": 3.904342204624955e-06, - "loss": 0.33240315318107605, - "mean_token_accuracy": 0.8808181285858154, - "num_tokens": 12175379.0, - "step": 1362 - }, - { - "epoch": 1.0357142857142858, - "grad_norm": 2.4111437797546387, - "learning_rate": 3.9026089855501475e-06, - "loss": 0.412802517414093, - "mean_token_accuracy": 0.8504396677017212, - "num_tokens": 12182007.0, - "step": 1363 - }, - { - "epoch": 1.0364741641337385, - "grad_norm": 2.0424840450286865, - "learning_rate": 3.900874782070362e-06, - "loss": 0.2914797067642212, - "mean_token_accuracy": 0.8731886148452759, - "num_tokens": 12187743.0, - "step": 1364 - }, - { - "epoch": 1.0372340425531914, - "grad_norm": 2.9248716831207275, - "learning_rate": 3.899139595402729e-06, - "loss": 0.34071338176727295, - "mean_token_accuracy": 0.8736443519592285, - "num_tokens": 12191830.0, - "step": 1365 - }, - { - "epoch": 1.0379939209726443, - "grad_norm": 2.240220785140991, - "learning_rate": 3.8974034267650695e-06, - "loss": 0.23049014806747437, - "mean_token_accuracy": 0.9000070691108704, - "num_tokens": 12196460.0, - "step": 1366 - }, - { - "epoch": 1.0387537993920972, - "grad_norm": 1.5038460493087769, - "learning_rate": 3.895666277375892e-06, - "loss": 0.32255327701568604, - "mean_token_accuracy": 0.873004674911499, - "num_tokens": 12206230.0, - "step": 1367 - }, - { - "epoch": 1.0395136778115501, - "grad_norm": 1.2339142560958862, - "learning_rate": 3.893928148454398e-06, - "loss": 0.4069131314754486, - "mean_token_accuracy": 0.8461740016937256, - "num_tokens": 12226502.0, - "step": 1368 - }, - { - "epoch": 1.040273556231003, - "grad_norm": 2.531553268432617, - "learning_rate": 3.89218904122047e-06, - "loss": 0.43681037425994873, - "mean_token_accuracy": 0.8497104048728943, - "num_tokens": 12232241.0, - "step": 1369 - }, - { - "epoch": 1.041033434650456, - "grad_norm": 3.8404815196990967, - "learning_rate": 3.890448956894682e-06, - "loss": 0.3241814970970154, - "mean_token_accuracy": 0.884732723236084, - "num_tokens": 12235126.0, - "step": 1370 - }, - { - "epoch": 1.0417933130699089, - "grad_norm": 2.9608030319213867, - "learning_rate": 3.888707896698293e-06, - "loss": 0.4641021490097046, - "mean_token_accuracy": 0.8496800661087036, - "num_tokens": 12240630.0, - "step": 1371 - }, - { - "epoch": 1.0425531914893618, - "grad_norm": 2.1166417598724365, - "learning_rate": 3.886965861853243e-06, - "loss": 0.42038479447364807, - "mean_token_accuracy": 0.8512747287750244, - "num_tokens": 12247969.0, - "step": 1372 - }, - { - "epoch": 1.0433130699088147, - "grad_norm": 2.5918161869049072, - "learning_rate": 3.885222853582163e-06, - "loss": 0.2871917188167572, - "mean_token_accuracy": 0.9129709601402283, - "num_tokens": 12252161.0, - "step": 1373 - }, - { - "epoch": 1.0440729483282676, - "grad_norm": 2.4261348247528076, - "learning_rate": 3.88347887310836e-06, - "loss": 0.4003123342990875, - "mean_token_accuracy": 0.8570356369018555, - "num_tokens": 12258135.0, - "step": 1374 - }, - { - "epoch": 1.0448328267477203, - "grad_norm": 1.3439548015594482, - "learning_rate": 3.881733921655829e-06, - "loss": 0.3278140425682068, - "mean_token_accuracy": 0.8831373453140259, - "num_tokens": 12272849.0, - "step": 1375 - }, - { - "epoch": 1.0455927051671732, - "grad_norm": 1.527989387512207, - "learning_rate": 3.879988000449243e-06, - "loss": 0.33789363503456116, - "mean_token_accuracy": 0.8825669884681702, - "num_tokens": 12283281.0, - "step": 1376 - }, - { - "epoch": 1.046352583586626, - "grad_norm": 1.6755503416061401, - "learning_rate": 3.878241110713957e-06, - "loss": 0.4816160798072815, - "mean_token_accuracy": 0.8193758726119995, - "num_tokens": 12295422.0, - "step": 1377 - }, - { - "epoch": 1.047112462006079, - "grad_norm": 2.8110361099243164, - "learning_rate": 3.876493253676004e-06, - "loss": 0.38662949204444885, - "mean_token_accuracy": 0.8611986637115479, - "num_tokens": 12299806.0, - "step": 1378 - }, - { - "epoch": 1.047872340425532, - "grad_norm": 1.86097252368927, - "learning_rate": 3.8747444305621e-06, - "loss": 0.27612629532814026, - "mean_token_accuracy": 0.8984048366546631, - "num_tokens": 12306599.0, - "step": 1379 - }, - { - "epoch": 1.0486322188449848, - "grad_norm": 2.361828565597534, - "learning_rate": 3.872994642599635e-06, - "loss": 0.469953715801239, - "mean_token_accuracy": 0.8464452028274536, - "num_tokens": 12314249.0, - "step": 1380 - }, - { - "epoch": 1.0493920972644377, - "grad_norm": 1.9524794816970825, - "learning_rate": 3.871243891016676e-06, - "loss": 0.5419625043869019, - "mean_token_accuracy": 0.8468329906463623, - "num_tokens": 12324987.0, - "step": 1381 - }, - { - "epoch": 1.0501519756838906, - "grad_norm": 1.6931511163711548, - "learning_rate": 3.869492177041971e-06, - "loss": 0.3791416883468628, - "mean_token_accuracy": 0.8692882061004639, - "num_tokens": 12336864.0, - "step": 1382 - }, - { - "epoch": 1.0509118541033435, - "grad_norm": 1.909692406654358, - "learning_rate": 3.867739501904938e-06, - "loss": 0.27974557876586914, - "mean_token_accuracy": 0.9004636406898499, - "num_tokens": 12343093.0, - "step": 1383 - }, - { - "epoch": 1.0516717325227964, - "grad_norm": 1.415162205696106, - "learning_rate": 3.8659858668356735e-06, - "loss": 0.38928335905075073, - "mean_token_accuracy": 0.8491984009742737, - "num_tokens": 12356613.0, - "step": 1384 - }, - { - "epoch": 1.0524316109422491, - "grad_norm": 1.8195741176605225, - "learning_rate": 3.864231273064944e-06, - "loss": 0.3798758089542389, - "mean_token_accuracy": 0.8728072047233582, - "num_tokens": 12364860.0, - "step": 1385 - }, - { - "epoch": 1.053191489361702, - "grad_norm": 1.8481454849243164, - "learning_rate": 3.862475721824193e-06, - "loss": 0.269635945558548, - "mean_token_accuracy": 0.899247407913208, - "num_tokens": 12371841.0, - "step": 1386 - }, - { - "epoch": 1.053951367781155, - "grad_norm": 1.7838784456253052, - "learning_rate": 3.8607192143455325e-06, - "loss": 0.36971768736839294, - "mean_token_accuracy": 0.8833638429641724, - "num_tokens": 12380685.0, - "step": 1387 - }, - { - "epoch": 1.0547112462006079, - "grad_norm": 1.333358645439148, - "learning_rate": 3.858961751861748e-06, - "loss": 0.4039418399333954, - "mean_token_accuracy": 0.8541078567504883, - "num_tokens": 12394072.0, - "step": 1388 - }, - { - "epoch": 1.0554711246200608, - "grad_norm": 2.1600265502929688, - "learning_rate": 3.857203335606294e-06, - "loss": 0.38211894035339355, - "mean_token_accuracy": 0.8549972772598267, - "num_tokens": 12400449.0, - "step": 1389 - }, - { - "epoch": 1.0562310030395137, - "grad_norm": 2.914902687072754, - "learning_rate": 3.855443966813295e-06, - "loss": 0.2237374186515808, - "mean_token_accuracy": 0.9253600835800171, - "num_tokens": 12403758.0, - "step": 1390 - }, - { - "epoch": 1.0569908814589666, - "grad_norm": 2.2361080646514893, - "learning_rate": 3.853683646717543e-06, - "loss": 0.3359566926956177, - "mean_token_accuracy": 0.898173451423645, - "num_tokens": 12410374.0, - "step": 1391 - }, - { - "epoch": 1.0577507598784195, - "grad_norm": 2.3639304637908936, - "learning_rate": 3.8519223765544985e-06, - "loss": 0.3844943046569824, - "mean_token_accuracy": 0.863599419593811, - "num_tokens": 12416016.0, - "step": 1392 - }, - { - "epoch": 1.0585106382978724, - "grad_norm": 2.202971935272217, - "learning_rate": 3.85016015756029e-06, - "loss": 0.3546281158924103, - "mean_token_accuracy": 0.8907540440559387, - "num_tokens": 12422026.0, - "step": 1393 - }, - { - "epoch": 1.0592705167173253, - "grad_norm": 1.1279661655426025, - "learning_rate": 3.848396990971709e-06, - "loss": 0.31522464752197266, - "mean_token_accuracy": 0.8662257194519043, - "num_tokens": 12439964.0, - "step": 1394 - }, - { - "epoch": 1.0600303951367782, - "grad_norm": 2.4731740951538086, - "learning_rate": 3.846632878026214e-06, - "loss": 0.456442266702652, - "mean_token_accuracy": 0.8516958951950073, - "num_tokens": 12446231.0, - "step": 1395 - }, - { - "epoch": 1.060790273556231, - "grad_norm": 1.7631878852844238, - "learning_rate": 3.844867819961928e-06, - "loss": 0.487227201461792, - "mean_token_accuracy": 0.8466947078704834, - "num_tokens": 12459989.0, - "step": 1396 - }, - { - "epoch": 1.0615501519756838, - "grad_norm": 2.4468278884887695, - "learning_rate": 3.843101818017637e-06, - "loss": 0.3367291986942291, - "mean_token_accuracy": 0.8734689950942993, - "num_tokens": 12465741.0, - "step": 1397 - }, - { - "epoch": 1.0623100303951367, - "grad_norm": 1.9045145511627197, - "learning_rate": 3.841334873432789e-06, - "loss": 0.4652615487575531, - "mean_token_accuracy": 0.8333107233047485, - "num_tokens": 12474963.0, - "step": 1398 - }, - { - "epoch": 1.0630699088145896, - "grad_norm": 1.6816917657852173, - "learning_rate": 3.839566987447492e-06, - "loss": 0.4144279956817627, - "mean_token_accuracy": 0.8472539186477661, - "num_tokens": 12485521.0, - "step": 1399 - }, - { - "epoch": 1.0638297872340425, - "grad_norm": 1.8990092277526855, - "learning_rate": 3.837798161302518e-06, - "loss": 0.4040985405445099, - "mean_token_accuracy": 0.8514704704284668, - "num_tokens": 12493495.0, - "step": 1400 - }, - { - "epoch": 1.0645896656534954, - "grad_norm": 2.27785325050354, - "learning_rate": 3.836028396239297e-06, - "loss": 0.43425723910331726, - "mean_token_accuracy": 0.8795069456100464, - "num_tokens": 12499789.0, - "step": 1401 - }, - { - "epoch": 1.0653495440729484, - "grad_norm": 2.5130882263183594, - "learning_rate": 3.8342576934999184e-06, - "loss": 0.33892524242401123, - "mean_token_accuracy": 0.8717449903488159, - "num_tokens": 12504885.0, - "step": 1402 - }, - { - "epoch": 1.0661094224924013, - "grad_norm": 2.650040864944458, - "learning_rate": 3.832486054327131e-06, - "loss": 0.4200317859649658, - "mean_token_accuracy": 0.8616159558296204, - "num_tokens": 12509783.0, - "step": 1403 - }, - { - "epoch": 1.0668693009118542, - "grad_norm": 2.9176881313323975, - "learning_rate": 3.830713479964335e-06, - "loss": 0.37018489837646484, - "mean_token_accuracy": 0.8676021695137024, - "num_tokens": 12514441.0, - "step": 1404 - }, - { - "epoch": 1.067629179331307, - "grad_norm": 1.6430318355560303, - "learning_rate": 3.828939971655595e-06, - "loss": 0.27539193630218506, - "mean_token_accuracy": 0.9077831506729126, - "num_tokens": 12523677.0, - "step": 1405 - }, - { - "epoch": 1.06838905775076, - "grad_norm": 1.3683708906173706, - "learning_rate": 3.827165530645627e-06, - "loss": 0.4085099697113037, - "mean_token_accuracy": 0.8579255938529968, - "num_tokens": 12540104.0, - "step": 1406 - }, - { - "epoch": 1.0691489361702127, - "grad_norm": 2.528465747833252, - "learning_rate": 3.825390158179802e-06, - "loss": 0.42462456226348877, - "mean_token_accuracy": 0.852813720703125, - "num_tokens": 12548239.0, - "step": 1407 - }, - { - "epoch": 1.0699088145896656, - "grad_norm": 1.8288795948028564, - "learning_rate": 3.823613855504144e-06, - "loss": 0.412417471408844, - "mean_token_accuracy": 0.8622130751609802, - "num_tokens": 12557316.0, - "step": 1408 - }, - { - "epoch": 1.0706686930091185, - "grad_norm": 2.341794490814209, - "learning_rate": 3.82183662386533e-06, - "loss": 0.2996668815612793, - "mean_token_accuracy": 0.8964041471481323, - "num_tokens": 12562377.0, - "step": 1409 - }, - { - "epoch": 1.0714285714285714, - "grad_norm": 2.555877208709717, - "learning_rate": 3.82005846451069e-06, - "loss": 0.4184221625328064, - "mean_token_accuracy": 0.8678828477859497, - "num_tokens": 12568516.0, - "step": 1410 - }, - { - "epoch": 1.0721884498480243, - "grad_norm": 2.081308126449585, - "learning_rate": 3.8182793786882065e-06, - "loss": 0.4376835823059082, - "mean_token_accuracy": 0.8409077525138855, - "num_tokens": 12576598.0, - "step": 1411 - }, - { - "epoch": 1.0729483282674772, - "grad_norm": 2.0272316932678223, - "learning_rate": 3.816499367646508e-06, - "loss": 0.3630060851573944, - "mean_token_accuracy": 0.8762413263320923, - "num_tokens": 12584587.0, - "step": 1412 - }, - { - "epoch": 1.0737082066869301, - "grad_norm": 2.6382484436035156, - "learning_rate": 3.814718432634877e-06, - "loss": 0.4244990348815918, - "mean_token_accuracy": 0.8509312272071838, - "num_tokens": 12590028.0, - "step": 1413 - }, - { - "epoch": 1.074468085106383, - "grad_norm": 2.429800271987915, - "learning_rate": 3.8129365749032398e-06, - "loss": 0.36990004777908325, - "mean_token_accuracy": 0.8749774098396301, - "num_tokens": 12594984.0, - "step": 1414 - }, - { - "epoch": 1.075227963525836, - "grad_norm": 3.5939090251922607, - "learning_rate": 3.8111537957021736e-06, - "loss": 0.4245661199092865, - "mean_token_accuracy": 0.8481623530387878, - "num_tokens": 12598494.0, - "step": 1415 - }, - { - "epoch": 1.0759878419452888, - "grad_norm": 2.705955982208252, - "learning_rate": 3.809370096282903e-06, - "loss": 0.41851678490638733, - "mean_token_accuracy": 0.8548051714897156, - "num_tokens": 12603876.0, - "step": 1416 - }, - { - "epoch": 1.0767477203647418, - "grad_norm": 1.7812079191207886, - "learning_rate": 3.807585477897296e-06, - "loss": 0.47113919258117676, - "mean_token_accuracy": 0.8346904516220093, - "num_tokens": 12613402.0, - "step": 1417 - }, - { - "epoch": 1.0775075987841944, - "grad_norm": 1.4335212707519531, - "learning_rate": 3.8057999417978654e-06, - "loss": 0.3802063465118408, - "mean_token_accuracy": 0.8563423156738281, - "num_tokens": 12626865.0, - "step": 1418 - }, - { - "epoch": 1.0782674772036474, - "grad_norm": 1.9171305894851685, - "learning_rate": 3.8040134892377702e-06, - "loss": 0.20898357033729553, - "mean_token_accuracy": 0.9189738035202026, - "num_tokens": 12632593.0, - "step": 1419 - }, - { - "epoch": 1.0790273556231003, - "grad_norm": 1.4996821880340576, - "learning_rate": 3.802226121470811e-06, - "loss": 0.4203261137008667, - "mean_token_accuracy": 0.8479211330413818, - "num_tokens": 12646395.0, - "step": 1420 - }, - { - "epoch": 1.0797872340425532, - "grad_norm": 2.2007253170013428, - "learning_rate": 3.800437839751432e-06, - "loss": 0.40370577573776245, - "mean_token_accuracy": 0.8427679538726807, - "num_tokens": 12653508.0, - "step": 1421 - }, - { - "epoch": 1.080547112462006, - "grad_norm": 1.7266581058502197, - "learning_rate": 3.7986486453347183e-06, - "loss": 0.46750491857528687, - "mean_token_accuracy": 0.8429205417633057, - "num_tokens": 12666329.0, - "step": 1422 - }, - { - "epoch": 1.081306990881459, - "grad_norm": 1.4716318845748901, - "learning_rate": 3.796858539476394e-06, - "loss": 0.3330317735671997, - "mean_token_accuracy": 0.879012942314148, - "num_tokens": 12676741.0, - "step": 1423 - }, - { - "epoch": 1.082066869300912, - "grad_norm": 2.652127265930176, - "learning_rate": 3.795067523432826e-06, - "loss": 0.35365715622901917, - "mean_token_accuracy": 0.8796792030334473, - "num_tokens": 12681479.0, - "step": 1424 - }, - { - "epoch": 1.0828267477203648, - "grad_norm": 1.2937829494476318, - "learning_rate": 3.793275598461017e-06, - "loss": 0.25272446870803833, - "mean_token_accuracy": 0.9231734275817871, - "num_tokens": 12694238.0, - "step": 1425 - }, - { - "epoch": 1.0835866261398177, - "grad_norm": 1.3831220865249634, - "learning_rate": 3.7914827658186104e-06, - "loss": 0.4935331344604492, - "mean_token_accuracy": 0.8417420387268066, - "num_tokens": 12712857.0, - "step": 1426 - }, - { - "epoch": 1.0843465045592706, - "grad_norm": 3.059525728225708, - "learning_rate": 3.7896890267638832e-06, - "loss": 0.2592190206050873, - "mean_token_accuracy": 0.9040263295173645, - "num_tokens": 12716766.0, - "step": 1427 - }, - { - "epoch": 1.0851063829787233, - "grad_norm": 2.8399202823638916, - "learning_rate": 3.787894382555752e-06, - "loss": 0.32098138332366943, - "mean_token_accuracy": 0.8838302493095398, - "num_tokens": 12720774.0, - "step": 1428 - }, - { - "epoch": 1.0858662613981762, - "grad_norm": 2.618479013442993, - "learning_rate": 3.7860988344537664e-06, - "loss": 0.425255686044693, - "mean_token_accuracy": 0.8564130067825317, - "num_tokens": 12726506.0, - "step": 1429 - }, - { - "epoch": 1.0866261398176291, - "grad_norm": 1.3108669519424438, - "learning_rate": 3.7843023837181126e-06, - "loss": 0.40220165252685547, - "mean_token_accuracy": 0.8588873147964478, - "num_tokens": 12742814.0, - "step": 1430 - }, - { - "epoch": 1.087386018237082, - "grad_norm": 2.2083566188812256, - "learning_rate": 3.782505031609607e-06, - "loss": 0.318379282951355, - "mean_token_accuracy": 0.8887606859207153, - "num_tokens": 12748388.0, - "step": 1431 - }, - { - "epoch": 1.088145896656535, - "grad_norm": 1.922358751296997, - "learning_rate": 3.7807067793897006e-06, - "loss": 0.2519589364528656, - "mean_token_accuracy": 0.8936764001846313, - "num_tokens": 12754761.0, - "step": 1432 - }, - { - "epoch": 1.0889057750759878, - "grad_norm": 1.7367439270019531, - "learning_rate": 3.778907628320477e-06, - "loss": 0.3970367908477783, - "mean_token_accuracy": 0.858735203742981, - "num_tokens": 12764016.0, - "step": 1433 - }, - { - "epoch": 1.0896656534954408, - "grad_norm": 2.1931066513061523, - "learning_rate": 3.77710757966465e-06, - "loss": 0.5250554084777832, - "mean_token_accuracy": 0.8356746435165405, - "num_tokens": 12772272.0, - "step": 1434 - }, - { - "epoch": 1.0904255319148937, - "grad_norm": 1.718337893486023, - "learning_rate": 3.775306634685562e-06, - "loss": 0.283231645822525, - "mean_token_accuracy": 0.9009919166564941, - "num_tokens": 12780706.0, - "step": 1435 - }, - { - "epoch": 1.0911854103343466, - "grad_norm": 2.1985926628112793, - "learning_rate": 3.773504794647187e-06, - "loss": 0.3913170397281647, - "mean_token_accuracy": 0.8909255266189575, - "num_tokens": 12787052.0, - "step": 1436 - }, - { - "epoch": 1.0919452887537995, - "grad_norm": 2.8687937259674072, - "learning_rate": 3.771702060814123e-06, - "loss": 0.3135771155357361, - "mean_token_accuracy": 0.9016125202178955, - "num_tokens": 12791854.0, - "step": 1437 - }, - { - "epoch": 1.0927051671732522, - "grad_norm": 4.203946590423584, - "learning_rate": 3.7698984344516e-06, - "loss": 0.3642737865447998, - "mean_token_accuracy": 0.8842349052429199, - "num_tokens": 12794969.0, - "step": 1438 - }, - { - "epoch": 1.093465045592705, - "grad_norm": 1.5134642124176025, - "learning_rate": 3.7680939168254733e-06, - "loss": 0.3732057213783264, - "mean_token_accuracy": 0.8671083450317383, - "num_tokens": 12808480.0, - "step": 1439 - }, - { - "epoch": 1.094224924012158, - "grad_norm": 3.2103970050811768, - "learning_rate": 3.7662885092022206e-06, - "loss": 0.3556194603443146, - "mean_token_accuracy": 0.8786529302597046, - "num_tokens": 12812654.0, - "step": 1440 - }, - { - "epoch": 1.094984802431611, - "grad_norm": 2.2774064540863037, - "learning_rate": 3.7644822128489476e-06, - "loss": 0.38409674167633057, - "mean_token_accuracy": 0.866563081741333, - "num_tokens": 12819854.0, - "step": 1441 - }, - { - "epoch": 1.0957446808510638, - "grad_norm": 1.8250885009765625, - "learning_rate": 3.7626750290333824e-06, - "loss": 0.3812350034713745, - "mean_token_accuracy": 0.8676212430000305, - "num_tokens": 12830338.0, - "step": 1442 - }, - { - "epoch": 1.0965045592705167, - "grad_norm": 1.8337891101837158, - "learning_rate": 3.7608669590238765e-06, - "loss": 0.3892471194267273, - "mean_token_accuracy": 0.8616238832473755, - "num_tokens": 12840340.0, - "step": 1443 - }, - { - "epoch": 1.0972644376899696, - "grad_norm": 1.5300254821777344, - "learning_rate": 3.7590580040894025e-06, - "loss": 0.35288217663764954, - "mean_token_accuracy": 0.8625509738922119, - "num_tokens": 12853144.0, - "step": 1444 - }, - { - "epoch": 1.0980243161094225, - "grad_norm": 2.152683734893799, - "learning_rate": 3.7572481654995554e-06, - "loss": 0.4004772901535034, - "mean_token_accuracy": 0.858427107334137, - "num_tokens": 12859970.0, - "step": 1445 - }, - { - "epoch": 1.0987841945288754, - "grad_norm": 1.532832145690918, - "learning_rate": 3.755437444524548e-06, - "loss": 0.46820127964019775, - "mean_token_accuracy": 0.8585472106933594, - "num_tokens": 12875243.0, - "step": 1446 - }, - { - "epoch": 1.0995440729483283, - "grad_norm": 1.6485342979431152, - "learning_rate": 3.7536258424352164e-06, - "loss": 0.46329325437545776, - "mean_token_accuracy": 0.8376060724258423, - "num_tokens": 12886383.0, - "step": 1447 - }, - { - "epoch": 1.1003039513677813, - "grad_norm": 2.402256488800049, - "learning_rate": 3.75181336050301e-06, - "loss": 0.43916207551956177, - "mean_token_accuracy": 0.8448786735534668, - "num_tokens": 12892613.0, - "step": 1448 - }, - { - "epoch": 1.101063829787234, - "grad_norm": 1.3893651962280273, - "learning_rate": 3.7500000000000005e-06, - "loss": 0.3919021785259247, - "mean_token_accuracy": 0.8495820760726929, - "num_tokens": 12905523.0, - "step": 1449 - }, - { - "epoch": 1.1018237082066868, - "grad_norm": 1.5519827604293823, - "learning_rate": 3.7481857621988734e-06, - "loss": 0.4710700809955597, - "mean_token_accuracy": 0.8387632369995117, - "num_tokens": 12918236.0, - "step": 1450 - }, - { - "epoch": 1.1025835866261398, - "grad_norm": 2.0141353607177734, - "learning_rate": 3.74637064837293e-06, - "loss": 0.30866751074790955, - "mean_token_accuracy": 0.9059321880340576, - "num_tokens": 12924391.0, - "step": 1451 - }, - { - "epoch": 1.1033434650455927, - "grad_norm": 1.2201496362686157, - "learning_rate": 3.7445546597960882e-06, - "loss": 0.3938257396221161, - "mean_token_accuracy": 0.8726630210876465, - "num_tokens": 12943338.0, - "step": 1452 - }, - { - "epoch": 1.1041033434650456, - "grad_norm": 2.29434871673584, - "learning_rate": 3.742737797742878e-06, - "loss": 0.4347776174545288, - "mean_token_accuracy": 0.840569257736206, - "num_tokens": 12950636.0, - "step": 1453 - }, - { - "epoch": 1.1048632218844985, - "grad_norm": 2.3875105381011963, - "learning_rate": 3.7409200634884425e-06, - "loss": 0.48353564739227295, - "mean_token_accuracy": 0.8207056522369385, - "num_tokens": 12957635.0, - "step": 1454 - }, - { - "epoch": 1.1056231003039514, - "grad_norm": 2.3539648056030273, - "learning_rate": 3.7391014583085384e-06, - "loss": 0.3532431721687317, - "mean_token_accuracy": 0.8903788924217224, - "num_tokens": 12963032.0, - "step": 1455 - }, - { - "epoch": 1.1063829787234043, - "grad_norm": 1.5611135959625244, - "learning_rate": 3.737281983479534e-06, - "loss": 0.4734863042831421, - "mean_token_accuracy": 0.8413879871368408, - "num_tokens": 12977170.0, - "step": 1456 - }, - { - "epoch": 1.1071428571428572, - "grad_norm": 1.474320411682129, - "learning_rate": 3.735461640278404e-06, - "loss": 0.41854286193847656, - "mean_token_accuracy": 0.8499876856803894, - "num_tokens": 12993750.0, - "step": 1457 - }, - { - "epoch": 1.1079027355623101, - "grad_norm": 2.6873273849487305, - "learning_rate": 3.733640429982738e-06, - "loss": 0.47637903690338135, - "mean_token_accuracy": 0.83599853515625, - "num_tokens": 12999058.0, - "step": 1458 - }, - { - "epoch": 1.108662613981763, - "grad_norm": 1.4575026035308838, - "learning_rate": 3.731818353870729e-06, - "loss": 0.38441652059555054, - "mean_token_accuracy": 0.8582364320755005, - "num_tokens": 13013864.0, - "step": 1459 - }, - { - "epoch": 1.1094224924012157, - "grad_norm": 1.7722690105438232, - "learning_rate": 3.729995413221183e-06, - "loss": 0.4224998950958252, - "mean_token_accuracy": 0.8511888384819031, - "num_tokens": 13023714.0, - "step": 1460 - }, - { - "epoch": 1.1101823708206686, - "grad_norm": 2.625760555267334, - "learning_rate": 3.7281716093135068e-06, - "loss": 0.3487582802772522, - "mean_token_accuracy": 0.8834779262542725, - "num_tokens": 13028608.0, - "step": 1461 - }, - { - "epoch": 1.1109422492401215, - "grad_norm": 1.2554056644439697, - "learning_rate": 3.726346943427719e-06, - "loss": 0.33312469720840454, - "mean_token_accuracy": 0.8704153299331665, - "num_tokens": 13044901.0, - "step": 1462 - }, - { - "epoch": 1.1117021276595744, - "grad_norm": 2.1109910011291504, - "learning_rate": 3.7245214168444388e-06, - "loss": 0.387290894985199, - "mean_token_accuracy": 0.860816240310669, - "num_tokens": 13051452.0, - "step": 1463 - }, - { - "epoch": 1.1124620060790273, - "grad_norm": 3.159201145172119, - "learning_rate": 3.722695030844891e-06, - "loss": 0.37690871953964233, - "mean_token_accuracy": 0.8717561960220337, - "num_tokens": 13055131.0, - "step": 1464 - }, - { - "epoch": 1.1132218844984803, - "grad_norm": 1.3810011148452759, - "learning_rate": 3.7208677867109042e-06, - "loss": 0.36598485708236694, - "mean_token_accuracy": 0.8683375120162964, - "num_tokens": 13069798.0, - "step": 1465 - }, - { - "epoch": 1.1139817629179332, - "grad_norm": 2.500849485397339, - "learning_rate": 3.7190396857249087e-06, - "loss": 0.2781746983528137, - "mean_token_accuracy": 0.9026005268096924, - "num_tokens": 13075127.0, - "step": 1466 - }, - { - "epoch": 1.114741641337386, - "grad_norm": 1.7445712089538574, - "learning_rate": 3.7172107291699356e-06, - "loss": 0.5055314302444458, - "mean_token_accuracy": 0.8252174258232117, - "num_tokens": 13084843.0, - "step": 1467 - }, - { - "epoch": 1.115501519756839, - "grad_norm": 1.6386256217956543, - "learning_rate": 3.7153809183296174e-06, - "loss": 0.38478314876556396, - "mean_token_accuracy": 0.8600847721099854, - "num_tokens": 13096517.0, - "step": 1468 - }, - { - "epoch": 1.1162613981762919, - "grad_norm": 2.3818395137786865, - "learning_rate": 3.713550254488185e-06, - "loss": 0.40308547019958496, - "mean_token_accuracy": 0.8628184795379639, - "num_tokens": 13102324.0, - "step": 1469 - }, - { - "epoch": 1.1170212765957448, - "grad_norm": 1.73163640499115, - "learning_rate": 3.7117187389304703e-06, - "loss": 0.5035421848297119, - "mean_token_accuracy": 0.8229597210884094, - "num_tokens": 13113763.0, - "step": 1470 - }, - { - "epoch": 1.1177811550151975, - "grad_norm": 3.147177219390869, - "learning_rate": 3.7098863729418997e-06, - "loss": 0.557449221611023, - "mean_token_accuracy": 0.8266849517822266, - "num_tokens": 13118849.0, - "step": 1471 - }, - { - "epoch": 1.1185410334346504, - "grad_norm": 1.5061391592025757, - "learning_rate": 3.7080531578085e-06, - "loss": 0.3759554922580719, - "mean_token_accuracy": 0.8541903495788574, - "num_tokens": 13131337.0, - "step": 1472 - }, - { - "epoch": 1.1193009118541033, - "grad_norm": 2.172346353530884, - "learning_rate": 3.7062190948168906e-06, - "loss": 0.41491609811782837, - "mean_token_accuracy": 0.8531454801559448, - "num_tokens": 13139767.0, - "step": 1473 - }, - { - "epoch": 1.1200607902735562, - "grad_norm": 2.1527154445648193, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.4309239387512207, - "mean_token_accuracy": 0.8327745199203491, - "num_tokens": 13147210.0, - "step": 1474 - }, - { - "epoch": 1.1208206686930091, - "grad_norm": 1.8342832326889038, - "learning_rate": 3.7025484304085035e-06, - "loss": 0.34393298625946045, - "mean_token_accuracy": 0.8948153257369995, - "num_tokens": 13154831.0, - "step": 1475 - }, - { - "epoch": 1.121580547112462, - "grad_norm": 2.509291172027588, - "learning_rate": 3.7007118315679384e-06, - "loss": 0.4479471445083618, - "mean_token_accuracy": 0.8280234336853027, - "num_tokens": 13161040.0, - "step": 1476 - }, - { - "epoch": 1.122340425531915, - "grad_norm": 2.914710521697998, - "learning_rate": 3.6988743900215895e-06, - "loss": 0.3724832832813263, - "mean_token_accuracy": 0.863893985748291, - "num_tokens": 13164975.0, - "step": 1477 - }, - { - "epoch": 1.1231003039513678, - "grad_norm": 3.274808645248413, - "learning_rate": 3.6970361070590443e-06, - "loss": 0.4088161885738373, - "mean_token_accuracy": 0.8474822044372559, - "num_tokens": 13168826.0, - "step": 1478 - }, - { - "epoch": 1.1238601823708207, - "grad_norm": 2.861546277999878, - "learning_rate": 3.695196983970481e-06, - "loss": 0.45837992429733276, - "mean_token_accuracy": 0.8579759001731873, - "num_tokens": 13173794.0, - "step": 1479 - }, - { - "epoch": 1.1246200607902737, - "grad_norm": 1.9491597414016724, - "learning_rate": 3.6933570220466654e-06, - "loss": 0.4333910346031189, - "mean_token_accuracy": 0.8444236516952515, - "num_tokens": 13181598.0, - "step": 1480 - }, - { - "epoch": 1.1253799392097266, - "grad_norm": 1.329848051071167, - "learning_rate": 3.6915162225789546e-06, - "loss": 0.36404621601104736, - "mean_token_accuracy": 0.8694117069244385, - "num_tokens": 13196381.0, - "step": 1481 - }, - { - "epoch": 1.1261398176291793, - "grad_norm": 1.8854197263717651, - "learning_rate": 3.6896745868592924e-06, - "loss": 0.4085756838321686, - "mean_token_accuracy": 0.855188250541687, - "num_tokens": 13205236.0, - "step": 1482 - }, - { - "epoch": 1.1268996960486322, - "grad_norm": 3.01684832572937, - "learning_rate": 3.6878321161802106e-06, - "loss": 0.28105655312538147, - "mean_token_accuracy": 0.9009426236152649, - "num_tokens": 13209380.0, - "step": 1483 - }, - { - "epoch": 1.127659574468085, - "grad_norm": 1.8051308393478394, - "learning_rate": 3.685988811834823e-06, - "loss": 0.3314531147480011, - "mean_token_accuracy": 0.8805814385414124, - "num_tokens": 13217714.0, - "step": 1484 - }, - { - "epoch": 1.128419452887538, - "grad_norm": 1.61757493019104, - "learning_rate": 3.684144675116836e-06, - "loss": 0.4543863534927368, - "mean_token_accuracy": 0.8400536775588989, - "num_tokens": 13229330.0, - "step": 1485 - }, - { - "epoch": 1.1291793313069909, - "grad_norm": 1.602686882019043, - "learning_rate": 3.682299707320532e-06, - "loss": 0.3653204143047333, - "mean_token_accuracy": 0.8655825853347778, - "num_tokens": 13242872.0, - "step": 1486 - }, - { - "epoch": 1.1299392097264438, - "grad_norm": 2.3093113899230957, - "learning_rate": 3.680453909740782e-06, - "loss": 0.4383693039417267, - "mean_token_accuracy": 0.839782178401947, - "num_tokens": 13248976.0, - "step": 1487 - }, - { - "epoch": 1.1306990881458967, - "grad_norm": 1.180559754371643, - "learning_rate": 3.6786072836730376e-06, - "loss": 0.5354755520820618, - "mean_token_accuracy": 0.8151205778121948, - "num_tokens": 13272896.0, - "step": 1488 - }, - { - "epoch": 1.1314589665653496, - "grad_norm": 1.9554040431976318, - "learning_rate": 3.6767598304133325e-06, - "loss": 0.4485316872596741, - "mean_token_accuracy": 0.8399936556816101, - "num_tokens": 13280757.0, - "step": 1489 - }, - { - "epoch": 1.1322188449848025, - "grad_norm": 2.236471176147461, - "learning_rate": 3.674911551258279e-06, - "loss": 0.45594364404678345, - "mean_token_accuracy": 0.8552400469779968, - "num_tokens": 13287328.0, - "step": 1490 - }, - { - "epoch": 1.1329787234042552, - "grad_norm": 2.5228686332702637, - "learning_rate": 3.673062447505072e-06, - "loss": 0.4048641622066498, - "mean_token_accuracy": 0.8617376685142517, - "num_tokens": 13292716.0, - "step": 1491 - }, - { - "epoch": 1.1337386018237081, - "grad_norm": 1.1274473667144775, - "learning_rate": 3.6712125204514836e-06, - "loss": 0.3848876357078552, - "mean_token_accuracy": 0.8672975301742554, - "num_tokens": 13313403.0, - "step": 1492 - }, - { - "epoch": 1.134498480243161, - "grad_norm": 2.349541425704956, - "learning_rate": 3.6693617713958633e-06, - "loss": 0.3166058361530304, - "mean_token_accuracy": 0.8896721601486206, - "num_tokens": 13318720.0, - "step": 1493 - }, - { - "epoch": 1.135258358662614, - "grad_norm": 2.2438278198242188, - "learning_rate": 3.6675102016371387e-06, - "loss": 0.5418218970298767, - "mean_token_accuracy": 0.8256527185440063, - "num_tokens": 13325360.0, - "step": 1494 - }, - { - "epoch": 1.1360182370820668, - "grad_norm": 2.21268892288208, - "learning_rate": 3.665657812474812e-06, - "loss": 0.48603951930999756, - "mean_token_accuracy": 0.8273470401763916, - "num_tokens": 13333217.0, - "step": 1495 - }, - { - "epoch": 1.1367781155015197, - "grad_norm": 2.6105997562408447, - "learning_rate": 3.6638046052089614e-06, - "loss": 0.31221291422843933, - "mean_token_accuracy": 0.888375997543335, - "num_tokens": 13338413.0, - "step": 1496 - }, - { - "epoch": 1.1375379939209727, - "grad_norm": 3.655658483505249, - "learning_rate": 3.661950581140239e-06, - "loss": 0.3609023988246918, - "mean_token_accuracy": 0.8838576078414917, - "num_tokens": 13341499.0, - "step": 1497 - }, - { - "epoch": 1.1382978723404256, - "grad_norm": 2.242009162902832, - "learning_rate": 3.660095741569871e-06, - "loss": 0.40022802352905273, - "mean_token_accuracy": 0.8559960722923279, - "num_tokens": 13347917.0, - "step": 1498 - }, - { - "epoch": 1.1390577507598785, - "grad_norm": 1.7958979606628418, - "learning_rate": 3.658240087799655e-06, - "loss": 0.499157190322876, - "mean_token_accuracy": 0.8423802256584167, - "num_tokens": 13361570.0, - "step": 1499 - }, - { - "epoch": 1.1398176291793314, - "grad_norm": 2.5406908988952637, - "learning_rate": 3.6563836211319593e-06, - "loss": 0.4090137481689453, - "mean_token_accuracy": 0.8769663572311401, - "num_tokens": 13367183.0, - "step": 1500 - }, - { - "epoch": 1.1405775075987843, - "grad_norm": 1.9861716032028198, - "learning_rate": 3.654526342869724e-06, - "loss": 0.5125207304954529, - "mean_token_accuracy": 0.8315266370773315, - "num_tokens": 13376767.0, - "step": 1501 - }, - { - "epoch": 1.141337386018237, - "grad_norm": 1.731188178062439, - "learning_rate": 3.65266825431646e-06, - "loss": 0.39452576637268066, - "mean_token_accuracy": 0.8585706353187561, - "num_tokens": 13388437.0, - "step": 1502 - }, - { - "epoch": 1.1420972644376899, - "grad_norm": 1.5203773975372314, - "learning_rate": 3.6508093567762425e-06, - "loss": 0.39466819167137146, - "mean_token_accuracy": 0.8584027886390686, - "num_tokens": 13399727.0, - "step": 1503 - }, - { - "epoch": 1.1428571428571428, - "grad_norm": 2.606462001800537, - "learning_rate": 3.6489496515537204e-06, - "loss": 0.4521079361438751, - "mean_token_accuracy": 0.8413360118865967, - "num_tokens": 13408426.0, - "step": 1504 - }, - { - "epoch": 1.1436170212765957, - "grad_norm": 2.6207993030548096, - "learning_rate": 3.647089139954104e-06, - "loss": 0.4709353446960449, - "mean_token_accuracy": 0.8397113084793091, - "num_tokens": 13413506.0, - "step": 1505 - }, - { - "epoch": 1.1443768996960486, - "grad_norm": 1.7214165925979614, - "learning_rate": 3.6452278232831734e-06, - "loss": 0.45506367087364197, - "mean_token_accuracy": 0.8466023206710815, - "num_tokens": 13424592.0, - "step": 1506 - }, - { - "epoch": 1.1451367781155015, - "grad_norm": 1.7111759185791016, - "learning_rate": 3.643365702847272e-06, - "loss": 0.5016278624534607, - "mean_token_accuracy": 0.8196234703063965, - "num_tokens": 13434421.0, - "step": 1507 - }, - { - "epoch": 1.1458966565349544, - "grad_norm": 1.7528148889541626, - "learning_rate": 3.641502779953307e-06, - "loss": 0.5020896196365356, - "mean_token_accuracy": 0.826249361038208, - "num_tokens": 13445286.0, - "step": 1508 - }, - { - "epoch": 1.1466565349544073, - "grad_norm": 1.3470909595489502, - "learning_rate": 3.639639055908751e-06, - "loss": 0.45765724778175354, - "mean_token_accuracy": 0.8380560278892517, - "num_tokens": 13465030.0, - "step": 1509 - }, - { - "epoch": 1.1474164133738602, - "grad_norm": 2.4846835136413574, - "learning_rate": 3.6377745320216346e-06, - "loss": 0.46488267183303833, - "mean_token_accuracy": 0.8393925428390503, - "num_tokens": 13470883.0, - "step": 1510 - }, - { - "epoch": 1.1481762917933132, - "grad_norm": 1.770201563835144, - "learning_rate": 3.635909209600555e-06, - "loss": 0.5262179374694824, - "mean_token_accuracy": 0.8201162815093994, - "num_tokens": 13482558.0, - "step": 1511 - }, - { - "epoch": 1.148936170212766, - "grad_norm": 1.5955098867416382, - "learning_rate": 3.6340430899546656e-06, - "loss": 0.430621862411499, - "mean_token_accuracy": 0.8488553762435913, - "num_tokens": 13493003.0, - "step": 1512 - }, - { - "epoch": 1.1496960486322187, - "grad_norm": 2.846176862716675, - "learning_rate": 3.632176174393682e-06, - "loss": 0.23461638391017914, - "mean_token_accuracy": 0.9218817353248596, - "num_tokens": 13496566.0, - "step": 1513 - }, - { - "epoch": 1.1504559270516717, - "grad_norm": 1.9606610536575317, - "learning_rate": 3.630308464227877e-06, - "loss": 0.4940161108970642, - "mean_token_accuracy": 0.8474864959716797, - "num_tokens": 13504843.0, - "step": 1514 - }, - { - "epoch": 1.1512158054711246, - "grad_norm": 1.1588608026504517, - "learning_rate": 3.628439960768082e-06, - "loss": 0.32650992274284363, - "mean_token_accuracy": 0.8797246217727661, - "num_tokens": 13521513.0, - "step": 1515 - }, - { - "epoch": 1.1519756838905775, - "grad_norm": 1.3566495180130005, - "learning_rate": 3.6265706653256837e-06, - "loss": 0.4359064996242523, - "mean_token_accuracy": 0.8379859328269958, - "num_tokens": 13540608.0, - "step": 1516 - }, - { - "epoch": 1.1527355623100304, - "grad_norm": 1.4728609323501587, - "learning_rate": 3.624700579212626e-06, - "loss": 0.29939693212509155, - "mean_token_accuracy": 0.8831408023834229, - "num_tokens": 13550641.0, - "step": 1517 - }, - { - "epoch": 1.1534954407294833, - "grad_norm": 2.162325382232666, - "learning_rate": 3.6228297037414077e-06, - "loss": 0.4097636938095093, - "mean_token_accuracy": 0.8575425148010254, - "num_tokens": 13556931.0, - "step": 1518 - }, - { - "epoch": 1.1542553191489362, - "grad_norm": 1.754439353942871, - "learning_rate": 3.6209580402250816e-06, - "loss": 0.400202214717865, - "mean_token_accuracy": 0.8569821119308472, - "num_tokens": 13565491.0, - "step": 1519 - }, - { - "epoch": 1.155015197568389, - "grad_norm": 1.5250083208084106, - "learning_rate": 3.619085589977251e-06, - "loss": 0.43330419063568115, - "mean_token_accuracy": 0.8492985963821411, - "num_tokens": 13577147.0, - "step": 1520 - }, - { - "epoch": 1.155775075987842, - "grad_norm": 1.9108905792236328, - "learning_rate": 3.617212354312076e-06, - "loss": 0.30567464232444763, - "mean_token_accuracy": 0.8850164413452148, - "num_tokens": 13584366.0, - "step": 1521 - }, - { - "epoch": 1.156534954407295, - "grad_norm": 2.2574243545532227, - "learning_rate": 3.615338334544265e-06, - "loss": 0.4391738772392273, - "mean_token_accuracy": 0.839765727519989, - "num_tokens": 13591816.0, - "step": 1522 - }, - { - "epoch": 1.1572948328267478, - "grad_norm": 2.1235218048095703, - "learning_rate": 3.6134635319890763e-06, - "loss": 0.45043107867240906, - "mean_token_accuracy": 0.8385299444198608, - "num_tokens": 13599736.0, - "step": 1523 - }, - { - "epoch": 1.1580547112462005, - "grad_norm": 2.2274110317230225, - "learning_rate": 3.611587947962319e-06, - "loss": 0.3623226284980774, - "mean_token_accuracy": 0.8724044561386108, - "num_tokens": 13605354.0, - "step": 1524 - }, - { - "epoch": 1.1588145896656534, - "grad_norm": 3.414236545562744, - "learning_rate": 3.6097115837803504e-06, - "loss": 0.30060696601867676, - "mean_token_accuracy": 0.8971061706542969, - "num_tokens": 13608851.0, - "step": 1525 - }, - { - "epoch": 1.1595744680851063, - "grad_norm": 2.496264696121216, - "learning_rate": 3.6078344407600744e-06, - "loss": 0.3567180037498474, - "mean_token_accuracy": 0.8596180081367493, - "num_tokens": 13614339.0, - "step": 1526 - }, - { - "epoch": 1.1603343465045592, - "grad_norm": 2.0191843509674072, - "learning_rate": 3.6059565202189433e-06, - "loss": 0.43206095695495605, - "mean_token_accuracy": 0.8464000821113586, - "num_tokens": 13622395.0, - "step": 1527 - }, - { - "epoch": 1.1610942249240122, - "grad_norm": 1.5475906133651733, - "learning_rate": 3.604077823474954e-06, - "loss": 0.4535648226737976, - "mean_token_accuracy": 0.8391586542129517, - "num_tokens": 13635356.0, - "step": 1528 - }, - { - "epoch": 1.161854103343465, - "grad_norm": 2.1348211765289307, - "learning_rate": 3.6021983518466468e-06, - "loss": 0.2733963429927826, - "mean_token_accuracy": 0.9007417559623718, - "num_tokens": 13640641.0, - "step": 1529 - }, - { - "epoch": 1.162613981762918, - "grad_norm": 2.8452792167663574, - "learning_rate": 3.600318106653108e-06, - "loss": 0.29591235518455505, - "mean_token_accuracy": 0.8934413194656372, - "num_tokens": 13644995.0, - "step": 1530 - }, - { - "epoch": 1.1633738601823709, - "grad_norm": 2.342907190322876, - "learning_rate": 3.5984370892139663e-06, - "loss": 0.4675130248069763, - "mean_token_accuracy": 0.8352028131484985, - "num_tokens": 13652695.0, - "step": 1531 - }, - { - "epoch": 1.1641337386018238, - "grad_norm": 2.3480238914489746, - "learning_rate": 3.5965553008493924e-06, - "loss": 0.3114515542984009, - "mean_token_accuracy": 0.8845353126525879, - "num_tokens": 13658101.0, - "step": 1532 - }, - { - "epoch": 1.1648936170212765, - "grad_norm": 1.8608155250549316, - "learning_rate": 3.594672742880097e-06, - "loss": 0.3864145278930664, - "mean_token_accuracy": 0.867354154586792, - "num_tokens": 13666042.0, - "step": 1533 - }, - { - "epoch": 1.1656534954407296, - "grad_norm": 1.4756088256835938, - "learning_rate": 3.5927894166273324e-06, - "loss": 0.3671600818634033, - "mean_token_accuracy": 0.8695988655090332, - "num_tokens": 13678253.0, - "step": 1534 - }, - { - "epoch": 1.1664133738601823, - "grad_norm": 2.8831355571746826, - "learning_rate": 3.5909053234128893e-06, - "loss": 0.267184317111969, - "mean_token_accuracy": 0.9008115530014038, - "num_tokens": 13681790.0, - "step": 1535 - }, - { - "epoch": 1.1671732522796352, - "grad_norm": 2.1984763145446777, - "learning_rate": 3.5890204645590964e-06, - "loss": 0.4431505799293518, - "mean_token_accuracy": 0.8623673915863037, - "num_tokens": 13688444.0, - "step": 1536 - }, - { - "epoch": 1.167933130699088, - "grad_norm": 1.8271523714065552, - "learning_rate": 3.5871348413888207e-06, - "loss": 0.3861040771007538, - "mean_token_accuracy": 0.8624277114868164, - "num_tokens": 13696872.0, - "step": 1537 - }, - { - "epoch": 1.168693009118541, - "grad_norm": 1.6313756704330444, - "learning_rate": 3.585248455225466e-06, - "loss": 0.3775154948234558, - "mean_token_accuracy": 0.8624461889266968, - "num_tokens": 13706167.0, - "step": 1538 - }, - { - "epoch": 1.169452887537994, - "grad_norm": 2.4377901554107666, - "learning_rate": 3.5833613073929684e-06, - "loss": 0.2308957427740097, - "mean_token_accuracy": 0.920600175857544, - "num_tokens": 13710367.0, - "step": 1539 - }, - { - "epoch": 1.1702127659574468, - "grad_norm": 2.2621750831604004, - "learning_rate": 3.5814733992158025e-06, - "loss": 0.33167219161987305, - "mean_token_accuracy": 0.8963261842727661, - "num_tokens": 13716384.0, - "step": 1540 - }, - { - "epoch": 1.1709726443768997, - "grad_norm": 1.3178150653839111, - "learning_rate": 3.579584732018975e-06, - "loss": 0.3276631832122803, - "mean_token_accuracy": 0.8853521347045898, - "num_tokens": 13731031.0, - "step": 1541 - }, - { - "epoch": 1.1717325227963526, - "grad_norm": 2.177750587463379, - "learning_rate": 3.577695307128024e-06, - "loss": 0.48177266120910645, - "mean_token_accuracy": 0.830329418182373, - "num_tokens": 13737925.0, - "step": 1542 - }, - { - "epoch": 1.1724924012158056, - "grad_norm": 2.2268829345703125, - "learning_rate": 3.5758051258690223e-06, - "loss": 0.48843517899513245, - "mean_token_accuracy": 0.8310644030570984, - "num_tokens": 13746039.0, - "step": 1543 - }, - { - "epoch": 1.1732522796352582, - "grad_norm": 1.498701572418213, - "learning_rate": 3.5739141895685708e-06, - "loss": 0.4542962312698364, - "mean_token_accuracy": 0.8500330448150635, - "num_tokens": 13765002.0, - "step": 1544 - }, - { - "epoch": 1.1740121580547112, - "grad_norm": 1.786670446395874, - "learning_rate": 3.5720224995538023e-06, - "loss": 0.27367928624153137, - "mean_token_accuracy": 0.8916142582893372, - "num_tokens": 13774113.0, - "step": 1545 - }, - { - "epoch": 1.174772036474164, - "grad_norm": 2.0311272144317627, - "learning_rate": 3.5701300571523757e-06, - "loss": 0.559987485408783, - "mean_token_accuracy": 0.8266973495483398, - "num_tokens": 13783912.0, - "step": 1546 - }, - { - "epoch": 1.175531914893617, - "grad_norm": 1.8732186555862427, - "learning_rate": 3.5682368636924825e-06, - "loss": 0.5184751152992249, - "mean_token_accuracy": 0.8450918197631836, - "num_tokens": 13792728.0, - "step": 1547 - }, - { - "epoch": 1.1762917933130699, - "grad_norm": 1.4410661458969116, - "learning_rate": 3.566342920502837e-06, - "loss": 0.383536696434021, - "mean_token_accuracy": 0.8672217726707458, - "num_tokens": 13813590.0, - "step": 1548 - }, - { - "epoch": 1.1770516717325228, - "grad_norm": 3.06056547164917, - "learning_rate": 3.564448228912682e-06, - "loss": 0.3941686153411865, - "mean_token_accuracy": 0.8696402311325073, - "num_tokens": 13817704.0, - "step": 1549 - }, - { - "epoch": 1.1778115501519757, - "grad_norm": 1.6150329113006592, - "learning_rate": 3.562552790251785e-06, - "loss": 0.41606605052948, - "mean_token_accuracy": 0.8488572835922241, - "num_tokens": 13831303.0, - "step": 1550 - }, - { - "epoch": 1.1785714285714286, - "grad_norm": 2.1199934482574463, - "learning_rate": 3.5606566058504377e-06, - "loss": 0.3974752426147461, - "mean_token_accuracy": 0.8686345219612122, - "num_tokens": 13837613.0, - "step": 1551 - }, - { - "epoch": 1.1793313069908815, - "grad_norm": 1.5683876276016235, - "learning_rate": 3.558759677039455e-06, - "loss": 0.35225993394851685, - "mean_token_accuracy": 0.8710784316062927, - "num_tokens": 13846779.0, - "step": 1552 - }, - { - "epoch": 1.1800911854103344, - "grad_norm": 1.4644675254821777, - "learning_rate": 3.5568620051501755e-06, - "loss": 0.38400042057037354, - "mean_token_accuracy": 0.8548328876495361, - "num_tokens": 13860713.0, - "step": 1553 - }, - { - "epoch": 1.1808510638297873, - "grad_norm": 1.461491346359253, - "learning_rate": 3.5549635915144578e-06, - "loss": 0.4572640061378479, - "mean_token_accuracy": 0.8506045937538147, - "num_tokens": 13877289.0, - "step": 1554 - }, - { - "epoch": 1.18161094224924, - "grad_norm": 2.6364715099334717, - "learning_rate": 3.553064437464682e-06, - "loss": 0.3954341411590576, - "mean_token_accuracy": 0.8561649322509766, - "num_tokens": 13882064.0, - "step": 1555 - }, - { - "epoch": 1.182370820668693, - "grad_norm": 2.027273654937744, - "learning_rate": 3.551164544333745e-06, - "loss": 0.47625732421875, - "mean_token_accuracy": 0.8349384069442749, - "num_tokens": 13890306.0, - "step": 1556 - }, - { - "epoch": 1.1831306990881458, - "grad_norm": 2.8427743911743164, - "learning_rate": 3.549263913455069e-06, - "loss": 0.4273033142089844, - "mean_token_accuracy": 0.8541387319564819, - "num_tokens": 13894882.0, - "step": 1557 - }, - { - "epoch": 1.1838905775075987, - "grad_norm": 1.6298975944519043, - "learning_rate": 3.5473625461625884e-06, - "loss": 0.4378639757633209, - "mean_token_accuracy": 0.8634963631629944, - "num_tokens": 13906152.0, - "step": 1558 - }, - { - "epoch": 1.1846504559270516, - "grad_norm": 2.4098947048187256, - "learning_rate": 3.5454604437907535e-06, - "loss": 0.47236716747283936, - "mean_token_accuracy": 0.8646864891052246, - "num_tokens": 13911803.0, - "step": 1559 - }, - { - "epoch": 1.1854103343465046, - "grad_norm": 1.5972497463226318, - "learning_rate": 3.543557607674537e-06, - "loss": 0.3001407980918884, - "mean_token_accuracy": 0.8927055597305298, - "num_tokens": 13921304.0, - "step": 1560 - }, - { - "epoch": 1.1861702127659575, - "grad_norm": 2.1140005588531494, - "learning_rate": 3.54165403914942e-06, - "loss": 0.41898271441459656, - "mean_token_accuracy": 0.8542245626449585, - "num_tokens": 13929434.0, - "step": 1561 - }, - { - "epoch": 1.1869300911854104, - "grad_norm": 1.8733803033828735, - "learning_rate": 3.539749739551401e-06, - "loss": 0.35469961166381836, - "mean_token_accuracy": 0.8805290460586548, - "num_tokens": 13937781.0, - "step": 1562 - }, - { - "epoch": 1.1876899696048633, - "grad_norm": 2.2805802822113037, - "learning_rate": 3.53784471021699e-06, - "loss": 0.44496792554855347, - "mean_token_accuracy": 0.8454172611236572, - "num_tokens": 13944394.0, - "step": 1563 - }, - { - "epoch": 1.1884498480243162, - "grad_norm": 0.9728449583053589, - "learning_rate": 3.535938952483211e-06, - "loss": 0.3156968355178833, - "mean_token_accuracy": 0.8739837408065796, - "num_tokens": 13966712.0, - "step": 1564 - }, - { - "epoch": 1.189209726443769, - "grad_norm": 3.025338888168335, - "learning_rate": 3.534032467687597e-06, - "loss": 0.30036938190460205, - "mean_token_accuracy": 0.9058252573013306, - "num_tokens": 13970183.0, - "step": 1565 - }, - { - "epoch": 1.1899696048632218, - "grad_norm": 2.0659425258636475, - "learning_rate": 3.532125257168193e-06, - "loss": 0.30619731545448303, - "mean_token_accuracy": 0.9041587710380554, - "num_tokens": 13976657.0, - "step": 1566 - }, - { - "epoch": 1.1907294832826747, - "grad_norm": 3.2036776542663574, - "learning_rate": 3.5302173222635526e-06, - "loss": 0.4145944118499756, - "mean_token_accuracy": 0.8502328395843506, - "num_tokens": 13981198.0, - "step": 1567 - }, - { - "epoch": 1.1914893617021276, - "grad_norm": 1.7767539024353027, - "learning_rate": 3.5283086643127396e-06, - "loss": 0.437128484249115, - "mean_token_accuracy": 0.8965631723403931, - "num_tokens": 13990259.0, - "step": 1568 - }, - { - "epoch": 1.1922492401215805, - "grad_norm": 1.7777384519577026, - "learning_rate": 3.5263992846553203e-06, - "loss": 0.33831220865249634, - "mean_token_accuracy": 0.8734279870986938, - "num_tokens": 13999363.0, - "step": 1569 - }, - { - "epoch": 1.1930091185410334, - "grad_norm": 1.6710708141326904, - "learning_rate": 3.5244891846313733e-06, - "loss": 0.4005590081214905, - "mean_token_accuracy": 0.8820298314094543, - "num_tokens": 14008719.0, - "step": 1570 - }, - { - "epoch": 1.1937689969604863, - "grad_norm": 1.0378777980804443, - "learning_rate": 3.5225783655814798e-06, - "loss": 0.3174915313720703, - "mean_token_accuracy": 0.8894162774085999, - "num_tokens": 14025806.0, - "step": 1571 - }, - { - "epoch": 1.1945288753799392, - "grad_norm": 1.2647521495819092, - "learning_rate": 3.520666828846726e-06, - "loss": 0.4173050820827484, - "mean_token_accuracy": 0.8437265157699585, - "num_tokens": 14046445.0, - "step": 1572 - }, - { - "epoch": 1.1952887537993921, - "grad_norm": 2.8625528812408447, - "learning_rate": 3.518754575768702e-06, - "loss": 0.37182557582855225, - "mean_token_accuracy": 0.8660947680473328, - "num_tokens": 14051197.0, - "step": 1573 - }, - { - "epoch": 1.196048632218845, - "grad_norm": 1.1213171482086182, - "learning_rate": 3.516841607689501e-06, - "loss": 0.332731157541275, - "mean_token_accuracy": 0.8573278784751892, - "num_tokens": 14070817.0, - "step": 1574 - }, - { - "epoch": 1.196808510638298, - "grad_norm": 1.197508692741394, - "learning_rate": 3.5149279259517165e-06, - "loss": 0.34058472514152527, - "mean_token_accuracy": 0.8603571653366089, - "num_tokens": 14085301.0, - "step": 1575 - }, - { - "epoch": 1.1975683890577509, - "grad_norm": 4.019949913024902, - "learning_rate": 3.5130135318984454e-06, - "loss": 0.3094622492790222, - "mean_token_accuracy": 0.8905094861984253, - "num_tokens": 14088107.0, - "step": 1576 - }, - { - "epoch": 1.1983282674772036, - "grad_norm": 2.591181755065918, - "learning_rate": 3.5110984268732827e-06, - "loss": 0.3407078981399536, - "mean_token_accuracy": 0.880385160446167, - "num_tokens": 14092887.0, - "step": 1577 - }, - { - "epoch": 1.1990881458966565, - "grad_norm": 1.3069331645965576, - "learning_rate": 3.509182612220322e-06, - "loss": 0.3761988878250122, - "mean_token_accuracy": 0.862013041973114, - "num_tokens": 14109216.0, - "step": 1578 - }, - { - "epoch": 1.1998480243161094, - "grad_norm": 1.7802022695541382, - "learning_rate": 3.507266089284157e-06, - "loss": 0.3824652135372162, - "mean_token_accuracy": 0.8707721829414368, - "num_tokens": 14119645.0, - "step": 1579 - }, - { - "epoch": 1.2006079027355623, - "grad_norm": 2.7937185764312744, - "learning_rate": 3.5053488594098763e-06, - "loss": 0.33828890323638916, - "mean_token_accuracy": 0.8765541315078735, - "num_tokens": 14124628.0, - "step": 1580 - }, - { - "epoch": 1.2013677811550152, - "grad_norm": 1.892671823501587, - "learning_rate": 3.5034309239430664e-06, - "loss": 0.3476094603538513, - "mean_token_accuracy": 0.9053795337677002, - "num_tokens": 14131756.0, - "step": 1581 - }, - { - "epoch": 1.202127659574468, - "grad_norm": 1.6857695579528809, - "learning_rate": 3.501512284229807e-06, - "loss": 0.5397108793258667, - "mean_token_accuracy": 0.8173421025276184, - "num_tokens": 14143024.0, - "step": 1582 - }, - { - "epoch": 1.202887537993921, - "grad_norm": 2.501737117767334, - "learning_rate": 3.4995929416166756e-06, - "loss": 0.4192458391189575, - "mean_token_accuracy": 0.8558136224746704, - "num_tokens": 14149499.0, - "step": 1583 - }, - { - "epoch": 1.203647416413374, - "grad_norm": 2.0133907794952393, - "learning_rate": 3.4976728974507387e-06, - "loss": 0.4791576564311981, - "mean_token_accuracy": 0.8253597021102905, - "num_tokens": 14158381.0, - "step": 1584 - }, - { - "epoch": 1.2044072948328268, - "grad_norm": 2.984611988067627, - "learning_rate": 3.4957521530795576e-06, - "loss": 0.3040750026702881, - "mean_token_accuracy": 0.8902391791343689, - "num_tokens": 14162419.0, - "step": 1585 - }, - { - "epoch": 1.2051671732522795, - "grad_norm": 1.518591284751892, - "learning_rate": 3.493830709851185e-06, - "loss": 0.35539618134498596, - "mean_token_accuracy": 0.8737183809280396, - "num_tokens": 14173048.0, - "step": 1586 - }, - { - "epoch": 1.2059270516717326, - "grad_norm": 2.628758192062378, - "learning_rate": 3.4919085691141636e-06, - "loss": 0.33340200781822205, - "mean_token_accuracy": 0.8705098628997803, - "num_tokens": 14178255.0, - "step": 1587 - }, - { - "epoch": 1.2066869300911853, - "grad_norm": 2.5565974712371826, - "learning_rate": 3.4899857322175252e-06, - "loss": 0.44939476251602173, - "mean_token_accuracy": 0.8315504193305969, - "num_tokens": 14183808.0, - "step": 1588 - }, - { - "epoch": 1.2074468085106382, - "grad_norm": 1.7521045207977295, - "learning_rate": 3.4880622005107916e-06, - "loss": 0.3168621063232422, - "mean_token_accuracy": 0.8824669122695923, - "num_tokens": 14192186.0, - "step": 1589 - }, - { - "epoch": 1.2082066869300911, - "grad_norm": 1.9816104173660278, - "learning_rate": 3.486137975343971e-06, - "loss": 0.3892582058906555, - "mean_token_accuracy": 0.8524188995361328, - "num_tokens": 14200512.0, - "step": 1590 - }, - { - "epoch": 1.208966565349544, - "grad_norm": 1.459800124168396, - "learning_rate": 3.484213058067559e-06, - "loss": 0.45930033922195435, - "mean_token_accuracy": 0.8408471345901489, - "num_tokens": 14215232.0, - "step": 1591 - }, - { - "epoch": 1.209726443768997, - "grad_norm": 2.015493154525757, - "learning_rate": 3.482287450032536e-06, - "loss": 0.5514016151428223, - "mean_token_accuracy": 0.8456779718399048, - "num_tokens": 14225402.0, - "step": 1592 - }, - { - "epoch": 1.2104863221884499, - "grad_norm": 3.4511911869049072, - "learning_rate": 3.4803611525903687e-06, - "loss": 0.4772771894931793, - "mean_token_accuracy": 0.8558698892593384, - "num_tokens": 14229038.0, - "step": 1593 - }, - { - "epoch": 1.2112462006079028, - "grad_norm": 2.2247982025146484, - "learning_rate": 3.4784341670930067e-06, - "loss": 0.4042825996875763, - "mean_token_accuracy": 0.8635870218276978, - "num_tokens": 14237057.0, - "step": 1594 - }, - { - "epoch": 1.2120060790273557, - "grad_norm": 2.0534820556640625, - "learning_rate": 3.4765064948928813e-06, - "loss": 0.34057414531707764, - "mean_token_accuracy": 0.8800770044326782, - "num_tokens": 14243013.0, - "step": 1595 - }, - { - "epoch": 1.2127659574468086, - "grad_norm": 2.594703197479248, - "learning_rate": 3.474578137342909e-06, - "loss": 0.4997410774230957, - "mean_token_accuracy": 0.8302106261253357, - "num_tokens": 14251210.0, - "step": 1596 - }, - { - "epoch": 1.2135258358662613, - "grad_norm": 2.517833948135376, - "learning_rate": 3.4726490957964836e-06, - "loss": 0.3630390465259552, - "mean_token_accuracy": 0.8679884672164917, - "num_tokens": 14255893.0, - "step": 1597 - }, - { - "epoch": 1.2142857142857142, - "grad_norm": 1.5177065134048462, - "learning_rate": 3.4707193716074816e-06, - "loss": 0.36218544840812683, - "mean_token_accuracy": 0.879178524017334, - "num_tokens": 14268143.0, - "step": 1598 - }, - { - "epoch": 1.215045592705167, - "grad_norm": 2.215291738510132, - "learning_rate": 3.4687889661302577e-06, - "loss": 0.4166645407676697, - "mean_token_accuracy": 0.8495793342590332, - "num_tokens": 14276794.0, - "step": 1599 - }, - { - "epoch": 1.21580547112462, - "grad_norm": 1.534294843673706, - "learning_rate": 3.466857880719645e-06, - "loss": 0.2635883092880249, - "mean_token_accuracy": 0.8971712589263916, - "num_tokens": 14287000.0, - "step": 1600 - }, - { - "epoch": 1.216565349544073, - "grad_norm": 1.2338658571243286, - "learning_rate": 3.464926116730953e-06, - "loss": 0.339110404253006, - "mean_token_accuracy": 0.895592987537384, - "num_tokens": 14303217.0, - "step": 1601 - }, - { - "epoch": 1.2173252279635258, - "grad_norm": 1.8717178106307983, - "learning_rate": 3.462993675519968e-06, - "loss": 0.41204726696014404, - "mean_token_accuracy": 0.8560728430747986, - "num_tokens": 14311372.0, - "step": 1602 - }, - { - "epoch": 1.2180851063829787, - "grad_norm": 2.844160795211792, - "learning_rate": 3.4610605584429526e-06, - "loss": 0.4129520058631897, - "mean_token_accuracy": 0.8555002212524414, - "num_tokens": 14316244.0, - "step": 1603 - }, - { - "epoch": 1.2188449848024316, - "grad_norm": 1.099926471710205, - "learning_rate": 3.4591267668566412e-06, - "loss": 0.35783132910728455, - "mean_token_accuracy": 0.8693175315856934, - "num_tokens": 14338414.0, - "step": 1604 - }, - { - "epoch": 1.2196048632218845, - "grad_norm": 1.6448384523391724, - "learning_rate": 3.457192302118244e-06, - "loss": 0.42060258984565735, - "mean_token_accuracy": 0.8557323217391968, - "num_tokens": 14349143.0, - "step": 1605 - }, - { - "epoch": 1.2203647416413375, - "grad_norm": 2.097529888153076, - "learning_rate": 3.455257165585444e-06, - "loss": 0.5227499008178711, - "mean_token_accuracy": 0.828961968421936, - "num_tokens": 14360032.0, - "step": 1606 - }, - { - "epoch": 1.2211246200607904, - "grad_norm": 1.602988600730896, - "learning_rate": 3.453321358616393e-06, - "loss": 0.3537187874317169, - "mean_token_accuracy": 0.8776708841323853, - "num_tokens": 14370005.0, - "step": 1607 - }, - { - "epoch": 1.221884498480243, - "grad_norm": 2.358971357345581, - "learning_rate": 3.4513848825697145e-06, - "loss": 0.3448919653892517, - "mean_token_accuracy": 0.8887944221496582, - "num_tokens": 14375718.0, - "step": 1608 - }, - { - "epoch": 1.222644376899696, - "grad_norm": 1.72306227684021, - "learning_rate": 3.4494477388045035e-06, - "loss": 0.36985084414482117, - "mean_token_accuracy": 0.859595537185669, - "num_tokens": 14385016.0, - "step": 1609 - }, - { - "epoch": 1.2234042553191489, - "grad_norm": 1.5494085550308228, - "learning_rate": 3.4475099286803204e-06, - "loss": 0.49003708362579346, - "mean_token_accuracy": 0.8701964616775513, - "num_tokens": 14399277.0, - "step": 1610 - }, - { - "epoch": 1.2241641337386018, - "grad_norm": 2.6874046325683594, - "learning_rate": 3.445571453557196e-06, - "loss": 0.3424490690231323, - "mean_token_accuracy": 0.8835943937301636, - "num_tokens": 14404182.0, - "step": 1611 - }, - { - "epoch": 1.2249240121580547, - "grad_norm": 2.2163190841674805, - "learning_rate": 3.443632314795627e-06, - "loss": 0.40944457054138184, - "mean_token_accuracy": 0.8649888038635254, - "num_tokens": 14410158.0, - "step": 1612 - }, - { - "epoch": 1.2256838905775076, - "grad_norm": 2.7961158752441406, - "learning_rate": 3.4416925137565756e-06, - "loss": 0.17890746891498566, - "mean_token_accuracy": 0.9439430832862854, - "num_tokens": 14413285.0, - "step": 1613 - }, - { - "epoch": 1.2264437689969605, - "grad_norm": 1.421451210975647, - "learning_rate": 3.439752051801467e-06, - "loss": 0.33948683738708496, - "mean_token_accuracy": 0.8754585981369019, - "num_tokens": 14424674.0, - "step": 1614 - }, - { - "epoch": 1.2272036474164134, - "grad_norm": 2.105196237564087, - "learning_rate": 3.4378109302921946e-06, - "loss": 0.40009379386901855, - "mean_token_accuracy": 0.8600341081619263, - "num_tokens": 14432400.0, - "step": 1615 - }, - { - "epoch": 1.2279635258358663, - "grad_norm": 2.004122734069824, - "learning_rate": 3.4358691505911105e-06, - "loss": 0.46013444662094116, - "mean_token_accuracy": 0.8400925993919373, - "num_tokens": 14440741.0, - "step": 1616 - }, - { - "epoch": 1.2287234042553192, - "grad_norm": 1.8407535552978516, - "learning_rate": 3.4339267140610317e-06, - "loss": 0.38828906416893005, - "mean_token_accuracy": 0.8582802414894104, - "num_tokens": 14448698.0, - "step": 1617 - }, - { - "epoch": 1.2294832826747721, - "grad_norm": 2.4285924434661865, - "learning_rate": 3.4319836220652334e-06, - "loss": 0.3109283447265625, - "mean_token_accuracy": 0.8888344764709473, - "num_tokens": 14453674.0, - "step": 1618 - }, - { - "epoch": 1.2302431610942248, - "grad_norm": 1.6322550773620605, - "learning_rate": 3.430039875967454e-06, - "loss": 0.5222204327583313, - "mean_token_accuracy": 0.825019121170044, - "num_tokens": 14465736.0, - "step": 1619 - }, - { - "epoch": 1.2310030395136777, - "grad_norm": 2.307573080062866, - "learning_rate": 3.428095477131888e-06, - "loss": 0.29477375745773315, - "mean_token_accuracy": 0.8899064660072327, - "num_tokens": 14471266.0, - "step": 1620 - }, - { - "epoch": 1.2317629179331306, - "grad_norm": 1.8044531345367432, - "learning_rate": 3.4261504269231904e-06, - "loss": 0.4883342981338501, - "mean_token_accuracy": 0.8310165405273438, - "num_tokens": 14481679.0, - "step": 1621 - }, - { - "epoch": 1.2325227963525835, - "grad_norm": 2.7585411071777344, - "learning_rate": 3.4242047267064714e-06, - "loss": 0.45369645953178406, - "mean_token_accuracy": 0.8432134985923767, - "num_tokens": 14487299.0, - "step": 1622 - }, - { - "epoch": 1.2332826747720365, - "grad_norm": 2.687490701675415, - "learning_rate": 3.4222583778472997e-06, - "loss": 0.5627540349960327, - "mean_token_accuracy": 0.8186438083648682, - "num_tokens": 14494254.0, - "step": 1623 - }, - { - "epoch": 1.2340425531914894, - "grad_norm": 2.622443199157715, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.28697147965431213, - "mean_token_accuracy": 0.8861737847328186, - "num_tokens": 14498632.0, - "step": 1624 - }, - { - "epoch": 1.2348024316109423, - "grad_norm": 2.6943359375, - "learning_rate": 3.4183637396661372e-06, - "loss": 0.25273287296295166, - "mean_token_accuracy": 0.9104914665222168, - "num_tokens": 14502797.0, - "step": 1625 - }, - { - "epoch": 1.2355623100303952, - "grad_norm": 2.428189992904663, - "learning_rate": 3.4164154530775552e-06, - "loss": 0.4213451147079468, - "mean_token_accuracy": 0.851524293422699, - "num_tokens": 14508503.0, - "step": 1626 - }, - { - "epoch": 1.236322188449848, - "grad_norm": 2.1722824573516846, - "learning_rate": 3.4144665233133318e-06, - "loss": 0.35238856077194214, - "mean_token_accuracy": 0.8730837106704712, - "num_tokens": 14516126.0, - "step": 1627 - }, - { - "epoch": 1.237082066869301, - "grad_norm": 2.291365146636963, - "learning_rate": 3.4125169517413005e-06, - "loss": 0.43963465094566345, - "mean_token_accuracy": 0.8525444865226746, - "num_tokens": 14522507.0, - "step": 1628 - }, - { - "epoch": 1.237841945288754, - "grad_norm": 1.6181648969650269, - "learning_rate": 3.410566739729746e-06, - "loss": 0.2799680233001709, - "mean_token_accuracy": 0.8915654420852661, - "num_tokens": 14531025.0, - "step": 1629 - }, - { - "epoch": 1.2386018237082066, - "grad_norm": 1.4039218425750732, - "learning_rate": 3.408615888647402e-06, - "loss": 0.29756587743759155, - "mean_token_accuracy": 0.8951715230941772, - "num_tokens": 14543770.0, - "step": 1630 - }, - { - "epoch": 1.2393617021276595, - "grad_norm": 2.148325204849243, - "learning_rate": 3.4066643998634506e-06, - "loss": 0.3983418345451355, - "mean_token_accuracy": 0.8635951280593872, - "num_tokens": 14550896.0, - "step": 1631 - }, - { - "epoch": 1.2401215805471124, - "grad_norm": 1.5225859880447388, - "learning_rate": 3.4047122747475227e-06, - "loss": 0.3247569799423218, - "mean_token_accuracy": 0.8727027177810669, - "num_tokens": 14562181.0, - "step": 1632 - }, - { - "epoch": 1.2408814589665653, - "grad_norm": 3.99835467338562, - "learning_rate": 3.402759514669694e-06, - "loss": 0.4317352771759033, - "mean_token_accuracy": 0.8488142490386963, - "num_tokens": 14565521.0, - "step": 1633 - }, - { - "epoch": 1.2416413373860182, - "grad_norm": 1.7306902408599854, - "learning_rate": 3.4008061210004872e-06, - "loss": 0.389854371547699, - "mean_token_accuracy": 0.8553084135055542, - "num_tokens": 14574633.0, - "step": 1634 - }, - { - "epoch": 1.2424012158054711, - "grad_norm": 2.3614673614501953, - "learning_rate": 3.3988520951108683e-06, - "loss": 0.3150152564048767, - "mean_token_accuracy": 0.8865959644317627, - "num_tokens": 14580240.0, - "step": 1635 - }, - { - "epoch": 1.243161094224924, - "grad_norm": 1.5625747442245483, - "learning_rate": 3.3968974383722497e-06, - "loss": 0.43160033226013184, - "mean_token_accuracy": 0.840155839920044, - "num_tokens": 14594255.0, - "step": 1636 - }, - { - "epoch": 1.243920972644377, - "grad_norm": 1.871620535850525, - "learning_rate": 3.3949421521564825e-06, - "loss": 0.49550193548202515, - "mean_token_accuracy": 0.8315126299858093, - "num_tokens": 14605416.0, - "step": 1637 - }, - { - "epoch": 1.2446808510638299, - "grad_norm": 2.111304759979248, - "learning_rate": 3.392986237835863e-06, - "loss": 0.2794899046421051, - "mean_token_accuracy": 0.9049773216247559, - "num_tokens": 14611711.0, - "step": 1638 - }, - { - "epoch": 1.2454407294832828, - "grad_norm": 3.7479894161224365, - "learning_rate": 3.391029696783127e-06, - "loss": 0.469397634267807, - "mean_token_accuracy": 0.8352956771850586, - "num_tokens": 14615536.0, - "step": 1639 - }, - { - "epoch": 1.2462006079027357, - "grad_norm": 3.277726650238037, - "learning_rate": 3.389072530371451e-06, - "loss": 0.35431790351867676, - "mean_token_accuracy": 0.8822286128997803, - "num_tokens": 14619390.0, - "step": 1640 - }, - { - "epoch": 1.2469604863221884, - "grad_norm": 1.9583072662353516, - "learning_rate": 3.3871147399744482e-06, - "loss": 0.3708694577217102, - "mean_token_accuracy": 0.8720351457595825, - "num_tokens": 14626573.0, - "step": 1641 - }, - { - "epoch": 1.2477203647416413, - "grad_norm": 1.8734042644500732, - "learning_rate": 3.385156326966173e-06, - "loss": 0.48163774609565735, - "mean_token_accuracy": 0.8479621410369873, - "num_tokens": 14636382.0, - "step": 1642 - }, - { - "epoch": 1.2484802431610942, - "grad_norm": 2.0085532665252686, - "learning_rate": 3.383197292721114e-06, - "loss": 0.4893198311328888, - "mean_token_accuracy": 0.838238000869751, - "num_tokens": 14645083.0, - "step": 1643 - }, - { - "epoch": 1.249240121580547, - "grad_norm": 2.0874593257904053, - "learning_rate": 3.3812376386141966e-06, - "loss": 0.4610505700111389, - "mean_token_accuracy": 0.8441368341445923, - "num_tokens": 14654048.0, - "step": 1644 - }, - { - "epoch": 1.25, - "grad_norm": 1.6887420415878296, - "learning_rate": 3.379277366020782e-06, - "loss": 0.3628596067428589, - "mean_token_accuracy": 0.8838590383529663, - "num_tokens": 14662317.0, - "step": 1645 - }, - { - "epoch": 1.250759878419453, - "grad_norm": 2.389002561569214, - "learning_rate": 3.3773164763166653e-06, - "loss": 0.21903495490550995, - "mean_token_accuracy": 0.9249413013458252, - "num_tokens": 14666394.0, - "step": 1646 - }, - { - "epoch": 1.2515197568389058, - "grad_norm": 1.7091087102890015, - "learning_rate": 3.3753549708780736e-06, - "loss": 0.37802332639694214, - "mean_token_accuracy": 0.8644627332687378, - "num_tokens": 14676214.0, - "step": 1647 - }, - { - "epoch": 1.2522796352583587, - "grad_norm": 2.5717999935150146, - "learning_rate": 3.3733928510816677e-06, - "loss": 0.4236462116241455, - "mean_token_accuracy": 0.8519910573959351, - "num_tokens": 14681681.0, - "step": 1648 - }, - { - "epoch": 1.2530395136778116, - "grad_norm": 1.958856463432312, - "learning_rate": 3.3714301183045382e-06, - "loss": 0.3923419415950775, - "mean_token_accuracy": 0.8720202445983887, - "num_tokens": 14690419.0, - "step": 1649 - }, - { - "epoch": 1.2537993920972643, - "grad_norm": 1.5900038480758667, - "learning_rate": 3.369466773924207e-06, - "loss": 0.4182325601577759, - "mean_token_accuracy": 0.8515387177467346, - "num_tokens": 14699790.0, - "step": 1650 - }, - { - "epoch": 1.2545592705167175, - "grad_norm": 1.260547161102295, - "learning_rate": 3.3675028193186243e-06, - "loss": 0.3915718197822571, - "mean_token_accuracy": 0.8536830544471741, - "num_tokens": 14717502.0, - "step": 1651 - }, - { - "epoch": 1.2553191489361701, - "grad_norm": 1.8152283430099487, - "learning_rate": 3.365538255866169e-06, - "loss": 0.424524188041687, - "mean_token_accuracy": 0.8434420824050903, - "num_tokens": 14726591.0, - "step": 1652 - }, - { - "epoch": 1.256079027355623, - "grad_norm": 1.3357285261154175, - "learning_rate": 3.3635730849456484e-06, - "loss": 0.2949739396572113, - "mean_token_accuracy": 0.8868321180343628, - "num_tokens": 14739911.0, - "step": 1653 - }, - { - "epoch": 1.256838905775076, - "grad_norm": 1.1770358085632324, - "learning_rate": 3.3616073079362925e-06, - "loss": 0.29939576983451843, - "mean_token_accuracy": 0.8923654556274414, - "num_tokens": 14755521.0, - "step": 1654 - }, - { - "epoch": 1.2575987841945289, - "grad_norm": 2.059162139892578, - "learning_rate": 3.3596409262177633e-06, - "loss": 0.4562555253505707, - "mean_token_accuracy": 0.8585271239280701, - "num_tokens": 14764173.0, - "step": 1655 - }, - { - "epoch": 1.2583586626139818, - "grad_norm": 1.430752158164978, - "learning_rate": 3.357673941170139e-06, - "loss": 0.35301265120506287, - "mean_token_accuracy": 0.8920517563819885, - "num_tokens": 14775596.0, - "step": 1656 - }, - { - "epoch": 1.2591185410334347, - "grad_norm": 1.6066302061080933, - "learning_rate": 3.3557063541739283e-06, - "loss": 0.41129636764526367, - "mean_token_accuracy": 0.8512256145477295, - "num_tokens": 14786289.0, - "step": 1657 - }, - { - "epoch": 1.2598784194528876, - "grad_norm": 1.5471590757369995, - "learning_rate": 3.353738166610058e-06, - "loss": 0.3935067057609558, - "mean_token_accuracy": 0.8514131903648376, - "num_tokens": 14798672.0, - "step": 1658 - }, - { - "epoch": 1.2606382978723405, - "grad_norm": 1.3455181121826172, - "learning_rate": 3.35176937985988e-06, - "loss": 0.3486790657043457, - "mean_token_accuracy": 0.8644362688064575, - "num_tokens": 14811603.0, - "step": 1659 - }, - { - "epoch": 1.2613981762917934, - "grad_norm": 1.891432762145996, - "learning_rate": 3.349799995305162e-06, - "loss": 0.3325638175010681, - "mean_token_accuracy": 0.8844645023345947, - "num_tokens": 14819256.0, - "step": 1660 - }, - { - "epoch": 1.262158054711246, - "grad_norm": 2.600614309310913, - "learning_rate": 3.3478300143280946e-06, - "loss": 0.30310919880867004, - "mean_token_accuracy": 0.9103429317474365, - "num_tokens": 14823706.0, - "step": 1661 - }, - { - "epoch": 1.2629179331306992, - "grad_norm": 3.8636202812194824, - "learning_rate": 3.3458594383112868e-06, - "loss": 0.28377676010131836, - "mean_token_accuracy": 0.9047091007232666, - "num_tokens": 14826688.0, - "step": 1662 - }, - { - "epoch": 1.263677811550152, - "grad_norm": 2.3100268840789795, - "learning_rate": 3.343888268637765e-06, - "loss": 0.4723394513130188, - "mean_token_accuracy": 0.8306777477264404, - "num_tokens": 14835471.0, - "step": 1663 - }, - { - "epoch": 1.2644376899696048, - "grad_norm": 1.7582160234451294, - "learning_rate": 3.341916506690971e-06, - "loss": 0.48168784379959106, - "mean_token_accuracy": 0.8281306028366089, - "num_tokens": 14846513.0, - "step": 1664 - }, - { - "epoch": 1.2651975683890577, - "grad_norm": 2.166055917739868, - "learning_rate": 3.3399441538547638e-06, - "loss": 0.4626024067401886, - "mean_token_accuracy": 0.8377980589866638, - "num_tokens": 14853408.0, - "step": 1665 - }, - { - "epoch": 1.2659574468085106, - "grad_norm": 2.23038911819458, - "learning_rate": 3.337971211513417e-06, - "loss": 0.38434159755706787, - "mean_token_accuracy": 0.8708412647247314, - "num_tokens": 14859919.0, - "step": 1666 - }, - { - "epoch": 1.2667173252279635, - "grad_norm": 2.092505693435669, - "learning_rate": 3.3359976810516164e-06, - "loss": 0.35072219371795654, - "mean_token_accuracy": 0.8761640191078186, - "num_tokens": 14865624.0, - "step": 1667 - }, - { - "epoch": 1.2674772036474165, - "grad_norm": 1.8255130052566528, - "learning_rate": 3.3340235638544633e-06, - "loss": 0.4404270648956299, - "mean_token_accuracy": 0.836356520652771, - "num_tokens": 14874181.0, - "step": 1668 - }, - { - "epoch": 1.2682370820668694, - "grad_norm": 1.9889036417007446, - "learning_rate": 3.332048861307467e-06, - "loss": 0.4199368357658386, - "mean_token_accuracy": 0.8508217334747314, - "num_tokens": 14882275.0, - "step": 1669 - }, - { - "epoch": 1.2689969604863223, - "grad_norm": 4.050281047821045, - "learning_rate": 3.330073574796551e-06, - "loss": 0.4271625280380249, - "mean_token_accuracy": 0.8471108675003052, - "num_tokens": 14893633.0, - "step": 1670 - }, - { - "epoch": 1.2697568389057752, - "grad_norm": 1.998838186264038, - "learning_rate": 3.328097705708047e-06, - "loss": 0.34743767976760864, - "mean_token_accuracy": 0.8771528005599976, - "num_tokens": 14899859.0, - "step": 1671 - }, - { - "epoch": 1.2705167173252279, - "grad_norm": 1.7989062070846558, - "learning_rate": 3.3261212554286977e-06, - "loss": 0.5267184376716614, - "mean_token_accuracy": 0.8323302268981934, - "num_tokens": 14911131.0, - "step": 1672 - }, - { - "epoch": 1.2712765957446808, - "grad_norm": 1.312070369720459, - "learning_rate": 3.324144225345649e-06, - "loss": 0.4675425887107849, - "mean_token_accuracy": 0.8157106637954712, - "num_tokens": 14928955.0, - "step": 1673 - }, - { - "epoch": 1.2720364741641337, - "grad_norm": 2.0547919273376465, - "learning_rate": 3.3221666168464584e-06, - "loss": 0.33704331517219543, - "mean_token_accuracy": 0.8621441125869751, - "num_tokens": 14935536.0, - "step": 1674 - }, - { - "epoch": 1.2727963525835866, - "grad_norm": 2.810413122177124, - "learning_rate": 3.320188431319088e-06, - "loss": 0.4007563292980194, - "mean_token_accuracy": 0.8649672269821167, - "num_tokens": 14940219.0, - "step": 1675 - }, - { - "epoch": 1.2735562310030395, - "grad_norm": 1.3516674041748047, - "learning_rate": 3.318209670151904e-06, - "loss": 0.3457040786743164, - "mean_token_accuracy": 0.8698287010192871, - "num_tokens": 14952904.0, - "step": 1676 - }, - { - "epoch": 1.2743161094224924, - "grad_norm": 2.440643310546875, - "learning_rate": 3.3162303347336765e-06, - "loss": 0.5195086002349854, - "mean_token_accuracy": 0.8348199129104614, - "num_tokens": 14958623.0, - "step": 1677 - }, - { - "epoch": 1.2750759878419453, - "grad_norm": 1.3264343738555908, - "learning_rate": 3.3142504264535808e-06, - "loss": 0.2990425229072571, - "mean_token_accuracy": 0.8961933851242065, - "num_tokens": 14971494.0, - "step": 1678 - }, - { - "epoch": 1.2758358662613982, - "grad_norm": 1.3106894493103027, - "learning_rate": 3.3122699467011913e-06, - "loss": 0.291853666305542, - "mean_token_accuracy": 0.893449068069458, - "num_tokens": 14985239.0, - "step": 1679 - }, - { - "epoch": 1.2765957446808511, - "grad_norm": 2.5387396812438965, - "learning_rate": 3.3102888968664857e-06, - "loss": 0.4336916208267212, - "mean_token_accuracy": 0.8447890877723694, - "num_tokens": 14991453.0, - "step": 1680 - }, - { - "epoch": 1.2773556231003038, - "grad_norm": 2.7052135467529297, - "learning_rate": 3.308307278339842e-06, - "loss": 0.3279378116130829, - "mean_token_accuracy": 0.8935879468917847, - "num_tokens": 14995428.0, - "step": 1681 - }, - { - "epoch": 1.278115501519757, - "grad_norm": 1.6251261234283447, - "learning_rate": 3.306325092512034e-06, - "loss": 0.32066458463668823, - "mean_token_accuracy": 0.8909799456596375, - "num_tokens": 15004841.0, - "step": 1682 - }, - { - "epoch": 1.2788753799392096, - "grad_norm": 2.3014605045318604, - "learning_rate": 3.3043423407742374e-06, - "loss": 0.3523373603820801, - "mean_token_accuracy": 0.8810735940933228, - "num_tokens": 15010742.0, - "step": 1683 - }, - { - "epoch": 1.2796352583586625, - "grad_norm": 2.9563019275665283, - "learning_rate": 3.3023590245180237e-06, - "loss": 0.39715707302093506, - "mean_token_accuracy": 0.8779881000518799, - "num_tokens": 15015357.0, - "step": 1684 - }, - { - "epoch": 1.2803951367781155, - "grad_norm": 1.5787957906723022, - "learning_rate": 3.300375145135361e-06, - "loss": 0.44630166888237, - "mean_token_accuracy": 0.8400174975395203, - "num_tokens": 15031360.0, - "step": 1685 - }, - { - "epoch": 1.2811550151975684, - "grad_norm": 1.6753438711166382, - "learning_rate": 3.2983907040186112e-06, - "loss": 0.3235800862312317, - "mean_token_accuracy": 0.8938044309616089, - "num_tokens": 15040276.0, - "step": 1686 - }, - { - "epoch": 1.2819148936170213, - "grad_norm": 1.7331148386001587, - "learning_rate": 3.296405702560532e-06, - "loss": 0.39061424136161804, - "mean_token_accuracy": 0.8599754571914673, - "num_tokens": 15049725.0, - "step": 1687 - }, - { - "epoch": 1.2826747720364742, - "grad_norm": 2.2029430866241455, - "learning_rate": 3.294420142154274e-06, - "loss": 0.43598297238349915, - "mean_token_accuracy": 0.8663698434829712, - "num_tokens": 15058182.0, - "step": 1688 - }, - { - "epoch": 1.283434650455927, - "grad_norm": 2.943964958190918, - "learning_rate": 3.29243402419338e-06, - "loss": 0.405210942029953, - "mean_token_accuracy": 0.854996919631958, - "num_tokens": 15062920.0, - "step": 1689 - }, - { - "epoch": 1.28419452887538, - "grad_norm": 1.9343379735946655, - "learning_rate": 3.2904473500717826e-06, - "loss": 0.35011449456214905, - "mean_token_accuracy": 0.8745867013931274, - "num_tokens": 15070298.0, - "step": 1690 - }, - { - "epoch": 1.284954407294833, - "grad_norm": 2.559859037399292, - "learning_rate": 3.2884601211838087e-06, - "loss": 0.38816407322883606, - "mean_token_accuracy": 0.854763388633728, - "num_tokens": 15075667.0, - "step": 1691 - }, - { - "epoch": 1.2857142857142856, - "grad_norm": 1.4357839822769165, - "learning_rate": 3.2864723389241697e-06, - "loss": 0.4512745141983032, - "mean_token_accuracy": 0.8398592472076416, - "num_tokens": 15090291.0, - "step": 1692 - }, - { - "epoch": 1.2864741641337387, - "grad_norm": 1.7643728256225586, - "learning_rate": 3.284484004687969e-06, - "loss": 0.3536742627620697, - "mean_token_accuracy": 0.8726381063461304, - "num_tokens": 15099325.0, - "step": 1693 - }, - { - "epoch": 1.2872340425531914, - "grad_norm": 1.853173017501831, - "learning_rate": 3.2824951198706958e-06, - "loss": 0.36579740047454834, - "mean_token_accuracy": 0.8988048434257507, - "num_tokens": 15107090.0, - "step": 1694 - }, - { - "epoch": 1.2879939209726443, - "grad_norm": 1.6526862382888794, - "learning_rate": 3.280505685868226e-06, - "loss": 0.3853636682033539, - "mean_token_accuracy": 0.8743607997894287, - "num_tokens": 15117818.0, - "step": 1695 - }, - { - "epoch": 1.2887537993920972, - "grad_norm": 2.790398597717285, - "learning_rate": 3.278515704076821e-06, - "loss": 0.2707311511039734, - "mean_token_accuracy": 0.9034668803215027, - "num_tokens": 15121641.0, - "step": 1696 - }, - { - "epoch": 1.2895136778115501, - "grad_norm": 1.69557523727417, - "learning_rate": 3.276525175893126e-06, - "loss": 0.3707970082759857, - "mean_token_accuracy": 0.8617855906486511, - "num_tokens": 15130414.0, - "step": 1697 - }, - { - "epoch": 1.290273556231003, - "grad_norm": 1.1360478401184082, - "learning_rate": 3.274534102714172e-06, - "loss": 0.3368082344532013, - "mean_token_accuracy": 0.8781654834747314, - "num_tokens": 15148307.0, - "step": 1698 - }, - { - "epoch": 1.291033434650456, - "grad_norm": 1.5894653797149658, - "learning_rate": 3.272542485937369e-06, - "loss": 0.3870658278465271, - "mean_token_accuracy": 0.8830926418304443, - "num_tokens": 15161841.0, - "step": 1699 - }, - { - "epoch": 1.2917933130699089, - "grad_norm": 2.3735709190368652, - "learning_rate": 3.270550326960511e-06, - "loss": 0.3873991370201111, - "mean_token_accuracy": 0.8729057908058167, - "num_tokens": 15167733.0, - "step": 1700 - }, - { - "epoch": 1.2925531914893618, - "grad_norm": 1.3739598989486694, - "learning_rate": 3.268557627181772e-06, - "loss": 0.30831626057624817, - "mean_token_accuracy": 0.8695719242095947, - "num_tokens": 15180861.0, - "step": 1701 - }, - { - "epoch": 1.2933130699088147, - "grad_norm": 1.7526969909667969, - "learning_rate": 3.2665643879997054e-06, - "loss": 0.4716024398803711, - "mean_token_accuracy": 0.8303275108337402, - "num_tokens": 15191642.0, - "step": 1702 - }, - { - "epoch": 1.2940729483282674, - "grad_norm": 2.7866084575653076, - "learning_rate": 3.2645706108132426e-06, - "loss": 0.33337634801864624, - "mean_token_accuracy": 0.8790726065635681, - "num_tokens": 15196038.0, - "step": 1703 - }, - { - "epoch": 1.2948328267477205, - "grad_norm": 2.319765090942383, - "learning_rate": 3.2625762970216944e-06, - "loss": 0.3999716639518738, - "mean_token_accuracy": 0.8693568706512451, - "num_tokens": 15202075.0, - "step": 1704 - }, - { - "epoch": 1.2955927051671732, - "grad_norm": 3.18292498588562, - "learning_rate": 3.2605814480247454e-06, - "loss": 0.4579541087150574, - "mean_token_accuracy": 0.8516187071800232, - "num_tokens": 15206886.0, - "step": 1705 - }, - { - "epoch": 1.296352583586626, - "grad_norm": 2.1816933155059814, - "learning_rate": 3.258586065222459e-06, - "loss": 0.5198885202407837, - "mean_token_accuracy": 0.8170592784881592, - "num_tokens": 15214088.0, - "step": 1706 - }, - { - "epoch": 1.297112462006079, - "grad_norm": 1.9076340198516846, - "learning_rate": 3.2565901500152702e-06, - "loss": 0.49752360582351685, - "mean_token_accuracy": 0.8681992292404175, - "num_tokens": 15226046.0, - "step": 1707 - }, - { - "epoch": 1.297872340425532, - "grad_norm": 2.0223331451416016, - "learning_rate": 3.2545937038039904e-06, - "loss": 0.4515793025493622, - "mean_token_accuracy": 0.8429619073867798, - "num_tokens": 15234993.0, - "step": 1708 - }, - { - "epoch": 1.2986322188449848, - "grad_norm": 2.5089669227600098, - "learning_rate": 3.2525967279898017e-06, - "loss": 0.43628376722335815, - "mean_token_accuracy": 0.8493682146072388, - "num_tokens": 15240575.0, - "step": 1709 - }, - { - "epoch": 1.2993920972644377, - "grad_norm": 2.8347091674804688, - "learning_rate": 3.2505992239742582e-06, - "loss": 0.25112441182136536, - "mean_token_accuracy": 0.908825159072876, - "num_tokens": 15244085.0, - "step": 1710 - }, - { - "epoch": 1.3001519756838906, - "grad_norm": 2.3157572746276855, - "learning_rate": 3.2486011931592863e-06, - "loss": 0.482818067073822, - "mean_token_accuracy": 0.8305923938751221, - "num_tokens": 15250377.0, - "step": 1711 - }, - { - "epoch": 1.3009118541033435, - "grad_norm": 3.169052839279175, - "learning_rate": 3.2466026369471804e-06, - "loss": 0.3493242561817169, - "mean_token_accuracy": 0.86913001537323, - "num_tokens": 15255041.0, - "step": 1712 - }, - { - "epoch": 1.3016717325227964, - "grad_norm": 1.4475083351135254, - "learning_rate": 3.2446035567406033e-06, - "loss": 0.4177290201187134, - "mean_token_accuracy": 0.8497589826583862, - "num_tokens": 15266946.0, - "step": 1713 - }, - { - "epoch": 1.3024316109422491, - "grad_norm": 1.6473008394241333, - "learning_rate": 3.2426039539425875e-06, - "loss": 0.5272886753082275, - "mean_token_accuracy": 0.8440133333206177, - "num_tokens": 15279263.0, - "step": 1714 - }, - { - "epoch": 1.3031914893617023, - "grad_norm": 2.3996543884277344, - "learning_rate": 3.240603829956531e-06, - "loss": 0.4272066652774811, - "mean_token_accuracy": 0.8495640754699707, - "num_tokens": 15285213.0, - "step": 1715 - }, - { - "epoch": 1.303951367781155, - "grad_norm": 1.63034987449646, - "learning_rate": 3.238603186186198e-06, - "loss": 0.4034635126590729, - "mean_token_accuracy": 0.8638584613800049, - "num_tokens": 15295974.0, - "step": 1716 - }, - { - "epoch": 1.3047112462006079, - "grad_norm": 2.153608798980713, - "learning_rate": 3.2366020240357166e-06, - "loss": 0.30712565779685974, - "mean_token_accuracy": 0.8863866329193115, - "num_tokens": 15302220.0, - "step": 1717 - }, - { - "epoch": 1.3054711246200608, - "grad_norm": 2.9814558029174805, - "learning_rate": 3.2346003449095803e-06, - "loss": 0.3922840356826782, - "mean_token_accuracy": 0.868030309677124, - "num_tokens": 15306747.0, - "step": 1718 - }, - { - "epoch": 1.3062310030395137, - "grad_norm": 3.3417985439300537, - "learning_rate": 3.2325981502126434e-06, - "loss": 0.30750396847724915, - "mean_token_accuracy": 0.9065356850624084, - "num_tokens": 15310309.0, - "step": 1719 - }, - { - "epoch": 1.3069908814589666, - "grad_norm": 2.237682819366455, - "learning_rate": 3.2305954413501252e-06, - "loss": 0.35068294405937195, - "mean_token_accuracy": 0.8887614011764526, - "num_tokens": 15316463.0, - "step": 1720 - }, - { - "epoch": 1.3077507598784195, - "grad_norm": 1.9526605606079102, - "learning_rate": 3.228592219727602e-06, - "loss": 0.42061835527420044, - "mean_token_accuracy": 0.8456839323043823, - "num_tokens": 15323984.0, - "step": 1721 - }, - { - "epoch": 1.3085106382978724, - "grad_norm": 1.6454212665557861, - "learning_rate": 3.226588486751012e-06, - "loss": 0.5189976692199707, - "mean_token_accuracy": 0.8187375068664551, - "num_tokens": 15338807.0, - "step": 1722 - }, - { - "epoch": 1.3092705167173253, - "grad_norm": 1.4521609544754028, - "learning_rate": 3.2245842438266526e-06, - "loss": 0.329673171043396, - "mean_token_accuracy": 0.853867769241333, - "num_tokens": 15350400.0, - "step": 1723 - }, - { - "epoch": 1.3100303951367782, - "grad_norm": 1.8750989437103271, - "learning_rate": 3.222579492361179e-06, - "loss": 0.4635341167449951, - "mean_token_accuracy": 0.8393422365188599, - "num_tokens": 15360557.0, - "step": 1724 - }, - { - "epoch": 1.310790273556231, - "grad_norm": 1.2728849649429321, - "learning_rate": 3.220574233761603e-06, - "loss": 0.3255572021007538, - "mean_token_accuracy": 0.8989741802215576, - "num_tokens": 15376548.0, - "step": 1725 - }, - { - "epoch": 1.3115501519756838, - "grad_norm": 3.5155694484710693, - "learning_rate": 3.2185684694352913e-06, - "loss": 0.34204089641571045, - "mean_token_accuracy": 0.8781906366348267, - "num_tokens": 15380304.0, - "step": 1726 - }, - { - "epoch": 1.3123100303951367, - "grad_norm": 2.059800148010254, - "learning_rate": 3.216562200789968e-06, - "loss": 0.36288338899612427, - "mean_token_accuracy": 0.8595278263092041, - "num_tokens": 15387653.0, - "step": 1727 - }, - { - "epoch": 1.3130699088145896, - "grad_norm": 3.5388240814208984, - "learning_rate": 3.214555429233707e-06, - "loss": 0.5434849858283997, - "mean_token_accuracy": 0.8074631690979004, - "num_tokens": 15391662.0, - "step": 1728 - }, - { - "epoch": 1.3138297872340425, - "grad_norm": 2.8595592975616455, - "learning_rate": 3.2125481561749406e-06, - "loss": 0.5113687515258789, - "mean_token_accuracy": 0.8448649644851685, - "num_tokens": 15397536.0, - "step": 1729 - }, - { - "epoch": 1.3145896656534954, - "grad_norm": 2.50386905670166, - "learning_rate": 3.210540383022449e-06, - "loss": 0.5293697118759155, - "mean_token_accuracy": 0.8096445798873901, - "num_tokens": 15403478.0, - "step": 1730 - }, - { - "epoch": 1.3153495440729484, - "grad_norm": 1.880035400390625, - "learning_rate": 3.208532111185365e-06, - "loss": 0.5344835519790649, - "mean_token_accuracy": 0.8172965049743652, - "num_tokens": 15413812.0, - "step": 1731 - }, - { - "epoch": 1.3161094224924013, - "grad_norm": 1.3688768148422241, - "learning_rate": 3.2065233420731717e-06, - "loss": 0.2577427327632904, - "mean_token_accuracy": 0.9142681360244751, - "num_tokens": 15423583.0, - "step": 1732 - }, - { - "epoch": 1.3168693009118542, - "grad_norm": 1.7945705652236938, - "learning_rate": 3.2045140770956987e-06, - "loss": 0.3983926773071289, - "mean_token_accuracy": 0.8652000427246094, - "num_tokens": 15432473.0, - "step": 1733 - }, - { - "epoch": 1.3176291793313069, - "grad_norm": 1.8243350982666016, - "learning_rate": 3.2025043176631283e-06, - "loss": 0.48644185066223145, - "mean_token_accuracy": 0.8319193124771118, - "num_tokens": 15445463.0, - "step": 1734 - }, - { - "epoch": 1.31838905775076, - "grad_norm": 2.000094175338745, - "learning_rate": 3.2004940651859844e-06, - "loss": 0.43567317724227905, - "mean_token_accuracy": 0.8857482671737671, - "num_tokens": 15452382.0, - "step": 1735 - }, - { - "epoch": 1.3191489361702127, - "grad_norm": 2.379974365234375, - "learning_rate": 3.198483321075141e-06, - "loss": 0.5153506398200989, - "mean_token_accuracy": 0.8295865654945374, - "num_tokens": 15458740.0, - "step": 1736 - }, - { - "epoch": 1.3199088145896656, - "grad_norm": 1.6564184427261353, - "learning_rate": 3.196472086741815e-06, - "loss": 0.508430540561676, - "mean_token_accuracy": 0.8181540369987488, - "num_tokens": 15471844.0, - "step": 1737 - }, - { - "epoch": 1.3206686930091185, - "grad_norm": 2.006925344467163, - "learning_rate": 3.194460363597569e-06, - "loss": 0.34542378783226013, - "mean_token_accuracy": 0.8827437162399292, - "num_tokens": 15478414.0, - "step": 1738 - }, - { - "epoch": 1.3214285714285714, - "grad_norm": 3.589045763015747, - "learning_rate": 3.192448153054306e-06, - "loss": 0.4385780096054077, - "mean_token_accuracy": 0.8480287790298462, - "num_tokens": 15482063.0, - "step": 1739 - }, - { - "epoch": 1.3221884498480243, - "grad_norm": 1.9797427654266357, - "learning_rate": 3.190435456524275e-06, - "loss": 0.4330386519432068, - "mean_token_accuracy": 0.8458058834075928, - "num_tokens": 15489803.0, - "step": 1740 - }, - { - "epoch": 1.3229483282674772, - "grad_norm": 1.4777411222457886, - "learning_rate": 3.188422275420063e-06, - "loss": 0.3997895419597626, - "mean_token_accuracy": 0.8639512062072754, - "num_tokens": 15501103.0, - "step": 1741 - }, - { - "epoch": 1.3237082066869301, - "grad_norm": 2.882338523864746, - "learning_rate": 3.186408611154597e-06, - "loss": 0.2336438149213791, - "mean_token_accuracy": 0.9176726937294006, - "num_tokens": 15504854.0, - "step": 1742 - }, - { - "epoch": 1.324468085106383, - "grad_norm": 2.353503704071045, - "learning_rate": 3.184394465141146e-06, - "loss": 0.4107069671154022, - "mean_token_accuracy": 0.8677014112472534, - "num_tokens": 15510662.0, - "step": 1743 - }, - { - "epoch": 1.325227963525836, - "grad_norm": 2.6551976203918457, - "learning_rate": 3.1823798387933134e-06, - "loss": 0.3862302899360657, - "mean_token_accuracy": 0.8819445371627808, - "num_tokens": 15515681.0, - "step": 1744 - }, - { - "epoch": 1.3259878419452886, - "grad_norm": 1.478572964668274, - "learning_rate": 3.180364733525043e-06, - "loss": 0.43972986936569214, - "mean_token_accuracy": 0.832388162612915, - "num_tokens": 15529542.0, - "step": 1745 - }, - { - "epoch": 1.3267477203647418, - "grad_norm": 1.6003550291061401, - "learning_rate": 3.178349150750612e-06, - "loss": 0.3404902219772339, - "mean_token_accuracy": 0.8764007091522217, - "num_tokens": 15538865.0, - "step": 1746 - }, - { - "epoch": 1.3275075987841944, - "grad_norm": 2.130689859390259, - "learning_rate": 3.1763330918846347e-06, - "loss": 0.383136510848999, - "mean_token_accuracy": 0.8652247190475464, - "num_tokens": 15545567.0, - "step": 1747 - }, - { - "epoch": 1.3282674772036474, - "grad_norm": 2.395937442779541, - "learning_rate": 3.1743165583420586e-06, - "loss": 0.3870319128036499, - "mean_token_accuracy": 0.8618065118789673, - "num_tokens": 15551090.0, - "step": 1748 - }, - { - "epoch": 1.3290273556231003, - "grad_norm": 2.0841057300567627, - "learning_rate": 3.1722995515381644e-06, - "loss": 0.4838739335536957, - "mean_token_accuracy": 0.8548711538314819, - "num_tokens": 15558913.0, - "step": 1749 - }, - { - "epoch": 1.3297872340425532, - "grad_norm": 1.4237847328186035, - "learning_rate": 3.1702820728885657e-06, - "loss": 0.40350261330604553, - "mean_token_accuracy": 0.858984649181366, - "num_tokens": 15572045.0, - "step": 1750 - }, - { - "epoch": 1.330547112462006, - "grad_norm": 2.2641282081604004, - "learning_rate": 3.1682641238092064e-06, - "loss": 0.5117636919021606, - "mean_token_accuracy": 0.8078924417495728, - "num_tokens": 15579753.0, - "step": 1751 - }, - { - "epoch": 1.331306990881459, - "grad_norm": 1.0010309219360352, - "learning_rate": 3.1662457057163603e-06, - "loss": 0.3220978379249573, - "mean_token_accuracy": 0.8786559104919434, - "num_tokens": 15602823.0, - "step": 1752 - }, - { - "epoch": 1.332066869300912, - "grad_norm": 2.441230535507202, - "learning_rate": 3.164226820026632e-06, - "loss": 0.37529727816581726, - "mean_token_accuracy": 0.8886898756027222, - "num_tokens": 15608473.0, - "step": 1753 - }, - { - "epoch": 1.3328267477203648, - "grad_norm": 1.2960991859436035, - "learning_rate": 3.162207468156952e-06, - "loss": 0.3393767476081848, - "mean_token_accuracy": 0.8766993284225464, - "num_tokens": 15620893.0, - "step": 1754 - }, - { - "epoch": 1.3335866261398177, - "grad_norm": 2.0806996822357178, - "learning_rate": 3.16018765152458e-06, - "loss": 0.38034507632255554, - "mean_token_accuracy": 0.8854838609695435, - "num_tokens": 15627068.0, - "step": 1755 - }, - { - "epoch": 1.3343465045592704, - "grad_norm": 1.4316699504852295, - "learning_rate": 3.1581673715471007e-06, - "loss": 0.3665890693664551, - "mean_token_accuracy": 0.870919406414032, - "num_tokens": 15641070.0, - "step": 1756 - }, - { - "epoch": 1.3351063829787235, - "grad_norm": 1.3466622829437256, - "learning_rate": 3.1561466296424247e-06, - "loss": 0.37387198209762573, - "mean_token_accuracy": 0.8633951544761658, - "num_tokens": 15653777.0, - "step": 1757 - }, - { - "epoch": 1.3358662613981762, - "grad_norm": 1.8108628988265991, - "learning_rate": 3.154125427228786e-06, - "loss": 0.38428938388824463, - "mean_token_accuracy": 0.85402512550354, - "num_tokens": 15662494.0, - "step": 1758 - }, - { - "epoch": 1.3366261398176291, - "grad_norm": 1.3221700191497803, - "learning_rate": 3.152103765724743e-06, - "loss": 0.42825520038604736, - "mean_token_accuracy": 0.8435465097427368, - "num_tokens": 15677552.0, - "step": 1759 - }, - { - "epoch": 1.337386018237082, - "grad_norm": 2.6247692108154297, - "learning_rate": 3.150081646549174e-06, - "loss": 0.36186715960502625, - "mean_token_accuracy": 0.8767328262329102, - "num_tokens": 15682103.0, - "step": 1760 - }, - { - "epoch": 1.338145896656535, - "grad_norm": 2.1469814777374268, - "learning_rate": 3.1480590711212823e-06, - "loss": 0.3734385669231415, - "mean_token_accuracy": 0.8711104393005371, - "num_tokens": 15689182.0, - "step": 1761 - }, - { - "epoch": 1.3389057750759878, - "grad_norm": 2.1702585220336914, - "learning_rate": 3.1460360408605866e-06, - "loss": 0.2795315086841583, - "mean_token_accuracy": 0.8892190456390381, - "num_tokens": 15694272.0, - "step": 1762 - }, - { - "epoch": 1.3396656534954408, - "grad_norm": 1.918797254562378, - "learning_rate": 3.144012557186931e-06, - "loss": 0.4363473057746887, - "mean_token_accuracy": 0.8573931455612183, - "num_tokens": 15703532.0, - "step": 1763 - }, - { - "epoch": 1.3404255319148937, - "grad_norm": 2.5579960346221924, - "learning_rate": 3.14198862152047e-06, - "loss": 0.406247079372406, - "mean_token_accuracy": 0.8617593050003052, - "num_tokens": 15708652.0, - "step": 1764 - }, - { - "epoch": 1.3411854103343466, - "grad_norm": 2.3617870807647705, - "learning_rate": 3.1399642352816825e-06, - "loss": 0.2839522659778595, - "mean_token_accuracy": 0.8996064066886902, - "num_tokens": 15713598.0, - "step": 1765 - }, - { - "epoch": 1.3419452887537995, - "grad_norm": 1.248302936553955, - "learning_rate": 3.1379393998913594e-06, - "loss": 0.2922290861606598, - "mean_token_accuracy": 0.8948773145675659, - "num_tokens": 15726693.0, - "step": 1766 - }, - { - "epoch": 1.3427051671732522, - "grad_norm": 2.143599510192871, - "learning_rate": 3.135914116770609e-06, - "loss": 0.32176223397254944, - "mean_token_accuracy": 0.8808754682540894, - "num_tokens": 15731901.0, - "step": 1767 - }, - { - "epoch": 1.3434650455927053, - "grad_norm": 4.226369857788086, - "learning_rate": 3.1338883873408517e-06, - "loss": 0.4682556390762329, - "mean_token_accuracy": 0.8566025495529175, - "num_tokens": 15735029.0, - "step": 1768 - }, - { - "epoch": 1.344224924012158, - "grad_norm": 1.8695988655090332, - "learning_rate": 3.1318622130238237e-06, - "loss": 0.4297192394733429, - "mean_token_accuracy": 0.8419148921966553, - "num_tokens": 15744310.0, - "step": 1769 - }, - { - "epoch": 1.344984802431611, - "grad_norm": 2.4321305751800537, - "learning_rate": 3.1298355952415714e-06, - "loss": 0.36076444387435913, - "mean_token_accuracy": 0.8826035261154175, - "num_tokens": 15749337.0, - "step": 1770 - }, - { - "epoch": 1.3457446808510638, - "grad_norm": 1.5500011444091797, - "learning_rate": 3.127808535416454e-06, - "loss": 0.48664039373397827, - "mean_token_accuracy": 0.844344437122345, - "num_tokens": 15761096.0, - "step": 1771 - }, - { - "epoch": 1.3465045592705167, - "grad_norm": 2.1498289108276367, - "learning_rate": 3.1257810349711388e-06, - "loss": 0.4841752052307129, - "mean_token_accuracy": 0.8324567079544067, - "num_tokens": 15768646.0, - "step": 1772 - }, - { - "epoch": 1.3472644376899696, - "grad_norm": 1.2995187044143677, - "learning_rate": 3.1237530953286046e-06, - "loss": 0.492019385099411, - "mean_token_accuracy": 0.8285316228866577, - "num_tokens": 15788401.0, - "step": 1773 - }, - { - "epoch": 1.3480243161094225, - "grad_norm": 2.324819803237915, - "learning_rate": 3.121724717912138e-06, - "loss": 0.33166298270225525, - "mean_token_accuracy": 0.8856451511383057, - "num_tokens": 15794097.0, - "step": 1774 - }, - { - "epoch": 1.3487841945288754, - "grad_norm": 1.9611430168151855, - "learning_rate": 3.11969590414533e-06, - "loss": 0.3974284827709198, - "mean_token_accuracy": 0.8751305937767029, - "num_tokens": 15801065.0, - "step": 1775 - }, - { - "epoch": 1.3495440729483283, - "grad_norm": 1.7084417343139648, - "learning_rate": 3.1176666554520827e-06, - "loss": 0.38729435205459595, - "mean_token_accuracy": 0.8680770397186279, - "num_tokens": 15810353.0, - "step": 1776 - }, - { - "epoch": 1.3503039513677813, - "grad_norm": 1.7616240978240967, - "learning_rate": 3.1156369732566006e-06, - "loss": 0.4271578788757324, - "mean_token_accuracy": 0.843730092048645, - "num_tokens": 15821889.0, - "step": 1777 - }, - { - "epoch": 1.351063829787234, - "grad_norm": 2.030747413635254, - "learning_rate": 3.113606858983391e-06, - "loss": 0.361891508102417, - "mean_token_accuracy": 0.8522407412528992, - "num_tokens": 15830800.0, - "step": 1778 - }, - { - "epoch": 1.3518237082066868, - "grad_norm": 1.4842649698257446, - "learning_rate": 3.1115763140572686e-06, - "loss": 0.466334730386734, - "mean_token_accuracy": 0.8433995246887207, - "num_tokens": 15849422.0, - "step": 1779 - }, - { - "epoch": 1.3525835866261398, - "grad_norm": 1.6595379114151, - "learning_rate": 3.109545339903347e-06, - "loss": 0.4622533321380615, - "mean_token_accuracy": 0.8526314496994019, - "num_tokens": 15860431.0, - "step": 1780 - }, - { - "epoch": 1.3533434650455927, - "grad_norm": 2.1235809326171875, - "learning_rate": 3.107513937947041e-06, - "loss": 0.42694270610809326, - "mean_token_accuracy": 0.854864239692688, - "num_tokens": 15869044.0, - "step": 1781 - }, - { - "epoch": 1.3541033434650456, - "grad_norm": 1.5889263153076172, - "learning_rate": 3.1054821096140675e-06, - "loss": 0.41838499903678894, - "mean_token_accuracy": 0.8671513795852661, - "num_tokens": 15878598.0, - "step": 1782 - }, - { - "epoch": 1.3548632218844985, - "grad_norm": 2.2261741161346436, - "learning_rate": 3.1034498563304435e-06, - "loss": 0.4045066237449646, - "mean_token_accuracy": 0.843826949596405, - "num_tokens": 15885167.0, - "step": 1783 - }, - { - "epoch": 1.3556231003039514, - "grad_norm": 2.2569329738616943, - "learning_rate": 3.1014171795224794e-06, - "loss": 0.36677104234695435, - "mean_token_accuracy": 0.8747833967208862, - "num_tokens": 15891308.0, - "step": 1784 - }, - { - "epoch": 1.3563829787234043, - "grad_norm": 2.1027088165283203, - "learning_rate": 3.0993840806167884e-06, - "loss": 0.437946081161499, - "mean_token_accuracy": 0.8370785117149353, - "num_tokens": 15898952.0, - "step": 1785 - }, - { - "epoch": 1.3571428571428572, - "grad_norm": 1.8768929243087769, - "learning_rate": 3.0973505610402767e-06, - "loss": 0.4201734662055969, - "mean_token_accuracy": 0.8474810123443604, - "num_tokens": 15907340.0, - "step": 1786 - }, - { - "epoch": 1.35790273556231, - "grad_norm": 1.7216229438781738, - "learning_rate": 3.0953166222201474e-06, - "loss": 0.4225231409072876, - "mean_token_accuracy": 0.8437749147415161, - "num_tokens": 15917852.0, - "step": 1787 - }, - { - "epoch": 1.358662613981763, - "grad_norm": 2.6256966590881348, - "learning_rate": 3.093282265583895e-06, - "loss": 0.435439795255661, - "mean_token_accuracy": 0.8452040553092957, - "num_tokens": 15923739.0, - "step": 1788 - }, - { - "epoch": 1.3594224924012157, - "grad_norm": 2.90028977394104, - "learning_rate": 3.0912474925593124e-06, - "loss": 0.3730456829071045, - "mean_token_accuracy": 0.8766646385192871, - "num_tokens": 15927943.0, - "step": 1789 - }, - { - "epoch": 1.3601823708206686, - "grad_norm": 1.5966626405715942, - "learning_rate": 3.0892123045744787e-06, - "loss": 0.42150455713272095, - "mean_token_accuracy": 0.854656457901001, - "num_tokens": 15939922.0, - "step": 1790 - }, - { - "epoch": 1.3609422492401215, - "grad_norm": 1.8069748878479004, - "learning_rate": 3.0871767030577686e-06, - "loss": 0.4954872131347656, - "mean_token_accuracy": 0.8289790153503418, - "num_tokens": 15950095.0, - "step": 1791 - }, - { - "epoch": 1.3617021276595744, - "grad_norm": 2.0855250358581543, - "learning_rate": 3.085140689437846e-06, - "loss": 0.41999945044517517, - "mean_token_accuracy": 0.8517382144927979, - "num_tokens": 15957972.0, - "step": 1792 - }, - { - "epoch": 1.3624620060790273, - "grad_norm": 2.108659267425537, - "learning_rate": 3.0831042651436634e-06, - "loss": 0.3668023645877838, - "mean_token_accuracy": 0.8710855841636658, - "num_tokens": 15965614.0, - "step": 1793 - }, - { - "epoch": 1.3632218844984803, - "grad_norm": 1.3799632787704468, - "learning_rate": 3.0810674316044602e-06, - "loss": 0.351409375667572, - "mean_token_accuracy": 0.870837390422821, - "num_tokens": 15978854.0, - "step": 1794 - }, - { - "epoch": 1.3639817629179332, - "grad_norm": 1.540397047996521, - "learning_rate": 3.0790301902497664e-06, - "loss": 0.403600811958313, - "mean_token_accuracy": 0.8485002517700195, - "num_tokens": 15993324.0, - "step": 1795 - }, - { - "epoch": 1.364741641337386, - "grad_norm": 1.946882963180542, - "learning_rate": 3.076992542509396e-06, - "loss": 0.40118327736854553, - "mean_token_accuracy": 0.8607497811317444, - "num_tokens": 16001937.0, - "step": 1796 - }, - { - "epoch": 1.365501519756839, - "grad_norm": 2.0464305877685547, - "learning_rate": 3.0749544898134487e-06, - "loss": 0.31742292642593384, - "mean_token_accuracy": 0.8878391981124878, - "num_tokens": 16009277.0, - "step": 1797 - }, - { - "epoch": 1.3662613981762917, - "grad_norm": 2.091754913330078, - "learning_rate": 3.072916033592307e-06, - "loss": 0.31580421328544617, - "mean_token_accuracy": 0.8875244855880737, - "num_tokens": 16015756.0, - "step": 1798 - }, - { - "epoch": 1.3670212765957448, - "grad_norm": 3.4449212551116943, - "learning_rate": 3.0708771752766397e-06, - "loss": 0.4692591726779938, - "mean_token_accuracy": 0.8456202149391174, - "num_tokens": 16019912.0, - "step": 1799 - }, - { - "epoch": 1.3677811550151975, - "grad_norm": 1.600419521331787, - "learning_rate": 3.068837916297396e-06, - "loss": 0.40389442443847656, - "mean_token_accuracy": 0.8378961086273193, - "num_tokens": 16032637.0, - "step": 1800 - }, - { - "epoch": 1.3685410334346504, - "grad_norm": 1.5282686948776245, - "learning_rate": 3.0667982580858047e-06, - "loss": 0.379841685295105, - "mean_token_accuracy": 0.8752143383026123, - "num_tokens": 16045205.0, - "step": 1801 - }, - { - "epoch": 1.3693009118541033, - "grad_norm": 2.486079454421997, - "learning_rate": 3.0647582020733773e-06, - "loss": 0.41060030460357666, - "mean_token_accuracy": 0.8575131893157959, - "num_tokens": 16051189.0, - "step": 1802 - }, - { - "epoch": 1.3700607902735562, - "grad_norm": 1.9458621740341187, - "learning_rate": 3.062717749691904e-06, - "loss": 0.4442213773727417, - "mean_token_accuracy": 0.8451495170593262, - "num_tokens": 16059700.0, - "step": 1803 - }, - { - "epoch": 1.3708206686930091, - "grad_norm": 1.4333001375198364, - "learning_rate": 3.0606769023734535e-06, - "loss": 0.39132001996040344, - "mean_token_accuracy": 0.8609901666641235, - "num_tokens": 16072458.0, - "step": 1804 - }, - { - "epoch": 1.371580547112462, - "grad_norm": 1.490355372428894, - "learning_rate": 3.0586356615503693e-06, - "loss": 0.4108564257621765, - "mean_token_accuracy": 0.8871046304702759, - "num_tokens": 16083142.0, - "step": 1805 - }, - { - "epoch": 1.372340425531915, - "grad_norm": 1.7765129804611206, - "learning_rate": 3.056594028655274e-06, - "loss": 0.3850266635417938, - "mean_token_accuracy": 0.8923365473747253, - "num_tokens": 16092519.0, - "step": 1806 - }, - { - "epoch": 1.3731003039513678, - "grad_norm": 1.955661416053772, - "learning_rate": 3.0545520051210637e-06, - "loss": 0.4665378928184509, - "mean_token_accuracy": 0.837419867515564, - "num_tokens": 16100618.0, - "step": 1807 - }, - { - "epoch": 1.3738601823708207, - "grad_norm": 3.259265422821045, - "learning_rate": 3.052509592380909e-06, - "loss": 0.24722981452941895, - "mean_token_accuracy": 0.9106054306030273, - "num_tokens": 16103836.0, - "step": 1808 - }, - { - "epoch": 1.3746200607902734, - "grad_norm": 1.7995736598968506, - "learning_rate": 3.050466791868254e-06, - "loss": 0.4982220530509949, - "mean_token_accuracy": 0.8298169374465942, - "num_tokens": 16114727.0, - "step": 1809 - }, - { - "epoch": 1.3753799392097266, - "grad_norm": 1.9643093347549438, - "learning_rate": 3.048423605016815e-06, - "loss": 0.5076829195022583, - "mean_token_accuracy": 0.8303098678588867, - "num_tokens": 16129491.0, - "step": 1810 - }, - { - "epoch": 1.3761398176291793, - "grad_norm": 3.505594491958618, - "learning_rate": 3.0463800332605787e-06, - "loss": 0.27466052770614624, - "mean_token_accuracy": 0.9018045663833618, - "num_tokens": 16132640.0, - "step": 1811 - }, - { - "epoch": 1.3768996960486322, - "grad_norm": 1.798437237739563, - "learning_rate": 3.0443360780338034e-06, - "loss": 0.4004853069782257, - "mean_token_accuracy": 0.8569544553756714, - "num_tokens": 16143317.0, - "step": 1812 - }, - { - "epoch": 1.377659574468085, - "grad_norm": 2.276740789413452, - "learning_rate": 3.042291740771014e-06, - "loss": 0.3823797106742859, - "mean_token_accuracy": 0.8764113783836365, - "num_tokens": 16148898.0, - "step": 1813 - }, - { - "epoch": 1.378419452887538, - "grad_norm": 2.5051357746124268, - "learning_rate": 3.0402470229070057e-06, - "loss": 0.40365856885910034, - "mean_token_accuracy": 0.8809891939163208, - "num_tokens": 16153815.0, - "step": 1814 - }, - { - "epoch": 1.3791793313069909, - "grad_norm": 1.2379236221313477, - "learning_rate": 3.03820192587684e-06, - "loss": 0.3955119848251343, - "mean_token_accuracy": 0.8536627292633057, - "num_tokens": 16167783.0, - "step": 1815 - }, - { - "epoch": 1.3799392097264438, - "grad_norm": 2.2286343574523926, - "learning_rate": 3.036156451115846e-06, - "loss": 0.39647501707077026, - "mean_token_accuracy": 0.8621993064880371, - "num_tokens": 16174707.0, - "step": 1816 - }, - { - "epoch": 1.3806990881458967, - "grad_norm": 1.884639024734497, - "learning_rate": 3.034110600059616e-06, - "loss": 0.31612110137939453, - "mean_token_accuracy": 0.8942475318908691, - "num_tokens": 16181919.0, - "step": 1817 - }, - { - "epoch": 1.3814589665653496, - "grad_norm": 1.891312599182129, - "learning_rate": 3.0320643741440052e-06, - "loss": 0.46209126710891724, - "mean_token_accuracy": 0.8374713659286499, - "num_tokens": 16189276.0, - "step": 1818 - }, - { - "epoch": 1.3822188449848025, - "grad_norm": 2.507478713989258, - "learning_rate": 3.0300177748051375e-06, - "loss": 0.37601593136787415, - "mean_token_accuracy": 0.8633589148521423, - "num_tokens": 16194346.0, - "step": 1819 - }, - { - "epoch": 1.3829787234042552, - "grad_norm": 1.5046696662902832, - "learning_rate": 3.0279708034793907e-06, - "loss": 0.3284982144832611, - "mean_token_accuracy": 0.8792630434036255, - "num_tokens": 16205457.0, - "step": 1820 - }, - { - "epoch": 1.3837386018237083, - "grad_norm": 2.4244449138641357, - "learning_rate": 3.025923461603412e-06, - "loss": 0.40939009189605713, - "mean_token_accuracy": 0.8596426248550415, - "num_tokens": 16211866.0, - "step": 1821 - }, - { - "epoch": 1.384498480243161, - "grad_norm": 2.8656933307647705, - "learning_rate": 3.0238757506141013e-06, - "loss": 0.4397110044956207, - "mean_token_accuracy": 0.8597331047058105, - "num_tokens": 16216607.0, - "step": 1822 - }, - { - "epoch": 1.385258358662614, - "grad_norm": 2.0718610286712646, - "learning_rate": 3.0218276719486245e-06, - "loss": 0.49057573080062866, - "mean_token_accuracy": 0.8325331211090088, - "num_tokens": 16224014.0, - "step": 1823 - }, - { - "epoch": 1.3860182370820668, - "grad_norm": 1.054450273513794, - "learning_rate": 3.019779227044398e-06, - "loss": 0.3758106827735901, - "mean_token_accuracy": 0.8689473867416382, - "num_tokens": 16248627.0, - "step": 1824 - }, - { - "epoch": 1.3867781155015197, - "grad_norm": 2.1115148067474365, - "learning_rate": 3.0177304173391038e-06, - "loss": 0.502967119216919, - "mean_token_accuracy": 0.823198676109314, - "num_tokens": 16256255.0, - "step": 1825 - }, - { - "epoch": 1.3875379939209727, - "grad_norm": 2.207277297973633, - "learning_rate": 3.015681244270672e-06, - "loss": 0.3458971083164215, - "mean_token_accuracy": 0.8930196762084961, - "num_tokens": 16261823.0, - "step": 1826 - }, - { - "epoch": 1.3882978723404256, - "grad_norm": 1.289669156074524, - "learning_rate": 3.0136317092772923e-06, - "loss": 0.4422765374183655, - "mean_token_accuracy": 0.8358346819877625, - "num_tokens": 16280659.0, - "step": 1827 - }, - { - "epoch": 1.3890577507598785, - "grad_norm": 2.233865737915039, - "learning_rate": 3.0115818137974066e-06, - "loss": 0.3643006384372711, - "mean_token_accuracy": 0.8682862520217896, - "num_tokens": 16286356.0, - "step": 1828 - }, - { - "epoch": 1.3898176291793314, - "grad_norm": 1.0950042009353638, - "learning_rate": 3.0095315592697126e-06, - "loss": 0.34712421894073486, - "mean_token_accuracy": 0.8578766584396362, - "num_tokens": 16307298.0, - "step": 1829 - }, - { - "epoch": 1.3905775075987843, - "grad_norm": 1.1708037853240967, - "learning_rate": 3.007480947133155e-06, - "loss": 0.33152541518211365, - "mean_token_accuracy": 0.894973874092102, - "num_tokens": 16323232.0, - "step": 1830 - }, - { - "epoch": 1.391337386018237, - "grad_norm": 1.2226970195770264, - "learning_rate": 3.0054299788269343e-06, - "loss": 0.3915635943412781, - "mean_token_accuracy": 0.8575779795646667, - "num_tokens": 16339273.0, - "step": 1831 - }, - { - "epoch": 1.39209726443769, - "grad_norm": 1.2226042747497559, - "learning_rate": 3.0033786557904982e-06, - "loss": 0.45846253633499146, - "mean_token_accuracy": 0.8290432691574097, - "num_tokens": 16360145.0, - "step": 1832 - }, - { - "epoch": 1.3928571428571428, - "grad_norm": 2.0117406845092773, - "learning_rate": 3.001326979463545e-06, - "loss": 0.3837882876396179, - "mean_token_accuracy": 0.8941739797592163, - "num_tokens": 16366602.0, - "step": 1833 - }, - { - "epoch": 1.3936170212765957, - "grad_norm": 1.8419997692108154, - "learning_rate": 2.9992749512860177e-06, - "loss": 0.40777021646499634, - "mean_token_accuracy": 0.854655385017395, - "num_tokens": 16375611.0, - "step": 1834 - }, - { - "epoch": 1.3943768996960486, - "grad_norm": 1.9405122995376587, - "learning_rate": 2.9972225726981114e-06, - "loss": 0.46685922145843506, - "mean_token_accuracy": 0.8493201732635498, - "num_tokens": 16384878.0, - "step": 1835 - }, - { - "epoch": 1.3951367781155015, - "grad_norm": 1.2425674200057983, - "learning_rate": 2.995169845140264e-06, - "loss": 0.394692063331604, - "mean_token_accuracy": 0.851348876953125, - "num_tokens": 16404452.0, - "step": 1836 - }, - { - "epoch": 1.3958966565349544, - "grad_norm": 1.2215365171432495, - "learning_rate": 2.9931167700531575e-06, - "loss": 0.31412452459335327, - "mean_token_accuracy": 0.882760763168335, - "num_tokens": 16419358.0, - "step": 1837 - }, - { - "epoch": 1.3966565349544073, - "grad_norm": 1.912168025970459, - "learning_rate": 2.9910633488777198e-06, - "loss": 0.5065487623214722, - "mean_token_accuracy": 0.8524355292320251, - "num_tokens": 16430418.0, - "step": 1838 - }, - { - "epoch": 1.3974164133738602, - "grad_norm": 2.2173948287963867, - "learning_rate": 2.989009583055121e-06, - "loss": 0.4290938377380371, - "mean_token_accuracy": 0.8381836414337158, - "num_tokens": 16438267.0, - "step": 1839 - }, - { - "epoch": 1.3981762917933132, - "grad_norm": 1.8293484449386597, - "learning_rate": 2.9869554740267726e-06, - "loss": 0.41683733463287354, - "mean_token_accuracy": 0.8548779487609863, - "num_tokens": 16447382.0, - "step": 1840 - }, - { - "epoch": 1.398936170212766, - "grad_norm": 1.835015892982483, - "learning_rate": 2.9849010232343274e-06, - "loss": 0.5080599784851074, - "mean_token_accuracy": 0.8193596601486206, - "num_tokens": 16458541.0, - "step": 1841 - }, - { - "epoch": 1.3996960486322187, - "grad_norm": 2.031339645385742, - "learning_rate": 2.982846232119679e-06, - "loss": 0.5168882012367249, - "mean_token_accuracy": 0.8525956869125366, - "num_tokens": 16467747.0, - "step": 1842 - }, - { - "epoch": 1.4004559270516717, - "grad_norm": 1.5554167032241821, - "learning_rate": 2.9807911021249573e-06, - "loss": 0.35098958015441895, - "mean_token_accuracy": 0.888373851776123, - "num_tokens": 16479319.0, - "step": 1843 - }, - { - "epoch": 1.4012158054711246, - "grad_norm": 1.7183740139007568, - "learning_rate": 2.9787356346925327e-06, - "loss": 0.41263148188591003, - "mean_token_accuracy": 0.8478364944458008, - "num_tokens": 16489952.0, - "step": 1844 - }, - { - "epoch": 1.4019756838905775, - "grad_norm": 1.7743209600448608, - "learning_rate": 2.9766798312650112e-06, - "loss": 0.4211183190345764, - "mean_token_accuracy": 0.8641136884689331, - "num_tokens": 16498655.0, - "step": 1845 - }, - { - "epoch": 1.4027355623100304, - "grad_norm": 2.141300916671753, - "learning_rate": 2.9746236932852355e-06, - "loss": 0.49548980593681335, - "mean_token_accuracy": 0.8304252028465271, - "num_tokens": 16506348.0, - "step": 1846 - }, - { - "epoch": 1.4034954407294833, - "grad_norm": 2.341571807861328, - "learning_rate": 2.9725672221962804e-06, - "loss": 0.40804803371429443, - "mean_token_accuracy": 0.8545800447463989, - "num_tokens": 16513091.0, - "step": 1847 - }, - { - "epoch": 1.4042553191489362, - "grad_norm": 1.934428095817566, - "learning_rate": 2.9705104194414587e-06, - "loss": 0.30029812455177307, - "mean_token_accuracy": 0.9032052755355835, - "num_tokens": 16519455.0, - "step": 1848 - }, - { - "epoch": 1.405015197568389, - "grad_norm": 1.420804500579834, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.4384060502052307, - "mean_token_accuracy": 0.8465110063552856, - "num_tokens": 16533222.0, - "step": 1849 - }, - { - "epoch": 1.405775075987842, - "grad_norm": 2.1180737018585205, - "learning_rate": 2.9663958247086165e-06, - "loss": 0.3915565609931946, - "mean_token_accuracy": 0.8633890748023987, - "num_tokens": 16539489.0, - "step": 1850 - }, - { - "epoch": 1.4065349544072947, - "grad_norm": 1.408048152923584, - "learning_rate": 2.964338035618378e-06, - "loss": 0.46166157722473145, - "mean_token_accuracy": 0.8305013179779053, - "num_tokens": 16555785.0, - "step": 1851 - }, - { - "epoch": 1.4072948328267478, - "grad_norm": 1.3418530225753784, - "learning_rate": 2.9622799206378306e-06, - "loss": 0.5314373970031738, - "mean_token_accuracy": 0.81779944896698, - "num_tokens": 16578111.0, - "step": 1852 - }, - { - "epoch": 1.4080547112462005, - "grad_norm": 1.4634262323379517, - "learning_rate": 2.9602214812114414e-06, - "loss": 0.4859408140182495, - "mean_token_accuracy": 0.8261818885803223, - "num_tokens": 16591976.0, - "step": 1853 - }, - { - "epoch": 1.4088145896656534, - "grad_norm": 1.4840295314788818, - "learning_rate": 2.9581627187838997e-06, - "loss": 0.4079628586769104, - "mean_token_accuracy": 0.8549603223800659, - "num_tokens": 16603631.0, - "step": 1854 - }, - { - "epoch": 1.4095744680851063, - "grad_norm": 2.1474642753601074, - "learning_rate": 2.956103634800126e-06, - "loss": 0.32997995615005493, - "mean_token_accuracy": 0.8836915493011475, - "num_tokens": 16609875.0, - "step": 1855 - }, - { - "epoch": 1.4103343465045592, - "grad_norm": 2.627460241317749, - "learning_rate": 2.9540442307052643e-06, - "loss": 0.3229186236858368, - "mean_token_accuracy": 0.8852157592773438, - "num_tokens": 16614113.0, - "step": 1856 - }, - { - "epoch": 1.4110942249240122, - "grad_norm": 1.9569811820983887, - "learning_rate": 2.9519845079446824e-06, - "loss": 0.5057883858680725, - "mean_token_accuracy": 0.8585711717605591, - "num_tokens": 16624611.0, - "step": 1857 - }, - { - "epoch": 1.411854103343465, - "grad_norm": 2.0604090690612793, - "learning_rate": 2.949924467963975e-06, - "loss": 0.4681510329246521, - "mean_token_accuracy": 0.8390560150146484, - "num_tokens": 16632938.0, - "step": 1858 - }, - { - "epoch": 1.412613981762918, - "grad_norm": 2.5430450439453125, - "learning_rate": 2.9478641122089563e-06, - "loss": 0.3090999126434326, - "mean_token_accuracy": 0.8943990468978882, - "num_tokens": 16637135.0, - "step": 1859 - }, - { - "epoch": 1.4133738601823709, - "grad_norm": 1.3275387287139893, - "learning_rate": 2.945803442125663e-06, - "loss": 0.3592180013656616, - "mean_token_accuracy": 0.8678265810012817, - "num_tokens": 16650322.0, - "step": 1860 - }, - { - "epoch": 1.4141337386018238, - "grad_norm": 1.9070929288864136, - "learning_rate": 2.943742459160354e-06, - "loss": 0.5332518815994263, - "mean_token_accuracy": 0.8475706577301025, - "num_tokens": 16660240.0, - "step": 1861 - }, - { - "epoch": 1.4148936170212765, - "grad_norm": 2.8724546432495117, - "learning_rate": 2.9416811647595052e-06, - "loss": 0.5052884817123413, - "mean_token_accuracy": 0.8363175392150879, - "num_tokens": 16665481.0, - "step": 1862 - }, - { - "epoch": 1.4156534954407296, - "grad_norm": 4.203817844390869, - "learning_rate": 2.939619560369813e-06, - "loss": 0.546925961971283, - "mean_token_accuracy": 0.834044337272644, - "num_tokens": 16669615.0, - "step": 1863 - }, - { - "epoch": 1.4164133738601823, - "grad_norm": 1.6466281414031982, - "learning_rate": 2.9375576474381907e-06, - "loss": 0.3474533259868622, - "mean_token_accuracy": 0.8571163415908813, - "num_tokens": 16678893.0, - "step": 1864 - }, - { - "epoch": 1.4171732522796352, - "grad_norm": 1.8885842561721802, - "learning_rate": 2.9354954274117683e-06, - "loss": 0.3726021349430084, - "mean_token_accuracy": 0.8629094958305359, - "num_tokens": 16685939.0, - "step": 1865 - }, - { - "epoch": 1.417933130699088, - "grad_norm": 2.830599784851074, - "learning_rate": 2.9334329017378898e-06, - "loss": 0.4138668477535248, - "mean_token_accuracy": 0.8670746088027954, - "num_tokens": 16690012.0, - "step": 1866 - }, - { - "epoch": 1.418693009118541, - "grad_norm": 1.6838961839675903, - "learning_rate": 2.9313700718641167e-06, - "loss": 0.33954259753227234, - "mean_token_accuracy": 0.8660278916358948, - "num_tokens": 16700061.0, - "step": 1867 - }, - { - "epoch": 1.419452887537994, - "grad_norm": 2.8767011165618896, - "learning_rate": 2.9293069392382224e-06, - "loss": 0.4650302827358246, - "mean_token_accuracy": 0.8448452949523926, - "num_tokens": 16705072.0, - "step": 1868 - }, - { - "epoch": 1.4202127659574468, - "grad_norm": 1.5901305675506592, - "learning_rate": 2.927243505308192e-06, - "loss": 0.40838998556137085, - "mean_token_accuracy": 0.8560664653778076, - "num_tokens": 16714763.0, - "step": 1869 - }, - { - "epoch": 1.4209726443768997, - "grad_norm": 1.3293657302856445, - "learning_rate": 2.925179771522223e-06, - "loss": 0.34712862968444824, - "mean_token_accuracy": 0.8633697032928467, - "num_tokens": 16729575.0, - "step": 1870 - }, - { - "epoch": 1.4217325227963526, - "grad_norm": 1.7465964555740356, - "learning_rate": 2.9231157393287234e-06, - "loss": 0.48190903663635254, - "mean_token_accuracy": 0.8255834579467773, - "num_tokens": 16742529.0, - "step": 1871 - }, - { - "epoch": 1.4224924012158056, - "grad_norm": 1.865749716758728, - "learning_rate": 2.9210514101763116e-06, - "loss": 0.4912028908729553, - "mean_token_accuracy": 0.8309572339057922, - "num_tokens": 16753989.0, - "step": 1872 - }, - { - "epoch": 1.4232522796352582, - "grad_norm": 2.55780291557312, - "learning_rate": 2.9189867855138103e-06, - "loss": 0.4550635814666748, - "mean_token_accuracy": 0.8584091067314148, - "num_tokens": 16758906.0, - "step": 1873 - }, - { - "epoch": 1.4240121580547114, - "grad_norm": 1.867530107498169, - "learning_rate": 2.9169218667902562e-06, - "loss": 0.3524911105632782, - "mean_token_accuracy": 0.8715004920959473, - "num_tokens": 16765969.0, - "step": 1874 - }, - { - "epoch": 1.424772036474164, - "grad_norm": 1.8886862993240356, - "learning_rate": 2.9148566554548857e-06, - "loss": 0.37144535779953003, - "mean_token_accuracy": 0.8640961050987244, - "num_tokens": 16773935.0, - "step": 1875 - }, - { - "epoch": 1.425531914893617, - "grad_norm": 1.266065239906311, - "learning_rate": 2.912791152957145e-06, - "loss": 0.3341747522354126, - "mean_token_accuracy": 0.8929134607315063, - "num_tokens": 16787780.0, - "step": 1876 - }, - { - "epoch": 1.4262917933130699, - "grad_norm": 2.524888753890991, - "learning_rate": 2.9107253607466833e-06, - "loss": 0.33709171414375305, - "mean_token_accuracy": 0.8857531547546387, - "num_tokens": 16792753.0, - "step": 1877 - }, - { - "epoch": 1.4270516717325228, - "grad_norm": 1.9269018173217773, - "learning_rate": 2.908659280273354e-06, - "loss": 0.32599249482154846, - "mean_token_accuracy": 0.8777773380279541, - "num_tokens": 16799904.0, - "step": 1878 - }, - { - "epoch": 1.4278115501519757, - "grad_norm": 1.9844375848770142, - "learning_rate": 2.9065929129872097e-06, - "loss": 0.4086732268333435, - "mean_token_accuracy": 0.8505409955978394, - "num_tokens": 16807774.0, - "step": 1879 - }, - { - "epoch": 1.4285714285714286, - "grad_norm": 4.0958662033081055, - "learning_rate": 2.9045262603385073e-06, - "loss": 0.3838827610015869, - "mean_token_accuracy": 0.877601146697998, - "num_tokens": 16810908.0, - "step": 1880 - }, - { - "epoch": 1.4293313069908815, - "grad_norm": 1.7323768138885498, - "learning_rate": 2.902459323777704e-06, - "loss": 0.37459003925323486, - "mean_token_accuracy": 0.8655836582183838, - "num_tokens": 16819494.0, - "step": 1881 - }, - { - "epoch": 1.4300911854103344, - "grad_norm": 2.608043670654297, - "learning_rate": 2.900392104755455e-06, - "loss": 0.5798726677894592, - "mean_token_accuracy": 0.8382592797279358, - "num_tokens": 16827745.0, - "step": 1882 - }, - { - "epoch": 1.4308510638297873, - "grad_norm": 1.3262078762054443, - "learning_rate": 2.8983246047226137e-06, - "loss": 0.3724595904350281, - "mean_token_accuracy": 0.8651963472366333, - "num_tokens": 16844171.0, - "step": 1883 - }, - { - "epoch": 1.43161094224924, - "grad_norm": 1.7250545024871826, - "learning_rate": 2.8962568251302327e-06, - "loss": 0.3478979468345642, - "mean_token_accuracy": 0.8807886242866516, - "num_tokens": 16852838.0, - "step": 1884 - }, - { - "epoch": 1.4323708206686931, - "grad_norm": 2.114525318145752, - "learning_rate": 2.8941887674295573e-06, - "loss": 0.5156140327453613, - "mean_token_accuracy": 0.825178861618042, - "num_tokens": 16861087.0, - "step": 1885 - }, - { - "epoch": 1.4331306990881458, - "grad_norm": 2.400829792022705, - "learning_rate": 2.892120433072031e-06, - "loss": 0.2807392477989197, - "mean_token_accuracy": 0.8907361030578613, - "num_tokens": 16866557.0, - "step": 1886 - }, - { - "epoch": 1.4338905775075987, - "grad_norm": 2.490880012512207, - "learning_rate": 2.8900518235092908e-06, - "loss": 0.2615952491760254, - "mean_token_accuracy": 0.9152894020080566, - "num_tokens": 16871357.0, - "step": 1887 - }, - { - "epoch": 1.4346504559270516, - "grad_norm": 1.9058431386947632, - "learning_rate": 2.887982940193165e-06, - "loss": 0.43623363971710205, - "mean_token_accuracy": 0.84696364402771, - "num_tokens": 16879016.0, - "step": 1888 - }, - { - "epoch": 1.4354103343465046, - "grad_norm": 1.4520210027694702, - "learning_rate": 2.8859137845756785e-06, - "loss": 0.3961856961250305, - "mean_token_accuracy": 0.8518897294998169, - "num_tokens": 16892254.0, - "step": 1889 - }, - { - "epoch": 1.4361702127659575, - "grad_norm": 2.500274896621704, - "learning_rate": 2.8838443581090415e-06, - "loss": 0.41457289457321167, - "mean_token_accuracy": 0.8751448392868042, - "num_tokens": 16897156.0, - "step": 1890 - }, - { - "epoch": 1.4369300911854104, - "grad_norm": 2.9312057495117188, - "learning_rate": 2.8817746622456585e-06, - "loss": 0.45875269174575806, - "mean_token_accuracy": 0.8411039113998413, - "num_tokens": 16902291.0, - "step": 1891 - }, - { - "epoch": 1.4376899696048633, - "grad_norm": 2.367419481277466, - "learning_rate": 2.879704698438121e-06, - "loss": 0.3643629848957062, - "mean_token_accuracy": 0.8771071434020996, - "num_tokens": 16908128.0, - "step": 1892 - }, - { - "epoch": 1.4384498480243162, - "grad_norm": 1.9907705783843994, - "learning_rate": 2.8776344681392106e-06, - "loss": 0.3206835389137268, - "mean_token_accuracy": 0.879996657371521, - "num_tokens": 16914918.0, - "step": 1893 - }, - { - "epoch": 1.439209726443769, - "grad_norm": 3.536956310272217, - "learning_rate": 2.875563972801893e-06, - "loss": 0.3640141785144806, - "mean_token_accuracy": 0.8814959526062012, - "num_tokens": 16918187.0, - "step": 1894 - }, - { - "epoch": 1.4399696048632218, - "grad_norm": 1.3451156616210938, - "learning_rate": 2.8734932138793226e-06, - "loss": 0.3427346348762512, - "mean_token_accuracy": 0.8835382461547852, - "num_tokens": 16931135.0, - "step": 1895 - }, - { - "epoch": 1.4407294832826747, - "grad_norm": 2.0735955238342285, - "learning_rate": 2.871422192824837e-06, - "loss": 0.4265315532684326, - "mean_token_accuracy": 0.8452677726745605, - "num_tokens": 16937995.0, - "step": 1896 - }, - { - "epoch": 1.4414893617021276, - "grad_norm": 1.5124932527542114, - "learning_rate": 2.8693509110919597e-06, - "loss": 0.497121661901474, - "mean_token_accuracy": 0.815092921257019, - "num_tokens": 16952743.0, - "step": 1897 - }, - { - "epoch": 1.4422492401215805, - "grad_norm": 3.716669797897339, - "learning_rate": 2.867279370134395e-06, - "loss": 0.5452651381492615, - "mean_token_accuracy": 0.8150380849838257, - "num_tokens": 16956797.0, - "step": 1898 - }, - { - "epoch": 1.4430091185410334, - "grad_norm": 1.3571398258209229, - "learning_rate": 2.8652075714060296e-06, - "loss": 0.4249724745750427, - "mean_token_accuracy": 0.8675867915153503, - "num_tokens": 16974494.0, - "step": 1899 - }, - { - "epoch": 1.4437689969604863, - "grad_norm": 2.310673475265503, - "learning_rate": 2.863135516360932e-06, - "loss": 0.39368677139282227, - "mean_token_accuracy": 0.878392219543457, - "num_tokens": 16980612.0, - "step": 1900 - }, - { - "epoch": 1.4445288753799392, - "grad_norm": 1.9025533199310303, - "learning_rate": 2.8610632064533517e-06, - "loss": 0.4786127805709839, - "mean_token_accuracy": 0.8720556497573853, - "num_tokens": 16992262.0, - "step": 1901 - }, - { - "epoch": 1.4452887537993921, - "grad_norm": 2.528564453125, - "learning_rate": 2.8589906431377133e-06, - "loss": 0.4223094582557678, - "mean_token_accuracy": 0.8513246178627014, - "num_tokens": 16997717.0, - "step": 1902 - }, - { - "epoch": 1.446048632218845, - "grad_norm": 1.010425329208374, - "learning_rate": 2.8569178278686222e-06, - "loss": 0.3908255696296692, - "mean_token_accuracy": 0.8620463609695435, - "num_tokens": 17020903.0, - "step": 1903 - }, - { - "epoch": 1.4468085106382977, - "grad_norm": 1.5760232210159302, - "learning_rate": 2.8548447621008614e-06, - "loss": 0.4134044051170349, - "mean_token_accuracy": 0.8472093343734741, - "num_tokens": 17035250.0, - "step": 1904 - }, - { - "epoch": 1.4475683890577509, - "grad_norm": 2.0668535232543945, - "learning_rate": 2.8527714472893866e-06, - "loss": 0.44095730781555176, - "mean_token_accuracy": 0.881983757019043, - "num_tokens": 17042170.0, - "step": 1905 - }, - { - "epoch": 1.4483282674772036, - "grad_norm": 1.1620599031448364, - "learning_rate": 2.85069788488933e-06, - "loss": 0.3607163429260254, - "mean_token_accuracy": 0.8684282898902893, - "num_tokens": 17061937.0, - "step": 1906 - }, - { - "epoch": 1.4490881458966565, - "grad_norm": 2.1316568851470947, - "learning_rate": 2.8486240763559984e-06, - "loss": 0.3478124141693115, - "mean_token_accuracy": 0.8772403001785278, - "num_tokens": 17068628.0, - "step": 1907 - }, - { - "epoch": 1.4498480243161094, - "grad_norm": 2.4756391048431396, - "learning_rate": 2.8465500231448707e-06, - "loss": 0.46441152691841125, - "mean_token_accuracy": 0.8436450958251953, - "num_tokens": 17075495.0, - "step": 1908 - }, - { - "epoch": 1.4506079027355623, - "grad_norm": 2.249720573425293, - "learning_rate": 2.844475726711595e-06, - "loss": 0.41565513610839844, - "mean_token_accuracy": 0.8525094985961914, - "num_tokens": 17080940.0, - "step": 1909 - }, - { - "epoch": 1.4513677811550152, - "grad_norm": 2.3081841468811035, - "learning_rate": 2.8424011885119956e-06, - "loss": 0.49903199076652527, - "mean_token_accuracy": 0.8212426900863647, - "num_tokens": 17092024.0, - "step": 1910 - }, - { - "epoch": 1.452127659574468, - "grad_norm": 1.2929959297180176, - "learning_rate": 2.8403264100020613e-06, - "loss": 0.47038257122039795, - "mean_token_accuracy": 0.8319816589355469, - "num_tokens": 17108840.0, - "step": 1911 - }, - { - "epoch": 1.452887537993921, - "grad_norm": 1.6476463079452515, - "learning_rate": 2.8382513926379508e-06, - "loss": 0.42287829518318176, - "mean_token_accuracy": 0.8555682897567749, - "num_tokens": 17119704.0, - "step": 1912 - }, - { - "epoch": 1.453647416413374, - "grad_norm": 1.759998083114624, - "learning_rate": 2.836176137875993e-06, - "loss": 0.40904951095581055, - "mean_token_accuracy": 0.8698266744613647, - "num_tokens": 17130676.0, - "step": 1913 - }, - { - "epoch": 1.4544072948328268, - "grad_norm": 1.510909914970398, - "learning_rate": 2.8341006471726817e-06, - "loss": 0.47834792733192444, - "mean_token_accuracy": 0.8335825204849243, - "num_tokens": 17146304.0, - "step": 1914 - }, - { - "epoch": 1.4551671732522795, - "grad_norm": 3.538071632385254, - "learning_rate": 2.832024921984674e-06, - "loss": 0.34059035778045654, - "mean_token_accuracy": 0.8769031763076782, - "num_tokens": 17150458.0, - "step": 1915 - }, - { - "epoch": 1.4559270516717326, - "grad_norm": 2.3368659019470215, - "learning_rate": 2.8299489637687955e-06, - "loss": 0.43068382143974304, - "mean_token_accuracy": 0.845360517501831, - "num_tokens": 17157368.0, - "step": 1916 - }, - { - "epoch": 1.4566869300911853, - "grad_norm": 1.8720396757125854, - "learning_rate": 2.8278727739820334e-06, - "loss": 0.37013399600982666, - "mean_token_accuracy": 0.854241132736206, - "num_tokens": 17166325.0, - "step": 1917 - }, - { - "epoch": 1.4574468085106382, - "grad_norm": 1.6706892251968384, - "learning_rate": 2.825796354081537e-06, - "loss": 0.5397020578384399, - "mean_token_accuracy": 0.8309713006019592, - "num_tokens": 17178920.0, - "step": 1918 - }, - { - "epoch": 1.4582066869300911, - "grad_norm": 2.729210376739502, - "learning_rate": 2.8237197055246175e-06, - "loss": 0.25137859582901, - "mean_token_accuracy": 0.9148792028427124, - "num_tokens": 17183107.0, - "step": 1919 - }, - { - "epoch": 1.458966565349544, - "grad_norm": 3.023500680923462, - "learning_rate": 2.821642829768748e-06, - "loss": 0.43312495946884155, - "mean_token_accuracy": 0.8481811285018921, - "num_tokens": 17187853.0, - "step": 1920 - }, - { - "epoch": 1.459726443768997, - "grad_norm": 1.8108519315719604, - "learning_rate": 2.8195657282715595e-06, - "loss": 0.5101792216300964, - "mean_token_accuracy": 0.8315553069114685, - "num_tokens": 17199247.0, - "step": 1921 - }, - { - "epoch": 1.4604863221884499, - "grad_norm": 2.0262672901153564, - "learning_rate": 2.817488402490841e-06, - "loss": 0.4449934959411621, - "mean_token_accuracy": 0.8634527325630188, - "num_tokens": 17206348.0, - "step": 1922 - }, - { - "epoch": 1.4612462006079028, - "grad_norm": 2.6163926124572754, - "learning_rate": 2.8154108538845405e-06, - "loss": 0.43052345514297485, - "mean_token_accuracy": 0.8375401496887207, - "num_tokens": 17211702.0, - "step": 1923 - }, - { - "epoch": 1.4620060790273557, - "grad_norm": 2.0854408740997314, - "learning_rate": 2.813333083910761e-06, - "loss": 0.5011380910873413, - "mean_token_accuracy": 0.8359915018081665, - "num_tokens": 17219096.0, - "step": 1924 - }, - { - "epoch": 1.4627659574468086, - "grad_norm": 2.2081687450408936, - "learning_rate": 2.8112550940277615e-06, - "loss": 0.5239193439483643, - "mean_token_accuracy": 0.8499593734741211, - "num_tokens": 17229266.0, - "step": 1925 - }, - { - "epoch": 1.4635258358662613, - "grad_norm": 1.798343539237976, - "learning_rate": 2.809176885693956e-06, - "loss": 0.4515029191970825, - "mean_token_accuracy": 0.8400485515594482, - "num_tokens": 17239280.0, - "step": 1926 - }, - { - "epoch": 1.4642857142857144, - "grad_norm": 1.897887945175171, - "learning_rate": 2.807098460367911e-06, - "loss": 0.35935714840888977, - "mean_token_accuracy": 0.8776072263717651, - "num_tokens": 17247132.0, - "step": 1927 - }, - { - "epoch": 1.465045592705167, - "grad_norm": 2.705836296081543, - "learning_rate": 2.8050198195083445e-06, - "loss": 0.3728443682193756, - "mean_token_accuracy": 0.8649885654449463, - "num_tokens": 17251865.0, - "step": 1928 - }, - { - "epoch": 1.46580547112462, - "grad_norm": 1.841178059577942, - "learning_rate": 2.802940964574127e-06, - "loss": 0.40604841709136963, - "mean_token_accuracy": 0.8537783622741699, - "num_tokens": 17260163.0, - "step": 1929 - }, - { - "epoch": 1.466565349544073, - "grad_norm": 2.7393605709075928, - "learning_rate": 2.800861897024279e-06, - "loss": 0.39346879720687866, - "mean_token_accuracy": 0.8628787994384766, - "num_tokens": 17264876.0, - "step": 1930 - }, - { - "epoch": 1.4673252279635258, - "grad_norm": 1.84367835521698, - "learning_rate": 2.798782618317971e-06, - "loss": 0.37411895394325256, - "mean_token_accuracy": 0.8605265617370605, - "num_tokens": 17273049.0, - "step": 1931 - }, - { - "epoch": 1.4680851063829787, - "grad_norm": 1.6546733379364014, - "learning_rate": 2.796703129914519e-06, - "loss": 0.4997844099998474, - "mean_token_accuracy": 0.8267433643341064, - "num_tokens": 17285074.0, - "step": 1932 - }, - { - "epoch": 1.4688449848024316, - "grad_norm": 2.2749221324920654, - "learning_rate": 2.79462343327339e-06, - "loss": 0.35453367233276367, - "mean_token_accuracy": 0.8746850490570068, - "num_tokens": 17290273.0, - "step": 1933 - }, - { - "epoch": 1.4696048632218845, - "grad_norm": 1.7142518758773804, - "learning_rate": 2.7925435298541944e-06, - "loss": 0.345878541469574, - "mean_token_accuracy": 0.8600981831550598, - "num_tokens": 17301045.0, - "step": 1934 - }, - { - "epoch": 1.4703647416413375, - "grad_norm": 3.163342237472534, - "learning_rate": 2.7904634211166877e-06, - "loss": 0.4356975853443146, - "mean_token_accuracy": 0.8460350036621094, - "num_tokens": 17305108.0, - "step": 1935 - }, - { - "epoch": 1.4711246200607904, - "grad_norm": 1.6377612352371216, - "learning_rate": 2.7883831085207707e-06, - "loss": 0.4459729790687561, - "mean_token_accuracy": 0.8463394641876221, - "num_tokens": 17315479.0, - "step": 1936 - }, - { - "epoch": 1.471884498480243, - "grad_norm": 1.865268588066101, - "learning_rate": 2.7863025935264876e-06, - "loss": 0.394723117351532, - "mean_token_accuracy": 0.864177942276001, - "num_tokens": 17324795.0, - "step": 1937 - }, - { - "epoch": 1.4726443768996962, - "grad_norm": 1.241937518119812, - "learning_rate": 2.784221877594024e-06, - "loss": 0.2752220630645752, - "mean_token_accuracy": 0.8998259902000427, - "num_tokens": 17338000.0, - "step": 1938 - }, - { - "epoch": 1.4734042553191489, - "grad_norm": 1.8013651371002197, - "learning_rate": 2.7821409621837042e-06, - "loss": 0.4251005947589874, - "mean_token_accuracy": 0.8518919348716736, - "num_tokens": 17347351.0, - "step": 1939 - }, - { - "epoch": 1.4741641337386018, - "grad_norm": 1.2902207374572754, - "learning_rate": 2.7800598487559976e-06, - "loss": 0.3640727400779724, - "mean_token_accuracy": 0.8592870235443115, - "num_tokens": 17362335.0, - "step": 1940 - }, - { - "epoch": 1.4749240121580547, - "grad_norm": 2.5427513122558594, - "learning_rate": 2.777978538771508e-06, - "loss": 0.38166797161102295, - "mean_token_accuracy": 0.8653234839439392, - "num_tokens": 17367733.0, - "step": 1941 - }, - { - "epoch": 1.4756838905775076, - "grad_norm": 1.7793641090393066, - "learning_rate": 2.7758970336909795e-06, - "loss": 0.3113783895969391, - "mean_token_accuracy": 0.8812868595123291, - "num_tokens": 17375267.0, - "step": 1942 - }, - { - "epoch": 1.4764437689969605, - "grad_norm": 3.4031741619110107, - "learning_rate": 2.7738153349752923e-06, - "loss": 0.4800986647605896, - "mean_token_accuracy": 0.8336698412895203, - "num_tokens": 17379549.0, - "step": 1943 - }, - { - "epoch": 1.4772036474164134, - "grad_norm": 1.3451651334762573, - "learning_rate": 2.7717334440854634e-06, - "loss": 0.3115345239639282, - "mean_token_accuracy": 0.908623218536377, - "num_tokens": 17394455.0, - "step": 1944 - }, - { - "epoch": 1.4779635258358663, - "grad_norm": 1.980919599533081, - "learning_rate": 2.7696513624826422e-06, - "loss": 0.391154944896698, - "mean_token_accuracy": 0.8650267720222473, - "num_tokens": 17401931.0, - "step": 1945 - }, - { - "epoch": 1.4787234042553192, - "grad_norm": 1.0118765830993652, - "learning_rate": 2.7675690916281158e-06, - "loss": 0.3157956600189209, - "mean_token_accuracy": 0.8827471733093262, - "num_tokens": 17424144.0, - "step": 1946 - }, - { - "epoch": 1.4794832826747721, - "grad_norm": 1.579654335975647, - "learning_rate": 2.7654866329833e-06, - "loss": 0.4578486382961273, - "mean_token_accuracy": 0.8361750245094299, - "num_tokens": 17435769.0, - "step": 1947 - }, - { - "epoch": 1.4802431610942248, - "grad_norm": 1.7706717252731323, - "learning_rate": 2.763403988009746e-06, - "loss": 0.3564416170120239, - "mean_token_accuracy": 0.8689201474189758, - "num_tokens": 17444088.0, - "step": 1948 - }, - { - "epoch": 1.4810030395136777, - "grad_norm": 1.2264244556427002, - "learning_rate": 2.761321158169134e-06, - "loss": 0.30763837695121765, - "mean_token_accuracy": 0.8960219621658325, - "num_tokens": 17458096.0, - "step": 1949 - }, - { - "epoch": 1.4817629179331306, - "grad_norm": 1.214431881904602, - "learning_rate": 2.759238144923274e-06, - "loss": 0.49099457263946533, - "mean_token_accuracy": 0.8279136419296265, - "num_tokens": 17481062.0, - "step": 1950 - }, - { - "epoch": 1.4825227963525835, - "grad_norm": 1.593892216682434, - "learning_rate": 2.7571549497341044e-06, - "loss": 0.3745320737361908, - "mean_token_accuracy": 0.8690779209136963, - "num_tokens": 17490874.0, - "step": 1951 - }, - { - "epoch": 1.4832826747720365, - "grad_norm": 2.409924268722534, - "learning_rate": 2.755071574063692e-06, - "loss": 0.4310247600078583, - "mean_token_accuracy": 0.8521159291267395, - "num_tokens": 17496942.0, - "step": 1952 - }, - { - "epoch": 1.4840425531914894, - "grad_norm": 1.2557463645935059, - "learning_rate": 2.7529880193742297e-06, - "loss": 0.34304720163345337, - "mean_token_accuracy": 0.8748183250427246, - "num_tokens": 17514391.0, - "step": 1953 - }, - { - "epoch": 1.4848024316109423, - "grad_norm": 1.17310631275177, - "learning_rate": 2.7509042871280373e-06, - "loss": 0.3835817277431488, - "mean_token_accuracy": 0.8853274583816528, - "num_tokens": 17533289.0, - "step": 1954 - }, - { - "epoch": 1.4855623100303952, - "grad_norm": 1.5261479616165161, - "learning_rate": 2.748820378787558e-06, - "loss": 0.4799988865852356, - "mean_token_accuracy": 0.8252149820327759, - "num_tokens": 17544118.0, - "step": 1955 - }, - { - "epoch": 1.486322188449848, - "grad_norm": 2.030930757522583, - "learning_rate": 2.7467362958153585e-06, - "loss": 0.35690805315971375, - "mean_token_accuracy": 0.8959587216377258, - "num_tokens": 17550431.0, - "step": 1956 - }, - { - "epoch": 1.4870820668693008, - "grad_norm": 2.376520872116089, - "learning_rate": 2.7446520396741293e-06, - "loss": 0.262234091758728, - "mean_token_accuracy": 0.9054547548294067, - "num_tokens": 17554853.0, - "step": 1957 - }, - { - "epoch": 1.487841945288754, - "grad_norm": 1.6944479942321777, - "learning_rate": 2.742567611826681e-06, - "loss": 0.529259979724884, - "mean_token_accuracy": 0.8195339441299438, - "num_tokens": 17568016.0, - "step": 1958 - }, - { - "epoch": 1.4886018237082066, - "grad_norm": 2.833029270172119, - "learning_rate": 2.7404830137359445e-06, - "loss": 0.30229634046554565, - "mean_token_accuracy": 0.8933001756668091, - "num_tokens": 17572587.0, - "step": 1959 - }, - { - "epoch": 1.4893617021276595, - "grad_norm": 1.7040144205093384, - "learning_rate": 2.7383982468649715e-06, - "loss": 0.3166356682777405, - "mean_token_accuracy": 0.8871906399726868, - "num_tokens": 17580966.0, - "step": 1960 - }, - { - "epoch": 1.4901215805471124, - "grad_norm": 1.7539052963256836, - "learning_rate": 2.7363133126769326e-06, - "loss": 0.4231064021587372, - "mean_token_accuracy": 0.8708304166793823, - "num_tokens": 17590907.0, - "step": 1961 - }, - { - "epoch": 1.4908814589665653, - "grad_norm": 1.6198650598526, - "learning_rate": 2.7342282126351145e-06, - "loss": 0.4198967218399048, - "mean_token_accuracy": 0.8723280429840088, - "num_tokens": 17604291.0, - "step": 1962 - }, - { - "epoch": 1.4916413373860182, - "grad_norm": 1.8437711000442505, - "learning_rate": 2.73214294820292e-06, - "loss": 0.38923323154449463, - "mean_token_accuracy": 0.8697006106376648, - "num_tokens": 17612291.0, - "step": 1963 - }, - { - "epoch": 1.4924012158054711, - "grad_norm": 1.1129369735717773, - "learning_rate": 2.7300575208438684e-06, - "loss": 0.3107512593269348, - "mean_token_accuracy": 0.878618597984314, - "num_tokens": 17630073.0, - "step": 1964 - }, - { - "epoch": 1.493161094224924, - "grad_norm": 3.0210442543029785, - "learning_rate": 2.7279719320215924e-06, - "loss": 0.4630751609802246, - "mean_token_accuracy": 0.8567075729370117, - "num_tokens": 17634758.0, - "step": 1965 - }, - { - "epoch": 1.493920972644377, - "grad_norm": 2.8825972080230713, - "learning_rate": 2.725886183199839e-06, - "loss": 0.35351765155792236, - "mean_token_accuracy": 0.8711981773376465, - "num_tokens": 17639613.0, - "step": 1966 - }, - { - "epoch": 1.4946808510638299, - "grad_norm": 2.111238718032837, - "learning_rate": 2.723800275842468e-06, - "loss": 0.3529569208621979, - "mean_token_accuracy": 0.8679244518280029, - "num_tokens": 17645308.0, - "step": 1967 - }, - { - "epoch": 1.4954407294832825, - "grad_norm": 2.080509901046753, - "learning_rate": 2.7217142114134466e-06, - "loss": 0.43321219086647034, - "mean_token_accuracy": 0.8848220109939575, - "num_tokens": 17652292.0, - "step": 1968 - }, - { - "epoch": 1.4962006079027357, - "grad_norm": 2.8686363697052, - "learning_rate": 2.7196279913768587e-06, - "loss": 0.417035311460495, - "mean_token_accuracy": 0.8724601864814758, - "num_tokens": 17656908.0, - "step": 1969 - }, - { - "epoch": 1.4969604863221884, - "grad_norm": 3.294193744659424, - "learning_rate": 2.717541617196891e-06, - "loss": 0.3551934063434601, - "mean_token_accuracy": 0.8838565349578857, - "num_tokens": 17660590.0, - "step": 1970 - }, - { - "epoch": 1.4977203647416413, - "grad_norm": 1.766292929649353, - "learning_rate": 2.7154550903378425e-06, - "loss": 0.36521971225738525, - "mean_token_accuracy": 0.8810199499130249, - "num_tokens": 17668214.0, - "step": 1971 - }, - { - "epoch": 1.4984802431610942, - "grad_norm": 1.2127676010131836, - "learning_rate": 2.713368412264118e-06, - "loss": 0.35184425115585327, - "mean_token_accuracy": 0.8672580718994141, - "num_tokens": 17684736.0, - "step": 1972 - }, - { - "epoch": 1.499240121580547, - "grad_norm": 2.268256664276123, - "learning_rate": 2.711281584440228e-06, - "loss": 0.40115267038345337, - "mean_token_accuracy": 0.8517841100692749, - "num_tokens": 17691510.0, - "step": 1973 - }, - { - "epoch": 1.5, - "grad_norm": 2.7196054458618164, - "learning_rate": 2.70919460833079e-06, - "loss": 0.3819037675857544, - "mean_token_accuracy": 0.8765411376953125, - "num_tokens": 17696179.0, - "step": 1974 - }, - { - "epoch": 1.500759878419453, - "grad_norm": 2.969406843185425, - "learning_rate": 2.7071074854005206e-06, - "loss": 0.3922455608844757, - "mean_token_accuracy": 0.8796037435531616, - "num_tokens": 17700597.0, - "step": 1975 - }, - { - "epoch": 1.5015197568389058, - "grad_norm": 2.2965853214263916, - "learning_rate": 2.705020217114248e-06, - "loss": 0.5433666110038757, - "mean_token_accuracy": 0.809639036655426, - "num_tokens": 17708895.0, - "step": 1976 - }, - { - "epoch": 1.5022796352583585, - "grad_norm": 1.5584394931793213, - "learning_rate": 2.7029328049368942e-06, - "loss": 0.4736343324184418, - "mean_token_accuracy": 0.8197190761566162, - "num_tokens": 17725202.0, - "step": 1977 - }, - { - "epoch": 1.5030395136778116, - "grad_norm": 1.3903142213821411, - "learning_rate": 2.700845250333486e-06, - "loss": 0.4471571445465088, - "mean_token_accuracy": 0.839043140411377, - "num_tokens": 17742835.0, - "step": 1978 - }, - { - "epoch": 1.5037993920972643, - "grad_norm": 3.080716609954834, - "learning_rate": 2.69875755476915e-06, - "loss": 0.45760005712509155, - "mean_token_accuracy": 0.8366328477859497, - "num_tokens": 17747324.0, - "step": 1979 - }, - { - "epoch": 1.5045592705167175, - "grad_norm": 1.0150405168533325, - "learning_rate": 2.696669719709111e-06, - "loss": 0.33638954162597656, - "mean_token_accuracy": 0.8591676354408264, - "num_tokens": 17765565.0, - "step": 1980 - }, - { - "epoch": 1.5053191489361701, - "grad_norm": 2.402927875518799, - "learning_rate": 2.694581746618691e-06, - "loss": 0.4086601436138153, - "mean_token_accuracy": 0.8769911527633667, - "num_tokens": 17771275.0, - "step": 1981 - }, - { - "epoch": 1.506079027355623, - "grad_norm": 2.030583381652832, - "learning_rate": 2.6924936369633126e-06, - "loss": 0.5115457773208618, - "mean_token_accuracy": 0.8054746389389038, - "num_tokens": 17779999.0, - "step": 1982 - }, - { - "epoch": 1.506838905775076, - "grad_norm": 2.575199604034424, - "learning_rate": 2.6904053922084893e-06, - "loss": 0.363183856010437, - "mean_token_accuracy": 0.8716042637825012, - "num_tokens": 17785473.0, - "step": 1983 - }, - { - "epoch": 1.5075987841945289, - "grad_norm": 1.8497480154037476, - "learning_rate": 2.688317013819832e-06, - "loss": 0.4254384934902191, - "mean_token_accuracy": 0.8549597263336182, - "num_tokens": 17793812.0, - "step": 1984 - }, - { - "epoch": 1.5083586626139818, - "grad_norm": 1.7786511182785034, - "learning_rate": 2.686228503263045e-06, - "loss": 0.33400774002075195, - "mean_token_accuracy": 0.9027615189552307, - "num_tokens": 17801783.0, - "step": 1985 - }, - { - "epoch": 1.5091185410334347, - "grad_norm": 1.8365367650985718, - "learning_rate": 2.684139862003927e-06, - "loss": 0.35765063762664795, - "mean_token_accuracy": 0.8663736581802368, - "num_tokens": 17809562.0, - "step": 1986 - }, - { - "epoch": 1.5098784194528876, - "grad_norm": 1.8817477226257324, - "learning_rate": 2.682051091508365e-06, - "loss": 0.4627506732940674, - "mean_token_accuracy": 0.8358862400054932, - "num_tokens": 17819094.0, - "step": 1987 - }, - { - "epoch": 1.5106382978723403, - "grad_norm": 2.221547842025757, - "learning_rate": 2.679962193242338e-06, - "loss": 0.577020525932312, - "mean_token_accuracy": 0.80013108253479, - "num_tokens": 17826666.0, - "step": 1988 - }, - { - "epoch": 1.5113981762917934, - "grad_norm": 2.6618270874023438, - "learning_rate": 2.6778731686719177e-06, - "loss": 0.44632256031036377, - "mean_token_accuracy": 0.8611289262771606, - "num_tokens": 17833172.0, - "step": 1989 - }, - { - "epoch": 1.512158054711246, - "grad_norm": 2.9495689868927, - "learning_rate": 2.67578401926326e-06, - "loss": 0.3482511043548584, - "mean_token_accuracy": 0.8703314661979675, - "num_tokens": 17837220.0, - "step": 1990 - }, - { - "epoch": 1.5129179331306992, - "grad_norm": 2.0943644046783447, - "learning_rate": 2.6736947464826107e-06, - "loss": 0.2354314625263214, - "mean_token_accuracy": 0.9137634038925171, - "num_tokens": 17842712.0, - "step": 1991 - }, - { - "epoch": 1.513677811550152, - "grad_norm": 1.1303033828735352, - "learning_rate": 2.671605351796302e-06, - "loss": 0.3624761700630188, - "mean_token_accuracy": 0.8769594430923462, - "num_tokens": 17860902.0, - "step": 1992 - }, - { - "epoch": 1.5144376899696048, - "grad_norm": 2.8921146392822266, - "learning_rate": 2.6695158366707526e-06, - "loss": 0.2517220973968506, - "mean_token_accuracy": 0.8974182605743408, - "num_tokens": 17865160.0, - "step": 1993 - }, - { - "epoch": 1.5151975683890577, - "grad_norm": 2.320587158203125, - "learning_rate": 2.667426202572463e-06, - "loss": 0.4589889943599701, - "mean_token_accuracy": 0.8379613161087036, - "num_tokens": 17871994.0, - "step": 1994 - }, - { - "epoch": 1.5159574468085106, - "grad_norm": 1.1407674551010132, - "learning_rate": 2.665336450968019e-06, - "loss": 0.34412115812301636, - "mean_token_accuracy": 0.8776306509971619, - "num_tokens": 17889941.0, - "step": 1995 - }, - { - "epoch": 1.5167173252279635, - "grad_norm": 2.069814920425415, - "learning_rate": 2.6632465833240895e-06, - "loss": 0.47524404525756836, - "mean_token_accuracy": 0.830310046672821, - "num_tokens": 17898447.0, - "step": 1996 - }, - { - "epoch": 1.5174772036474165, - "grad_norm": 1.822415828704834, - "learning_rate": 2.661156601107424e-06, - "loss": 0.4541318416595459, - "mean_token_accuracy": 0.8856616020202637, - "num_tokens": 17908729.0, - "step": 1997 - }, - { - "epoch": 1.5182370820668694, - "grad_norm": 2.851428985595703, - "learning_rate": 2.659066505784852e-06, - "loss": 0.41761666536331177, - "mean_token_accuracy": 0.8710572719573975, - "num_tokens": 17913860.0, - "step": 1998 - }, - { - "epoch": 1.518996960486322, - "grad_norm": 1.8483710289001465, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.45517268776893616, - "mean_token_accuracy": 0.8411115407943726, - "num_tokens": 17923497.0, - "step": 1999 - }, - { - "epoch": 1.5197568389057752, - "grad_norm": 1.9044219255447388, - "learning_rate": 2.654885981689706e-06, - "loss": 0.42533189058303833, - "mean_token_accuracy": 0.8597894906997681, - "num_tokens": 17932670.0, - "step": 2000 - }, - { - "epoch": 1.5205167173252279, - "grad_norm": 1.8170348405838013, - "learning_rate": 2.652795555851184e-06, - "loss": 0.4009692072868347, - "mean_token_accuracy": 0.8553036451339722, - "num_tokens": 17941616.0, - "step": 2001 - }, - { - "epoch": 1.521276595744681, - "grad_norm": 1.4704090356826782, - "learning_rate": 2.6507050227748595e-06, - "loss": 0.3732764720916748, - "mean_token_accuracy": 0.8788566589355469, - "num_tokens": 17957187.0, - "step": 2002 - }, - { - "epoch": 1.5220364741641337, - "grad_norm": 1.6681534051895142, - "learning_rate": 2.648614383927949e-06, - "loss": 0.341326504945755, - "mean_token_accuracy": 0.874875545501709, - "num_tokens": 17966668.0, - "step": 2003 - }, - { - "epoch": 1.5227963525835866, - "grad_norm": 1.8578619956970215, - "learning_rate": 2.646523640777741e-06, - "loss": 0.3937399983406067, - "mean_token_accuracy": 0.8656851053237915, - "num_tokens": 17976194.0, - "step": 2004 - }, - { - "epoch": 1.5235562310030395, - "grad_norm": 1.7520431280136108, - "learning_rate": 2.6444327947916037e-06, - "loss": 0.3392767906188965, - "mean_token_accuracy": 0.8799679279327393, - "num_tokens": 17984492.0, - "step": 2005 - }, - { - "epoch": 1.5243161094224924, - "grad_norm": 3.4649906158447266, - "learning_rate": 2.6423418474369707e-06, - "loss": 0.3451516032218933, - "mean_token_accuracy": 0.8753262758255005, - "num_tokens": 17988240.0, - "step": 2006 - }, - { - "epoch": 1.5250759878419453, - "grad_norm": 1.8037052154541016, - "learning_rate": 2.64025080018135e-06, - "loss": 0.34428173303604126, - "mean_token_accuracy": 0.8719067573547363, - "num_tokens": 17996644.0, - "step": 2007 - }, - { - "epoch": 1.5258358662613982, - "grad_norm": 1.743722677230835, - "learning_rate": 2.6381596544923184e-06, - "loss": 0.4446655213832855, - "mean_token_accuracy": 0.8612518906593323, - "num_tokens": 18005109.0, - "step": 2008 - }, - { - "epoch": 1.5265957446808511, - "grad_norm": 1.3357981443405151, - "learning_rate": 2.636068411837523e-06, - "loss": 0.38647788763046265, - "mean_token_accuracy": 0.858294665813446, - "num_tokens": 18018193.0, - "step": 2009 - }, - { - "epoch": 1.5273556231003038, - "grad_norm": 1.4848440885543823, - "learning_rate": 2.6339770736846794e-06, - "loss": 0.3597261607646942, - "mean_token_accuracy": 0.8760983943939209, - "num_tokens": 18028959.0, - "step": 2010 - }, - { - "epoch": 1.528115501519757, - "grad_norm": 2.356933832168579, - "learning_rate": 2.6318856415015664e-06, - "loss": 0.2697138488292694, - "mean_token_accuracy": 0.9078473448753357, - "num_tokens": 18033946.0, - "step": 2011 - }, - { - "epoch": 1.5288753799392096, - "grad_norm": 1.964368224143982, - "learning_rate": 2.629794116756035e-06, - "loss": 0.41349685192108154, - "mean_token_accuracy": 0.8567900657653809, - "num_tokens": 18042724.0, - "step": 2012 - }, - { - "epoch": 1.5296352583586628, - "grad_norm": 1.5630402565002441, - "learning_rate": 2.627702500915995e-06, - "loss": 0.49310681223869324, - "mean_token_accuracy": 0.8229681253433228, - "num_tokens": 18054396.0, - "step": 2013 - }, - { - "epoch": 1.5303951367781155, - "grad_norm": 1.6657718420028687, - "learning_rate": 2.625610795449424e-06, - "loss": 0.4263935387134552, - "mean_token_accuracy": 0.8634918332099915, - "num_tokens": 18064347.0, - "step": 2014 - }, - { - "epoch": 1.5311550151975684, - "grad_norm": 1.3684180974960327, - "learning_rate": 2.6235190018243623e-06, - "loss": 0.2903984487056732, - "mean_token_accuracy": 0.8930408358573914, - "num_tokens": 18076826.0, - "step": 2015 - }, - { - "epoch": 1.5319148936170213, - "grad_norm": 1.635044813156128, - "learning_rate": 2.6214271215089106e-06, - "loss": 0.3066539168357849, - "mean_token_accuracy": 0.8912158012390137, - "num_tokens": 18085761.0, - "step": 2016 - }, - { - "epoch": 1.5326747720364742, - "grad_norm": 2.431518316268921, - "learning_rate": 2.6193351559712294e-06, - "loss": 0.31123271584510803, - "mean_token_accuracy": 0.8865828514099121, - "num_tokens": 18091715.0, - "step": 2017 - }, - { - "epoch": 1.533434650455927, - "grad_norm": 1.8317419290542603, - "learning_rate": 2.6172431066795428e-06, - "loss": 0.5042020082473755, - "mean_token_accuracy": 0.8245081901550293, - "num_tokens": 18102095.0, - "step": 2018 - }, - { - "epoch": 1.53419452887538, - "grad_norm": 3.4221980571746826, - "learning_rate": 2.6151509751021307e-06, - "loss": 0.2885819971561432, - "mean_token_accuracy": 0.8997149467468262, - "num_tokens": 18105456.0, - "step": 2019 - }, - { - "epoch": 1.534954407294833, - "grad_norm": 1.4435855150222778, - "learning_rate": 2.6130587627073315e-06, - "loss": 0.45573529601097107, - "mean_token_accuracy": 0.837191104888916, - "num_tokens": 18119039.0, - "step": 2020 - }, - { - "epoch": 1.5357142857142856, - "grad_norm": 1.5748237371444702, - "learning_rate": 2.6109664709635413e-06, - "loss": 0.4561889171600342, - "mean_token_accuracy": 0.8334558010101318, - "num_tokens": 18132150.0, - "step": 2021 - }, - { - "epoch": 1.5364741641337387, - "grad_norm": 2.8278751373291016, - "learning_rate": 2.60887410133921e-06, - "loss": 0.3495104908943176, - "mean_token_accuracy": 0.8926796913146973, - "num_tokens": 18136528.0, - "step": 2022 - }, - { - "epoch": 1.5372340425531914, - "grad_norm": 2.5045573711395264, - "learning_rate": 2.606781655302843e-06, - "loss": 0.45362481474876404, - "mean_token_accuracy": 0.8379551768302917, - "num_tokens": 18142581.0, - "step": 2023 - }, - { - "epoch": 1.5379939209726445, - "grad_norm": 2.5984106063842773, - "learning_rate": 2.604689134322999e-06, - "loss": 0.4210243821144104, - "mean_token_accuracy": 0.8571645021438599, - "num_tokens": 18148152.0, - "step": 2024 - }, - { - "epoch": 1.5387537993920972, - "grad_norm": 1.7180702686309814, - "learning_rate": 2.602596539868292e-06, - "loss": 0.2478562295436859, - "mean_token_accuracy": 0.9227135181427002, - "num_tokens": 18155435.0, - "step": 2025 - }, - { - "epoch": 1.5395136778115501, - "grad_norm": 2.3721933364868164, - "learning_rate": 2.6005038734073833e-06, - "loss": 0.3820664584636688, - "mean_token_accuracy": 0.8788443803787231, - "num_tokens": 18161403.0, - "step": 2026 - }, - { - "epoch": 1.540273556231003, - "grad_norm": 1.4967509508132935, - "learning_rate": 2.5984111364089875e-06, - "loss": 0.34247124195098877, - "mean_token_accuracy": 0.8809049129486084, - "num_tokens": 18173724.0, - "step": 2027 - }, - { - "epoch": 1.541033434650456, - "grad_norm": 2.5226845741271973, - "learning_rate": 2.5963183303418682e-06, - "loss": 0.2647642493247986, - "mean_token_accuracy": 0.8988642692565918, - "num_tokens": 18178927.0, - "step": 2028 - }, - { - "epoch": 1.5417933130699089, - "grad_norm": 2.217228412628174, - "learning_rate": 2.594225456674837e-06, - "loss": 0.37754058837890625, - "mean_token_accuracy": 0.8660204410552979, - "num_tokens": 18185268.0, - "step": 2029 - }, - { - "epoch": 1.5425531914893615, - "grad_norm": 2.336409091949463, - "learning_rate": 2.592132516876753e-06, - "loss": 0.45098528265953064, - "mean_token_accuracy": 0.842115044593811, - "num_tokens": 18192372.0, - "step": 2030 - }, - { - "epoch": 1.5433130699088147, - "grad_norm": 3.5437142848968506, - "learning_rate": 2.5900395124165216e-06, - "loss": 0.5326460003852844, - "mean_token_accuracy": 0.8125103712081909, - "num_tokens": 18199182.0, - "step": 2031 - }, - { - "epoch": 1.5440729483282674, - "grad_norm": 1.5785651206970215, - "learning_rate": 2.5879464447630947e-06, - "loss": 0.3714991509914398, - "mean_token_accuracy": 0.8711390495300293, - "num_tokens": 18209045.0, - "step": 2032 - }, - { - "epoch": 1.5448328267477205, - "grad_norm": 2.3616182804107666, - "learning_rate": 2.5858533153854676e-06, - "loss": 0.4548399746417999, - "mean_token_accuracy": 0.8411449193954468, - "num_tokens": 18215487.0, - "step": 2033 - }, - { - "epoch": 1.5455927051671732, - "grad_norm": 2.0750479698181152, - "learning_rate": 2.583760125752679e-06, - "loss": 0.3980535566806793, - "mean_token_accuracy": 0.8603327870368958, - "num_tokens": 18222606.0, - "step": 2034 - }, - { - "epoch": 1.5463525835866263, - "grad_norm": 2.609295129776001, - "learning_rate": 2.58166687733381e-06, - "loss": 0.40177756547927856, - "mean_token_accuracy": 0.8652099370956421, - "num_tokens": 18227341.0, - "step": 2035 - }, - { - "epoch": 1.547112462006079, - "grad_norm": 2.1621339321136475, - "learning_rate": 2.5795735715979826e-06, - "loss": 0.45104342699050903, - "mean_token_accuracy": 0.8481369018554688, - "num_tokens": 18235820.0, - "step": 2036 - }, - { - "epoch": 1.547872340425532, - "grad_norm": 1.0381370782852173, - "learning_rate": 2.577480210014359e-06, - "loss": 0.32621103525161743, - "mean_token_accuracy": 0.8867391347885132, - "num_tokens": 18258307.0, - "step": 2037 - }, - { - "epoch": 1.5486322188449848, - "grad_norm": 1.7634375095367432, - "learning_rate": 2.575386794052142e-06, - "loss": 0.5115169882774353, - "mean_token_accuracy": 0.818779468536377, - "num_tokens": 18272782.0, - "step": 2038 - }, - { - "epoch": 1.5493920972644377, - "grad_norm": 1.874875545501709, - "learning_rate": 2.5732933251805716e-06, - "loss": 0.4381459951400757, - "mean_token_accuracy": 0.8594684600830078, - "num_tokens": 18282618.0, - "step": 2039 - }, - { - "epoch": 1.5501519756838906, - "grad_norm": 2.1316351890563965, - "learning_rate": 2.571199804868923e-06, - "loss": 0.5410124063491821, - "mean_token_accuracy": 0.8247587084770203, - "num_tokens": 18289750.0, - "step": 2040 - }, - { - "epoch": 1.5509118541033433, - "grad_norm": 1.7574573755264282, - "learning_rate": 2.569106234586511e-06, - "loss": 0.29967373609542847, - "mean_token_accuracy": 0.8913218975067139, - "num_tokens": 18298110.0, - "step": 2041 - }, - { - "epoch": 1.5516717325227964, - "grad_norm": 1.929626703262329, - "learning_rate": 2.5670126158026843e-06, - "loss": 0.3287760019302368, - "mean_token_accuracy": 0.8870488405227661, - "num_tokens": 18305702.0, - "step": 2042 - }, - { - "epoch": 1.5524316109422491, - "grad_norm": 3.020153284072876, - "learning_rate": 2.5649189499868233e-06, - "loss": 0.38523542881011963, - "mean_token_accuracy": 0.854824960231781, - "num_tokens": 18309830.0, - "step": 2043 - }, - { - "epoch": 1.5531914893617023, - "grad_norm": 1.6378421783447266, - "learning_rate": 2.5628252386083443e-06, - "loss": 0.47371378540992737, - "mean_token_accuracy": 0.8627713918685913, - "num_tokens": 18322820.0, - "step": 2044 - }, - { - "epoch": 1.553951367781155, - "grad_norm": 1.3711130619049072, - "learning_rate": 2.560731483136694e-06, - "loss": 0.3319293260574341, - "mean_token_accuracy": 0.8704103231430054, - "num_tokens": 18335074.0, - "step": 2045 - }, - { - "epoch": 1.5547112462006079, - "grad_norm": 1.7589185237884521, - "learning_rate": 2.558637685041352e-06, - "loss": 0.4446021020412445, - "mean_token_accuracy": 0.8446722626686096, - "num_tokens": 18344115.0, - "step": 2046 - }, - { - "epoch": 1.5554711246200608, - "grad_norm": 2.5249195098876953, - "learning_rate": 2.5565438457918247e-06, - "loss": 0.4625541865825653, - "mean_token_accuracy": 0.8451195359230042, - "num_tokens": 18349235.0, - "step": 2047 - }, - { - "epoch": 1.5562310030395137, - "grad_norm": 1.0562543869018555, - "learning_rate": 2.5544499668576508e-06, - "loss": 0.33747735619544983, - "mean_token_accuracy": 0.8503615856170654, - "num_tokens": 18368253.0, - "step": 2048 - }, - { - "epoch": 1.5569908814589666, - "grad_norm": 2.9451215267181396, - "learning_rate": 2.5523560497083927e-06, - "loss": 0.3958815932273865, - "mean_token_accuracy": 0.8393744826316833, - "num_tokens": 18372887.0, - "step": 2049 - }, - { - "epoch": 1.5577507598784195, - "grad_norm": 1.3597660064697266, - "learning_rate": 2.5502620958136444e-06, - "loss": 0.46281275153160095, - "mean_token_accuracy": 0.8269470930099487, - "num_tokens": 18388074.0, - "step": 2050 - }, - { - "epoch": 1.5585106382978724, - "grad_norm": 3.269068717956543, - "learning_rate": 2.548168106643022e-06, - "loss": 0.2309008538722992, - "mean_token_accuracy": 0.9178205728530884, - "num_tokens": 18391406.0, - "step": 2051 - }, - { - "epoch": 1.559270516717325, - "grad_norm": 2.1459391117095947, - "learning_rate": 2.546074083666169e-06, - "loss": 0.4006733298301697, - "mean_token_accuracy": 0.8631902933120728, - "num_tokens": 18397497.0, - "step": 2052 - }, - { - "epoch": 1.5600303951367782, - "grad_norm": 1.4614566564559937, - "learning_rate": 2.5439800283527495e-06, - "loss": 0.40810418128967285, - "mean_token_accuracy": 0.8473483920097351, - "num_tokens": 18409474.0, - "step": 2053 - }, - { - "epoch": 1.560790273556231, - "grad_norm": 2.084808826446533, - "learning_rate": 2.541885942172454e-06, - "loss": 0.34967708587646484, - "mean_token_accuracy": 0.8707003593444824, - "num_tokens": 18416400.0, - "step": 2054 - }, - { - "epoch": 1.561550151975684, - "grad_norm": 1.90664541721344, - "learning_rate": 2.539791826594991e-06, - "loss": 0.37694251537323, - "mean_token_accuracy": 0.8704941272735596, - "num_tokens": 18424206.0, - "step": 2055 - }, - { - "epoch": 1.5623100303951367, - "grad_norm": 1.880176305770874, - "learning_rate": 2.537697683090093e-06, - "loss": 0.32510411739349365, - "mean_token_accuracy": 0.8848961591720581, - "num_tokens": 18431676.0, - "step": 2056 - }, - { - "epoch": 1.5630699088145896, - "grad_norm": 2.133375406265259, - "learning_rate": 2.5356035131275096e-06, - "loss": 0.30538493394851685, - "mean_token_accuracy": 0.8890067338943481, - "num_tokens": 18438014.0, - "step": 2057 - }, - { - "epoch": 1.5638297872340425, - "grad_norm": 2.3495655059814453, - "learning_rate": 2.5335093181770105e-06, - "loss": 0.3126775324344635, - "mean_token_accuracy": 0.8865689039230347, - "num_tokens": 18443604.0, - "step": 2058 - }, - { - "epoch": 1.5645896656534954, - "grad_norm": 2.37949538230896, - "learning_rate": 2.531415099708382e-06, - "loss": 0.3257793188095093, - "mean_token_accuracy": 0.8809669017791748, - "num_tokens": 18448654.0, - "step": 2059 - }, - { - "epoch": 1.5653495440729484, - "grad_norm": 1.8285472393035889, - "learning_rate": 2.5293208591914265e-06, - "loss": 0.32376936078071594, - "mean_token_accuracy": 0.8816431760787964, - "num_tokens": 18456619.0, - "step": 2060 - }, - { - "epoch": 1.5661094224924013, - "grad_norm": 2.3238534927368164, - "learning_rate": 2.5272265980959644e-06, - "loss": 0.40366506576538086, - "mean_token_accuracy": 0.8496750593185425, - "num_tokens": 18462788.0, - "step": 2061 - }, - { - "epoch": 1.5668693009118542, - "grad_norm": 1.8954942226409912, - "learning_rate": 2.525132317891827e-06, - "loss": 0.3405473828315735, - "mean_token_accuracy": 0.8849360942840576, - "num_tokens": 18470719.0, - "step": 2062 - }, - { - "epoch": 1.5676291793313069, - "grad_norm": 1.6268190145492554, - "learning_rate": 2.523038020048861e-06, - "loss": 0.3662685751914978, - "mean_token_accuracy": 0.8865662813186646, - "num_tokens": 18482095.0, - "step": 2063 - }, - { - "epoch": 1.56838905775076, - "grad_norm": 2.5198733806610107, - "learning_rate": 2.5209437060369266e-06, - "loss": 0.3968311548233032, - "mean_token_accuracy": 0.8643308281898499, - "num_tokens": 18488069.0, - "step": 2064 - }, - { - "epoch": 1.5691489361702127, - "grad_norm": 2.9197335243225098, - "learning_rate": 2.518849377325893e-06, - "loss": 0.24738386273384094, - "mean_token_accuracy": 0.91959547996521, - "num_tokens": 18491762.0, - "step": 2065 - }, - { - "epoch": 1.5699088145896658, - "grad_norm": 1.5914254188537598, - "learning_rate": 2.51675503538564e-06, - "loss": 0.33473581075668335, - "mean_token_accuracy": 0.8794662952423096, - "num_tokens": 18501316.0, - "step": 2066 - }, - { - "epoch": 1.5706686930091185, - "grad_norm": 2.5130460262298584, - "learning_rate": 2.5146606816860597e-06, - "loss": 0.4067240357398987, - "mean_token_accuracy": 0.8564209342002869, - "num_tokens": 18507169.0, - "step": 2067 - }, - { - "epoch": 1.5714285714285714, - "grad_norm": 2.093353509902954, - "learning_rate": 2.5125663176970475e-06, - "loss": 0.4312136769294739, - "mean_token_accuracy": 0.8540225028991699, - "num_tokens": 18514536.0, - "step": 2068 - }, - { - "epoch": 1.5721884498480243, - "grad_norm": 1.284495234489441, - "learning_rate": 2.5104719448885103e-06, - "loss": 0.3813856542110443, - "mean_token_accuracy": 0.8435653448104858, - "num_tokens": 18529947.0, - "step": 2069 - }, - { - "epoch": 1.5729483282674772, - "grad_norm": 2.0383973121643066, - "learning_rate": 2.5083775647303583e-06, - "loss": 0.4428079426288605, - "mean_token_accuracy": 0.8841741681098938, - "num_tokens": 18537109.0, - "step": 2070 - }, - { - "epoch": 1.5737082066869301, - "grad_norm": 1.7991697788238525, - "learning_rate": 2.5062831786925102e-06, - "loss": 0.460052490234375, - "mean_token_accuracy": 0.8459943532943726, - "num_tokens": 18547108.0, - "step": 2071 - }, - { - "epoch": 1.574468085106383, - "grad_norm": 2.2168822288513184, - "learning_rate": 2.5041887882448845e-06, - "loss": 0.2863885462284088, - "mean_token_accuracy": 0.906816840171814, - "num_tokens": 18552357.0, - "step": 2072 - }, - { - "epoch": 1.575227963525836, - "grad_norm": 3.918499708175659, - "learning_rate": 2.5020943948574056e-06, - "loss": 0.3439999222755432, - "mean_token_accuracy": 0.8742123246192932, - "num_tokens": 18555272.0, - "step": 2073 - }, - { - "epoch": 1.5759878419452886, - "grad_norm": 1.773869514465332, - "learning_rate": 2.5e-06, - "loss": 0.2815646827220917, - "mean_token_accuracy": 0.8939872980117798, - "num_tokens": 18562989.0, - "step": 2074 - }, - { - "epoch": 1.5767477203647418, - "grad_norm": 1.8675572872161865, - "learning_rate": 2.497905605142595e-06, - "loss": 0.5005829930305481, - "mean_token_accuracy": 0.8242729902267456, - "num_tokens": 18575587.0, - "step": 2075 - }, - { - "epoch": 1.5775075987841944, - "grad_norm": 2.3143508434295654, - "learning_rate": 2.4958112117551163e-06, - "loss": 0.42472895979881287, - "mean_token_accuracy": 0.8540043830871582, - "num_tokens": 18581666.0, - "step": 2076 - }, - { - "epoch": 1.5782674772036476, - "grad_norm": 2.529740333557129, - "learning_rate": 2.4937168213074906e-06, - "loss": 0.24539905786514282, - "mean_token_accuracy": 0.9041235446929932, - "num_tokens": 18585773.0, - "step": 2077 - }, - { - "epoch": 1.5790273556231003, - "grad_norm": 2.5188395977020264, - "learning_rate": 2.491622435269642e-06, - "loss": 0.23059265315532684, - "mean_token_accuracy": 0.9204603433609009, - "num_tokens": 18589915.0, - "step": 2078 - }, - { - "epoch": 1.5797872340425532, - "grad_norm": 2.7752444744110107, - "learning_rate": 2.489528055111491e-06, - "loss": 0.452225923538208, - "mean_token_accuracy": 0.8444918990135193, - "num_tokens": 18595488.0, - "step": 2079 - }, - { - "epoch": 1.580547112462006, - "grad_norm": 1.174774408340454, - "learning_rate": 2.487433682302953e-06, - "loss": 0.3399246633052826, - "mean_token_accuracy": 0.8608446717262268, - "num_tokens": 18613756.0, - "step": 2080 - }, - { - "epoch": 1.581306990881459, - "grad_norm": 1.515575647354126, - "learning_rate": 2.485339318313941e-06, - "loss": 0.45886170864105225, - "mean_token_accuracy": 0.8479131460189819, - "num_tokens": 18629610.0, - "step": 2081 - }, - { - "epoch": 1.582066869300912, - "grad_norm": 1.7039403915405273, - "learning_rate": 2.4832449646143605e-06, - "loss": 0.349803626537323, - "mean_token_accuracy": 0.8721815347671509, - "num_tokens": 18637523.0, - "step": 2082 - }, - { - "epoch": 1.5828267477203646, - "grad_norm": 3.2289421558380127, - "learning_rate": 2.4811506226741077e-06, - "loss": 0.4967171549797058, - "mean_token_accuracy": 0.8303675651550293, - "num_tokens": 18641826.0, - "step": 2083 - }, - { - "epoch": 1.5835866261398177, - "grad_norm": 1.71235990524292, - "learning_rate": 2.4790562939630738e-06, - "loss": 0.4202485680580139, - "mean_token_accuracy": 0.8581224679946899, - "num_tokens": 18653146.0, - "step": 2084 - }, - { - "epoch": 1.5843465045592704, - "grad_norm": 1.710036277770996, - "learning_rate": 2.4769619799511392e-06, - "loss": 0.3942421078681946, - "mean_token_accuracy": 0.8553562164306641, - "num_tokens": 18663826.0, - "step": 2085 - }, - { - "epoch": 1.5851063829787235, - "grad_norm": 1.464859127998352, - "learning_rate": 2.474867682108174e-06, - "loss": 0.4093329906463623, - "mean_token_accuracy": 0.8598780632019043, - "num_tokens": 18675325.0, - "step": 2086 - }, - { - "epoch": 1.5858662613981762, - "grad_norm": 2.083707809448242, - "learning_rate": 2.472773401904037e-06, - "loss": 0.4252093434333801, - "mean_token_accuracy": 0.8433356881141663, - "num_tokens": 18682416.0, - "step": 2087 - }, - { - "epoch": 1.5866261398176293, - "grad_norm": 1.5577973127365112, - "learning_rate": 2.470679140808574e-06, - "loss": 0.3680085241794586, - "mean_token_accuracy": 0.8609116077423096, - "num_tokens": 18694445.0, - "step": 2088 - }, - { - "epoch": 1.587386018237082, - "grad_norm": 2.1617276668548584, - "learning_rate": 2.4685849002916184e-06, - "loss": 0.40488749742507935, - "mean_token_accuracy": 0.8429721593856812, - "num_tokens": 18701204.0, - "step": 2089 - }, - { - "epoch": 1.588145896656535, - "grad_norm": 2.046678304672241, - "learning_rate": 2.4664906818229903e-06, - "loss": 0.329141229391098, - "mean_token_accuracy": 0.8830771446228027, - "num_tokens": 18708354.0, - "step": 2090 - }, - { - "epoch": 1.5889057750759878, - "grad_norm": 2.7741200923919678, - "learning_rate": 2.4643964868724916e-06, - "loss": 0.42294493317604065, - "mean_token_accuracy": 0.8612706065177917, - "num_tokens": 18713017.0, - "step": 2091 - }, - { - "epoch": 1.5896656534954408, - "grad_norm": 2.085151433944702, - "learning_rate": 2.4623023169099074e-06, - "loss": 0.39038220047950745, - "mean_token_accuracy": 0.861169695854187, - "num_tokens": 18721423.0, - "step": 2092 - }, - { - "epoch": 1.5904255319148937, - "grad_norm": 2.8721165657043457, - "learning_rate": 2.4602081734050093e-06, - "loss": 0.27753859758377075, - "mean_token_accuracy": 0.8959167003631592, - "num_tokens": 18725044.0, - "step": 2093 - }, - { - "epoch": 1.5911854103343464, - "grad_norm": 1.7388207912445068, - "learning_rate": 2.4581140578275473e-06, - "loss": 0.3570033311843872, - "mean_token_accuracy": 0.8715590238571167, - "num_tokens": 18733891.0, - "step": 2094 - }, - { - "epoch": 1.5919452887537995, - "grad_norm": 2.3645241260528564, - "learning_rate": 2.456019971647251e-06, - "loss": 0.38982006907463074, - "mean_token_accuracy": 0.8734139800071716, - "num_tokens": 18740464.0, - "step": 2095 - }, - { - "epoch": 1.5927051671732522, - "grad_norm": 3.674072027206421, - "learning_rate": 2.4539259163338317e-06, - "loss": 0.4068281650543213, - "mean_token_accuracy": 0.8397839069366455, - "num_tokens": 18744857.0, - "step": 2096 - }, - { - "epoch": 1.5934650455927053, - "grad_norm": 1.8209186792373657, - "learning_rate": 2.4518318933569786e-06, - "loss": 0.3471015691757202, - "mean_token_accuracy": 0.8709044456481934, - "num_tokens": 18752414.0, - "step": 2097 - }, - { - "epoch": 1.594224924012158, - "grad_norm": 1.8138704299926758, - "learning_rate": 2.449737904186357e-06, - "loss": 0.3438487648963928, - "mean_token_accuracy": 0.8766711950302124, - "num_tokens": 18760587.0, - "step": 2098 - }, - { - "epoch": 1.594984802431611, - "grad_norm": 1.7893842458724976, - "learning_rate": 2.447643950291608e-06, - "loss": 0.43519508838653564, - "mean_token_accuracy": 0.8682907819747925, - "num_tokens": 18770293.0, - "step": 2099 - }, - { - "epoch": 1.5957446808510638, - "grad_norm": 1.4305094480514526, - "learning_rate": 2.4455500331423505e-06, - "loss": 0.37106508016586304, - "mean_token_accuracy": 0.8611354827880859, - "num_tokens": 18782456.0, - "step": 2100 - }, - { - "epoch": 1.5965045592705167, - "grad_norm": 2.0797057151794434, - "learning_rate": 2.4434561542081765e-06, - "loss": 0.43942689895629883, - "mean_token_accuracy": 0.8477288484573364, - "num_tokens": 18789547.0, - "step": 2101 - }, - { - "epoch": 1.5972644376899696, - "grad_norm": 1.2983288764953613, - "learning_rate": 2.441362314958649e-06, - "loss": 0.46385765075683594, - "mean_token_accuracy": 0.8340978622436523, - "num_tokens": 18809456.0, - "step": 2102 - }, - { - "epoch": 1.5980243161094225, - "grad_norm": 2.60866641998291, - "learning_rate": 2.439268516863306e-06, - "loss": 0.3106239140033722, - "mean_token_accuracy": 0.8859497308731079, - "num_tokens": 18813781.0, - "step": 2103 - }, - { - "epoch": 1.5987841945288754, - "grad_norm": 3.389376163482666, - "learning_rate": 2.4371747613916566e-06, - "loss": 0.44926169514656067, - "mean_token_accuracy": 0.8664819002151489, - "num_tokens": 18817666.0, - "step": 2104 - }, - { - "epoch": 1.5995440729483281, - "grad_norm": 3.3417351245880127, - "learning_rate": 2.4350810500131776e-06, - "loss": 0.4786076545715332, - "mean_token_accuracy": 0.8357523679733276, - "num_tokens": 18823717.0, - "step": 2105 - }, - { - "epoch": 1.6003039513677813, - "grad_norm": 1.5215197801589966, - "learning_rate": 2.4329873841973174e-06, - "loss": 0.4123923182487488, - "mean_token_accuracy": 0.853337287902832, - "num_tokens": 18835163.0, - "step": 2106 - }, - { - "epoch": 1.601063829787234, - "grad_norm": 1.8798415660858154, - "learning_rate": 2.4308937654134893e-06, - "loss": 0.45594000816345215, - "mean_token_accuracy": 0.8553717732429504, - "num_tokens": 18843923.0, - "step": 2107 - }, - { - "epoch": 1.601823708206687, - "grad_norm": 2.1012487411499023, - "learning_rate": 2.428800195131078e-06, - "loss": 0.4340161085128784, - "mean_token_accuracy": 0.8448120355606079, - "num_tokens": 18851852.0, - "step": 2108 - }, - { - "epoch": 1.6025835866261398, - "grad_norm": 2.827080726623535, - "learning_rate": 2.4267066748194297e-06, - "loss": 0.25922513008117676, - "mean_token_accuracy": 0.9024698734283447, - "num_tokens": 18856113.0, - "step": 2109 - }, - { - "epoch": 1.6033434650455927, - "grad_norm": 1.641032338142395, - "learning_rate": 2.4246132059478582e-06, - "loss": 0.591558575630188, - "mean_token_accuracy": 0.7960667610168457, - "num_tokens": 18870618.0, - "step": 2110 - }, - { - "epoch": 1.6041033434650456, - "grad_norm": 2.600771188735962, - "learning_rate": 2.4225197899856416e-06, - "loss": 0.382815957069397, - "mean_token_accuracy": 0.8654585480690002, - "num_tokens": 18875456.0, - "step": 2111 - }, - { - "epoch": 1.6048632218844985, - "grad_norm": 1.5125449895858765, - "learning_rate": 2.4204264284020182e-06, - "loss": 0.4643454849720001, - "mean_token_accuracy": 0.837038516998291, - "num_tokens": 18887979.0, - "step": 2112 - }, - { - "epoch": 1.6056231003039514, - "grad_norm": 1.7571941614151, - "learning_rate": 2.4183331226661913e-06, - "loss": 0.30713701248168945, - "mean_token_accuracy": 0.8856921195983887, - "num_tokens": 18896143.0, - "step": 2113 - }, - { - "epoch": 1.6063829787234043, - "grad_norm": 2.124593496322632, - "learning_rate": 2.4162398742473216e-06, - "loss": 0.2873607575893402, - "mean_token_accuracy": 0.8986717462539673, - "num_tokens": 18902364.0, - "step": 2114 - }, - { - "epoch": 1.6071428571428572, - "grad_norm": 2.3496272563934326, - "learning_rate": 2.4141466846145332e-06, - "loss": 0.33715200424194336, - "mean_token_accuracy": 0.8816461563110352, - "num_tokens": 18908038.0, - "step": 2115 - }, - { - "epoch": 1.60790273556231, - "grad_norm": 1.2783573865890503, - "learning_rate": 2.4120535552369057e-06, - "loss": 0.45153388381004333, - "mean_token_accuracy": 0.8345640897750854, - "num_tokens": 18926687.0, - "step": 2116 - }, - { - "epoch": 1.608662613981763, - "grad_norm": 2.1481080055236816, - "learning_rate": 2.4099604875834796e-06, - "loss": 0.43976694345474243, - "mean_token_accuracy": 0.847899317741394, - "num_tokens": 18932974.0, - "step": 2117 - }, - { - "epoch": 1.6094224924012157, - "grad_norm": 1.8669065237045288, - "learning_rate": 2.407867483123248e-06, - "loss": 0.4649358093738556, - "mean_token_accuracy": 0.8310785293579102, - "num_tokens": 18942551.0, - "step": 2118 - }, - { - "epoch": 1.6101823708206688, - "grad_norm": 2.7667746543884277, - "learning_rate": 2.4057745433251637e-06, - "loss": 0.4542210102081299, - "mean_token_accuracy": 0.8450086116790771, - "num_tokens": 18947525.0, - "step": 2119 - }, - { - "epoch": 1.6109422492401215, - "grad_norm": 2.2865076065063477, - "learning_rate": 2.4036816696581326e-06, - "loss": 0.34291431307792664, - "mean_token_accuracy": 0.8741394281387329, - "num_tokens": 18952967.0, - "step": 2120 - }, - { - "epoch": 1.6117021276595744, - "grad_norm": 3.055197238922119, - "learning_rate": 2.401588863591013e-06, - "loss": 0.4686807692050934, - "mean_token_accuracy": 0.8440030217170715, - "num_tokens": 18958257.0, - "step": 2121 - }, - { - "epoch": 1.6124620060790273, - "grad_norm": 2.268456220626831, - "learning_rate": 2.3994961265926166e-06, - "loss": 0.440069317817688, - "mean_token_accuracy": 0.8534891605377197, - "num_tokens": 18964745.0, - "step": 2122 - }, - { - "epoch": 1.6132218844984803, - "grad_norm": 2.061185359954834, - "learning_rate": 2.3974034601317085e-06, - "loss": 0.4383159279823303, - "mean_token_accuracy": 0.8484808802604675, - "num_tokens": 18972136.0, - "step": 2123 - }, - { - "epoch": 1.6139817629179332, - "grad_norm": 1.5121275186538696, - "learning_rate": 2.3953108656770018e-06, - "loss": 0.42403632402420044, - "mean_token_accuracy": 0.8467602133750916, - "num_tokens": 18985353.0, - "step": 2124 - }, - { - "epoch": 1.614741641337386, - "grad_norm": 1.9965397119522095, - "learning_rate": 2.3932183446971584e-06, - "loss": 0.3915751576423645, - "mean_token_accuracy": 0.8622956275939941, - "num_tokens": 18992017.0, - "step": 2125 - }, - { - "epoch": 1.615501519756839, - "grad_norm": 1.6688618659973145, - "learning_rate": 2.3911258986607907e-06, - "loss": 0.468288391828537, - "mean_token_accuracy": 0.8372251987457275, - "num_tokens": 19001930.0, - "step": 2126 - }, - { - "epoch": 1.6162613981762917, - "grad_norm": 1.8984699249267578, - "learning_rate": 2.3890335290364596e-06, - "loss": 0.3082895278930664, - "mean_token_accuracy": 0.8815990686416626, - "num_tokens": 19009712.0, - "step": 2127 - }, - { - "epoch": 1.6170212765957448, - "grad_norm": 2.6934773921966553, - "learning_rate": 2.386941237292669e-06, - "loss": 0.48406022787094116, - "mean_token_accuracy": 0.8300775289535522, - "num_tokens": 19015212.0, - "step": 2128 - }, - { - "epoch": 1.6177811550151975, - "grad_norm": 1.6615487337112427, - "learning_rate": 2.3848490248978693e-06, - "loss": 0.45227736234664917, - "mean_token_accuracy": 0.8421006798744202, - "num_tokens": 19027115.0, - "step": 2129 - }, - { - "epoch": 1.6185410334346506, - "grad_norm": 1.4625248908996582, - "learning_rate": 2.3827568933204576e-06, - "loss": 0.4141014814376831, - "mean_token_accuracy": 0.8479453325271606, - "num_tokens": 19041103.0, - "step": 2130 - }, - { - "epoch": 1.6193009118541033, - "grad_norm": 1.856701135635376, - "learning_rate": 2.3806648440287715e-06, - "loss": 0.3440483808517456, - "mean_token_accuracy": 0.8978210687637329, - "num_tokens": 19048124.0, - "step": 2131 - }, - { - "epoch": 1.6200607902735562, - "grad_norm": 1.7056550979614258, - "learning_rate": 2.378572878491091e-06, - "loss": 0.4136195182800293, - "mean_token_accuracy": 0.8579289317131042, - "num_tokens": 19057113.0, - "step": 2132 - }, - { - "epoch": 1.6208206686930091, - "grad_norm": 1.4673033952713013, - "learning_rate": 2.376480998175638e-06, - "loss": 0.40176504850387573, - "mean_token_accuracy": 0.8677150011062622, - "num_tokens": 19068258.0, - "step": 2133 - }, - { - "epoch": 1.621580547112462, - "grad_norm": 2.12859845161438, - "learning_rate": 2.3743892045505764e-06, - "loss": 0.39754825830459595, - "mean_token_accuracy": 0.8486959934234619, - "num_tokens": 19075469.0, - "step": 2134 - }, - { - "epoch": 1.622340425531915, - "grad_norm": 1.474247694015503, - "learning_rate": 2.372297499084006e-06, - "loss": 0.3546760678291321, - "mean_token_accuracy": 0.8767229318618774, - "num_tokens": 19086744.0, - "step": 2135 - }, - { - "epoch": 1.6231003039513676, - "grad_norm": 1.9945709705352783, - "learning_rate": 2.3702058832439667e-06, - "loss": 0.4200798273086548, - "mean_token_accuracy": 0.8435655832290649, - "num_tokens": 19095903.0, - "step": 2136 - }, - { - "epoch": 1.6238601823708207, - "grad_norm": 2.71991229057312, - "learning_rate": 2.368114358498434e-06, - "loss": 0.44925457239151, - "mean_token_accuracy": 0.8348450660705566, - "num_tokens": 19100864.0, - "step": 2137 - }, - { - "epoch": 1.6246200607902734, - "grad_norm": 2.817664623260498, - "learning_rate": 2.366022926315322e-06, - "loss": 0.44386279582977295, - "mean_token_accuracy": 0.8739628791809082, - "num_tokens": 19105355.0, - "step": 2138 - }, - { - "epoch": 1.6253799392097266, - "grad_norm": 1.3673229217529297, - "learning_rate": 2.3639315881624776e-06, - "loss": 0.3693230152130127, - "mean_token_accuracy": 0.8698620796203613, - "num_tokens": 19116748.0, - "step": 2139 - }, - { - "epoch": 1.6261398176291793, - "grad_norm": 2.712531805038452, - "learning_rate": 2.361840345507683e-06, - "loss": 0.4442938268184662, - "mean_token_accuracy": 0.8433241844177246, - "num_tokens": 19121437.0, - "step": 2140 - }, - { - "epoch": 1.6268996960486324, - "grad_norm": 2.2885231971740723, - "learning_rate": 2.359749199818651e-06, - "loss": 0.4021872878074646, - "mean_token_accuracy": 0.8605252504348755, - "num_tokens": 19127633.0, - "step": 2141 - }, - { - "epoch": 1.627659574468085, - "grad_norm": 1.9257299900054932, - "learning_rate": 2.3576581525630297e-06, - "loss": 0.3577788472175598, - "mean_token_accuracy": 0.8691596388816833, - "num_tokens": 19134450.0, - "step": 2142 - }, - { - "epoch": 1.628419452887538, - "grad_norm": 1.5035467147827148, - "learning_rate": 2.355567205208397e-06, - "loss": 0.3800235986709595, - "mean_token_accuracy": 0.867794394493103, - "num_tokens": 19146149.0, - "step": 2143 - }, - { - "epoch": 1.6291793313069909, - "grad_norm": 2.110445737838745, - "learning_rate": 2.353476359222259e-06, - "loss": 0.34394145011901855, - "mean_token_accuracy": 0.8777303695678711, - "num_tokens": 19152017.0, - "step": 2144 - }, - { - "epoch": 1.6299392097264438, - "grad_norm": 1.1713787317276, - "learning_rate": 2.351385616072052e-06, - "loss": 0.4060516357421875, - "mean_token_accuracy": 0.8411345481872559, - "num_tokens": 19172089.0, - "step": 2145 - }, - { - "epoch": 1.6306990881458967, - "grad_norm": 1.7600529193878174, - "learning_rate": 2.3492949772251418e-06, - "loss": 0.5299694538116455, - "mean_token_accuracy": 0.8218191862106323, - "num_tokens": 19184041.0, - "step": 2146 - }, - { - "epoch": 1.6314589665653494, - "grad_norm": 1.7126617431640625, - "learning_rate": 2.3472044441488175e-06, - "loss": 0.38628721237182617, - "mean_token_accuracy": 0.8526935577392578, - "num_tokens": 19193101.0, - "step": 2147 - }, - { - "epoch": 1.6322188449848025, - "grad_norm": 1.210344672203064, - "learning_rate": 2.345114018310295e-06, - "loss": 0.2732373774051666, - "mean_token_accuracy": 0.8903822898864746, - "num_tokens": 19206697.0, - "step": 2148 - }, - { - "epoch": 1.6329787234042552, - "grad_norm": 1.6693075895309448, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.3472709655761719, - "mean_token_accuracy": 0.8767187595367432, - "num_tokens": 19217008.0, - "step": 2149 - }, - { - "epoch": 1.6337386018237083, - "grad_norm": 1.5242515802383423, - "learning_rate": 2.3409334942151485e-06, - "loss": 0.4345507025718689, - "mean_token_accuracy": 0.8481311202049255, - "num_tokens": 19231573.0, - "step": 2150 - }, - { - "epoch": 1.634498480243161, - "grad_norm": 2.470122814178467, - "learning_rate": 2.3388433988925767e-06, - "loss": 0.4453052878379822, - "mean_token_accuracy": 0.8411355018615723, - "num_tokens": 19237076.0, - "step": 2151 - }, - { - "epoch": 1.635258358662614, - "grad_norm": 2.4177467823028564, - "learning_rate": 2.3367534166759105e-06, - "loss": 0.454534113407135, - "mean_token_accuracy": 0.8635509014129639, - "num_tokens": 19242890.0, - "step": 2152 - }, - { - "epoch": 1.6360182370820668, - "grad_norm": 2.8036744594573975, - "learning_rate": 2.3346635490319815e-06, - "loss": 0.4396413564682007, - "mean_token_accuracy": 0.8491836786270142, - "num_tokens": 19247492.0, - "step": 2153 - }, - { - "epoch": 1.6367781155015197, - "grad_norm": 1.9286335706710815, - "learning_rate": 2.3325737974275382e-06, - "loss": 0.34988659620285034, - "mean_token_accuracy": 0.8704243898391724, - "num_tokens": 19254966.0, - "step": 2154 - }, - { - "epoch": 1.6375379939209727, - "grad_norm": 1.8929904699325562, - "learning_rate": 2.3304841633292487e-06, - "loss": 0.4195491671562195, - "mean_token_accuracy": 0.857181966304779, - "num_tokens": 19263324.0, - "step": 2155 - }, - { - "epoch": 1.6382978723404256, - "grad_norm": 2.2598466873168945, - "learning_rate": 2.328394648203698e-06, - "loss": 0.37977826595306396, - "mean_token_accuracy": 0.8626722097396851, - "num_tokens": 19269363.0, - "step": 2156 - }, - { - "epoch": 1.6390577507598785, - "grad_norm": 1.8118126392364502, - "learning_rate": 2.32630525351739e-06, - "loss": 0.3532063364982605, - "mean_token_accuracy": 0.8677854537963867, - "num_tokens": 19277360.0, - "step": 2157 - }, - { - "epoch": 1.6398176291793312, - "grad_norm": 1.5216798782348633, - "learning_rate": 2.324215980736741e-06, - "loss": 0.38609349727630615, - "mean_token_accuracy": 0.8685325980186462, - "num_tokens": 19292159.0, - "step": 2158 - }, - { - "epoch": 1.6405775075987843, - "grad_norm": 3.0511462688446045, - "learning_rate": 2.3221268313280836e-06, - "loss": 0.21988365054130554, - "mean_token_accuracy": 0.9172534942626953, - "num_tokens": 19295735.0, - "step": 2159 - }, - { - "epoch": 1.641337386018237, - "grad_norm": 1.957828164100647, - "learning_rate": 2.320037806757662e-06, - "loss": 0.3868909478187561, - "mean_token_accuracy": 0.8605331182479858, - "num_tokens": 19303287.0, - "step": 2160 - }, - { - "epoch": 1.64209726443769, - "grad_norm": 2.590040922164917, - "learning_rate": 2.317948908491636e-06, - "loss": 0.3940129578113556, - "mean_token_accuracy": 0.8814224004745483, - "num_tokens": 19308101.0, - "step": 2161 - }, - { - "epoch": 1.6428571428571428, - "grad_norm": 2.859248161315918, - "learning_rate": 2.315860137996074e-06, - "loss": 0.3437344431877136, - "mean_token_accuracy": 0.8789017200469971, - "num_tokens": 19313026.0, - "step": 2162 - }, - { - "epoch": 1.6436170212765957, - "grad_norm": 1.1788666248321533, - "learning_rate": 2.3137714967369544e-06, - "loss": 0.3976179361343384, - "mean_token_accuracy": 0.8383771181106567, - "num_tokens": 19331103.0, - "step": 2163 - }, - { - "epoch": 1.6443768996960486, - "grad_norm": 1.8409802913665771, - "learning_rate": 2.3116829861801687e-06, - "loss": 0.41898879408836365, - "mean_token_accuracy": 0.8575010299682617, - "num_tokens": 19340866.0, - "step": 2164 - }, - { - "epoch": 1.6451367781155015, - "grad_norm": 1.4124691486358643, - "learning_rate": 2.3095946077915115e-06, - "loss": 0.333813339471817, - "mean_token_accuracy": 0.8766071796417236, - "num_tokens": 19353673.0, - "step": 2165 - }, - { - "epoch": 1.6458966565349544, - "grad_norm": 1.76325261592865, - "learning_rate": 2.307506363036688e-06, - "loss": 0.4158991575241089, - "mean_token_accuracy": 0.8522704839706421, - "num_tokens": 19363635.0, - "step": 2166 - }, - { - "epoch": 1.6466565349544073, - "grad_norm": 1.758833885192871, - "learning_rate": 2.305418253381309e-06, - "loss": 0.298480749130249, - "mean_token_accuracy": 0.888424277305603, - "num_tokens": 19372291.0, - "step": 2167 - }, - { - "epoch": 1.6474164133738602, - "grad_norm": 1.6387488842010498, - "learning_rate": 2.3033302802908895e-06, - "loss": 0.4309447109699249, - "mean_token_accuracy": 0.8672212362289429, - "num_tokens": 19383480.0, - "step": 2168 - }, - { - "epoch": 1.648176291793313, - "grad_norm": 1.5251084566116333, - "learning_rate": 2.301242445230851e-06, - "loss": 0.44890880584716797, - "mean_token_accuracy": 0.847392737865448, - "num_tokens": 19394810.0, - "step": 2169 - }, - { - "epoch": 1.648936170212766, - "grad_norm": 1.6106950044631958, - "learning_rate": 2.299154749666515e-06, - "loss": 0.4403916597366333, - "mean_token_accuracy": 0.8379756212234497, - "num_tokens": 19405551.0, - "step": 2170 - }, - { - "epoch": 1.6496960486322187, - "grad_norm": 1.4238437414169312, - "learning_rate": 2.2970671950631066e-06, - "loss": 0.4015567898750305, - "mean_token_accuracy": 0.851482629776001, - "num_tokens": 19418621.0, - "step": 2171 - }, - { - "epoch": 1.6504559270516719, - "grad_norm": 1.3026156425476074, - "learning_rate": 2.2949797828857527e-06, - "loss": 0.3680947422981262, - "mean_token_accuracy": 0.8641397953033447, - "num_tokens": 19432118.0, - "step": 2172 - }, - { - "epoch": 1.6512158054711246, - "grad_norm": 2.1265358924865723, - "learning_rate": 2.2928925145994798e-06, - "loss": 0.43980664014816284, - "mean_token_accuracy": 0.8358430862426758, - "num_tokens": 19439069.0, - "step": 2173 - }, - { - "epoch": 1.6519756838905775, - "grad_norm": 1.8399443626403809, - "learning_rate": 2.290805391669212e-06, - "loss": 0.29801061749458313, - "mean_token_accuracy": 0.8773187398910522, - "num_tokens": 19446745.0, - "step": 2174 - }, - { - "epoch": 1.6527355623100304, - "grad_norm": 1.8680047988891602, - "learning_rate": 2.2887184155597725e-06, - "loss": 0.3235543966293335, - "mean_token_accuracy": 0.8754611015319824, - "num_tokens": 19455266.0, - "step": 2175 - }, - { - "epoch": 1.6534954407294833, - "grad_norm": 2.3048481941223145, - "learning_rate": 2.286631587735883e-06, - "loss": 0.4011988043785095, - "mean_token_accuracy": 0.8531811237335205, - "num_tokens": 19461049.0, - "step": 2176 - }, - { - "epoch": 1.6542553191489362, - "grad_norm": 2.6067066192626953, - "learning_rate": 2.2845449096621583e-06, - "loss": 0.4957500696182251, - "mean_token_accuracy": 0.8255549073219299, - "num_tokens": 19466884.0, - "step": 2177 - }, - { - "epoch": 1.655015197568389, - "grad_norm": 1.5211488008499146, - "learning_rate": 2.282458382803109e-06, - "loss": 0.32245099544525146, - "mean_token_accuracy": 0.8865629434585571, - "num_tokens": 19477294.0, - "step": 2178 - }, - { - "epoch": 1.655775075987842, - "grad_norm": 2.245542526245117, - "learning_rate": 2.280372008623142e-06, - "loss": 0.3790864944458008, - "mean_token_accuracy": 0.8766552209854126, - "num_tokens": 19483385.0, - "step": 2179 - }, - { - "epoch": 1.6565349544072947, - "grad_norm": 2.1158151626586914, - "learning_rate": 2.2782857885865538e-06, - "loss": 0.4726812243461609, - "mean_token_accuracy": 0.8384029865264893, - "num_tokens": 19491367.0, - "step": 2180 - }, - { - "epoch": 1.6572948328267478, - "grad_norm": 3.301389694213867, - "learning_rate": 2.2761997241575335e-06, - "loss": 0.37664809823036194, - "mean_token_accuracy": 0.8913813829421997, - "num_tokens": 19494876.0, - "step": 2181 - }, - { - "epoch": 1.6580547112462005, - "grad_norm": 2.2964162826538086, - "learning_rate": 2.274113816800161e-06, - "loss": 0.4110721945762634, - "mean_token_accuracy": 0.8551756143569946, - "num_tokens": 19500546.0, - "step": 2182 - }, - { - "epoch": 1.6588145896656536, - "grad_norm": 3.368161916732788, - "learning_rate": 2.272028067978408e-06, - "loss": 0.39089250564575195, - "mean_token_accuracy": 0.8786845207214355, - "num_tokens": 19504142.0, - "step": 2183 - }, - { - "epoch": 1.6595744680851063, - "grad_norm": 1.7299834489822388, - "learning_rate": 2.2699424791561324e-06, - "loss": 0.5205090641975403, - "mean_token_accuracy": 0.8394201993942261, - "num_tokens": 19514523.0, - "step": 2184 - }, - { - "epoch": 1.6603343465045592, - "grad_norm": 2.045919418334961, - "learning_rate": 2.267857051797081e-06, - "loss": 0.49093255400657654, - "mean_token_accuracy": 0.8338311910629272, - "num_tokens": 19522439.0, - "step": 2185 - }, - { - "epoch": 1.6610942249240122, - "grad_norm": 1.2035714387893677, - "learning_rate": 2.265771787364886e-06, - "loss": 0.37247753143310547, - "mean_token_accuracy": 0.8873692750930786, - "num_tokens": 19536717.0, - "step": 2186 - }, - { - "epoch": 1.661854103343465, - "grad_norm": 2.6186633110046387, - "learning_rate": 2.263686687323068e-06, - "loss": 0.3318040370941162, - "mean_token_accuracy": 0.8720577955245972, - "num_tokens": 19541966.0, - "step": 2187 - }, - { - "epoch": 1.662613981762918, - "grad_norm": 2.6845929622650146, - "learning_rate": 2.261601753135029e-06, - "loss": 0.32441991567611694, - "mean_token_accuracy": 0.8700553178787231, - "num_tokens": 19546644.0, - "step": 2188 - }, - { - "epoch": 1.6633738601823707, - "grad_norm": 2.078998327255249, - "learning_rate": 2.259516986264057e-06, - "loss": 0.3424156904220581, - "mean_token_accuracy": 0.8707810044288635, - "num_tokens": 19553472.0, - "step": 2189 - }, - { - "epoch": 1.6641337386018238, - "grad_norm": 2.380747079849243, - "learning_rate": 2.2574323881733202e-06, - "loss": 0.4994799494743347, - "mean_token_accuracy": 0.817003607749939, - "num_tokens": 19560502.0, - "step": 2190 - }, - { - "epoch": 1.6648936170212765, - "grad_norm": 1.2984378337860107, - "learning_rate": 2.255347960325871e-06, - "loss": 0.33139657974243164, - "mean_token_accuracy": 0.8763977289199829, - "num_tokens": 19575624.0, - "step": 2191 - }, - { - "epoch": 1.6656534954407296, - "grad_norm": 1.3232799768447876, - "learning_rate": 2.2532637041846423e-06, - "loss": 0.32994017004966736, - "mean_token_accuracy": 0.8790634274482727, - "num_tokens": 19588636.0, - "step": 2192 - }, - { - "epoch": 1.6664133738601823, - "grad_norm": 2.11212158203125, - "learning_rate": 2.2511796212124424e-06, - "loss": 0.3140082359313965, - "mean_token_accuracy": 0.8946622014045715, - "num_tokens": 19594917.0, - "step": 2193 - }, - { - "epoch": 1.6671732522796354, - "grad_norm": 2.7206521034240723, - "learning_rate": 2.2490957128719627e-06, - "loss": 0.3723612427711487, - "mean_token_accuracy": 0.8781955242156982, - "num_tokens": 19599310.0, - "step": 2194 - }, - { - "epoch": 1.667933130699088, - "grad_norm": 2.6681952476501465, - "learning_rate": 2.247011980625771e-06, - "loss": 0.3740317225456238, - "mean_token_accuracy": 0.8780536651611328, - "num_tokens": 19604172.0, - "step": 2195 - }, - { - "epoch": 1.668693009118541, - "grad_norm": 1.8933384418487549, - "learning_rate": 2.2449284259363093e-06, - "loss": 0.3359421491622925, - "mean_token_accuracy": 0.8785334825515747, - "num_tokens": 19612030.0, - "step": 2196 - }, - { - "epoch": 1.669452887537994, - "grad_norm": 2.4779889583587646, - "learning_rate": 2.2428450502658964e-06, - "loss": 0.3724144399166107, - "mean_token_accuracy": 0.8739810585975647, - "num_tokens": 19617800.0, - "step": 2197 - }, - { - "epoch": 1.6702127659574468, - "grad_norm": 3.0661120414733887, - "learning_rate": 2.240761855076727e-06, - "loss": 0.3627531826496124, - "mean_token_accuracy": 0.865296483039856, - "num_tokens": 19621885.0, - "step": 2198 - }, - { - "epoch": 1.6709726443768997, - "grad_norm": 2.431708574295044, - "learning_rate": 2.238678841830867e-06, - "loss": 0.31396129727363586, - "mean_token_accuracy": 0.9026765823364258, - "num_tokens": 19627122.0, - "step": 2199 - }, - { - "epoch": 1.6717325227963524, - "grad_norm": 2.5498745441436768, - "learning_rate": 2.2365960119902543e-06, - "loss": 0.3193191885948181, - "mean_token_accuracy": 0.8750600218772888, - "num_tokens": 19631771.0, - "step": 2200 - }, - { - "epoch": 1.6724924012158056, - "grad_norm": 2.0419046878814697, - "learning_rate": 2.2345133670167e-06, - "loss": 0.32747960090637207, - "mean_token_accuracy": 0.8603148460388184, - "num_tokens": 19638972.0, - "step": 2201 - }, - { - "epoch": 1.6732522796352582, - "grad_norm": 2.0412306785583496, - "learning_rate": 2.232430908371885e-06, - "loss": 0.4701780676841736, - "mean_token_accuracy": 0.8318476676940918, - "num_tokens": 19647968.0, - "step": 2202 - }, - { - "epoch": 1.6740121580547114, - "grad_norm": 2.054070472717285, - "learning_rate": 2.2303486375173586e-06, - "loss": 0.33284813165664673, - "mean_token_accuracy": 0.8760920763015747, - "num_tokens": 19654032.0, - "step": 2203 - }, - { - "epoch": 1.674772036474164, - "grad_norm": 1.6053217649459839, - "learning_rate": 2.228266555914538e-06, - "loss": 0.34431374073028564, - "mean_token_accuracy": 0.8764770030975342, - "num_tokens": 19663785.0, - "step": 2204 - }, - { - "epoch": 1.675531914893617, - "grad_norm": 1.474494457244873, - "learning_rate": 2.2261846650247077e-06, - "loss": 0.3541037440299988, - "mean_token_accuracy": 0.8782497644424438, - "num_tokens": 19675498.0, - "step": 2205 - }, - { - "epoch": 1.6762917933130699, - "grad_norm": 1.9318026304244995, - "learning_rate": 2.224102966309021e-06, - "loss": 0.4291660189628601, - "mean_token_accuracy": 0.8424201607704163, - "num_tokens": 19684576.0, - "step": 2206 - }, - { - "epoch": 1.6770516717325228, - "grad_norm": 2.2150020599365234, - "learning_rate": 2.2220214612284925e-06, - "loss": 0.46187907457351685, - "mean_token_accuracy": 0.840459942817688, - "num_tokens": 19690412.0, - "step": 2207 - }, - { - "epoch": 1.6778115501519757, - "grad_norm": 1.667281150817871, - "learning_rate": 2.2199401512440037e-06, - "loss": 0.37440744042396545, - "mean_token_accuracy": 0.8694081902503967, - "num_tokens": 19699600.0, - "step": 2208 - }, - { - "epoch": 1.6785714285714286, - "grad_norm": 2.6446619033813477, - "learning_rate": 2.2178590378162957e-06, - "loss": 0.3301953077316284, - "mean_token_accuracy": 0.8992182016372681, - "num_tokens": 19704162.0, - "step": 2209 - }, - { - "epoch": 1.6793313069908815, - "grad_norm": 1.4266780614852905, - "learning_rate": 2.215778122405977e-06, - "loss": 0.3811204135417938, - "mean_token_accuracy": 0.861638069152832, - "num_tokens": 19716511.0, - "step": 2210 - }, - { - "epoch": 1.6800911854103342, - "grad_norm": 1.826087474822998, - "learning_rate": 2.2136974064735132e-06, - "loss": 0.4790012836456299, - "mean_token_accuracy": 0.8404909372329712, - "num_tokens": 19726645.0, - "step": 2211 - }, - { - "epoch": 1.6808510638297873, - "grad_norm": 1.8551808595657349, - "learning_rate": 2.2116168914792293e-06, - "loss": 0.40999075770378113, - "mean_token_accuracy": 0.8419463634490967, - "num_tokens": 19735601.0, - "step": 2212 - }, - { - "epoch": 1.68161094224924, - "grad_norm": 2.560124158859253, - "learning_rate": 2.209536578883313e-06, - "loss": 0.43428558111190796, - "mean_token_accuracy": 0.8689159750938416, - "num_tokens": 19741138.0, - "step": 2213 - }, - { - "epoch": 1.6823708206686931, - "grad_norm": 2.0154869556427, - "learning_rate": 2.207456470145807e-06, - "loss": 0.43633338809013367, - "mean_token_accuracy": 0.8646916151046753, - "num_tokens": 19751929.0, - "step": 2214 - }, - { - "epoch": 1.6831306990881458, - "grad_norm": 1.3583155870437622, - "learning_rate": 2.205376566726611e-06, - "loss": 0.3050280511379242, - "mean_token_accuracy": 0.8998798727989197, - "num_tokens": 19764012.0, - "step": 2215 - }, - { - "epoch": 1.6838905775075987, - "grad_norm": 1.266262173652649, - "learning_rate": 2.2032968700854813e-06, - "loss": 0.4039713144302368, - "mean_token_accuracy": 0.8571382164955139, - "num_tokens": 19780683.0, - "step": 2216 - }, - { - "epoch": 1.6846504559270516, - "grad_norm": 1.864356517791748, - "learning_rate": 2.2012173816820297e-06, - "loss": 0.361503541469574, - "mean_token_accuracy": 0.868161678314209, - "num_tokens": 19788907.0, - "step": 2217 - }, - { - "epoch": 1.6854103343465046, - "grad_norm": 1.320155382156372, - "learning_rate": 2.1991381029757216e-06, - "loss": 0.28228244185447693, - "mean_token_accuracy": 0.8945217132568359, - "num_tokens": 19800354.0, - "step": 2218 - }, - { - "epoch": 1.6861702127659575, - "grad_norm": 1.9706367254257202, - "learning_rate": 2.1970590354258745e-06, - "loss": 0.2849377989768982, - "mean_token_accuracy": 0.9065699577331543, - "num_tokens": 19806735.0, - "step": 2219 - }, - { - "epoch": 1.6869300911854104, - "grad_norm": 1.9150370359420776, - "learning_rate": 2.1949801804916563e-06, - "loss": 0.4125257730484009, - "mean_token_accuracy": 0.8642163872718811, - "num_tokens": 19814056.0, - "step": 2220 - }, - { - "epoch": 1.6876899696048633, - "grad_norm": 2.062589645385742, - "learning_rate": 2.19290153963209e-06, - "loss": 0.451707124710083, - "mean_token_accuracy": 0.8311163187026978, - "num_tokens": 19821263.0, - "step": 2221 - }, - { - "epoch": 1.688449848024316, - "grad_norm": 1.3959208726882935, - "learning_rate": 2.190823114306045e-06, - "loss": 0.3326707184314728, - "mean_token_accuracy": 0.9037837982177734, - "num_tokens": 19835163.0, - "step": 2222 - }, - { - "epoch": 1.689209726443769, - "grad_norm": 2.09995698928833, - "learning_rate": 2.188744905972239e-06, - "loss": 0.4144105315208435, - "mean_token_accuracy": 0.8512029051780701, - "num_tokens": 19843164.0, - "step": 2223 - }, - { - "epoch": 1.6899696048632218, - "grad_norm": 1.4759427309036255, - "learning_rate": 2.186666916089239e-06, - "loss": 0.4707002639770508, - "mean_token_accuracy": 0.8371601104736328, - "num_tokens": 19858551.0, - "step": 2224 - }, - { - "epoch": 1.690729483282675, - "grad_norm": 2.3398702144622803, - "learning_rate": 2.1845891461154604e-06, - "loss": 0.34672820568084717, - "mean_token_accuracy": 0.879936695098877, - "num_tokens": 19864348.0, - "step": 2225 - }, - { - "epoch": 1.6914893617021276, - "grad_norm": 1.6283963918685913, - "learning_rate": 2.1825115975091594e-06, - "loss": 0.31835079193115234, - "mean_token_accuracy": 0.8695961833000183, - "num_tokens": 19873560.0, - "step": 2226 - }, - { - "epoch": 1.6922492401215805, - "grad_norm": 2.035759687423706, - "learning_rate": 2.1804342717284414e-06, - "loss": 0.43110257387161255, - "mean_token_accuracy": 0.8593922853469849, - "num_tokens": 19880796.0, - "step": 2227 - }, - { - "epoch": 1.6930091185410334, - "grad_norm": 2.1340725421905518, - "learning_rate": 2.1783571702312523e-06, - "loss": 0.46967440843582153, - "mean_token_accuracy": 0.8839266300201416, - "num_tokens": 19887911.0, - "step": 2228 - }, - { - "epoch": 1.6937689969604863, - "grad_norm": 1.710340142250061, - "learning_rate": 2.176280294475383e-06, - "loss": 0.4167519807815552, - "mean_token_accuracy": 0.8526116609573364, - "num_tokens": 19896674.0, - "step": 2229 - }, - { - "epoch": 1.6945288753799392, - "grad_norm": 1.7793304920196533, - "learning_rate": 2.174203645918464e-06, - "loss": 0.3875434994697571, - "mean_token_accuracy": 0.8637192249298096, - "num_tokens": 19904825.0, - "step": 2230 - }, - { - "epoch": 1.6952887537993921, - "grad_norm": 1.7908778190612793, - "learning_rate": 2.172127226017967e-06, - "loss": 0.42065349221229553, - "mean_token_accuracy": 0.850834846496582, - "num_tokens": 19914377.0, - "step": 2231 - }, - { - "epoch": 1.696048632218845, - "grad_norm": 3.0943970680236816, - "learning_rate": 2.1700510362312053e-06, - "loss": 0.44845050573349, - "mean_token_accuracy": 0.8460367918014526, - "num_tokens": 19918929.0, - "step": 2232 - }, - { - "epoch": 1.6968085106382977, - "grad_norm": 1.5586018562316895, - "learning_rate": 2.1679750780153265e-06, - "loss": 0.4723482131958008, - "mean_token_accuracy": 0.871384859085083, - "num_tokens": 19932738.0, - "step": 2233 - }, - { - "epoch": 1.6975683890577509, - "grad_norm": 2.014230728149414, - "learning_rate": 2.1658993528273196e-06, - "loss": 0.43307146430015564, - "mean_token_accuracy": 0.8677935600280762, - "num_tokens": 19940246.0, - "step": 2234 - }, - { - "epoch": 1.6983282674772036, - "grad_norm": 1.528979778289795, - "learning_rate": 2.163823862124007e-06, - "loss": 0.3897377550601959, - "mean_token_accuracy": 0.8737689256668091, - "num_tokens": 19951187.0, - "step": 2235 - }, - { - "epoch": 1.6990881458966567, - "grad_norm": 1.9856207370758057, - "learning_rate": 2.1617486073620496e-06, - "loss": 0.4285745620727539, - "mean_token_accuracy": 0.8744081258773804, - "num_tokens": 19957768.0, - "step": 2236 - }, - { - "epoch": 1.6998480243161094, - "grad_norm": 2.130525827407837, - "learning_rate": 2.15967358999794e-06, - "loss": 0.405293732881546, - "mean_token_accuracy": 0.8588452935218811, - "num_tokens": 19965354.0, - "step": 2237 - }, - { - "epoch": 1.7006079027355623, - "grad_norm": 1.665329098701477, - "learning_rate": 2.1575988114880057e-06, - "loss": 0.42987754940986633, - "mean_token_accuracy": 0.846322238445282, - "num_tokens": 19975780.0, - "step": 2238 - }, - { - "epoch": 1.7013677811550152, - "grad_norm": 1.0725677013397217, - "learning_rate": 2.155524273288405e-06, - "loss": 0.31892159581184387, - "mean_token_accuracy": 0.8692483305931091, - "num_tokens": 19995875.0, - "step": 2239 - }, - { - "epoch": 1.702127659574468, - "grad_norm": 2.282604455947876, - "learning_rate": 2.15344997685513e-06, - "loss": 0.4460654556751251, - "mean_token_accuracy": 0.8623759746551514, - "num_tokens": 20001466.0, - "step": 2240 - }, - { - "epoch": 1.702887537993921, - "grad_norm": 1.1385949850082397, - "learning_rate": 2.1513759236440024e-06, - "loss": 0.37046104669570923, - "mean_token_accuracy": 0.8637164831161499, - "num_tokens": 20020998.0, - "step": 2241 - }, - { - "epoch": 1.7036474164133737, - "grad_norm": 1.5521315336227417, - "learning_rate": 2.1493021151106704e-06, - "loss": 0.4526556134223938, - "mean_token_accuracy": 0.8675785064697266, - "num_tokens": 20032750.0, - "step": 2242 - }, - { - "epoch": 1.7044072948328268, - "grad_norm": 1.7777446508407593, - "learning_rate": 2.147228552710614e-06, - "loss": 0.41294580698013306, - "mean_token_accuracy": 0.8597785234451294, - "num_tokens": 20041901.0, - "step": 2243 - }, - { - "epoch": 1.7051671732522795, - "grad_norm": 1.5157700777053833, - "learning_rate": 2.145155237899139e-06, - "loss": 0.4158926010131836, - "mean_token_accuracy": 0.8512611985206604, - "num_tokens": 20053705.0, - "step": 2244 - }, - { - "epoch": 1.7059270516717326, - "grad_norm": 1.5116809606552124, - "learning_rate": 2.143082172131378e-06, - "loss": 0.43943172693252563, - "mean_token_accuracy": 0.8429899215698242, - "num_tokens": 20069468.0, - "step": 2245 - }, - { - "epoch": 1.7066869300911853, - "grad_norm": 1.6095285415649414, - "learning_rate": 2.141009356862288e-06, - "loss": 0.41325604915618896, - "mean_token_accuracy": 0.8832963705062866, - "num_tokens": 20080596.0, - "step": 2246 - }, - { - "epoch": 1.7074468085106385, - "grad_norm": 1.39210844039917, - "learning_rate": 2.138936793546649e-06, - "loss": 0.3945302963256836, - "mean_token_accuracy": 0.8698325753211975, - "num_tokens": 20094158.0, - "step": 2247 - }, - { - "epoch": 1.7082066869300911, - "grad_norm": 2.9576594829559326, - "learning_rate": 2.1368644836390684e-06, - "loss": 0.16507276892662048, - "mean_token_accuracy": 0.9410445690155029, - "num_tokens": 20097002.0, - "step": 2248 - }, - { - "epoch": 1.708966565349544, - "grad_norm": 1.7631266117095947, - "learning_rate": 2.134792428593971e-06, - "loss": 0.519780695438385, - "mean_token_accuracy": 0.8276066780090332, - "num_tokens": 20107947.0, - "step": 2249 - }, - { - "epoch": 1.709726443768997, - "grad_norm": 2.144636869430542, - "learning_rate": 2.1327206298656055e-06, - "loss": 0.32923734188079834, - "mean_token_accuracy": 0.8766019344329834, - "num_tokens": 20113676.0, - "step": 2250 - }, - { - "epoch": 1.7104863221884499, - "grad_norm": 1.9511034488677979, - "learning_rate": 2.130649088908041e-06, - "loss": 0.4043842554092407, - "mean_token_accuracy": 0.8525843620300293, - "num_tokens": 20120787.0, - "step": 2251 - }, - { - "epoch": 1.7112462006079028, - "grad_norm": 1.5001336336135864, - "learning_rate": 2.1285778071751638e-06, - "loss": 0.4800187051296234, - "mean_token_accuracy": 0.8398486375808716, - "num_tokens": 20133534.0, - "step": 2252 - }, - { - "epoch": 1.7120060790273555, - "grad_norm": 1.435195803642273, - "learning_rate": 2.126506786120678e-06, - "loss": 0.44489604234695435, - "mean_token_accuracy": 0.8444881439208984, - "num_tokens": 20151787.0, - "step": 2253 - }, - { - "epoch": 1.7127659574468086, - "grad_norm": 1.3056137561798096, - "learning_rate": 2.1244360271981073e-06, - "loss": 0.300567090511322, - "mean_token_accuracy": 0.8903113007545471, - "num_tokens": 20163390.0, - "step": 2254 - }, - { - "epoch": 1.7135258358662613, - "grad_norm": 1.7347925901412964, - "learning_rate": 2.1223655318607907e-06, - "loss": 0.30601179599761963, - "mean_token_accuracy": 0.8845717906951904, - "num_tokens": 20171354.0, - "step": 2255 - }, - { - "epoch": 1.7142857142857144, - "grad_norm": 1.316306471824646, - "learning_rate": 2.1202953015618794e-06, - "loss": 0.3972984552383423, - "mean_token_accuracy": 0.845410943031311, - "num_tokens": 20184464.0, - "step": 2256 - }, - { - "epoch": 1.715045592705167, - "grad_norm": 2.1052892208099365, - "learning_rate": 2.1182253377543428e-06, - "loss": 0.3357020616531372, - "mean_token_accuracy": 0.8853542804718018, - "num_tokens": 20190539.0, - "step": 2257 - }, - { - "epoch": 1.71580547112462, - "grad_norm": 1.4192553758621216, - "learning_rate": 2.116155641890959e-06, - "loss": 0.3881692588329315, - "mean_token_accuracy": 0.8442144989967346, - "num_tokens": 20204570.0, - "step": 2258 - }, - { - "epoch": 1.716565349544073, - "grad_norm": 2.134113311767578, - "learning_rate": 2.1140862154243223e-06, - "loss": 0.37803274393081665, - "mean_token_accuracy": 0.8703107237815857, - "num_tokens": 20210535.0, - "step": 2259 - }, - { - "epoch": 1.7173252279635258, - "grad_norm": 2.9149155616760254, - "learning_rate": 2.1120170598068353e-06, - "loss": 0.34860676527023315, - "mean_token_accuracy": 0.8734345436096191, - "num_tokens": 20214375.0, - "step": 2260 - }, - { - "epoch": 1.7180851063829787, - "grad_norm": 1.6855589151382446, - "learning_rate": 2.109948176490711e-06, - "loss": 0.3676984906196594, - "mean_token_accuracy": 0.8531560301780701, - "num_tokens": 20223791.0, - "step": 2261 - }, - { - "epoch": 1.7188449848024316, - "grad_norm": 2.09671950340271, - "learning_rate": 2.10787956692797e-06, - "loss": 0.41744115948677063, - "mean_token_accuracy": 0.8570001125335693, - "num_tokens": 20231254.0, - "step": 2262 - }, - { - "epoch": 1.7196048632218845, - "grad_norm": 3.148813009262085, - "learning_rate": 2.1058112325704436e-06, - "loss": 0.20556189119815826, - "mean_token_accuracy": 0.926898717880249, - "num_tokens": 20234470.0, - "step": 2263 - }, - { - "epoch": 1.7203647416413372, - "grad_norm": 1.9707107543945312, - "learning_rate": 2.103743174869769e-06, - "loss": 0.40733110904693604, - "mean_token_accuracy": 0.8740406036376953, - "num_tokens": 20242286.0, - "step": 2264 - }, - { - "epoch": 1.7211246200607904, - "grad_norm": 1.2756069898605347, - "learning_rate": 2.1016753952773867e-06, - "loss": 0.3940718173980713, - "mean_token_accuracy": 0.860906720161438, - "num_tokens": 20260382.0, - "step": 2265 - }, - { - "epoch": 1.721884498480243, - "grad_norm": 1.5074653625488281, - "learning_rate": 2.0996078952445453e-06, - "loss": 0.3353617191314697, - "mean_token_accuracy": 0.8809853792190552, - "num_tokens": 20271665.0, - "step": 2266 - }, - { - "epoch": 1.7226443768996962, - "grad_norm": 1.4331210851669312, - "learning_rate": 2.0975406762222966e-06, - "loss": 0.32260069251060486, - "mean_token_accuracy": 0.901330828666687, - "num_tokens": 20283122.0, - "step": 2267 - }, - { - "epoch": 1.7234042553191489, - "grad_norm": 2.2378969192504883, - "learning_rate": 2.095473739661494e-06, - "loss": 0.39086243510246277, - "mean_token_accuracy": 0.8681687116622925, - "num_tokens": 20289243.0, - "step": 2268 - }, - { - "epoch": 1.7241641337386018, - "grad_norm": 2.754582405090332, - "learning_rate": 2.093407087012791e-06, - "loss": 0.42927244305610657, - "mean_token_accuracy": 0.8594136834144592, - "num_tokens": 20294537.0, - "step": 2269 - }, - { - "epoch": 1.7249240121580547, - "grad_norm": 2.2721824645996094, - "learning_rate": 2.091340719726647e-06, - "loss": 0.42479783296585083, - "mean_token_accuracy": 0.8411722183227539, - "num_tokens": 20301502.0, - "step": 2270 - }, - { - "epoch": 1.7256838905775076, - "grad_norm": 2.3230299949645996, - "learning_rate": 2.089274639253317e-06, - "loss": 0.4218963384628296, - "mean_token_accuracy": 0.8498032093048096, - "num_tokens": 20307710.0, - "step": 2271 - }, - { - "epoch": 1.7264437689969605, - "grad_norm": 2.3499748706817627, - "learning_rate": 2.0872088470428553e-06, - "loss": 0.4472277760505676, - "mean_token_accuracy": 0.8487255573272705, - "num_tokens": 20313945.0, - "step": 2272 - }, - { - "epoch": 1.7272036474164134, - "grad_norm": 1.3709690570831299, - "learning_rate": 2.0851433445451142e-06, - "loss": 0.38701117038726807, - "mean_token_accuracy": 0.8592075109481812, - "num_tokens": 20328023.0, - "step": 2273 - }, - { - "epoch": 1.7279635258358663, - "grad_norm": 1.1293425559997559, - "learning_rate": 2.0830781332097446e-06, - "loss": 0.34000539779663086, - "mean_token_accuracy": 0.8779317140579224, - "num_tokens": 20346767.0, - "step": 2274 - }, - { - "epoch": 1.728723404255319, - "grad_norm": 2.9770123958587646, - "learning_rate": 2.08101321448619e-06, - "loss": 0.4437636733055115, - "mean_token_accuracy": 0.8398602604866028, - "num_tokens": 20352306.0, - "step": 2275 - }, - { - "epoch": 1.7294832826747721, - "grad_norm": 3.510955572128296, - "learning_rate": 2.0789485898236897e-06, - "loss": 0.3359706401824951, - "mean_token_accuracy": 0.8872498273849487, - "num_tokens": 20355560.0, - "step": 2276 - }, - { - "epoch": 1.7302431610942248, - "grad_norm": 2.0873279571533203, - "learning_rate": 2.076884260671276e-06, - "loss": 0.38720619678497314, - "mean_token_accuracy": 0.865881621837616, - "num_tokens": 20362802.0, - "step": 2277 - }, - { - "epoch": 1.731003039513678, - "grad_norm": 2.4871230125427246, - "learning_rate": 2.0748202284777775e-06, - "loss": 0.3250775933265686, - "mean_token_accuracy": 0.8867610692977905, - "num_tokens": 20367080.0, - "step": 2278 - }, - { - "epoch": 1.7317629179331306, - "grad_norm": 3.5603582859039307, - "learning_rate": 2.072756494691809e-06, - "loss": 0.35600754618644714, - "mean_token_accuracy": 0.8781189918518066, - "num_tokens": 20370625.0, - "step": 2279 - }, - { - "epoch": 1.7325227963525835, - "grad_norm": 2.0948755741119385, - "learning_rate": 2.070693060761779e-06, - "loss": 0.3558604419231415, - "mean_token_accuracy": 0.902066707611084, - "num_tokens": 20376835.0, - "step": 2280 - }, - { - "epoch": 1.7332826747720365, - "grad_norm": 2.391188859939575, - "learning_rate": 2.0686299281358837e-06, - "loss": 0.36596938967704773, - "mean_token_accuracy": 0.8741272687911987, - "num_tokens": 20382282.0, - "step": 2281 - }, - { - "epoch": 1.7340425531914894, - "grad_norm": 1.6906369924545288, - "learning_rate": 2.0665670982621107e-06, - "loss": 0.5241266489028931, - "mean_token_accuracy": 0.8091107606887817, - "num_tokens": 20393736.0, - "step": 2282 - }, - { - "epoch": 1.7348024316109423, - "grad_norm": 1.7578394412994385, - "learning_rate": 2.0645045725882334e-06, - "loss": 0.37041786313056946, - "mean_token_accuracy": 0.8907113075256348, - "num_tokens": 20402715.0, - "step": 2283 - }, - { - "epoch": 1.7355623100303952, - "grad_norm": 2.191727638244629, - "learning_rate": 2.0624423525618097e-06, - "loss": 0.43301627039909363, - "mean_token_accuracy": 0.8706433773040771, - "num_tokens": 20409976.0, - "step": 2284 - }, - { - "epoch": 1.736322188449848, - "grad_norm": 1.958005666732788, - "learning_rate": 2.0603804396301875e-06, - "loss": 0.29002684354782104, - "mean_token_accuracy": 0.8914110660552979, - "num_tokens": 20417099.0, - "step": 2285 - }, - { - "epoch": 1.7370820668693008, - "grad_norm": 2.477837085723877, - "learning_rate": 2.058318835240495e-06, - "loss": 0.2953898310661316, - "mean_token_accuracy": 0.8975275754928589, - "num_tokens": 20422251.0, - "step": 2286 - }, - { - "epoch": 1.737841945288754, - "grad_norm": 2.156764268875122, - "learning_rate": 2.0562575408396475e-06, - "loss": 0.4063698649406433, - "mean_token_accuracy": 0.8497642278671265, - "num_tokens": 20429338.0, - "step": 2287 - }, - { - "epoch": 1.7386018237082066, - "grad_norm": 1.6748939752578735, - "learning_rate": 2.0541965578743373e-06, - "loss": 0.3272587060928345, - "mean_token_accuracy": 0.8646700382232666, - "num_tokens": 20439680.0, - "step": 2288 - }, - { - "epoch": 1.7393617021276597, - "grad_norm": 1.9948776960372925, - "learning_rate": 2.0521358877910446e-06, - "loss": 0.36843347549438477, - "mean_token_accuracy": 0.8613901138305664, - "num_tokens": 20448492.0, - "step": 2289 - }, - { - "epoch": 1.7401215805471124, - "grad_norm": 2.231428623199463, - "learning_rate": 2.0500755320360263e-06, - "loss": 0.3905152380466461, - "mean_token_accuracy": 0.8980990052223206, - "num_tokens": 20453945.0, - "step": 2290 - }, - { - "epoch": 1.7408814589665653, - "grad_norm": 2.2187650203704834, - "learning_rate": 2.048015492055319e-06, - "loss": 0.45920854806900024, - "mean_token_accuracy": 0.8282852172851562, - "num_tokens": 20462378.0, - "step": 2291 - }, - { - "epoch": 1.7416413373860182, - "grad_norm": 2.0668466091156006, - "learning_rate": 2.045955769294737e-06, - "loss": 0.3227751553058624, - "mean_token_accuracy": 0.8805934190750122, - "num_tokens": 20469822.0, - "step": 2292 - }, - { - "epoch": 1.7424012158054711, - "grad_norm": 1.9162774085998535, - "learning_rate": 2.0438963651998747e-06, - "loss": 0.4604800343513489, - "mean_token_accuracy": 0.8441175818443298, - "num_tokens": 20479099.0, - "step": 2293 - }, - { - "epoch": 1.743161094224924, - "grad_norm": 2.645329713821411, - "learning_rate": 2.0418372812161015e-06, - "loss": 0.3239654004573822, - "mean_token_accuracy": 0.8888648748397827, - "num_tokens": 20483926.0, - "step": 2294 - }, - { - "epoch": 1.743920972644377, - "grad_norm": 1.39468514919281, - "learning_rate": 2.03977851878856e-06, - "loss": 0.4003690183162689, - "mean_token_accuracy": 0.8769714832305908, - "num_tokens": 20496501.0, - "step": 2295 - }, - { - "epoch": 1.7446808510638299, - "grad_norm": 3.509174346923828, - "learning_rate": 2.0377200793621694e-06, - "loss": 0.2948213517665863, - "mean_token_accuracy": 0.8972329497337341, - "num_tokens": 20500000.0, - "step": 2296 - }, - { - "epoch": 1.7454407294832825, - "grad_norm": 1.5033894777297974, - "learning_rate": 2.0356619643816234e-06, - "loss": 0.40694737434387207, - "mean_token_accuracy": 0.8607243895530701, - "num_tokens": 20513473.0, - "step": 2297 - }, - { - "epoch": 1.7462006079027357, - "grad_norm": 1.4324895143508911, - "learning_rate": 2.0336041752913843e-06, - "loss": 0.3899157643318176, - "mean_token_accuracy": 0.858935534954071, - "num_tokens": 20524516.0, - "step": 2298 - }, - { - "epoch": 1.7469604863221884, - "grad_norm": 2.359544277191162, - "learning_rate": 2.031546713535688e-06, - "loss": 0.369213342666626, - "mean_token_accuracy": 0.8741403818130493, - "num_tokens": 20530421.0, - "step": 2299 - }, - { - "epoch": 1.7477203647416415, - "grad_norm": 2.282637357711792, - "learning_rate": 2.029489580558542e-06, - "loss": 0.3255441188812256, - "mean_token_accuracy": 0.9045462608337402, - "num_tokens": 20535954.0, - "step": 2300 - }, - { - "epoch": 1.7484802431610942, - "grad_norm": 1.7367198467254639, - "learning_rate": 2.0274327778037204e-06, - "loss": 0.43890488147735596, - "mean_token_accuracy": 0.8494667410850525, - "num_tokens": 20548638.0, - "step": 2301 - }, - { - "epoch": 1.749240121580547, - "grad_norm": 1.6236488819122314, - "learning_rate": 2.0253763067147657e-06, - "loss": 0.4440777897834778, - "mean_token_accuracy": 0.8414230942726135, - "num_tokens": 20559263.0, - "step": 2302 - }, - { - "epoch": 1.75, - "grad_norm": 1.3755455017089844, - "learning_rate": 2.0233201687349888e-06, - "loss": 0.3473797142505646, - "mean_token_accuracy": 0.8742472529411316, - "num_tokens": 20573109.0, - "step": 2303 - }, - { - "epoch": 1.750759878419453, - "grad_norm": 3.271153688430786, - "learning_rate": 2.0212643653074677e-06, - "loss": 0.4965784549713135, - "mean_token_accuracy": 0.8596988916397095, - "num_tokens": 20578525.0, - "step": 2304 - }, - { - "epoch": 1.7515197568389058, - "grad_norm": 2.6341168880462646, - "learning_rate": 2.019208897875043e-06, - "loss": 0.37775442004203796, - "mean_token_accuracy": 0.8721816539764404, - "num_tokens": 20583641.0, - "step": 2305 - }, - { - "epoch": 1.7522796352583585, - "grad_norm": 1.8308569192886353, - "learning_rate": 2.0171537678803222e-06, - "loss": 0.3243415355682373, - "mean_token_accuracy": 0.8837124109268188, - "num_tokens": 20591725.0, - "step": 2306 - }, - { - "epoch": 1.7530395136778116, - "grad_norm": 2.4362998008728027, - "learning_rate": 2.015098976765673e-06, - "loss": 0.3738787770271301, - "mean_token_accuracy": 0.8974303007125854, - "num_tokens": 20596587.0, - "step": 2307 - }, - { - "epoch": 1.7537993920972643, - "grad_norm": 3.2920920848846436, - "learning_rate": 2.0130445259732282e-06, - "loss": 0.33901530504226685, - "mean_token_accuracy": 0.9019063115119934, - "num_tokens": 20600379.0, - "step": 2308 - }, - { - "epoch": 1.7545592705167175, - "grad_norm": 1.290475606918335, - "learning_rate": 2.01099041694488e-06, - "loss": 0.37150678038597107, - "mean_token_accuracy": 0.8542044758796692, - "num_tokens": 20614340.0, - "step": 2309 - }, - { - "epoch": 1.7553191489361701, - "grad_norm": 2.7794933319091797, - "learning_rate": 2.0089366511222815e-06, - "loss": 0.3746095895767212, - "mean_token_accuracy": 0.8653185367584229, - "num_tokens": 20622056.0, - "step": 2310 - }, - { - "epoch": 1.756079027355623, - "grad_norm": 2.2112278938293457, - "learning_rate": 2.006883229946843e-06, - "loss": 0.35793858766555786, - "mean_token_accuracy": 0.875727653503418, - "num_tokens": 20628930.0, - "step": 2311 - }, - { - "epoch": 1.756838905775076, - "grad_norm": 1.5240603685379028, - "learning_rate": 2.0048301548597365e-06, - "loss": 0.512831449508667, - "mean_token_accuracy": 0.8139172792434692, - "num_tokens": 20643159.0, - "step": 2312 - }, - { - "epoch": 1.7575987841945289, - "grad_norm": 1.810485601425171, - "learning_rate": 2.0027774273018894e-06, - "loss": 0.43870818614959717, - "mean_token_accuracy": 0.8313089609146118, - "num_tokens": 20651914.0, - "step": 2313 - }, - { - "epoch": 1.7583586626139818, - "grad_norm": 1.748178243637085, - "learning_rate": 2.0007250487139827e-06, - "loss": 0.42277514934539795, - "mean_token_accuracy": 0.8463197946548462, - "num_tokens": 20660054.0, - "step": 2314 - }, - { - "epoch": 1.7591185410334347, - "grad_norm": 1.511717677116394, - "learning_rate": 1.998673020536456e-06, - "loss": 0.38304439187049866, - "mean_token_accuracy": 0.8508470058441162, - "num_tokens": 20673371.0, - "step": 2315 - }, - { - "epoch": 1.7598784194528876, - "grad_norm": 1.7790700197219849, - "learning_rate": 1.996621344209503e-06, - "loss": 0.3838311433792114, - "mean_token_accuracy": 0.8676829934120178, - "num_tokens": 20682072.0, - "step": 2316 - }, - { - "epoch": 1.7606382978723403, - "grad_norm": 1.9128468036651611, - "learning_rate": 1.994570021173067e-06, - "loss": 0.40384364128112793, - "mean_token_accuracy": 0.8747294545173645, - "num_tokens": 20689000.0, - "step": 2317 - }, - { - "epoch": 1.7613981762917934, - "grad_norm": 3.286569118499756, - "learning_rate": 1.9925190528668455e-06, - "loss": 0.38019680976867676, - "mean_token_accuracy": 0.8678069114685059, - "num_tokens": 20692763.0, - "step": 2318 - }, - { - "epoch": 1.762158054711246, - "grad_norm": 1.6108927726745605, - "learning_rate": 1.990468440730288e-06, - "loss": 0.3144170045852661, - "mean_token_accuracy": 0.8695170879364014, - "num_tokens": 20702620.0, - "step": 2319 - }, - { - "epoch": 1.7629179331306992, - "grad_norm": 3.185225009918213, - "learning_rate": 1.9884181862025938e-06, - "loss": 0.41619348526000977, - "mean_token_accuracy": 0.8543670177459717, - "num_tokens": 20706857.0, - "step": 2320 - }, - { - "epoch": 1.763677811550152, - "grad_norm": 2.3699469566345215, - "learning_rate": 1.986368290722709e-06, - "loss": 0.5115842819213867, - "mean_token_accuracy": 0.8141909837722778, - "num_tokens": 20713997.0, - "step": 2321 - }, - { - "epoch": 1.7644376899696048, - "grad_norm": 1.4449706077575684, - "learning_rate": 1.9843187557293286e-06, - "loss": 0.419655442237854, - "mean_token_accuracy": 0.8545533418655396, - "num_tokens": 20726548.0, - "step": 2322 - }, - { - "epoch": 1.7651975683890577, - "grad_norm": 2.127614974975586, - "learning_rate": 1.9822695826608975e-06, - "loss": 0.43722522258758545, - "mean_token_accuracy": 0.8542283773422241, - "num_tokens": 20733469.0, - "step": 2323 - }, - { - "epoch": 1.7659574468085106, - "grad_norm": 3.3081557750701904, - "learning_rate": 1.9802207729556023e-06, - "loss": 0.30904972553253174, - "mean_token_accuracy": 0.8896352648735046, - "num_tokens": 20737190.0, - "step": 2324 - }, - { - "epoch": 1.7667173252279635, - "grad_norm": 2.603506326675415, - "learning_rate": 1.978172328051377e-06, - "loss": 0.30952537059783936, - "mean_token_accuracy": 0.8868587017059326, - "num_tokens": 20741780.0, - "step": 2325 - }, - { - "epoch": 1.7674772036474165, - "grad_norm": 2.576824903488159, - "learning_rate": 1.9761242493858987e-06, - "loss": 0.29593953490257263, - "mean_token_accuracy": 0.888198733329773, - "num_tokens": 20746324.0, - "step": 2326 - }, - { - "epoch": 1.7682370820668694, - "grad_norm": 1.6168320178985596, - "learning_rate": 1.9740765383965894e-06, - "loss": 0.5093998908996582, - "mean_token_accuracy": 0.8301646709442139, - "num_tokens": 20760140.0, - "step": 2327 - }, - { - "epoch": 1.768996960486322, - "grad_norm": 2.1162400245666504, - "learning_rate": 1.9720291965206097e-06, - "loss": 0.36714404821395874, - "mean_token_accuracy": 0.8699671626091003, - "num_tokens": 20766961.0, - "step": 2328 - }, - { - "epoch": 1.7697568389057752, - "grad_norm": 1.046911597251892, - "learning_rate": 1.969982225194864e-06, - "loss": 0.40783989429473877, - "mean_token_accuracy": 0.8474892377853394, - "num_tokens": 20786737.0, - "step": 2329 - }, - { - "epoch": 1.7705167173252279, - "grad_norm": 1.7059568166732788, - "learning_rate": 1.9679356258559943e-06, - "loss": 0.44083845615386963, - "mean_token_accuracy": 0.841221034526825, - "num_tokens": 20798907.0, - "step": 2330 - }, - { - "epoch": 1.771276595744681, - "grad_norm": 1.5157767534255981, - "learning_rate": 1.9658893999403847e-06, - "loss": 0.4671107828617096, - "mean_token_accuracy": 0.8252813816070557, - "num_tokens": 20814304.0, - "step": 2331 - }, - { - "epoch": 1.7720364741641337, - "grad_norm": 2.1340525150299072, - "learning_rate": 1.9638435488841543e-06, - "loss": 0.4088709354400635, - "mean_token_accuracy": 0.8595127463340759, - "num_tokens": 20821827.0, - "step": 2332 - }, - { - "epoch": 1.7727963525835866, - "grad_norm": 1.948072910308838, - "learning_rate": 1.96179807412316e-06, - "loss": 0.3692860007286072, - "mean_token_accuracy": 0.8678920269012451, - "num_tokens": 20828612.0, - "step": 2333 - }, - { - "epoch": 1.7735562310030395, - "grad_norm": 1.5731977224349976, - "learning_rate": 1.959752977092995e-06, - "loss": 0.3743135929107666, - "mean_token_accuracy": 0.8723479509353638, - "num_tokens": 20838497.0, - "step": 2334 - }, - { - "epoch": 1.7743161094224924, - "grad_norm": 1.5506012439727783, - "learning_rate": 1.957708259228987e-06, - "loss": 0.4403391182422638, - "mean_token_accuracy": 0.854604959487915, - "num_tokens": 20851603.0, - "step": 2335 - }, - { - "epoch": 1.7750759878419453, - "grad_norm": 1.154336929321289, - "learning_rate": 1.9556639219661983e-06, - "loss": 0.5281188488006592, - "mean_token_accuracy": 0.8101300001144409, - "num_tokens": 20875661.0, - "step": 2336 - }, - { - "epoch": 1.7758358662613982, - "grad_norm": 4.720771312713623, - "learning_rate": 1.9536199667394217e-06, - "loss": 0.44419822096824646, - "mean_token_accuracy": 0.8740090131759644, - "num_tokens": 20886971.0, - "step": 2337 - }, - { - "epoch": 1.7765957446808511, - "grad_norm": 1.5492230653762817, - "learning_rate": 1.9515763949831852e-06, - "loss": 0.4538637697696686, - "mean_token_accuracy": 0.8362185955047607, - "num_tokens": 20899212.0, - "step": 2338 - }, - { - "epoch": 1.7773556231003038, - "grad_norm": 1.354101538658142, - "learning_rate": 1.9495332081317466e-06, - "loss": 0.4341534376144409, - "mean_token_accuracy": 0.8380170464515686, - "num_tokens": 20913065.0, - "step": 2339 - }, - { - "epoch": 1.778115501519757, - "grad_norm": 1.5805599689483643, - "learning_rate": 1.947490407619092e-06, - "loss": 0.40928739309310913, - "mean_token_accuracy": 0.8524469137191772, - "num_tokens": 20922919.0, - "step": 2340 - }, - { - "epoch": 1.7788753799392096, - "grad_norm": 2.097221851348877, - "learning_rate": 1.945447994878937e-06, - "loss": 0.4816104769706726, - "mean_token_accuracy": 0.888654351234436, - "num_tokens": 20931350.0, - "step": 2341 - }, - { - "epoch": 1.7796352583586628, - "grad_norm": 1.7193297147750854, - "learning_rate": 1.9434059713447264e-06, - "loss": 0.44925639033317566, - "mean_token_accuracy": 0.8500319123268127, - "num_tokens": 20940546.0, - "step": 2342 - }, - { - "epoch": 1.7803951367781155, - "grad_norm": 1.5971747636795044, - "learning_rate": 1.9413643384496315e-06, - "loss": 0.29559412598609924, - "mean_token_accuracy": 0.8871279954910278, - "num_tokens": 20950604.0, - "step": 2343 - }, - { - "epoch": 1.7811550151975684, - "grad_norm": 2.788029670715332, - "learning_rate": 1.9393230976265478e-06, - "loss": 0.31713539361953735, - "mean_token_accuracy": 0.8866176605224609, - "num_tokens": 20955296.0, - "step": 2344 - }, - { - "epoch": 1.7819148936170213, - "grad_norm": 1.5747952461242676, - "learning_rate": 1.937282250308096e-06, - "loss": 0.41813358664512634, - "mean_token_accuracy": 0.8418053984642029, - "num_tokens": 20967664.0, - "step": 2345 - }, - { - "epoch": 1.7826747720364742, - "grad_norm": 2.0813145637512207, - "learning_rate": 1.935241797926623e-06, - "loss": 0.39056286215782166, - "mean_token_accuracy": 0.8601781129837036, - "num_tokens": 20975895.0, - "step": 2346 - }, - { - "epoch": 1.783434650455927, - "grad_norm": 2.143022298812866, - "learning_rate": 1.933201741914196e-06, - "loss": 0.40797823667526245, - "mean_token_accuracy": 0.8846398591995239, - "num_tokens": 20983683.0, - "step": 2347 - }, - { - "epoch": 1.78419452887538, - "grad_norm": 1.8451775312423706, - "learning_rate": 1.931162083702606e-06, - "loss": 0.34083136916160583, - "mean_token_accuracy": 0.8643462657928467, - "num_tokens": 20992621.0, - "step": 2348 - }, - { - "epoch": 1.784954407294833, - "grad_norm": 1.8603935241699219, - "learning_rate": 1.9291228247233607e-06, - "loss": 0.4860231280326843, - "mean_token_accuracy": 0.8391251564025879, - "num_tokens": 21002427.0, - "step": 2349 - }, - { - "epoch": 1.7857142857142856, - "grad_norm": 2.751711845397949, - "learning_rate": 1.9270839664076937e-06, - "loss": 0.30588358640670776, - "mean_token_accuracy": 0.8836315274238586, - "num_tokens": 21006898.0, - "step": 2350 - }, - { - "epoch": 1.7864741641337387, - "grad_norm": 1.0335345268249512, - "learning_rate": 1.9250455101865526e-06, - "loss": 0.3119634985923767, - "mean_token_accuracy": 0.8912283182144165, - "num_tokens": 21024930.0, - "step": 2351 - }, - { - "epoch": 1.7872340425531914, - "grad_norm": 2.4693806171417236, - "learning_rate": 1.9230074574906043e-06, - "loss": 0.1976669877767563, - "mean_token_accuracy": 0.928974986076355, - "num_tokens": 21029027.0, - "step": 2352 - }, - { - "epoch": 1.7879939209726445, - "grad_norm": 1.2892690896987915, - "learning_rate": 1.920969809750234e-06, - "loss": 0.46008217334747314, - "mean_token_accuracy": 0.8299605846405029, - "num_tokens": 21047671.0, - "step": 2353 - }, - { - "epoch": 1.7887537993920972, - "grad_norm": 3.162534713745117, - "learning_rate": 1.91893256839554e-06, - "loss": 0.2916071116924286, - "mean_token_accuracy": 0.8932807445526123, - "num_tokens": 21051555.0, - "step": 2354 - }, - { - "epoch": 1.7895136778115501, - "grad_norm": 1.7627713680267334, - "learning_rate": 1.916895734856338e-06, - "loss": 0.3223535120487213, - "mean_token_accuracy": 0.8852578401565552, - "num_tokens": 21060056.0, - "step": 2355 - }, - { - "epoch": 1.790273556231003, - "grad_norm": 1.9448071718215942, - "learning_rate": 1.9148593105621542e-06, - "loss": 0.3650452196598053, - "mean_token_accuracy": 0.8709862232208252, - "num_tokens": 21067190.0, - "step": 2356 - }, - { - "epoch": 1.791033434650456, - "grad_norm": 2.026644229888916, - "learning_rate": 1.9128232969422318e-06, - "loss": 0.3620566427707672, - "mean_token_accuracy": 0.865707516670227, - "num_tokens": 21075197.0, - "step": 2357 - }, - { - "epoch": 1.7917933130699089, - "grad_norm": 2.2628564834594727, - "learning_rate": 1.9107876954255217e-06, - "loss": 0.353444367647171, - "mean_token_accuracy": 0.8590385913848877, - "num_tokens": 21080823.0, - "step": 2358 - }, - { - "epoch": 1.7925531914893615, - "grad_norm": 2.5959067344665527, - "learning_rate": 1.908752507440689e-06, - "loss": 0.43711763620376587, - "mean_token_accuracy": 0.8539710640907288, - "num_tokens": 21086016.0, - "step": 2359 - }, - { - "epoch": 1.7933130699088147, - "grad_norm": 1.6228864192962646, - "learning_rate": 1.906717734416105e-06, - "loss": 0.38630396127700806, - "mean_token_accuracy": 0.8611987829208374, - "num_tokens": 21096573.0, - "step": 2360 - }, - { - "epoch": 1.7940729483282674, - "grad_norm": 1.8471404314041138, - "learning_rate": 1.9046833777798534e-06, - "loss": 0.46608641743659973, - "mean_token_accuracy": 0.8782031536102295, - "num_tokens": 21105817.0, - "step": 2361 - }, - { - "epoch": 1.7948328267477205, - "grad_norm": 2.6532235145568848, - "learning_rate": 1.9026494389597239e-06, - "loss": 0.3310372829437256, - "mean_token_accuracy": 0.8781720399856567, - "num_tokens": 21111192.0, - "step": 2362 - }, - { - "epoch": 1.7955927051671732, - "grad_norm": 2.172534942626953, - "learning_rate": 1.9006159193832124e-06, - "loss": 0.49921661615371704, - "mean_token_accuracy": 0.8215196132659912, - "num_tokens": 21117878.0, - "step": 2363 - }, - { - "epoch": 1.7963525835866263, - "grad_norm": 1.6507720947265625, - "learning_rate": 1.8985828204775206e-06, - "loss": 0.4189162850379944, - "mean_token_accuracy": 0.8520572185516357, - "num_tokens": 21128287.0, - "step": 2364 - }, - { - "epoch": 1.797112462006079, - "grad_norm": 1.5932034254074097, - "learning_rate": 1.8965501436695578e-06, - "loss": 0.45531854033470154, - "mean_token_accuracy": 0.8391242027282715, - "num_tokens": 21140605.0, - "step": 2365 - }, - { - "epoch": 1.797872340425532, - "grad_norm": 2.4680638313293457, - "learning_rate": 1.894517890385933e-06, - "loss": 0.41174983978271484, - "mean_token_accuracy": 0.8616886138916016, - "num_tokens": 21147045.0, - "step": 2366 - }, - { - "epoch": 1.7986322188449848, - "grad_norm": 1.61875319480896, - "learning_rate": 1.8924860620529594e-06, - "loss": 0.47573935985565186, - "mean_token_accuracy": 0.8347671031951904, - "num_tokens": 21157253.0, - "step": 2367 - }, - { - "epoch": 1.7993920972644377, - "grad_norm": 3.4389333724975586, - "learning_rate": 1.8904546600966539e-06, - "loss": 0.34975939989089966, - "mean_token_accuracy": 0.8915865421295166, - "num_tokens": 21160486.0, - "step": 2368 - }, - { - "epoch": 1.8001519756838906, - "grad_norm": 2.0069527626037598, - "learning_rate": 1.888423685942732e-06, - "loss": 0.379585325717926, - "mean_token_accuracy": 0.8605983257293701, - "num_tokens": 21168016.0, - "step": 2369 - }, - { - "epoch": 1.8009118541033433, - "grad_norm": 3.0740530490875244, - "learning_rate": 1.886393141016609e-06, - "loss": 0.5244829058647156, - "mean_token_accuracy": 0.8282772302627563, - "num_tokens": 21172851.0, - "step": 2370 - }, - { - "epoch": 1.8016717325227964, - "grad_norm": 1.5724968910217285, - "learning_rate": 1.8843630267434e-06, - "loss": 0.2020694762468338, - "mean_token_accuracy": 0.8882503509521484, - "num_tokens": 21179866.0, - "step": 2371 - }, - { - "epoch": 1.8024316109422491, - "grad_norm": 2.1539509296417236, - "learning_rate": 1.8823333445479175e-06, - "loss": 0.37903186678886414, - "mean_token_accuracy": 0.8525497317314148, - "num_tokens": 21186941.0, - "step": 2372 - }, - { - "epoch": 1.8031914893617023, - "grad_norm": 2.0247764587402344, - "learning_rate": 1.8803040958546708e-06, - "loss": 0.293364018201828, - "mean_token_accuracy": 0.8954306244850159, - "num_tokens": 21193659.0, - "step": 2373 - }, - { - "epoch": 1.803951367781155, - "grad_norm": 1.7034926414489746, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.33828210830688477, - "mean_token_accuracy": 0.9032940864562988, - "num_tokens": 21201399.0, - "step": 2374 - }, - { - "epoch": 1.8047112462006079, - "grad_norm": 1.7864601612091064, - "learning_rate": 1.8762469046713954e-06, - "loss": 0.3165147006511688, - "mean_token_accuracy": 0.8997465372085571, - "num_tokens": 21209105.0, - "step": 2375 - }, - { - "epoch": 1.8054711246200608, - "grad_norm": 2.3371729850769043, - "learning_rate": 1.8742189650288617e-06, - "loss": 0.4036901593208313, - "mean_token_accuracy": 0.8549420833587646, - "num_tokens": 21215429.0, - "step": 2376 - }, - { - "epoch": 1.8062310030395137, - "grad_norm": 1.7922348976135254, - "learning_rate": 1.872191464583547e-06, - "loss": 0.4366671144962311, - "mean_token_accuracy": 0.8614166975021362, - "num_tokens": 21226823.0, - "step": 2377 - }, - { - "epoch": 1.8069908814589666, - "grad_norm": 2.1667943000793457, - "learning_rate": 1.8701644047584294e-06, - "loss": 0.3543647825717926, - "mean_token_accuracy": 0.9031318426132202, - "num_tokens": 21232823.0, - "step": 2378 - }, - { - "epoch": 1.8077507598784195, - "grad_norm": 1.7554421424865723, - "learning_rate": 1.868137786976177e-06, - "loss": 0.32704365253448486, - "mean_token_accuracy": 0.8990532755851746, - "num_tokens": 21242036.0, - "step": 2379 - }, - { - "epoch": 1.8085106382978724, - "grad_norm": 1.6723839044570923, - "learning_rate": 1.8661116126591492e-06, - "loss": 0.3665752410888672, - "mean_token_accuracy": 0.8828305006027222, - "num_tokens": 21251290.0, - "step": 2380 - }, - { - "epoch": 1.809270516717325, - "grad_norm": 1.5078409910202026, - "learning_rate": 1.8640858832293924e-06, - "loss": 0.368108332157135, - "mean_token_accuracy": 0.8720884323120117, - "num_tokens": 21263510.0, - "step": 2381 - }, - { - "epoch": 1.8100303951367782, - "grad_norm": 2.245553493499756, - "learning_rate": 1.8620606001086423e-06, - "loss": 0.3189915716648102, - "mean_token_accuracy": 0.9015103578567505, - "num_tokens": 21269690.0, - "step": 2382 - }, - { - "epoch": 1.810790273556231, - "grad_norm": 1.780027151107788, - "learning_rate": 1.8600357647183188e-06, - "loss": 0.40369710326194763, - "mean_token_accuracy": 0.8539618253707886, - "num_tokens": 21278523.0, - "step": 2383 - }, - { - "epoch": 1.811550151975684, - "grad_norm": 2.1727912425994873, - "learning_rate": 1.8580113784795306e-06, - "loss": 0.29285651445388794, - "mean_token_accuracy": 0.8954071998596191, - "num_tokens": 21284717.0, - "step": 2384 - }, - { - "epoch": 1.8123100303951367, - "grad_norm": 2.310225248336792, - "learning_rate": 1.8559874428130708e-06, - "loss": 0.3090948760509491, - "mean_token_accuracy": 0.8853784203529358, - "num_tokens": 21290484.0, - "step": 2385 - }, - { - "epoch": 1.8130699088145896, - "grad_norm": 1.6556873321533203, - "learning_rate": 1.8539639591394131e-06, - "loss": 0.4425269663333893, - "mean_token_accuracy": 0.8488757610321045, - "num_tokens": 21302588.0, - "step": 2386 - }, - { - "epoch": 1.8138297872340425, - "grad_norm": 1.9238256216049194, - "learning_rate": 1.8519409288787182e-06, - "loss": 0.4781329929828644, - "mean_token_accuracy": 0.8392970561981201, - "num_tokens": 21310598.0, - "step": 2387 - }, - { - "epoch": 1.8145896656534954, - "grad_norm": 1.4976142644882202, - "learning_rate": 1.8499183534508263e-06, - "loss": 0.36829859018325806, - "mean_token_accuracy": 0.8687542676925659, - "num_tokens": 21322668.0, - "step": 2388 - }, - { - "epoch": 1.8153495440729484, - "grad_norm": 2.0216941833496094, - "learning_rate": 1.8478962342752584e-06, - "loss": 0.385962575674057, - "mean_token_accuracy": 0.8908089399337769, - "num_tokens": 21330378.0, - "step": 2389 - }, - { - "epoch": 1.8161094224924013, - "grad_norm": 1.647863507270813, - "learning_rate": 1.8458745727712142e-06, - "loss": 0.30903705954551697, - "mean_token_accuracy": 0.8914397954940796, - "num_tokens": 21339932.0, - "step": 2390 - }, - { - "epoch": 1.8168693009118542, - "grad_norm": 1.5832399129867554, - "learning_rate": 1.8438533703575757e-06, - "loss": 0.3636384606361389, - "mean_token_accuracy": 0.8611595630645752, - "num_tokens": 21351557.0, - "step": 2391 - }, - { - "epoch": 1.8176291793313069, - "grad_norm": 3.0069241523742676, - "learning_rate": 1.8418326284528997e-06, - "loss": 0.37970617413520813, - "mean_token_accuracy": 0.8620643615722656, - "num_tokens": 21355704.0, - "step": 2392 - }, - { - "epoch": 1.81838905775076, - "grad_norm": 2.004526376724243, - "learning_rate": 1.8398123484754204e-06, - "loss": 0.5333225131034851, - "mean_token_accuracy": 0.8062554597854614, - "num_tokens": 21364640.0, - "step": 2393 - }, - { - "epoch": 1.8191489361702127, - "grad_norm": 1.449981689453125, - "learning_rate": 1.8377925318430478e-06, - "loss": 0.3736325800418854, - "mean_token_accuracy": 0.858788251876831, - "num_tokens": 21377025.0, - "step": 2394 - }, - { - "epoch": 1.8199088145896658, - "grad_norm": 1.1959524154663086, - "learning_rate": 1.8357731799733686e-06, - "loss": 0.3272058963775635, - "mean_token_accuracy": 0.8840590715408325, - "num_tokens": 21395378.0, - "step": 2395 - }, - { - "epoch": 1.8206686930091185, - "grad_norm": 2.134742498397827, - "learning_rate": 1.8337542942836406e-06, - "loss": 0.3737856149673462, - "mean_token_accuracy": 0.8674061298370361, - "num_tokens": 21402106.0, - "step": 2396 - }, - { - "epoch": 1.8214285714285714, - "grad_norm": 2.2179460525512695, - "learning_rate": 1.8317358761907945e-06, - "loss": 0.37301796674728394, - "mean_token_accuracy": 0.8605623245239258, - "num_tokens": 21408367.0, - "step": 2397 - }, - { - "epoch": 1.8221884498480243, - "grad_norm": 2.1718010902404785, - "learning_rate": 1.8297179271114345e-06, - "loss": 0.2772231101989746, - "mean_token_accuracy": 0.8997501730918884, - "num_tokens": 21414274.0, - "step": 2398 - }, - { - "epoch": 1.8229483282674772, - "grad_norm": 1.410933494567871, - "learning_rate": 1.827700448461836e-06, - "loss": 0.4834601581096649, - "mean_token_accuracy": 0.8382522463798523, - "num_tokens": 21429120.0, - "step": 2399 - }, - { - "epoch": 1.8237082066869301, - "grad_norm": 3.4779679775238037, - "learning_rate": 1.8256834416579423e-06, - "loss": 0.44643428921699524, - "mean_token_accuracy": 0.8308249711990356, - "num_tokens": 21432437.0, - "step": 2400 - }, - { - "epoch": 1.824468085106383, - "grad_norm": 1.374484658241272, - "learning_rate": 1.8236669081153657e-06, - "loss": 0.3947869837284088, - "mean_token_accuracy": 0.8605848550796509, - "num_tokens": 21445656.0, - "step": 2401 - }, - { - "epoch": 1.825227963525836, - "grad_norm": 1.9599316120147705, - "learning_rate": 1.8216508492493887e-06, - "loss": 0.49040719866752625, - "mean_token_accuracy": 0.839459240436554, - "num_tokens": 21452889.0, - "step": 2402 - }, - { - "epoch": 1.8259878419452886, - "grad_norm": 2.1267881393432617, - "learning_rate": 1.8196352664749578e-06, - "loss": 0.3233179450035095, - "mean_token_accuracy": 0.8841243386268616, - "num_tokens": 21458788.0, - "step": 2403 - }, - { - "epoch": 1.8267477203647418, - "grad_norm": 2.6356115341186523, - "learning_rate": 1.8176201612066874e-06, - "loss": 0.43436336517333984, - "mean_token_accuracy": 0.850265622138977, - "num_tokens": 21464305.0, - "step": 2404 - }, - { - "epoch": 1.8275075987841944, - "grad_norm": 2.0232386589050293, - "learning_rate": 1.8156055348588548e-06, - "loss": 0.37281763553619385, - "mean_token_accuracy": 0.8616300821304321, - "num_tokens": 21471722.0, - "step": 2405 - }, - { - "epoch": 1.8282674772036476, - "grad_norm": 3.2616260051727295, - "learning_rate": 1.8135913888454034e-06, - "loss": 0.2882898151874542, - "mean_token_accuracy": 0.9001147747039795, - "num_tokens": 21475400.0, - "step": 2406 - }, - { - "epoch": 1.8290273556231003, - "grad_norm": 2.1665611267089844, - "learning_rate": 1.8115777245799383e-06, - "loss": 0.45269185304641724, - "mean_token_accuracy": 0.8420798778533936, - "num_tokens": 21481827.0, - "step": 2407 - }, - { - "epoch": 1.8297872340425532, - "grad_norm": 1.4406569004058838, - "learning_rate": 1.8095645434757261e-06, - "loss": 0.43665701150894165, - "mean_token_accuracy": 0.8401381969451904, - "num_tokens": 21496441.0, - "step": 2408 - }, - { - "epoch": 1.830547112462006, - "grad_norm": 1.6756342649459839, - "learning_rate": 1.8075518469456944e-06, - "loss": 0.3521783947944641, - "mean_token_accuracy": 0.8737466335296631, - "num_tokens": 21505568.0, - "step": 2409 - }, - { - "epoch": 1.831306990881459, - "grad_norm": 1.6623140573501587, - "learning_rate": 1.8055396364024318e-06, - "loss": 0.344537615776062, - "mean_token_accuracy": 0.886972188949585, - "num_tokens": 21513252.0, - "step": 2410 - }, - { - "epoch": 1.832066869300912, - "grad_norm": 2.064835548400879, - "learning_rate": 1.803527913258186e-06, - "loss": 0.3252706229686737, - "mean_token_accuracy": 0.885245680809021, - "num_tokens": 21520242.0, - "step": 2411 - }, - { - "epoch": 1.8328267477203646, - "grad_norm": 1.9969112873077393, - "learning_rate": 1.8015166789248606e-06, - "loss": 0.34694376587867737, - "mean_token_accuracy": 0.8818766474723816, - "num_tokens": 21527524.0, - "step": 2412 - }, - { - "epoch": 1.8335866261398177, - "grad_norm": 2.086148977279663, - "learning_rate": 1.7995059348140165e-06, - "loss": 0.23109188675880432, - "mean_token_accuracy": 0.912773609161377, - "num_tokens": 21532829.0, - "step": 2413 - }, - { - "epoch": 1.8343465045592704, - "grad_norm": 1.80828058719635, - "learning_rate": 1.7974956823368728e-06, - "loss": 0.5422223210334778, - "mean_token_accuracy": 0.8058640956878662, - "num_tokens": 21544440.0, - "step": 2414 - }, - { - "epoch": 1.8351063829787235, - "grad_norm": 1.8121788501739502, - "learning_rate": 1.7954859229043017e-06, - "loss": 0.3674035668373108, - "mean_token_accuracy": 0.8628277778625488, - "num_tokens": 21553160.0, - "step": 2415 - }, - { - "epoch": 1.8358662613981762, - "grad_norm": 1.9307979345321655, - "learning_rate": 1.7934766579268292e-06, - "loss": 0.4528796672821045, - "mean_token_accuracy": 0.8328302502632141, - "num_tokens": 21563485.0, - "step": 2416 - }, - { - "epoch": 1.8366261398176293, - "grad_norm": 1.2312756776809692, - "learning_rate": 1.7914678888146347e-06, - "loss": 0.40424543619155884, - "mean_token_accuracy": 0.8571025133132935, - "num_tokens": 21582662.0, - "step": 2417 - }, - { - "epoch": 1.837386018237082, - "grad_norm": 1.6305770874023438, - "learning_rate": 1.7894596169775514e-06, - "loss": 0.36575305461883545, - "mean_token_accuracy": 0.8768579959869385, - "num_tokens": 21592930.0, - "step": 2418 - }, - { - "epoch": 1.838145896656535, - "grad_norm": 1.8107178211212158, - "learning_rate": 1.7874518438250598e-06, - "loss": 0.3260963261127472, - "mean_token_accuracy": 0.896018385887146, - "num_tokens": 21600509.0, - "step": 2419 - }, - { - "epoch": 1.8389057750759878, - "grad_norm": 2.7195847034454346, - "learning_rate": 1.785444570766293e-06, - "loss": 0.2728347182273865, - "mean_token_accuracy": 0.9178709983825684, - "num_tokens": 21604489.0, - "step": 2420 - }, - { - "epoch": 1.8396656534954408, - "grad_norm": 1.9783591032028198, - "learning_rate": 1.7834377992100332e-06, - "loss": 0.3136378526687622, - "mean_token_accuracy": 0.8844017386436462, - "num_tokens": 21612060.0, - "step": 2421 - }, - { - "epoch": 1.8404255319148937, - "grad_norm": 2.1911418437957764, - "learning_rate": 1.7814315305647095e-06, - "loss": 0.39013993740081787, - "mean_token_accuracy": 0.8688976764678955, - "num_tokens": 21618778.0, - "step": 2422 - }, - { - "epoch": 1.8411854103343464, - "grad_norm": 1.9143604040145874, - "learning_rate": 1.779425766238398e-06, - "loss": 0.5113036632537842, - "mean_token_accuracy": 0.8329141139984131, - "num_tokens": 21628976.0, - "step": 2423 - }, - { - "epoch": 1.8419452887537995, - "grad_norm": 1.4184197187423706, - "learning_rate": 1.7774205076388207e-06, - "loss": 0.3821067810058594, - "mean_token_accuracy": 0.8604007959365845, - "num_tokens": 21643145.0, - "step": 2424 - }, - { - "epoch": 1.8427051671732522, - "grad_norm": 2.45896577835083, - "learning_rate": 1.7754157561733476e-06, - "loss": 0.3004961311817169, - "mean_token_accuracy": 0.89884352684021, - "num_tokens": 21647441.0, - "step": 2425 - }, - { - "epoch": 1.8434650455927053, - "grad_norm": 1.7999277114868164, - "learning_rate": 1.7734115132489887e-06, - "loss": 0.42533132433891296, - "mean_token_accuracy": 0.8838746547698975, - "num_tokens": 21657445.0, - "step": 2426 - }, - { - "epoch": 1.844224924012158, - "grad_norm": 2.099728584289551, - "learning_rate": 1.7714077802723994e-06, - "loss": 0.36200380325317383, - "mean_token_accuracy": 0.86548912525177, - "num_tokens": 21663966.0, - "step": 2427 - }, - { - "epoch": 1.844984802431611, - "grad_norm": 2.1970369815826416, - "learning_rate": 1.7694045586498754e-06, - "loss": 0.34944331645965576, - "mean_token_accuracy": 0.8670865297317505, - "num_tokens": 21670051.0, - "step": 2428 - }, - { - "epoch": 1.8457446808510638, - "grad_norm": 2.2928519248962402, - "learning_rate": 1.7674018497873568e-06, - "loss": 0.39500880241394043, - "mean_token_accuracy": 0.8744652271270752, - "num_tokens": 21676054.0, - "step": 2429 - }, - { - "epoch": 1.8465045592705167, - "grad_norm": 1.7598960399627686, - "learning_rate": 1.7653996550904208e-06, - "loss": 0.40113672614097595, - "mean_token_accuracy": 0.8552819490432739, - "num_tokens": 21685514.0, - "step": 2430 - }, - { - "epoch": 1.8472644376899696, - "grad_norm": 2.0529749393463135, - "learning_rate": 1.7633979759642844e-06, - "loss": 0.47586584091186523, - "mean_token_accuracy": 0.8412872552871704, - "num_tokens": 21693282.0, - "step": 2431 - }, - { - "epoch": 1.8480243161094225, - "grad_norm": 2.2423181533813477, - "learning_rate": 1.7613968138138027e-06, - "loss": 0.2757381796836853, - "mean_token_accuracy": 0.8992017507553101, - "num_tokens": 21698439.0, - "step": 2432 - }, - { - "epoch": 1.8487841945288754, - "grad_norm": 1.3280467987060547, - "learning_rate": 1.7593961700434692e-06, - "loss": 0.29535043239593506, - "mean_token_accuracy": 0.8943840861320496, - "num_tokens": 21711823.0, - "step": 2433 - }, - { - "epoch": 1.8495440729483281, - "grad_norm": 2.589221715927124, - "learning_rate": 1.7573960460574133e-06, - "loss": 0.46775516867637634, - "mean_token_accuracy": 0.8654797673225403, - "num_tokens": 21717180.0, - "step": 2434 - }, - { - "epoch": 1.8503039513677813, - "grad_norm": 2.1137642860412598, - "learning_rate": 1.7553964432593976e-06, - "loss": 0.3808780610561371, - "mean_token_accuracy": 0.8759565353393555, - "num_tokens": 21723980.0, - "step": 2435 - }, - { - "epoch": 1.851063829787234, - "grad_norm": 2.386967182159424, - "learning_rate": 1.75339736305282e-06, - "loss": 0.42688336968421936, - "mean_token_accuracy": 0.8488960266113281, - "num_tokens": 21730411.0, - "step": 2436 - }, - { - "epoch": 1.851823708206687, - "grad_norm": 1.586552619934082, - "learning_rate": 1.7513988068407145e-06, - "loss": 0.33497530221939087, - "mean_token_accuracy": 0.8809621334075928, - "num_tokens": 21740228.0, - "step": 2437 - }, - { - "epoch": 1.8525835866261398, - "grad_norm": 2.107167959213257, - "learning_rate": 1.7494007760257428e-06, - "loss": 0.3801528513431549, - "mean_token_accuracy": 0.8666986227035522, - "num_tokens": 21746718.0, - "step": 2438 - }, - { - "epoch": 1.8533434650455927, - "grad_norm": 2.514514684677124, - "learning_rate": 1.7474032720101991e-06, - "loss": 0.285498708486557, - "mean_token_accuracy": 0.901540219783783, - "num_tokens": 21751009.0, - "step": 2439 - }, - { - "epoch": 1.8541033434650456, - "grad_norm": 1.8152034282684326, - "learning_rate": 1.7454062961960102e-06, - "loss": 0.3704795241355896, - "mean_token_accuracy": 0.8630262613296509, - "num_tokens": 21760164.0, - "step": 2440 - }, - { - "epoch": 1.8548632218844985, - "grad_norm": 2.714531183242798, - "learning_rate": 1.7434098499847308e-06, - "loss": 0.5070809125900269, - "mean_token_accuracy": 0.8408594131469727, - "num_tokens": 21765602.0, - "step": 2441 - }, - { - "epoch": 1.8556231003039514, - "grad_norm": 2.173832893371582, - "learning_rate": 1.7414139347775423e-06, - "loss": 0.3500945568084717, - "mean_token_accuracy": 0.8733699321746826, - "num_tokens": 21772029.0, - "step": 2442 - }, - { - "epoch": 1.8563829787234043, - "grad_norm": 1.580376148223877, - "learning_rate": 1.7394185519752546e-06, - "loss": 0.5137908458709717, - "mean_token_accuracy": 0.8141944408416748, - "num_tokens": 21784531.0, - "step": 2443 - }, - { - "epoch": 1.8571428571428572, - "grad_norm": 2.079318046569824, - "learning_rate": 1.7374237029783064e-06, - "loss": 0.41820770502090454, - "mean_token_accuracy": 0.8513275384902954, - "num_tokens": 21792047.0, - "step": 2444 - }, - { - "epoch": 1.85790273556231, - "grad_norm": 2.6890387535095215, - "learning_rate": 1.7354293891867582e-06, - "loss": 0.3810037672519684, - "mean_token_accuracy": 0.8790096044540405, - "num_tokens": 21796634.0, - "step": 2445 - }, - { - "epoch": 1.858662613981763, - "grad_norm": 2.161081552505493, - "learning_rate": 1.7334356120002956e-06, - "loss": 0.48064762353897095, - "mean_token_accuracy": 0.8329977989196777, - "num_tokens": 21803509.0, - "step": 2446 - }, - { - "epoch": 1.8594224924012157, - "grad_norm": 1.9201551675796509, - "learning_rate": 1.7314423728182283e-06, - "loss": 0.36369895935058594, - "mean_token_accuracy": 0.8713955879211426, - "num_tokens": 21810528.0, - "step": 2447 - }, - { - "epoch": 1.8601823708206688, - "grad_norm": 1.8095223903656006, - "learning_rate": 1.7294496730394897e-06, - "loss": 0.41493499279022217, - "mean_token_accuracy": 0.855312705039978, - "num_tokens": 21821176.0, - "step": 2448 - }, - { - "epoch": 1.8609422492401215, - "grad_norm": 2.172389507293701, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.3467463552951813, - "mean_token_accuracy": 0.8801594972610474, - "num_tokens": 21827486.0, - "step": 2449 - }, - { - "epoch": 1.8617021276595744, - "grad_norm": 2.8139185905456543, - "learning_rate": 1.7254658972858293e-06, - "loss": 0.35121995210647583, - "mean_token_accuracy": 0.8741901516914368, - "num_tokens": 21831915.0, - "step": 2450 - }, - { - "epoch": 1.8624620060790273, - "grad_norm": 1.2572762966156006, - "learning_rate": 1.7234748241068742e-06, - "loss": 0.3775328993797302, - "mean_token_accuracy": 0.8547425866127014, - "num_tokens": 21849623.0, - "step": 2451 - }, - { - "epoch": 1.8632218844984803, - "grad_norm": 1.2357900142669678, - "learning_rate": 1.7214842959231796e-06, - "loss": 0.28715917468070984, - "mean_token_accuracy": 0.9034290313720703, - "num_tokens": 21864507.0, - "step": 2452 - }, - { - "epoch": 1.8639817629179332, - "grad_norm": 1.2349165678024292, - "learning_rate": 1.719494314131775e-06, - "loss": 0.27918580174446106, - "mean_token_accuracy": 0.9073119759559631, - "num_tokens": 21878519.0, - "step": 2453 - }, - { - "epoch": 1.864741641337386, - "grad_norm": 1.960353136062622, - "learning_rate": 1.7175048801293042e-06, - "loss": 0.49304282665252686, - "mean_token_accuracy": 0.8193954229354858, - "num_tokens": 21886861.0, - "step": 2454 - }, - { - "epoch": 1.865501519756839, - "grad_norm": 1.480118751525879, - "learning_rate": 1.7155159953120315e-06, - "loss": 0.39433127641677856, - "mean_token_accuracy": 0.8674266338348389, - "num_tokens": 21899131.0, - "step": 2455 - }, - { - "epoch": 1.8662613981762917, - "grad_norm": 2.3136367797851562, - "learning_rate": 1.7135276610758309e-06, - "loss": 0.40943437814712524, - "mean_token_accuracy": 0.8511340022087097, - "num_tokens": 21905550.0, - "step": 2456 - }, - { - "epoch": 1.8670212765957448, - "grad_norm": 1.3622872829437256, - "learning_rate": 1.7115398788161923e-06, - "loss": 0.4255254566669464, - "mean_token_accuracy": 0.8457357883453369, - "num_tokens": 21919943.0, - "step": 2457 - }, - { - "epoch": 1.8677811550151975, - "grad_norm": 1.8197853565216064, - "learning_rate": 1.7095526499282172e-06, - "loss": 0.33384573459625244, - "mean_token_accuracy": 0.8757365942001343, - "num_tokens": 21928368.0, - "step": 2458 - }, - { - "epoch": 1.8685410334346506, - "grad_norm": 1.8771090507507324, - "learning_rate": 1.7075659758066207e-06, - "loss": 0.38854318857192993, - "mean_token_accuracy": 0.8565001487731934, - "num_tokens": 21936624.0, - "step": 2459 - }, - { - "epoch": 1.8693009118541033, - "grad_norm": 1.449811577796936, - "learning_rate": 1.7055798578457267e-06, - "loss": 0.45504286885261536, - "mean_token_accuracy": 0.8338158130645752, - "num_tokens": 21952192.0, - "step": 2460 - }, - { - "epoch": 1.8700607902735562, - "grad_norm": 2.253678321838379, - "learning_rate": 1.703594297439469e-06, - "loss": 0.44300752878189087, - "mean_token_accuracy": 0.8451106548309326, - "num_tokens": 21959107.0, - "step": 2461 - }, - { - "epoch": 1.8708206686930091, - "grad_norm": 2.5431747436523438, - "learning_rate": 1.7016092959813892e-06, - "loss": 0.34692925214767456, - "mean_token_accuracy": 0.8823766708374023, - "num_tokens": 21964543.0, - "step": 2462 - }, - { - "epoch": 1.871580547112462, - "grad_norm": 2.7001953125, - "learning_rate": 1.6996248548646393e-06, - "loss": 0.5270686745643616, - "mean_token_accuracy": 0.8366886377334595, - "num_tokens": 21970157.0, - "step": 2463 - }, - { - "epoch": 1.872340425531915, - "grad_norm": 2.3855581283569336, - "learning_rate": 1.6976409754819767e-06, - "loss": 0.40109893679618835, - "mean_token_accuracy": 0.8477234840393066, - "num_tokens": 21976046.0, - "step": 2464 - }, - { - "epoch": 1.8731003039513676, - "grad_norm": 1.6014364957809448, - "learning_rate": 1.6956576592257635e-06, - "loss": 0.4344262480735779, - "mean_token_accuracy": 0.8464433550834656, - "num_tokens": 21986299.0, - "step": 2465 - }, - { - "epoch": 1.8738601823708207, - "grad_norm": 2.221372127532959, - "learning_rate": 1.6936749074879663e-06, - "loss": 0.24239015579223633, - "mean_token_accuracy": 0.9185566306114197, - "num_tokens": 21991541.0, - "step": 2466 - }, - { - "epoch": 1.8746200607902734, - "grad_norm": 1.6672178506851196, - "learning_rate": 1.6916927216601593e-06, - "loss": 0.35219496488571167, - "mean_token_accuracy": 0.8668237328529358, - "num_tokens": 22000797.0, - "step": 2467 - }, - { - "epoch": 1.8753799392097266, - "grad_norm": 1.364131212234497, - "learning_rate": 1.6897111031335145e-06, - "loss": 0.4456409513950348, - "mean_token_accuracy": 0.8350487947463989, - "num_tokens": 22018297.0, - "step": 2468 - }, - { - "epoch": 1.8761398176291793, - "grad_norm": 1.4535794258117676, - "learning_rate": 1.6877300532988095e-06, - "loss": 0.395782470703125, - "mean_token_accuracy": 0.8482908010482788, - "num_tokens": 22030096.0, - "step": 2469 - }, - { - "epoch": 1.8768996960486324, - "grad_norm": 2.0192270278930664, - "learning_rate": 1.6857495735464196e-06, - "loss": 0.31406813859939575, - "mean_token_accuracy": 0.889453649520874, - "num_tokens": 22036082.0, - "step": 2470 - }, - { - "epoch": 1.877659574468085, - "grad_norm": 2.159257173538208, - "learning_rate": 1.6837696652663244e-06, - "loss": 0.43942126631736755, - "mean_token_accuracy": 0.8518660068511963, - "num_tokens": 22043413.0, - "step": 2471 - }, - { - "epoch": 1.878419452887538, - "grad_norm": 1.9774882793426514, - "learning_rate": 1.681790329848097e-06, - "loss": 0.42464935779571533, - "mean_token_accuracy": 0.8545591831207275, - "num_tokens": 22050290.0, - "step": 2472 - }, - { - "epoch": 1.8791793313069909, - "grad_norm": 1.0219167470932007, - "learning_rate": 1.6798115686809125e-06, - "loss": 0.36917346715927124, - "mean_token_accuracy": 0.8650286197662354, - "num_tokens": 22070408.0, - "step": 2473 - }, - { - "epoch": 1.8799392097264438, - "grad_norm": 1.2943378686904907, - "learning_rate": 1.677833383153542e-06, - "loss": 0.3434808850288391, - "mean_token_accuracy": 0.878541111946106, - "num_tokens": 22083567.0, - "step": 2474 - }, - { - "epoch": 1.8806990881458967, - "grad_norm": 3.582855224609375, - "learning_rate": 1.6758557746543518e-06, - "loss": 0.39738911390304565, - "mean_token_accuracy": 0.8951535224914551, - "num_tokens": 22086886.0, - "step": 2475 - }, - { - "epoch": 1.8814589665653494, - "grad_norm": 1.680220365524292, - "learning_rate": 1.673878744571304e-06, - "loss": 0.38146206736564636, - "mean_token_accuracy": 0.8596681356430054, - "num_tokens": 22095564.0, - "step": 2476 - }, - { - "epoch": 1.8822188449848025, - "grad_norm": 1.448194146156311, - "learning_rate": 1.6719022942919527e-06, - "loss": 0.43309977650642395, - "mean_token_accuracy": 0.8669528961181641, - "num_tokens": 22109333.0, - "step": 2477 - }, - { - "epoch": 1.8829787234042552, - "grad_norm": 1.5353537797927856, - "learning_rate": 1.6699264252034498e-06, - "loss": 0.4479079842567444, - "mean_token_accuracy": 0.8379873037338257, - "num_tokens": 22124735.0, - "step": 2478 - }, - { - "epoch": 1.8837386018237083, - "grad_norm": 1.1744320392608643, - "learning_rate": 1.6679511386925337e-06, - "loss": 0.31951260566711426, - "mean_token_accuracy": 0.8792685270309448, - "num_tokens": 22140882.0, - "step": 2479 - }, - { - "epoch": 1.884498480243161, - "grad_norm": 2.1996841430664062, - "learning_rate": 1.6659764361455383e-06, - "loss": 0.39045992493629456, - "mean_token_accuracy": 0.8587675094604492, - "num_tokens": 22146843.0, - "step": 2480 - }, - { - "epoch": 1.885258358662614, - "grad_norm": 3.494931697845459, - "learning_rate": 1.6640023189483836e-06, - "loss": 0.44756871461868286, - "mean_token_accuracy": 0.8643628358840942, - "num_tokens": 22150504.0, - "step": 2481 - }, - { - "epoch": 1.8860182370820668, - "grad_norm": 2.2455973625183105, - "learning_rate": 1.6620287884865831e-06, - "loss": 0.3308878540992737, - "mean_token_accuracy": 0.8748078942298889, - "num_tokens": 22156537.0, - "step": 2482 - }, - { - "epoch": 1.8867781155015197, - "grad_norm": 2.31868314743042, - "learning_rate": 1.6600558461452368e-06, - "loss": 0.46583569049835205, - "mean_token_accuracy": 0.8438903093338013, - "num_tokens": 22163501.0, - "step": 2483 - }, - { - "epoch": 1.8875379939209727, - "grad_norm": 1.5695412158966064, - "learning_rate": 1.65808349330903e-06, - "loss": 0.351986825466156, - "mean_token_accuracy": 0.8707568645477295, - "num_tokens": 22173880.0, - "step": 2484 - }, - { - "epoch": 1.8882978723404256, - "grad_norm": 1.4109563827514648, - "learning_rate": 1.656111731362236e-06, - "loss": 0.36058586835861206, - "mean_token_accuracy": 0.8606001138687134, - "num_tokens": 22189000.0, - "step": 2485 - }, - { - "epoch": 1.8890577507598785, - "grad_norm": 1.0398776531219482, - "learning_rate": 1.6541405616887138e-06, - "loss": 0.36524999141693115, - "mean_token_accuracy": 0.8690586090087891, - "num_tokens": 22209187.0, - "step": 2486 - }, - { - "epoch": 1.8898176291793312, - "grad_norm": 2.1050004959106445, - "learning_rate": 1.6521699856719065e-06, - "loss": 0.2988269329071045, - "mean_token_accuracy": 0.8887280225753784, - "num_tokens": 22215539.0, - "step": 2487 - }, - { - "epoch": 1.8905775075987843, - "grad_norm": 2.5606791973114014, - "learning_rate": 1.650200004694839e-06, - "loss": 0.41077330708503723, - "mean_token_accuracy": 0.8436049818992615, - "num_tokens": 22221133.0, - "step": 2488 - }, - { - "epoch": 1.891337386018237, - "grad_norm": 1.5786094665527344, - "learning_rate": 1.6482306201401211e-06, - "loss": 0.4217292368412018, - "mean_token_accuracy": 0.859939455986023, - "num_tokens": 22231578.0, - "step": 2489 - }, - { - "epoch": 1.89209726443769, - "grad_norm": 1.7131884098052979, - "learning_rate": 1.6462618333899422e-06, - "loss": 0.3945464789867401, - "mean_token_accuracy": 0.8679244518280029, - "num_tokens": 22241252.0, - "step": 2490 - }, - { - "epoch": 1.8928571428571428, - "grad_norm": 2.8350300788879395, - "learning_rate": 1.6442936458260723e-06, - "loss": 0.3992699384689331, - "mean_token_accuracy": 0.8717275857925415, - "num_tokens": 22246226.0, - "step": 2491 - }, - { - "epoch": 1.8936170212765957, - "grad_norm": 2.2180120944976807, - "learning_rate": 1.6423260588298608e-06, - "loss": 0.3381099998950958, - "mean_token_accuracy": 0.8968075513839722, - "num_tokens": 22252355.0, - "step": 2492 - }, - { - "epoch": 1.8943768996960486, - "grad_norm": 2.6498866081237793, - "learning_rate": 1.6403590737822378e-06, - "loss": 0.36339250206947327, - "mean_token_accuracy": 0.8633373379707336, - "num_tokens": 22257407.0, - "step": 2493 - }, - { - "epoch": 1.8951367781155015, - "grad_norm": 2.634241819381714, - "learning_rate": 1.6383926920637077e-06, - "loss": 0.2562698721885681, - "mean_token_accuracy": 0.8999600410461426, - "num_tokens": 22261858.0, - "step": 2494 - }, - { - "epoch": 1.8958966565349544, - "grad_norm": 2.0163333415985107, - "learning_rate": 1.6364269150543533e-06, - "loss": 0.3413389027118683, - "mean_token_accuracy": 0.8718398809432983, - "num_tokens": 22268517.0, - "step": 2495 - }, - { - "epoch": 1.8966565349544073, - "grad_norm": 2.8333005905151367, - "learning_rate": 1.6344617441338311e-06, - "loss": 0.4354540705680847, - "mean_token_accuracy": 0.8491238355636597, - "num_tokens": 22273648.0, - "step": 2496 - }, - { - "epoch": 1.8974164133738602, - "grad_norm": 1.6280957460403442, - "learning_rate": 1.6324971806813766e-06, - "loss": 0.3015792965888977, - "mean_token_accuracy": 0.8937206268310547, - "num_tokens": 22282521.0, - "step": 2497 - }, - { - "epoch": 1.898176291793313, - "grad_norm": 1.2246302366256714, - "learning_rate": 1.6305332260757937e-06, - "loss": 0.26619502902030945, - "mean_token_accuracy": 0.8886681199073792, - "num_tokens": 22295179.0, - "step": 2498 - }, - { - "epoch": 1.898936170212766, - "grad_norm": 2.4014432430267334, - "learning_rate": 1.6285698816954626e-06, - "loss": 0.3735058903694153, - "mean_token_accuracy": 0.8693109750747681, - "num_tokens": 22300681.0, - "step": 2499 - }, - { - "epoch": 1.8996960486322187, - "grad_norm": 1.4447300434112549, - "learning_rate": 1.6266071489183327e-06, - "loss": 0.40768876671791077, - "mean_token_accuracy": 0.8556059002876282, - "num_tokens": 22312442.0, - "step": 2500 - }, - { - "epoch": 1.9004559270516719, - "grad_norm": 2.1339821815490723, - "learning_rate": 1.6246450291219268e-06, - "loss": 0.33442017436027527, - "mean_token_accuracy": 0.8837105631828308, - "num_tokens": 22318779.0, - "step": 2501 - }, - { - "epoch": 1.9012158054711246, - "grad_norm": 2.8564913272857666, - "learning_rate": 1.6226835236833356e-06, - "loss": 0.36013197898864746, - "mean_token_accuracy": 0.8810569047927856, - "num_tokens": 22323390.0, - "step": 2502 - }, - { - "epoch": 1.9019756838905775, - "grad_norm": 2.1201915740966797, - "learning_rate": 1.620722633979219e-06, - "loss": 0.4587489664554596, - "mean_token_accuracy": 0.8517274856567383, - "num_tokens": 22330275.0, - "step": 2503 - }, - { - "epoch": 1.9027355623100304, - "grad_norm": 2.211402177810669, - "learning_rate": 1.6187623613858038e-06, - "loss": 0.3698349595069885, - "mean_token_accuracy": 0.8768182992935181, - "num_tokens": 22336041.0, - "step": 2504 - }, - { - "epoch": 1.9034954407294833, - "grad_norm": 1.421604871749878, - "learning_rate": 1.6168027072788868e-06, - "loss": 0.38086453080177307, - "mean_token_accuracy": 0.8622198104858398, - "num_tokens": 22349310.0, - "step": 2505 - }, - { - "epoch": 1.9042553191489362, - "grad_norm": 2.4304113388061523, - "learning_rate": 1.6148436730338279e-06, - "loss": 0.34694477915763855, - "mean_token_accuracy": 0.8833136558532715, - "num_tokens": 22355069.0, - "step": 2506 - }, - { - "epoch": 1.905015197568389, - "grad_norm": 2.1076772212982178, - "learning_rate": 1.6128852600255518e-06, - "loss": 0.4973800778388977, - "mean_token_accuracy": 0.851190984249115, - "num_tokens": 22362402.0, - "step": 2507 - }, - { - "epoch": 1.905775075987842, - "grad_norm": 3.0934200286865234, - "learning_rate": 1.6109274696285496e-06, - "loss": 0.46498024463653564, - "mean_token_accuracy": 0.8436626195907593, - "num_tokens": 22367390.0, - "step": 2508 - }, - { - "epoch": 1.9065349544072947, - "grad_norm": 2.0114359855651855, - "learning_rate": 1.6089703032168736e-06, - "loss": 0.45143815875053406, - "mean_token_accuracy": 0.852748692035675, - "num_tokens": 22377032.0, - "step": 2509 - }, - { - "epoch": 1.9072948328267478, - "grad_norm": 1.8780893087387085, - "learning_rate": 1.6070137621641382e-06, - "loss": 0.3977179527282715, - "mean_token_accuracy": 0.8556262850761414, - "num_tokens": 22386880.0, - "step": 2510 - }, - { - "epoch": 1.9080547112462005, - "grad_norm": 1.6748069524765015, - "learning_rate": 1.6050578478435184e-06, - "loss": 0.35590440034866333, - "mean_token_accuracy": 0.8702141046524048, - "num_tokens": 22396616.0, - "step": 2511 - }, - { - "epoch": 1.9088145896656536, - "grad_norm": 0.9799401760101318, - "learning_rate": 1.6031025616277512e-06, - "loss": 0.3325427770614624, - "mean_token_accuracy": 0.8771291971206665, - "num_tokens": 22419580.0, - "step": 2512 - }, - { - "epoch": 1.9095744680851063, - "grad_norm": 1.5084866285324097, - "learning_rate": 1.6011479048891323e-06, - "loss": 0.44336390495300293, - "mean_token_accuracy": 0.8786209225654602, - "num_tokens": 22434235.0, - "step": 2513 - }, - { - "epoch": 1.9103343465045592, - "grad_norm": 1.8544305562973022, - "learning_rate": 1.5991938789995138e-06, - "loss": 0.3055306375026703, - "mean_token_accuracy": 0.9043174982070923, - "num_tokens": 22442003.0, - "step": 2514 - }, - { - "epoch": 1.9110942249240122, - "grad_norm": 4.29932165145874, - "learning_rate": 1.5972404853303061e-06, - "loss": 0.386760413646698, - "mean_token_accuracy": 0.8914207220077515, - "num_tokens": 22444787.0, - "step": 2515 - }, - { - "epoch": 1.911854103343465, - "grad_norm": 1.7560505867004395, - "learning_rate": 1.595287725252478e-06, - "loss": 0.4141422510147095, - "mean_token_accuracy": 0.862310528755188, - "num_tokens": 22453625.0, - "step": 2516 - }, - { - "epoch": 1.912613981762918, - "grad_norm": 2.685443878173828, - "learning_rate": 1.5933356001365502e-06, - "loss": 0.36217260360717773, - "mean_token_accuracy": 0.868883490562439, - "num_tokens": 22458597.0, - "step": 2517 - }, - { - "epoch": 1.9133738601823707, - "grad_norm": 2.2587239742279053, - "learning_rate": 1.591384111352599e-06, - "loss": 0.5298880934715271, - "mean_token_accuracy": 0.821168839931488, - "num_tokens": 22466091.0, - "step": 2518 - }, - { - "epoch": 1.9141337386018238, - "grad_norm": 2.273380756378174, - "learning_rate": 1.5894332602702545e-06, - "loss": 0.3194117546081543, - "mean_token_accuracy": 0.8849239945411682, - "num_tokens": 22471785.0, - "step": 2519 - }, - { - "epoch": 1.9148936170212765, - "grad_norm": 2.314634084701538, - "learning_rate": 1.5874830482587003e-06, - "loss": 0.457550585269928, - "mean_token_accuracy": 0.8367670774459839, - "num_tokens": 22479091.0, - "step": 2520 - }, - { - "epoch": 1.9156534954407296, - "grad_norm": 2.16206693649292, - "learning_rate": 1.585533476686669e-06, - "loss": 0.43055859208106995, - "mean_token_accuracy": 0.8659856915473938, - "num_tokens": 22487379.0, - "step": 2521 - }, - { - "epoch": 1.9164133738601823, - "grad_norm": 2.2091798782348633, - "learning_rate": 1.5835845469224447e-06, - "loss": 0.45421302318573, - "mean_token_accuracy": 0.8418087959289551, - "num_tokens": 22493755.0, - "step": 2522 - }, - { - "epoch": 1.9171732522796354, - "grad_norm": 1.6166985034942627, - "learning_rate": 1.5816362603338632e-06, - "loss": 0.5211667418479919, - "mean_token_accuracy": 0.809440016746521, - "num_tokens": 22506648.0, - "step": 2523 - }, - { - "epoch": 1.917933130699088, - "grad_norm": 2.4998703002929688, - "learning_rate": 1.5796886182883053e-06, - "loss": 0.45915648341178894, - "mean_token_accuracy": 0.833067774772644, - "num_tokens": 22513216.0, - "step": 2524 - }, - { - "epoch": 1.918693009118541, - "grad_norm": 1.492928147315979, - "learning_rate": 1.577741622152702e-06, - "loss": 0.45581498742103577, - "mean_token_accuracy": 0.8531479835510254, - "num_tokens": 22524908.0, - "step": 2525 - }, - { - "epoch": 1.919452887537994, - "grad_norm": 2.0502207279205322, - "learning_rate": 1.5757952732935288e-06, - "loss": 0.4156759977340698, - "mean_token_accuracy": 0.8677599430084229, - "num_tokens": 22532275.0, - "step": 2526 - }, - { - "epoch": 1.9202127659574468, - "grad_norm": 2.4572031497955322, - "learning_rate": 1.5738495730768104e-06, - "loss": 0.43373313546180725, - "mean_token_accuracy": 0.8435516357421875, - "num_tokens": 22538272.0, - "step": 2527 - }, - { - "epoch": 1.9209726443768997, - "grad_norm": 2.071903705596924, - "learning_rate": 1.5719045228681127e-06, - "loss": 0.3211413621902466, - "mean_token_accuracy": 0.87841796875, - "num_tokens": 22545487.0, - "step": 2528 - }, - { - "epoch": 1.9217325227963524, - "grad_norm": 1.6742064952850342, - "learning_rate": 1.5699601240325474e-06, - "loss": 0.3704240322113037, - "mean_token_accuracy": 0.8646563291549683, - "num_tokens": 22554840.0, - "step": 2529 - }, - { - "epoch": 1.9224924012158056, - "grad_norm": 1.0941399335861206, - "learning_rate": 1.5680163779347668e-06, - "loss": 0.3595704436302185, - "mean_token_accuracy": 0.8680597543716431, - "num_tokens": 22572627.0, - "step": 2530 - }, - { - "epoch": 1.9232522796352582, - "grad_norm": 2.9815237522125244, - "learning_rate": 1.5660732859389687e-06, - "loss": 0.2941335141658783, - "mean_token_accuracy": 0.8847303986549377, - "num_tokens": 22576851.0, - "step": 2531 - }, - { - "epoch": 1.9240121580547114, - "grad_norm": 2.898106813430786, - "learning_rate": 1.5641308494088903e-06, - "loss": 0.4066317081451416, - "mean_token_accuracy": 0.8469538688659668, - "num_tokens": 22581431.0, - "step": 2532 - }, - { - "epoch": 1.924772036474164, - "grad_norm": 1.6757515668869019, - "learning_rate": 1.5621890697078069e-06, - "loss": 0.33923569321632385, - "mean_token_accuracy": 0.8790708184242249, - "num_tokens": 22590648.0, - "step": 2533 - }, - { - "epoch": 1.925531914893617, - "grad_norm": 1.747314214706421, - "learning_rate": 1.5602479481985333e-06, - "loss": 0.4865703582763672, - "mean_token_accuracy": 0.8314566612243652, - "num_tokens": 22600153.0, - "step": 2534 - }, - { - "epoch": 1.9262917933130699, - "grad_norm": 2.7927849292755127, - "learning_rate": 1.5583074862434254e-06, - "loss": 0.335658460855484, - "mean_token_accuracy": 0.8769067525863647, - "num_tokens": 22604864.0, - "step": 2535 - }, - { - "epoch": 1.9270516717325228, - "grad_norm": 2.2553000450134277, - "learning_rate": 1.5563676852043738e-06, - "loss": 0.4442562460899353, - "mean_token_accuracy": 0.8381515145301819, - "num_tokens": 22611102.0, - "step": 2536 - }, - { - "epoch": 1.9278115501519757, - "grad_norm": 1.1937638521194458, - "learning_rate": 1.5544285464428044e-06, - "loss": 0.38608425855636597, - "mean_token_accuracy": 0.8589644432067871, - "num_tokens": 22627781.0, - "step": 2537 - }, - { - "epoch": 1.9285714285714286, - "grad_norm": 3.282639980316162, - "learning_rate": 1.55249007131968e-06, - "loss": 0.31231993436813354, - "mean_token_accuracy": 0.8917703032493591, - "num_tokens": 22632341.0, - "step": 2538 - }, - { - "epoch": 1.9293313069908815, - "grad_norm": 2.3212976455688477, - "learning_rate": 1.5505522611954977e-06, - "loss": 0.34952571988105774, - "mean_token_accuracy": 0.8752106428146362, - "num_tokens": 22638572.0, - "step": 2539 - }, - { - "epoch": 1.9300911854103342, - "grad_norm": 1.389098882675171, - "learning_rate": 1.548615117430286e-06, - "loss": 0.4298851788043976, - "mean_token_accuracy": 0.871698260307312, - "num_tokens": 22651875.0, - "step": 2540 - }, - { - "epoch": 1.9308510638297873, - "grad_norm": 1.5333977937698364, - "learning_rate": 1.5466786413836077e-06, - "loss": 0.45540744066238403, - "mean_token_accuracy": 0.8409075736999512, - "num_tokens": 22662903.0, - "step": 2541 - }, - { - "epoch": 1.93161094224924, - "grad_norm": 1.7833251953125, - "learning_rate": 1.5447428344145565e-06, - "loss": 0.333247572183609, - "mean_token_accuracy": 0.8796100616455078, - "num_tokens": 22671125.0, - "step": 2542 - }, - { - "epoch": 1.9323708206686931, - "grad_norm": 1.5165303945541382, - "learning_rate": 1.5428076978817564e-06, - "loss": 0.3085063099861145, - "mean_token_accuracy": 0.888705849647522, - "num_tokens": 22681482.0, - "step": 2543 - }, - { - "epoch": 1.9331306990881458, - "grad_norm": 2.3556196689605713, - "learning_rate": 1.5408732331433596e-06, - "loss": 0.44008776545524597, - "mean_token_accuracy": 0.8578170537948608, - "num_tokens": 22686952.0, - "step": 2544 - }, - { - "epoch": 1.9338905775075987, - "grad_norm": 2.9572882652282715, - "learning_rate": 1.538939441557048e-06, - "loss": 0.3779261112213135, - "mean_token_accuracy": 0.8657241463661194, - "num_tokens": 22691211.0, - "step": 2545 - }, - { - "epoch": 1.9346504559270516, - "grad_norm": 2.373473644256592, - "learning_rate": 1.5370063244800326e-06, - "loss": 0.4113072454929352, - "mean_token_accuracy": 0.872116208076477, - "num_tokens": 22697442.0, - "step": 2546 - }, - { - "epoch": 1.9354103343465046, - "grad_norm": 2.270207643508911, - "learning_rate": 1.5350738832690479e-06, - "loss": 0.4021070897579193, - "mean_token_accuracy": 0.8750372529029846, - "num_tokens": 22703693.0, - "step": 2547 - }, - { - "epoch": 1.9361702127659575, - "grad_norm": 2.429445266723633, - "learning_rate": 1.5331421192803565e-06, - "loss": 0.40210235118865967, - "mean_token_accuracy": 0.8593704104423523, - "num_tokens": 22709285.0, - "step": 2548 - }, - { - "epoch": 1.9369300911854104, - "grad_norm": 1.4576458930969238, - "learning_rate": 1.5312110338697427e-06, - "loss": 0.44822201132774353, - "mean_token_accuracy": 0.8737322688102722, - "num_tokens": 22723743.0, - "step": 2549 - }, - { - "epoch": 1.9376899696048633, - "grad_norm": 2.1008098125457764, - "learning_rate": 1.5292806283925192e-06, - "loss": 0.3514235019683838, - "mean_token_accuracy": 0.8689005374908447, - "num_tokens": 22730135.0, - "step": 2550 - }, - { - "epoch": 1.938449848024316, - "grad_norm": 1.9786806106567383, - "learning_rate": 1.5273509042035172e-06, - "loss": 0.4483771324157715, - "mean_token_accuracy": 0.8353633880615234, - "num_tokens": 22738717.0, - "step": 2551 - }, - { - "epoch": 1.939209726443769, - "grad_norm": 1.0649693012237549, - "learning_rate": 1.5254218626570927e-06, - "loss": 0.30712205171585083, - "mean_token_accuracy": 0.8802675008773804, - "num_tokens": 22757346.0, - "step": 2552 - }, - { - "epoch": 1.9399696048632218, - "grad_norm": 3.0401108264923096, - "learning_rate": 1.5234935051071193e-06, - "loss": 0.5213959217071533, - "mean_token_accuracy": 0.8249514102935791, - "num_tokens": 22762169.0, - "step": 2553 - }, - { - "epoch": 1.940729483282675, - "grad_norm": 2.892486572265625, - "learning_rate": 1.521565832906994e-06, - "loss": 0.5694394111633301, - "mean_token_accuracy": 0.8139263391494751, - "num_tokens": 22767824.0, - "step": 2554 - }, - { - "epoch": 1.9414893617021276, - "grad_norm": 1.6187207698822021, - "learning_rate": 1.519638847409632e-06, - "loss": 0.46748271584510803, - "mean_token_accuracy": 0.8541051149368286, - "num_tokens": 22778195.0, - "step": 2555 - }, - { - "epoch": 1.9422492401215805, - "grad_norm": 1.3857731819152832, - "learning_rate": 1.5177125499674639e-06, - "loss": 0.35661786794662476, - "mean_token_accuracy": 0.8711516857147217, - "num_tokens": 22792353.0, - "step": 2556 - }, - { - "epoch": 1.9430091185410334, - "grad_norm": 1.108441710472107, - "learning_rate": 1.515786941932441e-06, - "loss": 0.3537200391292572, - "mean_token_accuracy": 0.8739079833030701, - "num_tokens": 22813185.0, - "step": 2557 - }, - { - "epoch": 1.9437689969604863, - "grad_norm": 2.0528404712677, - "learning_rate": 1.5138620246560295e-06, - "loss": 0.4161028265953064, - "mean_token_accuracy": 0.8385938405990601, - "num_tokens": 22821227.0, - "step": 2558 - }, - { - "epoch": 1.9445288753799392, - "grad_norm": 1.5123628377914429, - "learning_rate": 1.5119377994892095e-06, - "loss": 0.4420986473560333, - "mean_token_accuracy": 0.8664361834526062, - "num_tokens": 22835064.0, - "step": 2559 - }, - { - "epoch": 1.9452887537993921, - "grad_norm": 2.5354838371276855, - "learning_rate": 1.5100142677824752e-06, - "loss": 0.3837323784828186, - "mean_token_accuracy": 0.8607655763626099, - "num_tokens": 22840455.0, - "step": 2560 - }, - { - "epoch": 1.946048632218845, - "grad_norm": 1.1354057788848877, - "learning_rate": 1.5080914308858375e-06, - "loss": 0.39776813983917236, - "mean_token_accuracy": 0.8586497902870178, - "num_tokens": 22858828.0, - "step": 2561 - }, - { - "epoch": 1.9468085106382977, - "grad_norm": 1.576740026473999, - "learning_rate": 1.5061692901488161e-06, - "loss": 0.3167848289012909, - "mean_token_accuracy": 0.8876185417175293, - "num_tokens": 22868674.0, - "step": 2562 - }, - { - "epoch": 1.9475683890577509, - "grad_norm": 1.4835401773452759, - "learning_rate": 1.5042478469204437e-06, - "loss": 0.44950318336486816, - "mean_token_accuracy": 0.8526639342308044, - "num_tokens": 22883019.0, - "step": 2563 - }, - { - "epoch": 1.9483282674772036, - "grad_norm": 1.617073655128479, - "learning_rate": 1.502327102549262e-06, - "loss": 0.45711010694503784, - "mean_token_accuracy": 0.834361732006073, - "num_tokens": 22896834.0, - "step": 2564 - }, - { - "epoch": 1.9490881458966567, - "grad_norm": 1.3348414897918701, - "learning_rate": 1.5004070583833252e-06, - "loss": 0.3691314458847046, - "mean_token_accuracy": 0.8779371380805969, - "num_tokens": 22912350.0, - "step": 2565 - }, - { - "epoch": 1.9498480243161094, - "grad_norm": 1.711234450340271, - "learning_rate": 1.4984877157701932e-06, - "loss": 0.38726937770843506, - "mean_token_accuracy": 0.8704015016555786, - "num_tokens": 22922575.0, - "step": 2566 - }, - { - "epoch": 1.9506079027355623, - "grad_norm": 2.4587950706481934, - "learning_rate": 1.4965690760569346e-06, - "loss": 0.4455464482307434, - "mean_token_accuracy": 0.8481032252311707, - "num_tokens": 22928717.0, - "step": 2567 - }, - { - "epoch": 1.9513677811550152, - "grad_norm": 2.4189560413360596, - "learning_rate": 1.4946511405901237e-06, - "loss": 0.4120418429374695, - "mean_token_accuracy": 0.8519487380981445, - "num_tokens": 22934977.0, - "step": 2568 - }, - { - "epoch": 1.952127659574468, - "grad_norm": 1.2503050565719604, - "learning_rate": 1.4927339107158437e-06, - "loss": 0.4434332251548767, - "mean_token_accuracy": 0.8448144793510437, - "num_tokens": 22950061.0, - "step": 2569 - }, - { - "epoch": 1.952887537993921, - "grad_norm": 1.788493275642395, - "learning_rate": 1.4908173877796784e-06, - "loss": 0.49203023314476013, - "mean_token_accuracy": 0.8601495623588562, - "num_tokens": 22961838.0, - "step": 2570 - }, - { - "epoch": 1.9536474164133737, - "grad_norm": 1.4260050058364868, - "learning_rate": 1.4889015731267186e-06, - "loss": 0.3286570906639099, - "mean_token_accuracy": 0.882429838180542, - "num_tokens": 22973192.0, - "step": 2571 - }, - { - "epoch": 1.9544072948328268, - "grad_norm": 1.6754822731018066, - "learning_rate": 1.486986468101555e-06, - "loss": 0.34655290842056274, - "mean_token_accuracy": 0.8807861804962158, - "num_tokens": 22983661.0, - "step": 2572 - }, - { - "epoch": 1.9551671732522795, - "grad_norm": 1.9064570665359497, - "learning_rate": 1.4850720740482842e-06, - "loss": 0.34020254015922546, - "mean_token_accuracy": 0.86677086353302, - "num_tokens": 22991231.0, - "step": 2573 - }, - { - "epoch": 1.9559270516717326, - "grad_norm": 1.977444052696228, - "learning_rate": 1.4831583923105e-06, - "loss": 0.21505260467529297, - "mean_token_accuracy": 0.921241819858551, - "num_tokens": 22996828.0, - "step": 2574 - }, - { - "epoch": 1.9566869300911853, - "grad_norm": 1.1019235849380493, - "learning_rate": 1.481245424231298e-06, - "loss": 0.3804295063018799, - "mean_token_accuracy": 0.8582668900489807, - "num_tokens": 23016018.0, - "step": 2575 - }, - { - "epoch": 1.9574468085106385, - "grad_norm": 1.7943179607391357, - "learning_rate": 1.4793331711532743e-06, - "loss": 0.38565245270729065, - "mean_token_accuracy": 0.8599048256874084, - "num_tokens": 23024461.0, - "step": 2576 - }, - { - "epoch": 1.9582066869300911, - "grad_norm": 2.273824453353882, - "learning_rate": 1.4774216344185204e-06, - "loss": 0.46297723054885864, - "mean_token_accuracy": 0.8294345140457153, - "num_tokens": 23031687.0, - "step": 2577 - }, - { - "epoch": 1.958966565349544, - "grad_norm": 2.308509111404419, - "learning_rate": 1.4755108153686275e-06, - "loss": 0.4366525411605835, - "mean_token_accuracy": 0.8515903949737549, - "num_tokens": 23037072.0, - "step": 2578 - }, - { - "epoch": 1.959726443768997, - "grad_norm": 2.069028377532959, - "learning_rate": 1.4736007153446803e-06, - "loss": 0.33900877833366394, - "mean_token_accuracy": 0.8937177658081055, - "num_tokens": 23043207.0, - "step": 2579 - }, - { - "epoch": 1.9604863221884499, - "grad_norm": 2.905163288116455, - "learning_rate": 1.4716913356872614e-06, - "loss": 0.3708382844924927, - "mean_token_accuracy": 0.8936747312545776, - "num_tokens": 23047020.0, - "step": 2580 - }, - { - "epoch": 1.9612462006079028, - "grad_norm": 2.4153175354003906, - "learning_rate": 1.4697826777364478e-06, - "loss": 0.473562091588974, - "mean_token_accuracy": 0.8350275158882141, - "num_tokens": 23053282.0, - "step": 2581 - }, - { - "epoch": 1.9620060790273555, - "grad_norm": 2.21589994430542, - "learning_rate": 1.467874742831808e-06, - "loss": 0.3812660276889801, - "mean_token_accuracy": 0.8623865842819214, - "num_tokens": 23059399.0, - "step": 2582 - }, - { - "epoch": 1.9627659574468086, - "grad_norm": 1.0847623348236084, - "learning_rate": 1.4659675323124037e-06, - "loss": 0.3846944570541382, - "mean_token_accuracy": 0.8633466958999634, - "num_tokens": 23081005.0, - "step": 2583 - }, - { - "epoch": 1.9635258358662613, - "grad_norm": 1.8754645586013794, - "learning_rate": 1.46406104751679e-06, - "loss": 0.3460300862789154, - "mean_token_accuracy": 0.8757443428039551, - "num_tokens": 23088710.0, - "step": 2584 - }, - { - "epoch": 1.9642857142857144, - "grad_norm": 2.13075852394104, - "learning_rate": 1.462155289783011e-06, - "loss": 0.3060935139656067, - "mean_token_accuracy": 0.9070644378662109, - "num_tokens": 23094862.0, - "step": 2585 - }, - { - "epoch": 1.965045592705167, - "grad_norm": 2.9674458503723145, - "learning_rate": 1.4602502604486e-06, - "loss": 0.4464406371116638, - "mean_token_accuracy": 0.8497441411018372, - "num_tokens": 23099821.0, - "step": 2586 - }, - { - "epoch": 1.96580547112462, - "grad_norm": 1.9171007871627808, - "learning_rate": 1.45834596085058e-06, - "loss": 0.3905114531517029, - "mean_token_accuracy": 0.8564352989196777, - "num_tokens": 23107804.0, - "step": 2587 - }, - { - "epoch": 1.966565349544073, - "grad_norm": 2.0817408561706543, - "learning_rate": 1.456442392325463e-06, - "loss": 0.3903818130493164, - "mean_token_accuracy": 0.8671162128448486, - "num_tokens": 23115224.0, - "step": 2588 - }, - { - "epoch": 1.9673252279635258, - "grad_norm": 2.6379549503326416, - "learning_rate": 1.4545395562092467e-06, - "loss": 0.22965987026691437, - "mean_token_accuracy": 0.9160916805267334, - "num_tokens": 23119184.0, - "step": 2589 - }, - { - "epoch": 1.9680851063829787, - "grad_norm": 2.525221824645996, - "learning_rate": 1.4526374538374133e-06, - "loss": 0.4132574498653412, - "mean_token_accuracy": 0.8486990332603455, - "num_tokens": 23124679.0, - "step": 2590 - }, - { - "epoch": 1.9688449848024316, - "grad_norm": 2.0362391471862793, - "learning_rate": 1.4507360865449318e-06, - "loss": 0.29624345898628235, - "mean_token_accuracy": 0.888127863407135, - "num_tokens": 23130756.0, - "step": 2591 - }, - { - "epoch": 1.9696048632218845, - "grad_norm": 1.5150481462478638, - "learning_rate": 1.4488354556662553e-06, - "loss": 0.3852264881134033, - "mean_token_accuracy": 0.8532775640487671, - "num_tokens": 23141597.0, - "step": 2592 - }, - { - "epoch": 1.9703647416413372, - "grad_norm": 1.5255193710327148, - "learning_rate": 1.4469355625353199e-06, - "loss": 0.37015780806541443, - "mean_token_accuracy": 0.8669752478599548, - "num_tokens": 23152487.0, - "step": 2593 - }, - { - "epoch": 1.9711246200607904, - "grad_norm": 1.1780041456222534, - "learning_rate": 1.4450364084855433e-06, - "loss": 0.34421291947364807, - "mean_token_accuracy": 0.8593694567680359, - "num_tokens": 23168769.0, - "step": 2594 - }, - { - "epoch": 1.971884498480243, - "grad_norm": 2.4549946784973145, - "learning_rate": 1.4431379948498254e-06, - "loss": 0.4000544548034668, - "mean_token_accuracy": 0.8551953434944153, - "num_tokens": 23175428.0, - "step": 2595 - }, - { - "epoch": 1.9726443768996962, - "grad_norm": 2.374192476272583, - "learning_rate": 1.4412403229605453e-06, - "loss": 0.31329840421676636, - "mean_token_accuracy": 0.8917277455329895, - "num_tokens": 23180678.0, - "step": 2596 - }, - { - "epoch": 1.9734042553191489, - "grad_norm": 1.268515706062317, - "learning_rate": 1.4393433941495638e-06, - "loss": 0.34808623790740967, - "mean_token_accuracy": 0.8726245164871216, - "num_tokens": 23194733.0, - "step": 2597 - }, - { - "epoch": 1.9741641337386018, - "grad_norm": 2.0898988246917725, - "learning_rate": 1.4374472097482156e-06, - "loss": 0.45849233865737915, - "mean_token_accuracy": 0.8414266109466553, - "num_tokens": 23202211.0, - "step": 2598 - }, - { - "epoch": 1.9749240121580547, - "grad_norm": 2.1497802734375, - "learning_rate": 1.4355517710873184e-06, - "loss": 0.4304521977901459, - "mean_token_accuracy": 0.8502874374389648, - "num_tokens": 23209623.0, - "step": 2599 - }, - { - "epoch": 1.9756838905775076, - "grad_norm": 1.821786880493164, - "learning_rate": 1.4336570794971643e-06, - "loss": 0.3910462558269501, - "mean_token_accuracy": 0.8962477445602417, - "num_tokens": 23218904.0, - "step": 2600 - }, - { - "epoch": 1.9764437689969605, - "grad_norm": 2.2523093223571777, - "learning_rate": 1.4317631363075186e-06, - "loss": 0.3456020951271057, - "mean_token_accuracy": 0.8703117370605469, - "num_tokens": 23225602.0, - "step": 2601 - }, - { - "epoch": 1.9772036474164134, - "grad_norm": 1.6920030117034912, - "learning_rate": 1.4298699428476236e-06, - "loss": 0.4629668593406677, - "mean_token_accuracy": 0.841956615447998, - "num_tokens": 23236812.0, - "step": 2602 - }, - { - "epoch": 1.9779635258358663, - "grad_norm": 1.8796344995498657, - "learning_rate": 1.427977500446199e-06, - "loss": 0.3302173316478729, - "mean_token_accuracy": 0.8769404888153076, - "num_tokens": 23245851.0, - "step": 2603 - }, - { - "epoch": 1.978723404255319, - "grad_norm": 2.4003775119781494, - "learning_rate": 1.4260858104314299e-06, - "loss": 0.48402607440948486, - "mean_token_accuracy": 0.8477497100830078, - "num_tokens": 23252429.0, - "step": 2604 - }, - { - "epoch": 1.9794832826747721, - "grad_norm": 3.576800584793091, - "learning_rate": 1.4241948741309783e-06, - "loss": 0.2943669259548187, - "mean_token_accuracy": 0.8933546543121338, - "num_tokens": 23255431.0, - "step": 2605 - }, - { - "epoch": 1.9802431610942248, - "grad_norm": 2.7589938640594482, - "learning_rate": 1.4223046928719764e-06, - "loss": 0.5138746500015259, - "mean_token_accuracy": 0.817468523979187, - "num_tokens": 23261351.0, - "step": 2606 - }, - { - "epoch": 1.981003039513678, - "grad_norm": 1.6950130462646484, - "learning_rate": 1.420415267981026e-06, - "loss": 0.2744991183280945, - "mean_token_accuracy": 0.9005721211433411, - "num_tokens": 23269482.0, - "step": 2607 - }, - { - "epoch": 1.9817629179331306, - "grad_norm": 1.5962934494018555, - "learning_rate": 1.418526600784198e-06, - "loss": 0.4629114270210266, - "mean_token_accuracy": 0.8337699174880981, - "num_tokens": 23279796.0, - "step": 2608 - }, - { - "epoch": 1.9825227963525835, - "grad_norm": 1.4962197542190552, - "learning_rate": 1.4166386926070322e-06, - "loss": 0.4217689633369446, - "mean_token_accuracy": 0.8445580005645752, - "num_tokens": 23293050.0, - "step": 2609 - }, - { - "epoch": 1.9832826747720365, - "grad_norm": 1.4243721961975098, - "learning_rate": 1.414751544774535e-06, - "loss": 0.4888152480125427, - "mean_token_accuracy": 0.8298524022102356, - "num_tokens": 23308501.0, - "step": 2610 - }, - { - "epoch": 1.9840425531914894, - "grad_norm": 1.5776121616363525, - "learning_rate": 1.412865158611179e-06, - "loss": 0.3156965970993042, - "mean_token_accuracy": 0.8773540258407593, - "num_tokens": 23317401.0, - "step": 2611 - }, - { - "epoch": 1.9848024316109423, - "grad_norm": 1.4690552949905396, - "learning_rate": 1.4109795354409045e-06, - "loss": 0.35854774713516235, - "mean_token_accuracy": 0.869156002998352, - "num_tokens": 23328891.0, - "step": 2612 - }, - { - "epoch": 1.9855623100303952, - "grad_norm": 1.5036180019378662, - "learning_rate": 1.4090946765871105e-06, - "loss": 0.3579009771347046, - "mean_token_accuracy": 0.8698509931564331, - "num_tokens": 23340473.0, - "step": 2613 - }, - { - "epoch": 1.986322188449848, - "grad_norm": 2.0811538696289062, - "learning_rate": 1.4072105833726685e-06, - "loss": 0.2905905246734619, - "mean_token_accuracy": 0.9131759405136108, - "num_tokens": 23346480.0, - "step": 2614 - }, - { - "epoch": 1.9870820668693008, - "grad_norm": 1.2866275310516357, - "learning_rate": 1.4053272571199037e-06, - "loss": 0.4091147184371948, - "mean_token_accuracy": 0.8537255525588989, - "num_tokens": 23361957.0, - "step": 2615 - }, - { - "epoch": 1.987841945288754, - "grad_norm": 1.439497470855713, - "learning_rate": 1.4034446991506084e-06, - "loss": 0.4888972342014313, - "mean_token_accuracy": 0.8451695442199707, - "num_tokens": 23374936.0, - "step": 2616 - }, - { - "epoch": 1.9886018237082066, - "grad_norm": 1.758204698562622, - "learning_rate": 1.401562910786034e-06, - "loss": 0.4976118803024292, - "mean_token_accuracy": 0.8346713781356812, - "num_tokens": 23386102.0, - "step": 2617 - }, - { - "epoch": 1.9893617021276597, - "grad_norm": 1.436486840248108, - "learning_rate": 1.3996818933468926e-06, - "loss": 0.42407113313674927, - "mean_token_accuracy": 0.8529444932937622, - "num_tokens": 23398645.0, - "step": 2618 - }, - { - "epoch": 1.9901215805471124, - "grad_norm": 2.1466588973999023, - "learning_rate": 1.397801648153354e-06, - "loss": 0.45519331097602844, - "mean_token_accuracy": 0.8460411429405212, - "num_tokens": 23406162.0, - "step": 2619 - }, - { - "epoch": 1.9908814589665653, - "grad_norm": 2.0492005348205566, - "learning_rate": 1.395922176525047e-06, - "loss": 0.31093084812164307, - "mean_token_accuracy": 0.8927264213562012, - "num_tokens": 23412051.0, - "step": 2620 - }, - { - "epoch": 1.9916413373860182, - "grad_norm": 2.2639048099517822, - "learning_rate": 1.3940434797810567e-06, - "loss": 0.3804079592227936, - "mean_token_accuracy": 0.8720212578773499, - "num_tokens": 23418252.0, - "step": 2621 - }, - { - "epoch": 1.9924012158054711, - "grad_norm": 1.9541687965393066, - "learning_rate": 1.3921655592399256e-06, - "loss": 0.38776344060897827, - "mean_token_accuracy": 0.858753502368927, - "num_tokens": 23425901.0, - "step": 2622 - }, - { - "epoch": 1.993161094224924, - "grad_norm": 1.5119032859802246, - "learning_rate": 1.3902884162196509e-06, - "loss": 0.39581215381622314, - "mean_token_accuracy": 0.8539663553237915, - "num_tokens": 23439390.0, - "step": 2623 - }, - { - "epoch": 1.993920972644377, - "grad_norm": 2.1608591079711914, - "learning_rate": 1.388412052037682e-06, - "loss": 0.41801220178604126, - "mean_token_accuracy": 0.8703387975692749, - "num_tokens": 23445725.0, - "step": 2624 - }, - { - "epoch": 1.9946808510638299, - "grad_norm": 2.463165521621704, - "learning_rate": 1.3865364680109239e-06, - "loss": 0.3252835273742676, - "mean_token_accuracy": 0.9031686186790466, - "num_tokens": 23451122.0, - "step": 2625 - }, - { - "epoch": 1.9954407294832825, - "grad_norm": 1.1901201009750366, - "learning_rate": 1.384661665455736e-06, - "loss": 0.3358447253704071, - "mean_token_accuracy": 0.8767676949501038, - "num_tokens": 23467381.0, - "step": 2626 - }, - { - "epoch": 1.9962006079027357, - "grad_norm": 1.3035757541656494, - "learning_rate": 1.3827876456879247e-06, - "loss": 0.3736562430858612, - "mean_token_accuracy": 0.849855899810791, - "num_tokens": 23482192.0, - "step": 2627 - }, - { - "epoch": 1.9969604863221884, - "grad_norm": 1.8807034492492676, - "learning_rate": 1.3809144100227483e-06, - "loss": 0.45943766832351685, - "mean_token_accuracy": 0.8456380367279053, - "num_tokens": 23495167.0, - "step": 2628 - }, - { - "epoch": 1.9977203647416415, - "grad_norm": 2.3645784854888916, - "learning_rate": 1.3790419597749198e-06, - "loss": 0.4271511435508728, - "mean_token_accuracy": 0.846099853515625, - "num_tokens": 23500790.0, - "step": 2629 - }, - { - "epoch": 1.9984802431610942, - "grad_norm": 1.8451792001724243, - "learning_rate": 1.3771702962585928e-06, - "loss": 0.38092344999313354, - "mean_token_accuracy": 0.8641276359558105, - "num_tokens": 23508845.0, - "step": 2630 - }, - { - "epoch": 1.999240121580547, - "grad_norm": 1.1115045547485352, - "learning_rate": 1.3752994207873743e-06, - "loss": 0.35954269766807556, - "mean_token_accuracy": 0.8642125129699707, - "num_tokens": 23527929.0, - "step": 2631 - }, - { - "epoch": 2.0, - "grad_norm": 1.406253457069397, - "learning_rate": 1.373429334674317e-06, - "loss": 0.33467042446136475, - "mean_token_accuracy": 0.8713197708129883, - "num_tokens": 23539356.0, - "step": 2632 - }, - { - "epoch": 2.0007598784194527, - "grad_norm": 2.8150978088378906, - "learning_rate": 1.3715600392319186e-06, - "loss": 0.22929656505584717, - "mean_token_accuracy": 0.9197485446929932, - "num_tokens": 23543746.0, - "step": 2633 - }, - { - "epoch": 2.001519756838906, - "grad_norm": 2.6291964054107666, - "learning_rate": 1.369691535772123e-06, - "loss": 0.290000855922699, - "mean_token_accuracy": 0.8979663848876953, - "num_tokens": 23548633.0, - "step": 2634 - }, - { - "epoch": 2.0022796352583585, - "grad_norm": 1.724357008934021, - "learning_rate": 1.3678238256063193e-06, - "loss": 0.3717018663883209, - "mean_token_accuracy": 0.8743406534194946, - "num_tokens": 23557187.0, - "step": 2635 - }, - { - "epoch": 2.0030395136778116, - "grad_norm": 2.3801965713500977, - "learning_rate": 1.3659569100453346e-06, - "loss": 0.3452329635620117, - "mean_token_accuracy": 0.8799462914466858, - "num_tokens": 23563321.0, - "step": 2636 - }, - { - "epoch": 2.0037993920972643, - "grad_norm": 1.8925955295562744, - "learning_rate": 1.3640907903994455e-06, - "loss": 0.32880955934524536, - "mean_token_accuracy": 0.888347864151001, - "num_tokens": 23570571.0, - "step": 2637 - }, - { - "epoch": 2.0045592705167175, - "grad_norm": 1.0761849880218506, - "learning_rate": 1.3622254679783665e-06, - "loss": 0.395224004983902, - "mean_token_accuracy": 0.8637001514434814, - "num_tokens": 23589504.0, - "step": 2638 - }, - { - "epoch": 2.00531914893617, - "grad_norm": 2.1172127723693848, - "learning_rate": 1.3603609440912508e-06, - "loss": 0.32195356488227844, - "mean_token_accuracy": 0.8984324932098389, - "num_tokens": 23595586.0, - "step": 2639 - }, - { - "epoch": 2.0060790273556233, - "grad_norm": 2.127723217010498, - "learning_rate": 1.3584972200466936e-06, - "loss": 0.4710606634616852, - "mean_token_accuracy": 0.8563182950019836, - "num_tokens": 23602747.0, - "step": 2640 - }, - { - "epoch": 2.006838905775076, - "grad_norm": 1.9752192497253418, - "learning_rate": 1.356634297152729e-06, - "loss": 0.24204617738723755, - "mean_token_accuracy": 0.9082983136177063, - "num_tokens": 23609005.0, - "step": 2641 - }, - { - "epoch": 2.007598784194529, - "grad_norm": 2.5435397624969482, - "learning_rate": 1.3547721767168273e-06, - "loss": 0.16702288389205933, - "mean_token_accuracy": 0.9353867769241333, - "num_tokens": 23612852.0, - "step": 2642 - }, - { - "epoch": 2.0083586626139818, - "grad_norm": 1.8113304376602173, - "learning_rate": 1.3529108600458967e-06, - "loss": 0.4245433509349823, - "mean_token_accuracy": 0.8446527719497681, - "num_tokens": 23621462.0, - "step": 2643 - }, - { - "epoch": 2.0091185410334345, - "grad_norm": 1.0438088178634644, - "learning_rate": 1.3510503484462807e-06, - "loss": 0.3710743188858032, - "mean_token_accuracy": 0.8731123208999634, - "num_tokens": 23642029.0, - "step": 2644 - }, - { - "epoch": 2.0098784194528876, - "grad_norm": 1.9650516510009766, - "learning_rate": 1.349190643223758e-06, - "loss": 0.32384324073791504, - "mean_token_accuracy": 0.8859044313430786, - "num_tokens": 23648970.0, - "step": 2645 - }, - { - "epoch": 2.0106382978723403, - "grad_norm": 1.4213180541992188, - "learning_rate": 1.347331745683542e-06, - "loss": 0.42391857504844666, - "mean_token_accuracy": 0.8568997383117676, - "num_tokens": 23663012.0, - "step": 2646 - }, - { - "epoch": 2.0113981762917934, - "grad_norm": 1.852386236190796, - "learning_rate": 1.3454736571302761e-06, - "loss": 0.37283188104629517, - "mean_token_accuracy": 0.9096506834030151, - "num_tokens": 23671632.0, - "step": 2647 - }, - { - "epoch": 2.012158054711246, - "grad_norm": 1.8350872993469238, - "learning_rate": 1.3436163788680411e-06, - "loss": 0.21148793399333954, - "mean_token_accuracy": 0.9306647181510925, - "num_tokens": 23678554.0, - "step": 2648 - }, - { - "epoch": 2.012917933130699, - "grad_norm": 1.8285188674926758, - "learning_rate": 1.3417599122003464e-06, - "loss": 0.2638583183288574, - "mean_token_accuracy": 0.904695987701416, - "num_tokens": 23686905.0, - "step": 2649 - }, - { - "epoch": 2.013677811550152, - "grad_norm": 1.1955424547195435, - "learning_rate": 1.3399042584301298e-06, - "loss": 0.30598434805870056, - "mean_token_accuracy": 0.8953701257705688, - "num_tokens": 23702734.0, - "step": 2650 - }, - { - "epoch": 2.014437689969605, - "grad_norm": 1.5378512144088745, - "learning_rate": 1.3380494188597603e-06, - "loss": 0.33754611015319824, - "mean_token_accuracy": 0.9063926935195923, - "num_tokens": 23715891.0, - "step": 2651 - }, - { - "epoch": 2.0151975683890577, - "grad_norm": 1.6957111358642578, - "learning_rate": 1.3361953947910394e-06, - "loss": 0.26302939653396606, - "mean_token_accuracy": 0.90192711353302, - "num_tokens": 23724034.0, - "step": 2652 - }, - { - "epoch": 2.015957446808511, - "grad_norm": 1.1756837368011475, - "learning_rate": 1.334342187525189e-06, - "loss": 0.3312695622444153, - "mean_token_accuracy": 0.870500385761261, - "num_tokens": 23741241.0, - "step": 2653 - }, - { - "epoch": 2.0167173252279635, - "grad_norm": 1.027145266532898, - "learning_rate": 1.3324897983628621e-06, - "loss": 0.2534530758857727, - "mean_token_accuracy": 0.894199550151825, - "num_tokens": 23758399.0, - "step": 2654 - }, - { - "epoch": 2.0174772036474162, - "grad_norm": 2.2585113048553467, - "learning_rate": 1.330638228604137e-06, - "loss": 0.4558389186859131, - "mean_token_accuracy": 0.8372241258621216, - "num_tokens": 23766871.0, - "step": 2655 - }, - { - "epoch": 2.0182370820668694, - "grad_norm": 1.886893630027771, - "learning_rate": 1.3287874795485168e-06, - "loss": 0.29894912242889404, - "mean_token_accuracy": 0.9086098670959473, - "num_tokens": 23774935.0, - "step": 2656 - }, - { - "epoch": 2.018996960486322, - "grad_norm": 2.082537889480591, - "learning_rate": 1.3269375524949286e-06, - "loss": 0.39323803782463074, - "mean_token_accuracy": 0.8598287105560303, - "num_tokens": 23781303.0, - "step": 2657 - }, - { - "epoch": 2.019756838905775, - "grad_norm": 1.7059803009033203, - "learning_rate": 1.3250884487417227e-06, - "loss": 0.17909850180149078, - "mean_token_accuracy": 0.9276094436645508, - "num_tokens": 23789148.0, - "step": 2658 - }, - { - "epoch": 2.020516717325228, - "grad_norm": 2.150275945663452, - "learning_rate": 1.3232401695866686e-06, - "loss": 0.3707781434059143, - "mean_token_accuracy": 0.8587700128555298, - "num_tokens": 23795484.0, - "step": 2659 - }, - { - "epoch": 2.021276595744681, - "grad_norm": 2.0554518699645996, - "learning_rate": 1.321392716326963e-06, - "loss": 0.33217954635620117, - "mean_token_accuracy": 0.874828577041626, - "num_tokens": 23802968.0, - "step": 2660 - }, - { - "epoch": 2.0220364741641337, - "grad_norm": 2.4556071758270264, - "learning_rate": 1.3195460902592193e-06, - "loss": 0.2790899872779846, - "mean_token_accuracy": 0.9071618914604187, - "num_tokens": 23807788.0, - "step": 2661 - }, - { - "epoch": 2.022796352583587, - "grad_norm": 1.7501509189605713, - "learning_rate": 1.3177002926794685e-06, - "loss": 0.3080750107765198, - "mean_token_accuracy": 0.8942672610282898, - "num_tokens": 23816023.0, - "step": 2662 - }, - { - "epoch": 2.0235562310030395, - "grad_norm": 1.3934804201126099, - "learning_rate": 1.3158553248831658e-06, - "loss": 0.286912202835083, - "mean_token_accuracy": 0.9284837245941162, - "num_tokens": 23827186.0, - "step": 2663 - }, - { - "epoch": 2.024316109422492, - "grad_norm": 1.2530465126037598, - "learning_rate": 1.3140111881651773e-06, - "loss": 0.2630627155303955, - "mean_token_accuracy": 0.9029854536056519, - "num_tokens": 23841399.0, - "step": 2664 - }, - { - "epoch": 2.0250759878419453, - "grad_norm": 1.3417384624481201, - "learning_rate": 1.312167883819791e-06, - "loss": 0.37794870138168335, - "mean_token_accuracy": 0.8722256422042847, - "num_tokens": 23856061.0, - "step": 2665 - }, - { - "epoch": 2.025835866261398, - "grad_norm": 2.234257698059082, - "learning_rate": 1.3103254131407082e-06, - "loss": 0.2739933133125305, - "mean_token_accuracy": 0.9055665135383606, - "num_tokens": 23861865.0, - "step": 2666 - }, - { - "epoch": 2.026595744680851, - "grad_norm": 1.4187006950378418, - "learning_rate": 1.308483777421046e-06, - "loss": 0.24370817840099335, - "mean_token_accuracy": 0.9145886301994324, - "num_tokens": 23873632.0, - "step": 2667 - }, - { - "epoch": 2.027355623100304, - "grad_norm": 2.3645882606506348, - "learning_rate": 1.3066429779533352e-06, - "loss": 0.23659822344779968, - "mean_token_accuracy": 0.9209753274917603, - "num_tokens": 23878866.0, - "step": 2668 - }, - { - "epoch": 2.028115501519757, - "grad_norm": 1.4782226085662842, - "learning_rate": 1.3048030160295196e-06, - "loss": 0.3353138267993927, - "mean_token_accuracy": 0.8747807741165161, - "num_tokens": 23891089.0, - "step": 2669 - }, - { - "epoch": 2.0288753799392096, - "grad_norm": 2.051754951477051, - "learning_rate": 1.3029638929409555e-06, - "loss": 0.2905973196029663, - "mean_token_accuracy": 0.887441873550415, - "num_tokens": 23897653.0, - "step": 2670 - }, - { - "epoch": 2.0296352583586628, - "grad_norm": 1.322279453277588, - "learning_rate": 1.3011256099784103e-06, - "loss": 0.3938416540622711, - "mean_token_accuracy": 0.8911079168319702, - "num_tokens": 23912525.0, - "step": 2671 - }, - { - "epoch": 2.0303951367781155, - "grad_norm": 1.87980318069458, - "learning_rate": 1.2992881684320627e-06, - "loss": 0.16637520492076874, - "mean_token_accuracy": 0.9472321271896362, - "num_tokens": 23918752.0, - "step": 2672 - }, - { - "epoch": 2.0311550151975686, - "grad_norm": 2.0867233276367188, - "learning_rate": 1.297451569591498e-06, - "loss": 0.37282776832580566, - "mean_token_accuracy": 0.8688399195671082, - "num_tokens": 23925918.0, - "step": 2673 - }, - { - "epoch": 2.0319148936170213, - "grad_norm": 1.129468560218811, - "learning_rate": 1.2956158147457116e-06, - "loss": 0.33072173595428467, - "mean_token_accuracy": 0.8788217306137085, - "num_tokens": 23944702.0, - "step": 2674 - }, - { - "epoch": 2.032674772036474, - "grad_norm": 3.6016290187835693, - "learning_rate": 1.2937809051831102e-06, - "loss": 0.28343498706817627, - "mean_token_accuracy": 0.911794900894165, - "num_tokens": 23948417.0, - "step": 2675 - }, - { - "epoch": 2.033434650455927, - "grad_norm": 1.4904811382293701, - "learning_rate": 1.2919468421915008e-06, - "loss": 0.4072638750076294, - "mean_token_accuracy": 0.8615934252738953, - "num_tokens": 23963654.0, - "step": 2676 - }, - { - "epoch": 2.0341945288753798, - "grad_norm": 2.90740704536438, - "learning_rate": 1.2901136270580994e-06, - "loss": 0.3685106635093689, - "mean_token_accuracy": 0.8923419713973999, - "num_tokens": 23968608.0, - "step": 2677 - }, - { - "epoch": 2.034954407294833, - "grad_norm": 1.8772104978561401, - "learning_rate": 1.2882812610695305e-06, - "loss": 0.2947828471660614, - "mean_token_accuracy": 0.9065762758255005, - "num_tokens": 23978298.0, - "step": 2678 - }, - { - "epoch": 2.0357142857142856, - "grad_norm": 1.2135536670684814, - "learning_rate": 1.2864497455118152e-06, - "loss": 0.36015012860298157, - "mean_token_accuracy": 0.8481813073158264, - "num_tokens": 23995784.0, - "step": 2679 - }, - { - "epoch": 2.0364741641337387, - "grad_norm": 1.941889762878418, - "learning_rate": 1.2846190816703836e-06, - "loss": 0.3004198670387268, - "mean_token_accuracy": 0.8843618631362915, - "num_tokens": 24002651.0, - "step": 2680 - }, - { - "epoch": 2.0372340425531914, - "grad_norm": 1.8905075788497925, - "learning_rate": 1.2827892708300648e-06, - "loss": 0.26640570163726807, - "mean_token_accuracy": 0.9079146385192871, - "num_tokens": 24010400.0, - "step": 2681 - }, - { - "epoch": 2.0379939209726445, - "grad_norm": 1.2975934743881226, - "learning_rate": 1.280960314275092e-06, - "loss": 0.19093887507915497, - "mean_token_accuracy": 0.9277223348617554, - "num_tokens": 24021528.0, - "step": 2682 - }, - { - "epoch": 2.038753799392097, - "grad_norm": 1.6483098268508911, - "learning_rate": 1.279132213289096e-06, - "loss": 0.29260069131851196, - "mean_token_accuracy": 0.892486572265625, - "num_tokens": 24030470.0, - "step": 2683 - }, - { - "epoch": 2.0395136778115504, - "grad_norm": 1.6875916719436646, - "learning_rate": 1.2773049691551103e-06, - "loss": 0.3784627914428711, - "mean_token_accuracy": 0.8682783842086792, - "num_tokens": 24041608.0, - "step": 2684 - }, - { - "epoch": 2.040273556231003, - "grad_norm": 2.1055848598480225, - "learning_rate": 1.2754785831555617e-06, - "loss": 0.14676237106323242, - "mean_token_accuracy": 0.9532995223999023, - "num_tokens": 24046687.0, - "step": 2685 - }, - { - "epoch": 2.0410334346504557, - "grad_norm": 1.3862961530685425, - "learning_rate": 1.273653056572282e-06, - "loss": 0.34408485889434814, - "mean_token_accuracy": 0.8748919367790222, - "num_tokens": 24059147.0, - "step": 2686 - }, - { - "epoch": 2.041793313069909, - "grad_norm": 2.936876058578491, - "learning_rate": 1.2718283906864939e-06, - "loss": 0.2471027672290802, - "mean_token_accuracy": 0.9177526235580444, - "num_tokens": 24062963.0, - "step": 2687 - }, - { - "epoch": 2.0425531914893615, - "grad_norm": 1.3992520570755005, - "learning_rate": 1.2700045867788184e-06, - "loss": 0.421109139919281, - "mean_token_accuracy": 0.8664785623550415, - "num_tokens": 24077912.0, - "step": 2688 - }, - { - "epoch": 2.0433130699088147, - "grad_norm": 3.0531985759735107, - "learning_rate": 1.2681816461292715e-06, - "loss": 0.292591392993927, - "mean_token_accuracy": 0.8992351293563843, - "num_tokens": 24082058.0, - "step": 2689 - }, - { - "epoch": 2.0440729483282674, - "grad_norm": 1.4562251567840576, - "learning_rate": 1.2663595700172631e-06, - "loss": 0.39367130398750305, - "mean_token_accuracy": 0.8894597887992859, - "num_tokens": 24093954.0, - "step": 2690 - }, - { - "epoch": 2.0448328267477205, - "grad_norm": 1.9354028701782227, - "learning_rate": 1.2645383597215965e-06, - "loss": 0.28203579783439636, - "mean_token_accuracy": 0.9011955261230469, - "num_tokens": 24100590.0, - "step": 2691 - }, - { - "epoch": 2.045592705167173, - "grad_norm": 1.5010690689086914, - "learning_rate": 1.2627180165204671e-06, - "loss": 0.3463609516620636, - "mean_token_accuracy": 0.8978298306465149, - "num_tokens": 24111104.0, - "step": 2692 - }, - { - "epoch": 2.0463525835866263, - "grad_norm": 2.585813045501709, - "learning_rate": 1.2608985416914616e-06, - "loss": 0.2142711877822876, - "mean_token_accuracy": 0.9260460138320923, - "num_tokens": 24115301.0, - "step": 2693 - }, - { - "epoch": 2.047112462006079, - "grad_norm": 2.317268133163452, - "learning_rate": 1.259079936511558e-06, - "loss": 0.14454546570777893, - "mean_token_accuracy": 0.9498077034950256, - "num_tokens": 24120295.0, - "step": 2694 - }, - { - "epoch": 2.047872340425532, - "grad_norm": 1.966550350189209, - "learning_rate": 1.257262202257124e-06, - "loss": 0.20745311677455902, - "mean_token_accuracy": 0.9157166481018066, - "num_tokens": 24127158.0, - "step": 2695 - }, - { - "epoch": 2.048632218844985, - "grad_norm": 1.6521401405334473, - "learning_rate": 1.2554453402039124e-06, - "loss": 0.2547406256198883, - "mean_token_accuracy": 0.9356101751327515, - "num_tokens": 24135620.0, - "step": 2696 - }, - { - "epoch": 2.0493920972644375, - "grad_norm": 2.341756582260132, - "learning_rate": 1.2536293516270704e-06, - "loss": 0.35540008544921875, - "mean_token_accuracy": 0.874363899230957, - "num_tokens": 24141766.0, - "step": 2697 - }, - { - "epoch": 2.0501519756838906, - "grad_norm": 1.7938716411590576, - "learning_rate": 1.251814237801128e-06, - "loss": 0.37250861525535583, - "mean_token_accuracy": 0.8644422292709351, - "num_tokens": 24149997.0, - "step": 2698 - }, - { - "epoch": 2.0509118541033433, - "grad_norm": 2.0868122577667236, - "learning_rate": 1.2500000000000007e-06, - "loss": 0.44527092576026917, - "mean_token_accuracy": 0.8510264158248901, - "num_tokens": 24158208.0, - "step": 2699 - }, - { - "epoch": 2.0516717325227964, - "grad_norm": 2.412604808807373, - "learning_rate": 1.24818663949699e-06, - "loss": 0.19276219606399536, - "mean_token_accuracy": 0.9317681789398193, - "num_tokens": 24162905.0, - "step": 2700 - }, - { - "epoch": 2.052431610942249, - "grad_norm": 1.4488455057144165, - "learning_rate": 1.246374157564785e-06, - "loss": 0.3493705093860626, - "mean_token_accuracy": 0.9016396999359131, - "num_tokens": 24175852.0, - "step": 2701 - }, - { - "epoch": 2.0531914893617023, - "grad_norm": 2.1629185676574707, - "learning_rate": 1.2445625554754526e-06, - "loss": 0.30588388442993164, - "mean_token_accuracy": 0.8871392011642456, - "num_tokens": 24181507.0, - "step": 2702 - }, - { - "epoch": 2.053951367781155, - "grad_norm": 2.0489449501037598, - "learning_rate": 1.2427518345004459e-06, - "loss": 0.4578161835670471, - "mean_token_accuracy": 0.8498104214668274, - "num_tokens": 24191918.0, - "step": 2703 - }, - { - "epoch": 2.054711246200608, - "grad_norm": 2.063019037246704, - "learning_rate": 1.2409419959105981e-06, - "loss": 0.31680572032928467, - "mean_token_accuracy": 0.8809083700180054, - "num_tokens": 24199336.0, - "step": 2704 - }, - { - "epoch": 2.0554711246200608, - "grad_norm": 2.4594223499298096, - "learning_rate": 1.239133040976124e-06, - "loss": 0.3048282265663147, - "mean_token_accuracy": 0.8897095322608948, - "num_tokens": 24205118.0, - "step": 2705 - }, - { - "epoch": 2.056231003039514, - "grad_norm": 1.6359999179840088, - "learning_rate": 1.237324970966618e-06, - "loss": 0.4312370717525482, - "mean_token_accuracy": 0.8526142835617065, - "num_tokens": 24215792.0, - "step": 2706 - }, - { - "epoch": 2.0569908814589666, - "grad_norm": 1.5534536838531494, - "learning_rate": 1.2355177871510538e-06, - "loss": 0.3647908568382263, - "mean_token_accuracy": 0.8680631518363953, - "num_tokens": 24235325.0, - "step": 2707 - }, - { - "epoch": 2.0577507598784193, - "grad_norm": 2.4902515411376953, - "learning_rate": 1.2337114907977798e-06, - "loss": 0.3605276942253113, - "mean_token_accuracy": 0.8776376843452454, - "num_tokens": 24241502.0, - "step": 2708 - }, - { - "epoch": 2.0585106382978724, - "grad_norm": 1.7282993793487549, - "learning_rate": 1.2319060831745273e-06, - "loss": 0.38326722383499146, - "mean_token_accuracy": 0.8531644344329834, - "num_tokens": 24252665.0, - "step": 2709 - }, - { - "epoch": 2.059270516717325, - "grad_norm": 1.4213361740112305, - "learning_rate": 1.2301015655484006e-06, - "loss": 0.32221150398254395, - "mean_token_accuracy": 0.8890664577484131, - "num_tokens": 24266409.0, - "step": 2710 - }, - { - "epoch": 2.060030395136778, - "grad_norm": 2.6412453651428223, - "learning_rate": 1.2282979391858767e-06, - "loss": 0.20225220918655396, - "mean_token_accuracy": 0.9287782311439514, - "num_tokens": 24271069.0, - "step": 2711 - }, - { - "epoch": 2.060790273556231, - "grad_norm": 3.2601654529571533, - "learning_rate": 1.2264952053528145e-06, - "loss": 0.23259003460407257, - "mean_token_accuracy": 0.9290606379508972, - "num_tokens": 24274992.0, - "step": 2712 - }, - { - "epoch": 2.061550151975684, - "grad_norm": 1.6633410453796387, - "learning_rate": 1.2246933653144386e-06, - "loss": 0.355314165353775, - "mean_token_accuracy": 0.870380163192749, - "num_tokens": 24284917.0, - "step": 2713 - }, - { - "epoch": 2.0623100303951367, - "grad_norm": 2.9081318378448486, - "learning_rate": 1.2228924203353507e-06, - "loss": 0.38050833344459534, - "mean_token_accuracy": 0.8879997730255127, - "num_tokens": 24289694.0, - "step": 2714 - }, - { - "epoch": 2.06306990881459, - "grad_norm": 3.2404227256774902, - "learning_rate": 1.2210923716795233e-06, - "loss": 0.2502570152282715, - "mean_token_accuracy": 0.9150978922843933, - "num_tokens": 24293254.0, - "step": 2715 - }, - { - "epoch": 2.0638297872340425, - "grad_norm": 1.9262174367904663, - "learning_rate": 1.2192932206103e-06, - "loss": 0.26763200759887695, - "mean_token_accuracy": 0.9203122854232788, - "num_tokens": 24300881.0, - "step": 2716 - }, - { - "epoch": 2.0645896656534957, - "grad_norm": 1.6790109872817993, - "learning_rate": 1.2174949683903943e-06, - "loss": 0.22275440394878387, - "mean_token_accuracy": 0.9212621450424194, - "num_tokens": 24309288.0, - "step": 2717 - }, - { - "epoch": 2.0653495440729484, - "grad_norm": 1.8272414207458496, - "learning_rate": 1.2156976162818895e-06, - "loss": 0.3183424472808838, - "mean_token_accuracy": 0.8813169002532959, - "num_tokens": 24316980.0, - "step": 2718 - }, - { - "epoch": 2.066109422492401, - "grad_norm": 2.7388651371002197, - "learning_rate": 1.2139011655462338e-06, - "loss": 0.24794816970825195, - "mean_token_accuracy": 0.9109550714492798, - "num_tokens": 24321867.0, - "step": 2719 - }, - { - "epoch": 2.066869300911854, - "grad_norm": 1.4866925477981567, - "learning_rate": 1.2121056174442484e-06, - "loss": 0.24177205562591553, - "mean_token_accuracy": 0.9102780818939209, - "num_tokens": 24332874.0, - "step": 2720 - }, - { - "epoch": 2.067629179331307, - "grad_norm": 1.6006059646606445, - "learning_rate": 1.2103109732361178e-06, - "loss": 0.29220807552337646, - "mean_token_accuracy": 0.8947570323944092, - "num_tokens": 24342790.0, - "step": 2721 - }, - { - "epoch": 2.06838905775076, - "grad_norm": 2.2688677310943604, - "learning_rate": 1.208517234181391e-06, - "loss": 0.39247143268585205, - "mean_token_accuracy": 0.8514304161071777, - "num_tokens": 24349329.0, - "step": 2722 - }, - { - "epoch": 2.0691489361702127, - "grad_norm": 2.404534339904785, - "learning_rate": 1.2067244015389829e-06, - "loss": 0.4461793303489685, - "mean_token_accuracy": 0.8531662821769714, - "num_tokens": 24356287.0, - "step": 2723 - }, - { - "epoch": 2.069908814589666, - "grad_norm": 1.813341498374939, - "learning_rate": 1.204932476567175e-06, - "loss": 0.38300177454948425, - "mean_token_accuracy": 0.8597674369812012, - "num_tokens": 24366181.0, - "step": 2724 - }, - { - "epoch": 2.0706686930091185, - "grad_norm": 3.49125337600708, - "learning_rate": 1.2031414605236066e-06, - "loss": 0.33281540870666504, - "mean_token_accuracy": 0.8774969577789307, - "num_tokens": 24370362.0, - "step": 2725 - }, - { - "epoch": 2.0714285714285716, - "grad_norm": 1.7682114839553833, - "learning_rate": 1.2013513546652827e-06, - "loss": 0.3001813590526581, - "mean_token_accuracy": 0.8840254545211792, - "num_tokens": 24380469.0, - "step": 2726 - }, - { - "epoch": 2.0721884498480243, - "grad_norm": 2.3688952922821045, - "learning_rate": 1.1995621602485685e-06, - "loss": 0.20055249333381653, - "mean_token_accuracy": 0.9246129989624023, - "num_tokens": 24385474.0, - "step": 2727 - }, - { - "epoch": 2.072948328267477, - "grad_norm": 2.3368382453918457, - "learning_rate": 1.1977738785291894e-06, - "loss": 0.18379954993724823, - "mean_token_accuracy": 0.9385529160499573, - "num_tokens": 24390002.0, - "step": 2728 - }, - { - "epoch": 2.07370820668693, - "grad_norm": 1.857473373413086, - "learning_rate": 1.1959865107622306e-06, - "loss": 0.4606894552707672, - "mean_token_accuracy": 0.8437427282333374, - "num_tokens": 24400880.0, - "step": 2729 - }, - { - "epoch": 2.074468085106383, - "grad_norm": 1.2714136838912964, - "learning_rate": 1.1942000582021355e-06, - "loss": 0.21171459555625916, - "mean_token_accuracy": 0.9216019511222839, - "num_tokens": 24413113.0, - "step": 2730 - }, - { - "epoch": 2.075227963525836, - "grad_norm": 2.2025210857391357, - "learning_rate": 1.1924145221027048e-06, - "loss": 0.44211941957473755, - "mean_token_accuracy": 0.8538386821746826, - "num_tokens": 24420504.0, - "step": 2731 - }, - { - "epoch": 2.0759878419452886, - "grad_norm": 1.6706589460372925, - "learning_rate": 1.190629903717097e-06, - "loss": 0.35163265466690063, - "mean_token_accuracy": 0.8716240525245667, - "num_tokens": 24430203.0, - "step": 2732 - }, - { - "epoch": 2.0767477203647418, - "grad_norm": 2.299182176589966, - "learning_rate": 1.1888462042978268e-06, - "loss": 0.30983975529670715, - "mean_token_accuracy": 0.8859797716140747, - "num_tokens": 24437387.0, - "step": 2733 - }, - { - "epoch": 2.0775075987841944, - "grad_norm": 2.975123167037964, - "learning_rate": 1.1870634250967606e-06, - "loss": 0.23585952818393707, - "mean_token_accuracy": 0.9167368412017822, - "num_tokens": 24441176.0, - "step": 2734 - }, - { - "epoch": 2.0782674772036476, - "grad_norm": 1.1052464246749878, - "learning_rate": 1.1852815673651246e-06, - "loss": 0.24136316776275635, - "mean_token_accuracy": 0.8897353410720825, - "num_tokens": 24457092.0, - "step": 2735 - }, - { - "epoch": 2.0790273556231003, - "grad_norm": 1.5531870126724243, - "learning_rate": 1.1835006323534926e-06, - "loss": 0.302223265171051, - "mean_token_accuracy": 0.8940514326095581, - "num_tokens": 24467643.0, - "step": 2736 - }, - { - "epoch": 2.0797872340425534, - "grad_norm": 1.706140398979187, - "learning_rate": 1.1817206213117943e-06, - "loss": 0.39235255122184753, - "mean_token_accuracy": 0.8615218997001648, - "num_tokens": 24477715.0, - "step": 2737 - }, - { - "epoch": 2.080547112462006, - "grad_norm": 2.1109750270843506, - "learning_rate": 1.1799415354893103e-06, - "loss": 0.2526751756668091, - "mean_token_accuracy": 0.9108465909957886, - "num_tokens": 24484248.0, - "step": 2738 - }, - { - "epoch": 2.0813069908814588, - "grad_norm": 1.9943277835845947, - "learning_rate": 1.178163376134671e-06, - "loss": 0.3540172874927521, - "mean_token_accuracy": 0.9131139516830444, - "num_tokens": 24492207.0, - "step": 2739 - }, - { - "epoch": 2.082066869300912, - "grad_norm": 1.9536099433898926, - "learning_rate": 1.1763861444958573e-06, - "loss": 0.3902950584888458, - "mean_token_accuracy": 0.8611530065536499, - "num_tokens": 24501567.0, - "step": 2740 - }, - { - "epoch": 2.0828267477203646, - "grad_norm": 3.146925926208496, - "learning_rate": 1.1746098418201987e-06, - "loss": 0.43440669775009155, - "mean_token_accuracy": 0.8709320425987244, - "num_tokens": 24506684.0, - "step": 2741 - }, - { - "epoch": 2.0835866261398177, - "grad_norm": 2.763427495956421, - "learning_rate": 1.172834469354373e-06, - "loss": 0.3513452410697937, - "mean_token_accuracy": 0.8774256110191345, - "num_tokens": 24511509.0, - "step": 2742 - }, - { - "epoch": 2.0843465045592704, - "grad_norm": 2.773829221725464, - "learning_rate": 1.1710600283444048e-06, - "loss": 0.24668049812316895, - "mean_token_accuracy": 0.9146889448165894, - "num_tokens": 24516030.0, - "step": 2743 - }, - { - "epoch": 2.0851063829787235, - "grad_norm": 1.666471242904663, - "learning_rate": 1.169286520035666e-06, - "loss": 0.36206915974617004, - "mean_token_accuracy": 0.8711973428726196, - "num_tokens": 24526656.0, - "step": 2744 - }, - { - "epoch": 2.085866261398176, - "grad_norm": 2.818890333175659, - "learning_rate": 1.1675139456728702e-06, - "loss": 0.32967281341552734, - "mean_token_accuracy": 0.880983829498291, - "num_tokens": 24531625.0, - "step": 2745 - }, - { - "epoch": 2.0866261398176293, - "grad_norm": 1.09058678150177, - "learning_rate": 1.1657423065000811e-06, - "loss": 0.36224377155303955, - "mean_token_accuracy": 0.8708326816558838, - "num_tokens": 24557123.0, - "step": 2746 - }, - { - "epoch": 2.087386018237082, - "grad_norm": 1.1434987783432007, - "learning_rate": 1.1639716037607036e-06, - "loss": 0.26490458846092224, - "mean_token_accuracy": 0.9131897687911987, - "num_tokens": 24573223.0, - "step": 2747 - }, - { - "epoch": 2.088145896656535, - "grad_norm": 2.437505006790161, - "learning_rate": 1.1622018386974829e-06, - "loss": 0.18964408338069916, - "mean_token_accuracy": 0.9271818399429321, - "num_tokens": 24578306.0, - "step": 2748 - }, - { - "epoch": 2.088905775075988, - "grad_norm": 1.797308325767517, - "learning_rate": 1.160433012552508e-06, - "loss": 0.3090781569480896, - "mean_token_accuracy": 0.8960750102996826, - "num_tokens": 24587562.0, - "step": 2749 - }, - { - "epoch": 2.0896656534954405, - "grad_norm": 2.4050841331481934, - "learning_rate": 1.1586651265672122e-06, - "loss": 0.4001041054725647, - "mean_token_accuracy": 0.8588370084762573, - "num_tokens": 24594223.0, - "step": 2750 - }, - { - "epoch": 2.0904255319148937, - "grad_norm": 1.8757156133651733, - "learning_rate": 1.1568981819823636e-06, - "loss": 0.37845075130462646, - "mean_token_accuracy": 0.866146445274353, - "num_tokens": 24602556.0, - "step": 2751 - }, - { - "epoch": 2.0911854103343464, - "grad_norm": 1.8205114603042603, - "learning_rate": 1.1551321800380722e-06, - "loss": 0.24738016724586487, - "mean_token_accuracy": 0.923284113407135, - "num_tokens": 24611627.0, - "step": 2752 - }, - { - "epoch": 2.0919452887537995, - "grad_norm": 2.107512950897217, - "learning_rate": 1.153367121973786e-06, - "loss": 0.3062688410282135, - "mean_token_accuracy": 0.8909003734588623, - "num_tokens": 24619569.0, - "step": 2753 - }, - { - "epoch": 2.092705167173252, - "grad_norm": 1.93110191822052, - "learning_rate": 1.1516030090282915e-06, - "loss": 0.38658422231674194, - "mean_token_accuracy": 0.869437038898468, - "num_tokens": 24628869.0, - "step": 2754 - }, - { - "epoch": 2.0934650455927053, - "grad_norm": 2.3618004322052, - "learning_rate": 1.1498398424397106e-06, - "loss": 0.19193072617053986, - "mean_token_accuracy": 0.9329519271850586, - "num_tokens": 24633724.0, - "step": 2755 - }, - { - "epoch": 2.094224924012158, - "grad_norm": 2.274510622024536, - "learning_rate": 1.1480776234455024e-06, - "loss": 0.24939998984336853, - "mean_token_accuracy": 0.9104958772659302, - "num_tokens": 24642762.0, - "step": 2756 - }, - { - "epoch": 2.094984802431611, - "grad_norm": 1.7468934059143066, - "learning_rate": 1.1463163532824572e-06, - "loss": 0.3876607418060303, - "mean_token_accuracy": 0.8540539145469666, - "num_tokens": 24652138.0, - "step": 2757 - }, - { - "epoch": 2.095744680851064, - "grad_norm": 2.905381441116333, - "learning_rate": 1.1445560331867054e-06, - "loss": 0.33666878938674927, - "mean_token_accuracy": 0.8805598616600037, - "num_tokens": 24656612.0, - "step": 2758 - }, - { - "epoch": 2.096504559270517, - "grad_norm": 1.5513007640838623, - "learning_rate": 1.142796664393707e-06, - "loss": 0.25168463587760925, - "mean_token_accuracy": 0.925534725189209, - "num_tokens": 24667132.0, - "step": 2759 - }, - { - "epoch": 2.0972644376899696, - "grad_norm": 1.6804249286651611, - "learning_rate": 1.141038248138253e-06, - "loss": 0.3862859010696411, - "mean_token_accuracy": 0.8686253428459167, - "num_tokens": 24679274.0, - "step": 2760 - }, - { - "epoch": 2.0980243161094223, - "grad_norm": 1.7432880401611328, - "learning_rate": 1.1392807856544682e-06, - "loss": 0.3200700879096985, - "mean_token_accuracy": 0.9188123941421509, - "num_tokens": 24688628.0, - "step": 2761 - }, - { - "epoch": 2.0987841945288754, - "grad_norm": 1.8734468221664429, - "learning_rate": 1.1375242781758077e-06, - "loss": 0.34758424758911133, - "mean_token_accuracy": 0.8724187016487122, - "num_tokens": 24698159.0, - "step": 2762 - }, - { - "epoch": 2.099544072948328, - "grad_norm": 3.7156829833984375, - "learning_rate": 1.1357687269350564e-06, - "loss": 0.30014732480049133, - "mean_token_accuracy": 0.9021577835083008, - "num_tokens": 24701797.0, - "step": 2763 - }, - { - "epoch": 2.1003039513677813, - "grad_norm": 1.5196985006332397, - "learning_rate": 1.1340141331643276e-06, - "loss": 0.45747464895248413, - "mean_token_accuracy": 0.839891791343689, - "num_tokens": 24716468.0, - "step": 2764 - }, - { - "epoch": 2.101063829787234, - "grad_norm": 1.978009581565857, - "learning_rate": 1.132260498095062e-06, - "loss": 0.3130183815956116, - "mean_token_accuracy": 0.90610271692276, - "num_tokens": 24723211.0, - "step": 2765 - }, - { - "epoch": 2.101823708206687, - "grad_norm": 1.5883251428604126, - "learning_rate": 1.1305078229580294e-06, - "loss": 0.30493029952049255, - "mean_token_accuracy": 0.8889745473861694, - "num_tokens": 24733839.0, - "step": 2766 - }, - { - "epoch": 2.1025835866261398, - "grad_norm": 1.2397783994674683, - "learning_rate": 1.128756108983325e-06, - "loss": 0.2606407105922699, - "mean_token_accuracy": 0.9061247110366821, - "num_tokens": 24747488.0, - "step": 2767 - }, - { - "epoch": 2.103343465045593, - "grad_norm": 1.3046784400939941, - "learning_rate": 1.1270053574003658e-06, - "loss": 0.38750404119491577, - "mean_token_accuracy": 0.8777017593383789, - "num_tokens": 24763893.0, - "step": 2768 - }, - { - "epoch": 2.1041033434650456, - "grad_norm": 1.499266266822815, - "learning_rate": 1.1252555694379005e-06, - "loss": 0.4804937243461609, - "mean_token_accuracy": 0.8344086408615112, - "num_tokens": 24779323.0, - "step": 2769 - }, - { - "epoch": 2.1048632218844983, - "grad_norm": 1.211094856262207, - "learning_rate": 1.123506746323997e-06, - "loss": 0.3579246997833252, - "mean_token_accuracy": 0.8705919981002808, - "num_tokens": 24794965.0, - "step": 2770 - }, - { - "epoch": 2.1056231003039514, - "grad_norm": 2.490551471710205, - "learning_rate": 1.1217588892860446e-06, - "loss": 0.4084790349006653, - "mean_token_accuracy": 0.8553222417831421, - "num_tokens": 24800614.0, - "step": 2771 - }, - { - "epoch": 2.106382978723404, - "grad_norm": 1.5249632596969604, - "learning_rate": 1.1200119995507572e-06, - "loss": 0.36853182315826416, - "mean_token_accuracy": 0.8847414255142212, - "num_tokens": 24812886.0, - "step": 2772 - }, - { - "epoch": 2.107142857142857, - "grad_norm": 1.8510968685150146, - "learning_rate": 1.1182660783441719e-06, - "loss": 0.2918103337287903, - "mean_token_accuracy": 0.8898224830627441, - "num_tokens": 24821545.0, - "step": 2773 - }, - { - "epoch": 2.10790273556231, - "grad_norm": 1.7721803188323975, - "learning_rate": 1.11652112689164e-06, - "loss": 0.2920452654361725, - "mean_token_accuracy": 0.8879085779190063, - "num_tokens": 24831526.0, - "step": 2774 - }, - { - "epoch": 2.108662613981763, - "grad_norm": 1.3987336158752441, - "learning_rate": 1.1147771464178378e-06, - "loss": 0.4407062828540802, - "mean_token_accuracy": 0.8472493886947632, - "num_tokens": 24845847.0, - "step": 2775 - }, - { - "epoch": 2.1094224924012157, - "grad_norm": 1.8927375078201294, - "learning_rate": 1.1130341381467569e-06, - "loss": 0.36293038725852966, - "mean_token_accuracy": 0.8881135582923889, - "num_tokens": 24854760.0, - "step": 2776 - }, - { - "epoch": 2.110182370820669, - "grad_norm": 3.0480666160583496, - "learning_rate": 1.111292103301708e-06, - "loss": 0.30395108461380005, - "mean_token_accuracy": 0.9036306142807007, - "num_tokens": 24859051.0, - "step": 2777 - }, - { - "epoch": 2.1109422492401215, - "grad_norm": 1.5833618640899658, - "learning_rate": 1.1095510431053176e-06, - "loss": 0.26424330472946167, - "mean_token_accuracy": 0.9020674824714661, - "num_tokens": 24869853.0, - "step": 2778 - }, - { - "epoch": 2.1117021276595747, - "grad_norm": 1.645459532737732, - "learning_rate": 1.1078109587795311e-06, - "loss": 0.3563994765281677, - "mean_token_accuracy": 0.8732106685638428, - "num_tokens": 24880184.0, - "step": 2779 - }, - { - "epoch": 2.1124620060790273, - "grad_norm": 2.2964093685150146, - "learning_rate": 1.1060718515456022e-06, - "loss": 0.19739922881126404, - "mean_token_accuracy": 0.9273765087127686, - "num_tokens": 24885398.0, - "step": 2780 - }, - { - "epoch": 2.11322188449848, - "grad_norm": 2.094024181365967, - "learning_rate": 1.1043337226241075e-06, - "loss": 0.3321923315525055, - "mean_token_accuracy": 0.8865819573402405, - "num_tokens": 24893908.0, - "step": 2781 - }, - { - "epoch": 2.113981762917933, - "grad_norm": 1.9787025451660156, - "learning_rate": 1.1025965732349318e-06, - "loss": 0.37631168961524963, - "mean_token_accuracy": 0.8808693885803223, - "num_tokens": 24901270.0, - "step": 2782 - }, - { - "epoch": 2.114741641337386, - "grad_norm": 2.376060724258423, - "learning_rate": 1.100860404597271e-06, - "loss": 0.2591894268989563, - "mean_token_accuracy": 0.9174780249595642, - "num_tokens": 24906578.0, - "step": 2783 - }, - { - "epoch": 2.115501519756839, - "grad_norm": 1.0967903137207031, - "learning_rate": 1.0991252179296389e-06, - "loss": 0.26626938581466675, - "mean_token_accuracy": 0.9305505752563477, - "num_tokens": 24922329.0, - "step": 2784 - }, - { - "epoch": 2.1162613981762917, - "grad_norm": 3.3701183795928955, - "learning_rate": 1.0973910144498534e-06, - "loss": 0.2710079848766327, - "mean_token_accuracy": 0.9095271825790405, - "num_tokens": 24925777.0, - "step": 2785 - }, - { - "epoch": 2.117021276595745, - "grad_norm": 1.636264681816101, - "learning_rate": 1.0956577953750461e-06, - "loss": 0.2995981276035309, - "mean_token_accuracy": 0.8988568782806396, - "num_tokens": 24934230.0, - "step": 2786 - }, - { - "epoch": 2.1177811550151975, - "grad_norm": 2.3107731342315674, - "learning_rate": 1.093925561921657e-06, - "loss": 0.3424459397792816, - "mean_token_accuracy": 0.9100210070610046, - "num_tokens": 24939830.0, - "step": 2787 - }, - { - "epoch": 2.1185410334346506, - "grad_norm": 1.814764380455017, - "learning_rate": 1.0921943153054343e-06, - "loss": 0.3182154893875122, - "mean_token_accuracy": 0.883027195930481, - "num_tokens": 24947764.0, - "step": 2788 - }, - { - "epoch": 2.1193009118541033, - "grad_norm": 1.693555235862732, - "learning_rate": 1.0904640567414332e-06, - "loss": 0.3685447573661804, - "mean_token_accuracy": 0.8900846242904663, - "num_tokens": 24957680.0, - "step": 2789 - }, - { - "epoch": 2.1200607902735564, - "grad_norm": 1.0726022720336914, - "learning_rate": 1.088734787444017e-06, - "loss": 0.28461548686027527, - "mean_token_accuracy": 0.9026681184768677, - "num_tokens": 24975181.0, - "step": 2790 - }, - { - "epoch": 2.120820668693009, - "grad_norm": 1.3013874292373657, - "learning_rate": 1.0870065086268506e-06, - "loss": 0.28222548961639404, - "mean_token_accuracy": 0.9041857719421387, - "num_tokens": 24993211.0, - "step": 2791 - }, - { - "epoch": 2.121580547112462, - "grad_norm": 2.592106580734253, - "learning_rate": 1.085279221502909e-06, - "loss": 0.31733593344688416, - "mean_token_accuracy": 0.90151047706604, - "num_tokens": 24998151.0, - "step": 2792 - }, - { - "epoch": 2.122340425531915, - "grad_norm": 2.649210214614868, - "learning_rate": 1.0835529272844694e-06, - "loss": 0.341595321893692, - "mean_token_accuracy": 0.8989696502685547, - "num_tokens": 25003399.0, - "step": 2793 - }, - { - "epoch": 2.1231003039513676, - "grad_norm": 2.376619577407837, - "learning_rate": 1.0818276271831094e-06, - "loss": 0.2770065665245056, - "mean_token_accuracy": 0.8967875242233276, - "num_tokens": 25009686.0, - "step": 2794 - }, - { - "epoch": 2.1238601823708207, - "grad_norm": 2.1539604663848877, - "learning_rate": 1.080103322409711e-06, - "loss": 0.37501147389411926, - "mean_token_accuracy": 0.8768513202667236, - "num_tokens": 25016339.0, - "step": 2795 - }, - { - "epoch": 2.1246200607902734, - "grad_norm": 2.5727670192718506, - "learning_rate": 1.0783800141744607e-06, - "loss": 0.31852903962135315, - "mean_token_accuracy": 0.8897477388381958, - "num_tokens": 25021410.0, - "step": 2796 - }, - { - "epoch": 2.1253799392097266, - "grad_norm": 2.1428916454315186, - "learning_rate": 1.0766577036868395e-06, - "loss": 0.2348000407218933, - "mean_token_accuracy": 0.9012142419815063, - "num_tokens": 25027375.0, - "step": 2797 - }, - { - "epoch": 2.1261398176291793, - "grad_norm": 2.4231064319610596, - "learning_rate": 1.074936392155631e-06, - "loss": 0.30580806732177734, - "mean_token_accuracy": 0.8963108658790588, - "num_tokens": 25033211.0, - "step": 2798 - }, - { - "epoch": 2.1268996960486324, - "grad_norm": 2.1027259826660156, - "learning_rate": 1.073216080788921e-06, - "loss": 0.2508814334869385, - "mean_token_accuracy": 0.9095165729522705, - "num_tokens": 25040316.0, - "step": 2799 - }, - { - "epoch": 2.127659574468085, - "grad_norm": 1.6513079404830933, - "learning_rate": 1.0714967707940876e-06, - "loss": 0.40694183111190796, - "mean_token_accuracy": 0.8895826935768127, - "num_tokens": 25054978.0, - "step": 2800 - }, - { - "epoch": 2.128419452887538, - "grad_norm": 2.0551133155822754, - "learning_rate": 1.0697784633778093e-06, - "loss": 0.3452662229537964, - "mean_token_accuracy": 0.8710684776306152, - "num_tokens": 25062755.0, - "step": 2801 - }, - { - "epoch": 2.129179331306991, - "grad_norm": 2.1780688762664795, - "learning_rate": 1.0680611597460607e-06, - "loss": 0.2918209135532379, - "mean_token_accuracy": 0.8689337968826294, - "num_tokens": 25069453.0, - "step": 2802 - }, - { - "epoch": 2.1299392097264436, - "grad_norm": 1.7905635833740234, - "learning_rate": 1.0663448611041114e-06, - "loss": 0.3535313308238983, - "mean_token_accuracy": 0.8762770295143127, - "num_tokens": 25080004.0, - "step": 2803 - }, - { - "epoch": 2.1306990881458967, - "grad_norm": 1.6187241077423096, - "learning_rate": 1.0646295686565258e-06, - "loss": 0.3042716681957245, - "mean_token_accuracy": 0.884156346321106, - "num_tokens": 25089652.0, - "step": 2804 - }, - { - "epoch": 2.1314589665653494, - "grad_norm": 2.667459011077881, - "learning_rate": 1.0629152836071633e-06, - "loss": 0.3904019892215729, - "mean_token_accuracy": 0.8603606224060059, - "num_tokens": 25095556.0, - "step": 2805 - }, - { - "epoch": 2.1322188449848025, - "grad_norm": 1.4227970838546753, - "learning_rate": 1.0612020071591722e-06, - "loss": 0.3765299320220947, - "mean_token_accuracy": 0.8655093908309937, - "num_tokens": 25108963.0, - "step": 2806 - }, - { - "epoch": 2.132978723404255, - "grad_norm": 2.262726068496704, - "learning_rate": 1.0594897405149994e-06, - "loss": 0.2727298140525818, - "mean_token_accuracy": 0.9005513191223145, - "num_tokens": 25115135.0, - "step": 2807 - }, - { - "epoch": 2.1337386018237083, - "grad_norm": 2.0810186862945557, - "learning_rate": 1.0577784848763773e-06, - "loss": 0.4001343250274658, - "mean_token_accuracy": 0.8537896871566772, - "num_tokens": 25123079.0, - "step": 2808 - }, - { - "epoch": 2.134498480243161, - "grad_norm": 1.6573376655578613, - "learning_rate": 1.0560682414443315e-06, - "loss": 0.4197486340999603, - "mean_token_accuracy": 0.8549862504005432, - "num_tokens": 25135398.0, - "step": 2809 - }, - { - "epoch": 2.135258358662614, - "grad_norm": 2.200150489807129, - "learning_rate": 1.0543590114191768e-06, - "loss": 0.32026296854019165, - "mean_token_accuracy": 0.8797904253005981, - "num_tokens": 25141382.0, - "step": 2810 - }, - { - "epoch": 2.136018237082067, - "grad_norm": 2.678558111190796, - "learning_rate": 1.0526507960005164e-06, - "loss": 0.30048054456710815, - "mean_token_accuracy": 0.8849201202392578, - "num_tokens": 25146235.0, - "step": 2811 - }, - { - "epoch": 2.13677811550152, - "grad_norm": 1.5207500457763672, - "learning_rate": 1.0509435963872422e-06, - "loss": 0.3706427216529846, - "mean_token_accuracy": 0.8740214109420776, - "num_tokens": 25157108.0, - "step": 2812 - }, - { - "epoch": 2.1375379939209727, - "grad_norm": 1.4632720947265625, - "learning_rate": 1.049237413777532e-06, - "loss": 0.27156776189804077, - "mean_token_accuracy": 0.8950715661048889, - "num_tokens": 25167937.0, - "step": 2813 - }, - { - "epoch": 2.1382978723404253, - "grad_norm": 2.101048469543457, - "learning_rate": 1.0475322493688506e-06, - "loss": 0.366736501455307, - "mean_token_accuracy": 0.8700850009918213, - "num_tokens": 25177043.0, - "step": 2814 - }, - { - "epoch": 2.1390577507598785, - "grad_norm": 2.54221248626709, - "learning_rate": 1.0458281043579482e-06, - "loss": 0.20383943617343903, - "mean_token_accuracy": 0.9226665496826172, - "num_tokens": 25182105.0, - "step": 2815 - }, - { - "epoch": 2.139817629179331, - "grad_norm": 1.7742674350738525, - "learning_rate": 1.04412497994086e-06, - "loss": 0.26852455735206604, - "mean_token_accuracy": 0.8987031579017639, - "num_tokens": 25190178.0, - "step": 2816 - }, - { - "epoch": 2.1405775075987843, - "grad_norm": 3.2856075763702393, - "learning_rate": 1.0424228773129019e-06, - "loss": 0.24643859267234802, - "mean_token_accuracy": 0.9189155101776123, - "num_tokens": 25194105.0, - "step": 2817 - }, - { - "epoch": 2.141337386018237, - "grad_norm": 3.374311923980713, - "learning_rate": 1.0407217976686777e-06, - "loss": 0.2575511336326599, - "mean_token_accuracy": 0.9143530130386353, - "num_tokens": 25197787.0, - "step": 2818 - }, - { - "epoch": 2.14209726443769, - "grad_norm": 1.4967217445373535, - "learning_rate": 1.03902174220207e-06, - "loss": 0.3054750859737396, - "mean_token_accuracy": 0.8989205360412598, - "num_tokens": 25209150.0, - "step": 2819 - }, - { - "epoch": 2.142857142857143, - "grad_norm": 2.654459238052368, - "learning_rate": 1.0373227121062423e-06, - "loss": 0.27398061752319336, - "mean_token_accuracy": 0.9181102514266968, - "num_tokens": 25214015.0, - "step": 2820 - }, - { - "epoch": 2.143617021276596, - "grad_norm": 1.3205828666687012, - "learning_rate": 1.0356247085736388e-06, - "loss": 0.4085468053817749, - "mean_token_accuracy": 0.8745299577713013, - "num_tokens": 25230588.0, - "step": 2821 - }, - { - "epoch": 2.1443768996960486, - "grad_norm": 1.6965736150741577, - "learning_rate": 1.0339277327959863e-06, - "loss": 0.27269643545150757, - "mean_token_accuracy": 0.9001271724700928, - "num_tokens": 25239298.0, - "step": 2822 - }, - { - "epoch": 2.1451367781155017, - "grad_norm": 2.789114236831665, - "learning_rate": 1.0322317859642852e-06, - "loss": 0.2319176197052002, - "mean_token_accuracy": 0.9237110614776611, - "num_tokens": 25243286.0, - "step": 2823 - }, - { - "epoch": 2.1458966565349544, - "grad_norm": 1.8817718029022217, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.2917990982532501, - "mean_token_accuracy": 0.9211062788963318, - "num_tokens": 25250575.0, - "step": 2824 - }, - { - "epoch": 2.146656534954407, - "grad_norm": 2.1824984550476074, - "learning_rate": 1.0288429838991405e-06, - "loss": 0.39010798931121826, - "mean_token_accuracy": 0.8887852430343628, - "num_tokens": 25257947.0, - "step": 2825 - }, - { - "epoch": 2.1474164133738602, - "grad_norm": 1.302579641342163, - "learning_rate": 1.0271501310440882e-06, - "loss": 0.3511282503604889, - "mean_token_accuracy": 0.8728797435760498, - "num_tokens": 25272846.0, - "step": 2826 - }, - { - "epoch": 2.148176291793313, - "grad_norm": 1.691807746887207, - "learning_rate": 1.0254583118917699e-06, - "loss": 0.34246695041656494, - "mean_token_accuracy": 0.8743435144424438, - "num_tokens": 25283004.0, - "step": 2827 - }, - { - "epoch": 2.148936170212766, - "grad_norm": 1.2483569383621216, - "learning_rate": 1.0237675276295709e-06, - "loss": 0.3346659243106842, - "mean_token_accuracy": 0.8823951482772827, - "num_tokens": 25297786.0, - "step": 2828 - }, - { - "epoch": 2.1496960486322187, - "grad_norm": 3.7242841720581055, - "learning_rate": 1.022077779444145e-06, - "loss": 0.25516486167907715, - "mean_token_accuracy": 0.9189130663871765, - "num_tokens": 25301524.0, - "step": 2829 - }, - { - "epoch": 2.150455927051672, - "grad_norm": 2.5851144790649414, - "learning_rate": 1.020389068521426e-06, - "loss": 0.3543069362640381, - "mean_token_accuracy": 0.8942399621009827, - "num_tokens": 25307277.0, - "step": 2830 - }, - { - "epoch": 2.1512158054711246, - "grad_norm": 1.3453631401062012, - "learning_rate": 1.018701396046616e-06, - "loss": 0.2900702953338623, - "mean_token_accuracy": 0.8847548365592957, - "num_tokens": 25321366.0, - "step": 2831 - }, - { - "epoch": 2.1519756838905777, - "grad_norm": 1.6905686855316162, - "learning_rate": 1.0170147632041858e-06, - "loss": 0.24844832718372345, - "mean_token_accuracy": 0.9167388677597046, - "num_tokens": 25328916.0, - "step": 2832 - }, - { - "epoch": 2.1527355623100304, - "grad_norm": 2.6469411849975586, - "learning_rate": 1.0153291711778825e-06, - "loss": 0.18566903471946716, - "mean_token_accuracy": 0.9346771836280823, - "num_tokens": 25332871.0, - "step": 2833 - }, - { - "epoch": 2.1534954407294835, - "grad_norm": 1.3880906105041504, - "learning_rate": 1.0136446211507175e-06, - "loss": 0.37413570284843445, - "mean_token_accuracy": 0.8685535788536072, - "num_tokens": 25347447.0, - "step": 2834 - }, - { - "epoch": 2.154255319148936, - "grad_norm": 1.1376656293869019, - "learning_rate": 1.0119611143049731e-06, - "loss": 0.2844143509864807, - "mean_token_accuracy": 0.8910006284713745, - "num_tokens": 25365930.0, - "step": 2835 - }, - { - "epoch": 2.155015197568389, - "grad_norm": 2.259666919708252, - "learning_rate": 1.0102786518221997e-06, - "loss": 0.3148176074028015, - "mean_token_accuracy": 0.8851165175437927, - "num_tokens": 25373047.0, - "step": 2836 - }, - { - "epoch": 2.155775075987842, - "grad_norm": 3.304095506668091, - "learning_rate": 1.0085972348832138e-06, - "loss": 0.2042517364025116, - "mean_token_accuracy": 0.9247308969497681, - "num_tokens": 25376348.0, - "step": 2837 - }, - { - "epoch": 2.1565349544072947, - "grad_norm": 1.9856120347976685, - "learning_rate": 1.0069168646680985e-06, - "loss": 0.3547414541244507, - "mean_token_accuracy": 0.8941285610198975, - "num_tokens": 25384675.0, - "step": 2838 - }, - { - "epoch": 2.157294832826748, - "grad_norm": 2.8482213020324707, - "learning_rate": 1.0052375423562038e-06, - "loss": 0.3530133366584778, - "mean_token_accuracy": 0.8789700269699097, - "num_tokens": 25389631.0, - "step": 2839 - }, - { - "epoch": 2.1580547112462005, - "grad_norm": 1.4270408153533936, - "learning_rate": 1.0035592691261395e-06, - "loss": 0.34078776836395264, - "mean_token_accuracy": 0.8648165464401245, - "num_tokens": 25403746.0, - "step": 2840 - }, - { - "epoch": 2.1588145896656536, - "grad_norm": 0.9342723488807678, - "learning_rate": 1.0018820461557852e-06, - "loss": 0.2615935504436493, - "mean_token_accuracy": 0.9082236289978027, - "num_tokens": 25424695.0, - "step": 2841 - }, - { - "epoch": 2.1595744680851063, - "grad_norm": 2.695632219314575, - "learning_rate": 1.0002058746222807e-06, - "loss": 0.2202145904302597, - "mean_token_accuracy": 0.9221563339233398, - "num_tokens": 25428783.0, - "step": 2842 - }, - { - "epoch": 2.1603343465045595, - "grad_norm": 1.5679794549942017, - "learning_rate": 9.985307557020257e-07, - "loss": 0.24275024235248566, - "mean_token_accuracy": 0.9363338351249695, - "num_tokens": 25439104.0, - "step": 2843 - }, - { - "epoch": 2.161094224924012, - "grad_norm": 1.5985528230667114, - "learning_rate": 9.968566905706833e-07, - "loss": 0.2541901171207428, - "mean_token_accuracy": 0.9040743112564087, - "num_tokens": 25448829.0, - "step": 2844 - }, - { - "epoch": 2.161854103343465, - "grad_norm": 2.6022164821624756, - "learning_rate": 9.951836804031795e-07, - "loss": 0.24492180347442627, - "mean_token_accuracy": 0.9109418392181396, - "num_tokens": 25453902.0, - "step": 2845 - }, - { - "epoch": 2.162613981762918, - "grad_norm": 1.6719969511032104, - "learning_rate": 9.935117263736943e-07, - "loss": 0.43255117535591125, - "mean_token_accuracy": 0.868374228477478, - "num_tokens": 25465538.0, - "step": 2846 - }, - { - "epoch": 2.1633738601823707, - "grad_norm": 1.8284894227981567, - "learning_rate": 9.918408296556706e-07, - "loss": 0.32285982370376587, - "mean_token_accuracy": 0.9016412496566772, - "num_tokens": 25473721.0, - "step": 2847 - }, - { - "epoch": 2.164133738601824, - "grad_norm": 1.4488024711608887, - "learning_rate": 9.90170991421808e-07, - "loss": 0.35639309883117676, - "mean_token_accuracy": 0.8861881494522095, - "num_tokens": 25487535.0, - "step": 2848 - }, - { - "epoch": 2.1648936170212765, - "grad_norm": 2.089930534362793, - "learning_rate": 9.88502212844063e-07, - "loss": 0.2588546574115753, - "mean_token_accuracy": 0.9029642939567566, - "num_tokens": 25494567.0, - "step": 2849 - }, - { - "epoch": 2.1656534954407296, - "grad_norm": 1.1274315118789673, - "learning_rate": 9.86834495093649e-07, - "loss": 0.37268880009651184, - "mean_token_accuracy": 0.859347939491272, - "num_tokens": 25518278.0, - "step": 2850 - }, - { - "epoch": 2.1664133738601823, - "grad_norm": 2.3886640071868896, - "learning_rate": 9.851678393410343e-07, - "loss": 0.34938913583755493, - "mean_token_accuracy": 0.8724287748336792, - "num_tokens": 25524001.0, - "step": 2851 - }, - { - "epoch": 2.1671732522796354, - "grad_norm": 2.521230459213257, - "learning_rate": 9.83502246755942e-07, - "loss": 0.34781408309936523, - "mean_token_accuracy": 0.8970093131065369, - "num_tokens": 25529982.0, - "step": 2852 - }, - { - "epoch": 2.167933130699088, - "grad_norm": 2.467618942260742, - "learning_rate": 9.818377185073493e-07, - "loss": 0.29725387692451477, - "mean_token_accuracy": 0.8991899490356445, - "num_tokens": 25535356.0, - "step": 2853 - }, - { - "epoch": 2.1686930091185412, - "grad_norm": 2.335873603820801, - "learning_rate": 9.801742557634872e-07, - "loss": 0.39603036642074585, - "mean_token_accuracy": 0.8755916357040405, - "num_tokens": 25542526.0, - "step": 2854 - }, - { - "epoch": 2.169452887537994, - "grad_norm": 1.8388596773147583, - "learning_rate": 9.78511859691835e-07, - "loss": 0.3414672017097473, - "mean_token_accuracy": 0.8951467275619507, - "num_tokens": 25551904.0, - "step": 2855 - }, - { - "epoch": 2.1702127659574466, - "grad_norm": 1.86272394657135, - "learning_rate": 9.768505314591295e-07, - "loss": 0.45748448371887207, - "mean_token_accuracy": 0.8614354133605957, - "num_tokens": 25562197.0, - "step": 2856 - }, - { - "epoch": 2.1709726443768997, - "grad_norm": 1.9142264127731323, - "learning_rate": 9.751902722313527e-07, - "loss": 0.20877259969711304, - "mean_token_accuracy": 0.9316688179969788, - "num_tokens": 25569403.0, - "step": 2857 - }, - { - "epoch": 2.1717325227963524, - "grad_norm": 2.1138272285461426, - "learning_rate": 9.73531083173739e-07, - "loss": 0.37058722972869873, - "mean_token_accuracy": 0.8654135465621948, - "num_tokens": 25577200.0, - "step": 2858 - }, - { - "epoch": 2.1724924012158056, - "grad_norm": 1.973467469215393, - "learning_rate": 9.718729654507713e-07, - "loss": 0.4106993079185486, - "mean_token_accuracy": 0.8958662152290344, - "num_tokens": 25585694.0, - "step": 2859 - }, - { - "epoch": 2.1732522796352582, - "grad_norm": 1.957513451576233, - "learning_rate": 9.702159202261802e-07, - "loss": 0.2067333608865738, - "mean_token_accuracy": 0.9413473606109619, - "num_tokens": 25591604.0, - "step": 2860 - }, - { - "epoch": 2.1740121580547114, - "grad_norm": 2.7639806270599365, - "learning_rate": 9.685599486629444e-07, - "loss": 0.3446827232837677, - "mean_token_accuracy": 0.8837845325469971, - "num_tokens": 25596528.0, - "step": 2861 - }, - { - "epoch": 2.174772036474164, - "grad_norm": 2.483734607696533, - "learning_rate": 9.669050519232875e-07, - "loss": 0.21230249106884003, - "mean_token_accuracy": 0.9334918856620789, - "num_tokens": 25601182.0, - "step": 2862 - }, - { - "epoch": 2.175531914893617, - "grad_norm": 1.7194870710372925, - "learning_rate": 9.65251231168681e-07, - "loss": 0.2657586932182312, - "mean_token_accuracy": 0.9035707712173462, - "num_tokens": 25610561.0, - "step": 2863 - }, - { - "epoch": 2.17629179331307, - "grad_norm": 2.6709611415863037, - "learning_rate": 9.63598487559839e-07, - "loss": 0.3673030138015747, - "mean_token_accuracy": 0.8976202011108398, - "num_tokens": 25615822.0, - "step": 2864 - }, - { - "epoch": 2.1770516717325226, - "grad_norm": 1.6646889448165894, - "learning_rate": 9.619468222567216e-07, - "loss": 0.2796666622161865, - "mean_token_accuracy": 0.8698215484619141, - "num_tokens": 25626148.0, - "step": 2865 - }, - { - "epoch": 2.1778115501519757, - "grad_norm": 1.8341799974441528, - "learning_rate": 9.602962364185286e-07, - "loss": 0.44835132360458374, - "mean_token_accuracy": 0.84391850233078, - "num_tokens": 25636305.0, - "step": 2866 - }, - { - "epoch": 2.1785714285714284, - "grad_norm": 2.3579823970794678, - "learning_rate": 9.586467312037076e-07, - "loss": 0.2875673472881317, - "mean_token_accuracy": 0.889403223991394, - "num_tokens": 25642593.0, - "step": 2867 - }, - { - "epoch": 2.1793313069908815, - "grad_norm": 1.1284339427947998, - "learning_rate": 9.569983077699447e-07, - "loss": 0.3402171730995178, - "mean_token_accuracy": 0.8795222043991089, - "num_tokens": 25663734.0, - "step": 2868 - }, - { - "epoch": 2.180091185410334, - "grad_norm": 1.4705578088760376, - "learning_rate": 9.553509672741646e-07, - "loss": 0.4216107726097107, - "mean_token_accuracy": 0.845354437828064, - "num_tokens": 25678197.0, - "step": 2869 - }, - { - "epoch": 2.1808510638297873, - "grad_norm": 2.6181085109710693, - "learning_rate": 9.53704710872535e-07, - "loss": 0.2777765393257141, - "mean_token_accuracy": 0.8884872198104858, - "num_tokens": 25683808.0, - "step": 2870 - }, - { - "epoch": 2.18161094224924, - "grad_norm": 2.7285003662109375, - "learning_rate": 9.520595397204643e-07, - "loss": 0.33339786529541016, - "mean_token_accuracy": 0.8892828226089478, - "num_tokens": 25690125.0, - "step": 2871 - }, - { - "epoch": 2.182370820668693, - "grad_norm": 2.200571298599243, - "learning_rate": 9.504154549725944e-07, - "loss": 0.46546393632888794, - "mean_token_accuracy": 0.8389996290206909, - "num_tokens": 25697279.0, - "step": 2872 - }, - { - "epoch": 2.183130699088146, - "grad_norm": 3.491392135620117, - "learning_rate": 9.487724577828081e-07, - "loss": 0.17026299238204956, - "mean_token_accuracy": 0.9410334825515747, - "num_tokens": 25700263.0, - "step": 2873 - }, - { - "epoch": 2.183890577507599, - "grad_norm": 2.7800233364105225, - "learning_rate": 9.471305493042243e-07, - "loss": 0.2309894859790802, - "mean_token_accuracy": 0.9233936071395874, - "num_tokens": 25704486.0, - "step": 2874 - }, - { - "epoch": 2.1846504559270516, - "grad_norm": 2.6505582332611084, - "learning_rate": 9.454897306891972e-07, - "loss": 0.4378674328327179, - "mean_token_accuracy": 0.8846660852432251, - "num_tokens": 25710115.0, - "step": 2875 - }, - { - "epoch": 2.1854103343465043, - "grad_norm": 1.5393849611282349, - "learning_rate": 9.438500030893166e-07, - "loss": 0.42081019282341003, - "mean_token_accuracy": 0.8672939538955688, - "num_tokens": 25724598.0, - "step": 2876 - }, - { - "epoch": 2.1861702127659575, - "grad_norm": 1.911198377609253, - "learning_rate": 9.422113676554073e-07, - "loss": 0.19115394353866577, - "mean_token_accuracy": 0.9201297163963318, - "num_tokens": 25731040.0, - "step": 2877 - }, - { - "epoch": 2.18693009118541, - "grad_norm": 1.371443748474121, - "learning_rate": 9.405738255375243e-07, - "loss": 0.3639947772026062, - "mean_token_accuracy": 0.8653393983840942, - "num_tokens": 25745335.0, - "step": 2878 - }, - { - "epoch": 2.1876899696048633, - "grad_norm": 3.216238498687744, - "learning_rate": 9.389373778849612e-07, - "loss": 0.2623414397239685, - "mean_token_accuracy": 0.9046015739440918, - "num_tokens": 25749223.0, - "step": 2879 - }, - { - "epoch": 2.188449848024316, - "grad_norm": 2.7558846473693848, - "learning_rate": 9.37302025846237e-07, - "loss": 0.31921297311782837, - "mean_token_accuracy": 0.8903186321258545, - "num_tokens": 25754341.0, - "step": 2880 - }, - { - "epoch": 2.189209726443769, - "grad_norm": 2.06365704536438, - "learning_rate": 9.356677705691058e-07, - "loss": 0.357482373714447, - "mean_token_accuracy": 0.8661626577377319, - "num_tokens": 25761199.0, - "step": 2881 - }, - { - "epoch": 2.189969604863222, - "grad_norm": 3.240328550338745, - "learning_rate": 9.340346132005507e-07, - "loss": 0.3157888650894165, - "mean_token_accuracy": 0.8948285579681396, - "num_tokens": 25765099.0, - "step": 2882 - }, - { - "epoch": 2.190729483282675, - "grad_norm": 1.4671967029571533, - "learning_rate": 9.324025548867849e-07, - "loss": 0.32077109813690186, - "mean_token_accuracy": 0.8813248872756958, - "num_tokens": 25777636.0, - "step": 2883 - }, - { - "epoch": 2.1914893617021276, - "grad_norm": 2.6475353240966797, - "learning_rate": 9.307715967732492e-07, - "loss": 0.35567623376846313, - "mean_token_accuracy": 0.8738130331039429, - "num_tokens": 25783737.0, - "step": 2884 - }, - { - "epoch": 2.1922492401215807, - "grad_norm": 1.791491150856018, - "learning_rate": 9.29141740004613e-07, - "loss": 0.2556282877922058, - "mean_token_accuracy": 0.9223519563674927, - "num_tokens": 25792069.0, - "step": 2885 - }, - { - "epoch": 2.1930091185410334, - "grad_norm": 2.3944389820098877, - "learning_rate": 9.275129857247722e-07, - "loss": 0.3145869970321655, - "mean_token_accuracy": 0.8938079476356506, - "num_tokens": 25798400.0, - "step": 2886 - }, - { - "epoch": 2.193768996960486, - "grad_norm": 2.0802059173583984, - "learning_rate": 9.258853350768499e-07, - "loss": 0.37343069911003113, - "mean_token_accuracy": 0.8705670833587646, - "num_tokens": 25806567.0, - "step": 2887 - }, - { - "epoch": 2.1945288753799392, - "grad_norm": 2.10831880569458, - "learning_rate": 9.242587892031945e-07, - "loss": 0.1989251971244812, - "mean_token_accuracy": 0.931064248085022, - "num_tokens": 25812715.0, - "step": 2888 - }, - { - "epoch": 2.195288753799392, - "grad_norm": 2.1305530071258545, - "learning_rate": 9.226333492453759e-07, - "loss": 0.29377204179763794, - "mean_token_accuracy": 0.8942701816558838, - "num_tokens": 25819988.0, - "step": 2889 - }, - { - "epoch": 2.196048632218845, - "grad_norm": 2.179025411605835, - "learning_rate": 9.210090163441928e-07, - "loss": 0.37565115094184875, - "mean_token_accuracy": 0.8700202703475952, - "num_tokens": 25827777.0, - "step": 2890 - }, - { - "epoch": 2.1968085106382977, - "grad_norm": 3.177180290222168, - "learning_rate": 9.19385791639665e-07, - "loss": 0.16646479070186615, - "mean_token_accuracy": 0.9426749348640442, - "num_tokens": 25831724.0, - "step": 2891 - }, - { - "epoch": 2.197568389057751, - "grad_norm": 1.103196620941162, - "learning_rate": 9.177636762710321e-07, - "loss": 0.29140013456344604, - "mean_token_accuracy": 0.8789779543876648, - "num_tokens": 25854707.0, - "step": 2892 - }, - { - "epoch": 2.1983282674772036, - "grad_norm": 1.597692847251892, - "learning_rate": 9.161426713767574e-07, - "loss": 0.37799614667892456, - "mean_token_accuracy": 0.8623079061508179, - "num_tokens": 25868429.0, - "step": 2893 - }, - { - "epoch": 2.1990881458966567, - "grad_norm": 2.227132558822632, - "learning_rate": 9.145227780945265e-07, - "loss": 0.2683261036872864, - "mean_token_accuracy": 0.9092563390731812, - "num_tokens": 25875367.0, - "step": 2894 - }, - { - "epoch": 2.1998480243161094, - "grad_norm": 3.1229634284973145, - "learning_rate": 9.129039975612408e-07, - "loss": 0.21859994530677795, - "mean_token_accuracy": 0.9187530875205994, - "num_tokens": 25879456.0, - "step": 2895 - }, - { - "epoch": 2.2006079027355625, - "grad_norm": 2.3224828243255615, - "learning_rate": 9.112863309130235e-07, - "loss": 0.3557605743408203, - "mean_token_accuracy": 0.8735873103141785, - "num_tokens": 25886477.0, - "step": 2896 - }, - { - "epoch": 2.201367781155015, - "grad_norm": 1.7784863710403442, - "learning_rate": 9.096697792852155e-07, - "loss": 0.334577351808548, - "mean_token_accuracy": 0.8948780298233032, - "num_tokens": 25894977.0, - "step": 2897 - }, - { - "epoch": 2.202127659574468, - "grad_norm": 2.34066104888916, - "learning_rate": 9.080543438123746e-07, - "loss": 0.16479721665382385, - "mean_token_accuracy": 0.9405456781387329, - "num_tokens": 25900015.0, - "step": 2898 - }, - { - "epoch": 2.202887537993921, - "grad_norm": 1.944082498550415, - "learning_rate": 9.064400256282757e-07, - "loss": 0.40259572863578796, - "mean_token_accuracy": 0.8632713556289673, - "num_tokens": 25908749.0, - "step": 2899 - }, - { - "epoch": 2.2036474164133737, - "grad_norm": 1.2758828401565552, - "learning_rate": 9.048268258659098e-07, - "loss": 0.3939874470233917, - "mean_token_accuracy": 0.8652969598770142, - "num_tokens": 25924972.0, - "step": 2900 - }, - { - "epoch": 2.204407294832827, - "grad_norm": 1.4483891725540161, - "learning_rate": 9.032147456574822e-07, - "loss": 0.4132935404777527, - "mean_token_accuracy": 0.868486762046814, - "num_tokens": 25939785.0, - "step": 2901 - }, - { - "epoch": 2.2051671732522795, - "grad_norm": 1.4866713285446167, - "learning_rate": 9.01603786134413e-07, - "loss": 0.3644951581954956, - "mean_token_accuracy": 0.8750203847885132, - "num_tokens": 25952648.0, - "step": 2902 - }, - { - "epoch": 2.2059270516717326, - "grad_norm": 1.6555454730987549, - "learning_rate": 8.999939484273362e-07, - "loss": 0.48656779527664185, - "mean_token_accuracy": 0.8372372984886169, - "num_tokens": 25965062.0, - "step": 2903 - }, - { - "epoch": 2.2066869300911853, - "grad_norm": 2.3154168128967285, - "learning_rate": 8.983852336660959e-07, - "loss": 0.3768891990184784, - "mean_token_accuracy": 0.8614999055862427, - "num_tokens": 25972152.0, - "step": 2904 - }, - { - "epoch": 2.2074468085106385, - "grad_norm": 2.3618056774139404, - "learning_rate": 8.967776429797529e-07, - "loss": 0.24905793368816376, - "mean_token_accuracy": 0.9170958995819092, - "num_tokens": 25977808.0, - "step": 2905 - }, - { - "epoch": 2.208206686930091, - "grad_norm": 1.929051399230957, - "learning_rate": 8.951711774965741e-07, - "loss": 0.38099539279937744, - "mean_token_accuracy": 0.8812143802642822, - "num_tokens": 25987871.0, - "step": 2906 - }, - { - "epoch": 2.2089665653495443, - "grad_norm": 1.6529620885849, - "learning_rate": 8.93565838344039e-07, - "loss": 0.31784749031066895, - "mean_token_accuracy": 0.8929437398910522, - "num_tokens": 25997777.0, - "step": 2907 - }, - { - "epoch": 2.209726443768997, - "grad_norm": 2.1413469314575195, - "learning_rate": 8.919616266488373e-07, - "loss": 0.4043882191181183, - "mean_token_accuracy": 0.8937146663665771, - "num_tokens": 26005213.0, - "step": 2908 - }, - { - "epoch": 2.2104863221884496, - "grad_norm": 1.3838988542556763, - "learning_rate": 8.903585435368658e-07, - "loss": 0.2858969569206238, - "mean_token_accuracy": 0.9084860682487488, - "num_tokens": 26018371.0, - "step": 2909 - }, - { - "epoch": 2.211246200607903, - "grad_norm": 1.2853319644927979, - "learning_rate": 8.887565901332304e-07, - "loss": 0.3178713619709015, - "mean_token_accuracy": 0.872230589389801, - "num_tokens": 26034136.0, - "step": 2910 - }, - { - "epoch": 2.2120060790273555, - "grad_norm": 2.9032399654388428, - "learning_rate": 8.871557675622442e-07, - "loss": 0.20348960161209106, - "mean_token_accuracy": 0.9275314807891846, - "num_tokens": 26038299.0, - "step": 2911 - }, - { - "epoch": 2.2127659574468086, - "grad_norm": 2.4349892139434814, - "learning_rate": 8.855560769474237e-07, - "loss": 0.24282032251358032, - "mean_token_accuracy": 0.9103988409042358, - "num_tokens": 26043427.0, - "step": 2912 - }, - { - "epoch": 2.2135258358662613, - "grad_norm": 2.324664831161499, - "learning_rate": 8.839575194114958e-07, - "loss": 0.3808317184448242, - "mean_token_accuracy": 0.8598989844322205, - "num_tokens": 26049667.0, - "step": 2913 - }, - { - "epoch": 2.2142857142857144, - "grad_norm": 2.594947576522827, - "learning_rate": 8.823600960763901e-07, - "loss": 0.39623332023620605, - "mean_token_accuracy": 0.8738477230072021, - "num_tokens": 26055428.0, - "step": 2914 - }, - { - "epoch": 2.215045592705167, - "grad_norm": 1.674308180809021, - "learning_rate": 8.807638080632375e-07, - "loss": 0.2641369104385376, - "mean_token_accuracy": 0.9119734764099121, - "num_tokens": 26064355.0, - "step": 2915 - }, - { - "epoch": 2.2158054711246202, - "grad_norm": 2.9884912967681885, - "learning_rate": 8.791686564923746e-07, - "loss": 0.19229236245155334, - "mean_token_accuracy": 0.9388723969459534, - "num_tokens": 26067563.0, - "step": 2916 - }, - { - "epoch": 2.216565349544073, - "grad_norm": 1.8513846397399902, - "learning_rate": 8.775746424833428e-07, - "loss": 0.3076218366622925, - "mean_token_accuracy": 0.9165210723876953, - "num_tokens": 26075609.0, - "step": 2917 - }, - { - "epoch": 2.217325227963526, - "grad_norm": 1.229604721069336, - "learning_rate": 8.759817671548801e-07, - "loss": 0.2727023959159851, - "mean_token_accuracy": 0.8931418061256409, - "num_tokens": 26091183.0, - "step": 2918 - }, - { - "epoch": 2.2180851063829787, - "grad_norm": 2.384413957595825, - "learning_rate": 8.743900316249273e-07, - "loss": 0.27312609553337097, - "mean_token_accuracy": 0.8972288370132446, - "num_tokens": 26096677.0, - "step": 2919 - }, - { - "epoch": 2.2188449848024314, - "grad_norm": 2.186370611190796, - "learning_rate": 8.727994370106288e-07, - "loss": 0.36045557260513306, - "mean_token_accuracy": 0.8788503408432007, - "num_tokens": 26104464.0, - "step": 2920 - }, - { - "epoch": 2.2196048632218845, - "grad_norm": 2.769796848297119, - "learning_rate": 8.71209984428322e-07, - "loss": 0.3427591919898987, - "mean_token_accuracy": 0.892108678817749, - "num_tokens": 26109571.0, - "step": 2921 - }, - { - "epoch": 2.2203647416413372, - "grad_norm": 2.9888014793395996, - "learning_rate": 8.696216749935471e-07, - "loss": 0.20137615501880646, - "mean_token_accuracy": 0.9366025924682617, - "num_tokens": 26113165.0, - "step": 2922 - }, - { - "epoch": 2.2211246200607904, - "grad_norm": 1.484858751296997, - "learning_rate": 8.680345098210408e-07, - "loss": 0.2884698510169983, - "mean_token_accuracy": 0.8992507457733154, - "num_tokens": 26124385.0, - "step": 2923 - }, - { - "epoch": 2.221884498480243, - "grad_norm": 1.690119981765747, - "learning_rate": 8.664484900247363e-07, - "loss": 0.34275567531585693, - "mean_token_accuracy": 0.8682634234428406, - "num_tokens": 26134944.0, - "step": 2924 - }, - { - "epoch": 2.222644376899696, - "grad_norm": 1.6171982288360596, - "learning_rate": 8.64863616717764e-07, - "loss": 0.256338506937027, - "mean_token_accuracy": 0.9281957745552063, - "num_tokens": 26143586.0, - "step": 2925 - }, - { - "epoch": 2.223404255319149, - "grad_norm": 2.4853835105895996, - "learning_rate": 8.632798910124493e-07, - "loss": 0.26290056109428406, - "mean_token_accuracy": 0.9119559526443481, - "num_tokens": 26148931.0, - "step": 2926 - }, - { - "epoch": 2.224164133738602, - "grad_norm": 2.0014333724975586, - "learning_rate": 8.616973140203097e-07, - "loss": 0.33400261402130127, - "mean_token_accuracy": 0.8796782493591309, - "num_tokens": 26156246.0, - "step": 2927 - }, - { - "epoch": 2.2249240121580547, - "grad_norm": 1.4637027978897095, - "learning_rate": 8.601158868520617e-07, - "loss": 0.24374958872795105, - "mean_token_accuracy": 0.9116952419281006, - "num_tokens": 26166431.0, - "step": 2928 - }, - { - "epoch": 2.225683890577508, - "grad_norm": 2.2056987285614014, - "learning_rate": 8.585356106176093e-07, - "loss": 0.3419337570667267, - "mean_token_accuracy": 0.8703858852386475, - "num_tokens": 26173974.0, - "step": 2929 - }, - { - "epoch": 2.2264437689969605, - "grad_norm": 1.3687927722930908, - "learning_rate": 8.569564864260524e-07, - "loss": 0.43176111578941345, - "mean_token_accuracy": 0.8616900444030762, - "num_tokens": 26191632.0, - "step": 2930 - }, - { - "epoch": 2.227203647416413, - "grad_norm": 1.4975634813308716, - "learning_rate": 8.553785153856809e-07, - "loss": 0.38525745272636414, - "mean_token_accuracy": 0.8611687421798706, - "num_tokens": 26203300.0, - "step": 2931 - }, - { - "epoch": 2.2279635258358663, - "grad_norm": 1.970109462738037, - "learning_rate": 8.538016986039751e-07, - "loss": 0.31731468439102173, - "mean_token_accuracy": 0.884365975856781, - "num_tokens": 26210037.0, - "step": 2932 - }, - { - "epoch": 2.228723404255319, - "grad_norm": 2.681717872619629, - "learning_rate": 8.522260371876068e-07, - "loss": 0.2770140767097473, - "mean_token_accuracy": 0.9020107984542847, - "num_tokens": 26215460.0, - "step": 2933 - }, - { - "epoch": 2.229483282674772, - "grad_norm": 2.2324795722961426, - "learning_rate": 8.506515322424349e-07, - "loss": 0.30599141120910645, - "mean_token_accuracy": 0.8939633965492249, - "num_tokens": 26221260.0, - "step": 2934 - }, - { - "epoch": 2.230243161094225, - "grad_norm": 2.08915376663208, - "learning_rate": 8.49078184873508e-07, - "loss": 0.3609209954738617, - "mean_token_accuracy": 0.8776482343673706, - "num_tokens": 26228397.0, - "step": 2935 - }, - { - "epoch": 2.231003039513678, - "grad_norm": 1.641366958618164, - "learning_rate": 8.475059961850617e-07, - "loss": 0.2969125509262085, - "mean_token_accuracy": 0.8949217796325684, - "num_tokens": 26238533.0, - "step": 2936 - }, - { - "epoch": 2.2317629179331306, - "grad_norm": 1.082148551940918, - "learning_rate": 8.459349672805198e-07, - "loss": 0.23957109451293945, - "mean_token_accuracy": 0.9255712032318115, - "num_tokens": 26254154.0, - "step": 2937 - }, - { - "epoch": 2.2325227963525838, - "grad_norm": 2.495208740234375, - "learning_rate": 8.443650992624877e-07, - "loss": 0.2879767417907715, - "mean_token_accuracy": 0.8911515474319458, - "num_tokens": 26260812.0, - "step": 2938 - }, - { - "epoch": 2.2332826747720365, - "grad_norm": 3.566549062728882, - "learning_rate": 8.427963932327621e-07, - "loss": 0.31420570611953735, - "mean_token_accuracy": 0.8888009190559387, - "num_tokens": 26264592.0, - "step": 2939 - }, - { - "epoch": 2.2340425531914896, - "grad_norm": 2.217177391052246, - "learning_rate": 8.412288502923211e-07, - "loss": 0.30547618865966797, - "mean_token_accuracy": 0.9065294861793518, - "num_tokens": 26270729.0, - "step": 2940 - }, - { - "epoch": 2.2348024316109423, - "grad_norm": 1.404260277748108, - "learning_rate": 8.396624715413251e-07, - "loss": 0.32485032081604004, - "mean_token_accuracy": 0.8799532651901245, - "num_tokens": 26284280.0, - "step": 2941 - }, - { - "epoch": 2.235562310030395, - "grad_norm": 1.5519827604293823, - "learning_rate": 8.380972580791191e-07, - "loss": 0.3330575227737427, - "mean_token_accuracy": 0.8865892887115479, - "num_tokens": 26293635.0, - "step": 2942 - }, - { - "epoch": 2.236322188449848, - "grad_norm": 2.604766845703125, - "learning_rate": 8.365332110042323e-07, - "loss": 0.18986842036247253, - "mean_token_accuracy": 0.9276989102363586, - "num_tokens": 26298553.0, - "step": 2943 - }, - { - "epoch": 2.237082066869301, - "grad_norm": 2.1750004291534424, - "learning_rate": 8.349703314143712e-07, - "loss": 0.3661153018474579, - "mean_token_accuracy": 0.8879489302635193, - "num_tokens": 26305697.0, - "step": 2944 - }, - { - "epoch": 2.237841945288754, - "grad_norm": 2.247069835662842, - "learning_rate": 8.334086204064254e-07, - "loss": 0.3127560615539551, - "mean_token_accuracy": 0.8846344351768494, - "num_tokens": 26312347.0, - "step": 2945 - }, - { - "epoch": 2.2386018237082066, - "grad_norm": 1.905275821685791, - "learning_rate": 8.318480790764638e-07, - "loss": 0.44245776534080505, - "mean_token_accuracy": 0.87440425157547, - "num_tokens": 26322787.0, - "step": 2946 - }, - { - "epoch": 2.2393617021276597, - "grad_norm": 1.8596254587173462, - "learning_rate": 8.302887085197342e-07, - "loss": 0.30068373680114746, - "mean_token_accuracy": 0.8847110271453857, - "num_tokens": 26330437.0, - "step": 2947 - }, - { - "epoch": 2.2401215805471124, - "grad_norm": 2.0028860569000244, - "learning_rate": 8.28730509830663e-07, - "loss": 0.4276006817817688, - "mean_token_accuracy": 0.8406014442443848, - "num_tokens": 26340100.0, - "step": 2948 - }, - { - "epoch": 2.2408814589665655, - "grad_norm": 2.494434356689453, - "learning_rate": 8.271734841028553e-07, - "loss": 0.3874223232269287, - "mean_token_accuracy": 0.8782174587249756, - "num_tokens": 26345750.0, - "step": 2949 - }, - { - "epoch": 2.2416413373860182, - "grad_norm": 1.955613613128662, - "learning_rate": 8.256176324290885e-07, - "loss": 0.28770074248313904, - "mean_token_accuracy": 0.9004360437393188, - "num_tokens": 26353342.0, - "step": 2950 - }, - { - "epoch": 2.2424012158054714, - "grad_norm": 1.7579785585403442, - "learning_rate": 8.240629559013222e-07, - "loss": 0.2277943640947342, - "mean_token_accuracy": 0.9145861864089966, - "num_tokens": 26361348.0, - "step": 2951 - }, - { - "epoch": 2.243161094224924, - "grad_norm": 1.5848479270935059, - "learning_rate": 8.22509455610688e-07, - "loss": 0.32944542169570923, - "mean_token_accuracy": 0.8662827014923096, - "num_tokens": 26372006.0, - "step": 2952 - }, - { - "epoch": 2.2439209726443767, - "grad_norm": 2.6263222694396973, - "learning_rate": 8.209571326474897e-07, - "loss": 0.34646326303482056, - "mean_token_accuracy": 0.8817736506462097, - "num_tokens": 26377664.0, - "step": 2953 - }, - { - "epoch": 2.24468085106383, - "grad_norm": 2.407590627670288, - "learning_rate": 8.194059881012107e-07, - "loss": 0.41302192211151123, - "mean_token_accuracy": 0.8898757696151733, - "num_tokens": 26384225.0, - "step": 2954 - }, - { - "epoch": 2.2454407294832825, - "grad_norm": 2.5156402587890625, - "learning_rate": 8.178560230605012e-07, - "loss": 0.3468608558177948, - "mean_token_accuracy": 0.8879599571228027, - "num_tokens": 26389374.0, - "step": 2955 - }, - { - "epoch": 2.2462006079027357, - "grad_norm": 1.5076090097427368, - "learning_rate": 8.163072386131876e-07, - "loss": 0.3750625550746918, - "mean_token_accuracy": 0.8712738752365112, - "num_tokens": 26402674.0, - "step": 2956 - }, - { - "epoch": 2.2469604863221884, - "grad_norm": 1.5181068181991577, - "learning_rate": 8.147596358462662e-07, - "loss": 0.19113478064537048, - "mean_token_accuracy": 0.9323463439941406, - "num_tokens": 26411626.0, - "step": 2957 - }, - { - "epoch": 2.2477203647416415, - "grad_norm": 1.0806915760040283, - "learning_rate": 8.132132158459044e-07, - "loss": 0.3411233425140381, - "mean_token_accuracy": 0.8736830949783325, - "num_tokens": 26435891.0, - "step": 2958 - }, - { - "epoch": 2.248480243161094, - "grad_norm": 1.5527247190475464, - "learning_rate": 8.116679796974389e-07, - "loss": 0.425741970539093, - "mean_token_accuracy": 0.8448845148086548, - "num_tokens": 26448134.0, - "step": 2959 - }, - { - "epoch": 2.2492401215805473, - "grad_norm": 1.2390631437301636, - "learning_rate": 8.10123928485377e-07, - "loss": 0.38084933161735535, - "mean_token_accuracy": 0.8656617999076843, - "num_tokens": 26467213.0, - "step": 2960 - }, - { - "epoch": 2.25, - "grad_norm": 3.0672852993011475, - "learning_rate": 8.08581063293391e-07, - "loss": 0.29300111532211304, - "mean_token_accuracy": 0.8933638334274292, - "num_tokens": 26471599.0, - "step": 2961 - }, - { - "epoch": 2.250759878419453, - "grad_norm": 1.2359145879745483, - "learning_rate": 8.070393852043251e-07, - "loss": 0.41337621212005615, - "mean_token_accuracy": 0.854198694229126, - "num_tokens": 26488461.0, - "step": 2962 - }, - { - "epoch": 2.251519756838906, - "grad_norm": 1.8551225662231445, - "learning_rate": 8.054988953001889e-07, - "loss": 0.3036419153213501, - "mean_token_accuracy": 0.8883144855499268, - "num_tokens": 26496398.0, - "step": 2963 - }, - { - "epoch": 2.2522796352583585, - "grad_norm": 1.3691812753677368, - "learning_rate": 8.039595946621551e-07, - "loss": 0.3286219835281372, - "mean_token_accuracy": 0.892130434513092, - "num_tokens": 26510493.0, - "step": 2964 - }, - { - "epoch": 2.2530395136778116, - "grad_norm": 1.7371556758880615, - "learning_rate": 8.024214843705647e-07, - "loss": 0.4105026125907898, - "mean_token_accuracy": 0.8889180421829224, - "num_tokens": 26519148.0, - "step": 2965 - }, - { - "epoch": 2.2537993920972643, - "grad_norm": 2.211665630340576, - "learning_rate": 8.00884565504925e-07, - "loss": 0.3912196159362793, - "mean_token_accuracy": 0.8632891774177551, - "num_tokens": 26526314.0, - "step": 2966 - }, - { - "epoch": 2.2545592705167175, - "grad_norm": 2.476206064224243, - "learning_rate": 7.993488391439025e-07, - "loss": 0.20462508499622345, - "mean_token_accuracy": 0.9276266098022461, - "num_tokens": 26531781.0, - "step": 2967 - }, - { - "epoch": 2.25531914893617, - "grad_norm": 1.4944102764129639, - "learning_rate": 7.978143063653296e-07, - "loss": 0.2694895267486572, - "mean_token_accuracy": 0.9033881425857544, - "num_tokens": 26543780.0, - "step": 2968 - }, - { - "epoch": 2.2560790273556233, - "grad_norm": 1.7570104598999023, - "learning_rate": 7.962809682462008e-07, - "loss": 0.3060353100299835, - "mean_token_accuracy": 0.8908290863037109, - "num_tokens": 26551978.0, - "step": 2969 - }, - { - "epoch": 2.256838905775076, - "grad_norm": 2.215514898300171, - "learning_rate": 7.947488258626718e-07, - "loss": 0.2930528521537781, - "mean_token_accuracy": 0.8989757299423218, - "num_tokens": 26558267.0, - "step": 2970 - }, - { - "epoch": 2.2575987841945286, - "grad_norm": 2.3069000244140625, - "learning_rate": 7.93217880290059e-07, - "loss": 0.18501774966716766, - "mean_token_accuracy": 0.931271493434906, - "num_tokens": 26563286.0, - "step": 2971 - }, - { - "epoch": 2.2583586626139818, - "grad_norm": 1.6555116176605225, - "learning_rate": 7.916881326028387e-07, - "loss": 0.3178265392780304, - "mean_token_accuracy": 0.9016884565353394, - "num_tokens": 26572087.0, - "step": 2972 - }, - { - "epoch": 2.2591185410334345, - "grad_norm": 2.222161054611206, - "learning_rate": 7.901595838746471e-07, - "loss": 0.3013504445552826, - "mean_token_accuracy": 0.8942798376083374, - "num_tokens": 26578159.0, - "step": 2973 - }, - { - "epoch": 2.2598784194528876, - "grad_norm": 1.979411005973816, - "learning_rate": 7.886322351782782e-07, - "loss": 0.42746615409851074, - "mean_token_accuracy": 0.85303795337677, - "num_tokens": 26586252.0, - "step": 2974 - }, - { - "epoch": 2.2606382978723403, - "grad_norm": 1.4925786256790161, - "learning_rate": 7.871060875856854e-07, - "loss": 0.33495625853538513, - "mean_token_accuracy": 0.8911026120185852, - "num_tokens": 26599921.0, - "step": 2975 - }, - { - "epoch": 2.2613981762917934, - "grad_norm": 1.9037046432495117, - "learning_rate": 7.855811421679746e-07, - "loss": 0.31471866369247437, - "mean_token_accuracy": 0.9007552862167358, - "num_tokens": 26607954.0, - "step": 2976 - }, - { - "epoch": 2.262158054711246, - "grad_norm": 2.2751407623291016, - "learning_rate": 7.840573999954154e-07, - "loss": 0.26972368359565735, - "mean_token_accuracy": 0.8992317914962769, - "num_tokens": 26614036.0, - "step": 2977 - }, - { - "epoch": 2.262917933130699, - "grad_norm": 2.680572271347046, - "learning_rate": 7.825348621374257e-07, - "loss": 0.4264066219329834, - "mean_token_accuracy": 0.8547691106796265, - "num_tokens": 26619545.0, - "step": 2978 - }, - { - "epoch": 2.263677811550152, - "grad_norm": 2.3535876274108887, - "learning_rate": 7.810135296625817e-07, - "loss": 0.37871062755584717, - "mean_token_accuracy": 0.8621708750724792, - "num_tokens": 26626248.0, - "step": 2979 - }, - { - "epoch": 2.264437689969605, - "grad_norm": 1.2249537706375122, - "learning_rate": 7.794934036386139e-07, - "loss": 0.3877285122871399, - "mean_token_accuracy": 0.8593572378158569, - "num_tokens": 26648023.0, - "step": 2980 - }, - { - "epoch": 2.2651975683890577, - "grad_norm": 2.43371844291687, - "learning_rate": 7.779744851324048e-07, - "loss": 0.37463510036468506, - "mean_token_accuracy": 0.8646193742752075, - "num_tokens": 26654016.0, - "step": 2981 - }, - { - "epoch": 2.2659574468085104, - "grad_norm": 1.7429327964782715, - "learning_rate": 7.7645677520999e-07, - "loss": 0.4033060669898987, - "mean_token_accuracy": 0.8644014596939087, - "num_tokens": 26664447.0, - "step": 2982 - }, - { - "epoch": 2.2667173252279635, - "grad_norm": 2.4090006351470947, - "learning_rate": 7.749402749365573e-07, - "loss": 0.2981206774711609, - "mean_token_accuracy": 0.8886175751686096, - "num_tokens": 26670355.0, - "step": 2983 - }, - { - "epoch": 2.2674772036474162, - "grad_norm": 1.3855396509170532, - "learning_rate": 7.734249853764428e-07, - "loss": 0.35967472195625305, - "mean_token_accuracy": 0.8652631044387817, - "num_tokens": 26685385.0, - "step": 2984 - }, - { - "epoch": 2.2682370820668694, - "grad_norm": 1.328214168548584, - "learning_rate": 7.719109075931375e-07, - "loss": 0.3571951389312744, - "mean_token_accuracy": 0.8894522190093994, - "num_tokens": 26703265.0, - "step": 2985 - }, - { - "epoch": 2.268996960486322, - "grad_norm": 2.5001046657562256, - "learning_rate": 7.703980426492791e-07, - "loss": 0.3512844741344452, - "mean_token_accuracy": 0.887405514717102, - "num_tokens": 26709095.0, - "step": 2986 - }, - { - "epoch": 2.269756838905775, - "grad_norm": 1.8704569339752197, - "learning_rate": 7.688863916066524e-07, - "loss": 0.2746743857860565, - "mean_token_accuracy": 0.903412401676178, - "num_tokens": 26716815.0, - "step": 2987 - }, - { - "epoch": 2.270516717325228, - "grad_norm": 2.1134285926818848, - "learning_rate": 7.673759555261947e-07, - "loss": 0.38385504484176636, - "mean_token_accuracy": 0.8759124279022217, - "num_tokens": 26724046.0, - "step": 2988 - }, - { - "epoch": 2.271276595744681, - "grad_norm": 1.2651840448379517, - "learning_rate": 7.65866735467988e-07, - "loss": 0.3499506413936615, - "mean_token_accuracy": 0.8704953193664551, - "num_tokens": 26743024.0, - "step": 2989 - }, - { - "epoch": 2.2720364741641337, - "grad_norm": 1.7289817333221436, - "learning_rate": 7.643587324912597e-07, - "loss": 0.3768725097179413, - "mean_token_accuracy": 0.8623670339584351, - "num_tokens": 26754336.0, - "step": 2990 - }, - { - "epoch": 2.272796352583587, - "grad_norm": 1.6121667623519897, - "learning_rate": 7.628519476543839e-07, - "loss": 0.42746737599372864, - "mean_token_accuracy": 0.8425478935241699, - "num_tokens": 26766813.0, - "step": 2991 - }, - { - "epoch": 2.2735562310030395, - "grad_norm": 2.705442428588867, - "learning_rate": 7.613463820148831e-07, - "loss": 0.27137982845306396, - "mean_token_accuracy": 0.9014253616333008, - "num_tokens": 26772565.0, - "step": 2992 - }, - { - "epoch": 2.274316109422492, - "grad_norm": 1.3811960220336914, - "learning_rate": 7.598420366294185e-07, - "loss": 0.2957465350627899, - "mean_token_accuracy": 0.8935354351997375, - "num_tokens": 26787325.0, - "step": 2993 - }, - { - "epoch": 2.2750759878419453, - "grad_norm": 2.469336986541748, - "learning_rate": 7.583389125537982e-07, - "loss": 0.2811780273914337, - "mean_token_accuracy": 0.8956634998321533, - "num_tokens": 26793457.0, - "step": 2994 - }, - { - "epoch": 2.275835866261398, - "grad_norm": 2.945681571960449, - "learning_rate": 7.568370108429732e-07, - "loss": 0.3186708092689514, - "mean_token_accuracy": 0.8817545175552368, - "num_tokens": 26797867.0, - "step": 2995 - }, - { - "epoch": 2.276595744680851, - "grad_norm": 1.7748228311538696, - "learning_rate": 7.553363325510355e-07, - "loss": 0.3279818892478943, - "mean_token_accuracy": 0.884396493434906, - "num_tokens": 26806656.0, - "step": 2996 - }, - { - "epoch": 2.277355623100304, - "grad_norm": 1.312500238418579, - "learning_rate": 7.538368787312186e-07, - "loss": 0.3754822611808777, - "mean_token_accuracy": 0.8653179407119751, - "num_tokens": 26823126.0, - "step": 2997 - }, - { - "epoch": 2.278115501519757, - "grad_norm": 3.1305344104766846, - "learning_rate": 7.523386504358984e-07, - "loss": 0.3293214440345764, - "mean_token_accuracy": 0.8908799886703491, - "num_tokens": 26828250.0, - "step": 2998 - }, - { - "epoch": 2.2788753799392096, - "grad_norm": 2.6449344158172607, - "learning_rate": 7.508416487165862e-07, - "loss": 0.23732036352157593, - "mean_token_accuracy": 0.9029837846755981, - "num_tokens": 26833123.0, - "step": 2999 - }, - { - "epoch": 2.2796352583586628, - "grad_norm": 2.04388427734375, - "learning_rate": 7.49345874623939e-07, - "loss": 0.31240373849868774, - "mean_token_accuracy": 0.8860392570495605, - "num_tokens": 26840878.0, - "step": 3000 - } - ], - "logging_steps": 1.0, - "max_steps": 3948, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 1000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 2.925799536381133e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin deleted file mode 100644 index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000 --- a/checkpoint-3000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 -size 6225 diff --git a/checkpoint-3948/chat_template.jinja b/checkpoint-3948/chat_template.jinja deleted file mode 100644 index 70adff8a08fb31e0636f618564838d4bf3c05286..0000000000000000000000000000000000000000 --- a/checkpoint-3948/chat_template.jinja +++ /dev/null @@ -1,61 +0,0 @@ -{%- if tools %} - {{- '<|im_start|>system\n' }} - {%- if messages[0].role == 'system' %} - {{- messages[0].content + '\n\n' }} - {%- endif %} - {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} - {%- for tool in tools %} - {{- "\n" }} - {{- tool | tojson }} - {%- endfor %} - {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} -{%- else %} - {%- if messages[0].role == 'system' %} - {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} - {%- endif %} -{%- endif %} -{%- for message in messages %} - {%- if message.content is string %} - {%- set content = message.content %} - {%- else %} - {%- set content = '' %} - {%- endif %} - {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} - {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} - {%- elif message.role == "assistant" %} - {{- '<|im_start|>' + message.role + '\n' + content }} - {%- if message.tool_calls %} - {%- for tool_call in message.tool_calls %} - {%- if (loop.first and content) or (not loop.first) %} - {{- '\n' }} - {%- endif %} - {%- if tool_call.function %} - {%- set tool_call = tool_call.function %} - {%- endif %} - {{- '\n{"name": "' }} - {{- tool_call.name }} - {{- '", "arguments": ' }} - {%- if tool_call.arguments is string %} - {{- tool_call.arguments }} - {%- else %} - {{- tool_call.arguments | tojson }} - {%- endif %} - {{- '}\n' }} - {%- endfor %} - {%- endif %} - {{- '<|im_end|>\n' }} - {%- elif message.role == "tool" %} - {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} - {{- '<|im_start|>user' }} - {%- endif %} - {{- '\n\n' }} - {{- content }} - {{- '\n' }} - {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} - {{- '<|im_end|>\n' }} - {%- endif %} - {%- endif %} -{%- endfor %} -{%- if add_generation_prompt %} - {{- '<|im_start|>assistant\n' }} -{%- endif %} \ No newline at end of file diff --git a/checkpoint-3948/config.json b/checkpoint-3948/config.json deleted file mode 100644 index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000 --- a/checkpoint-3948/config.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "architectures": [ - "Qwen3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": null, - "dtype": "float32", - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 2560, - "initializer_range": 0.02, - "intermediate_size": 9728, - "layer_types": [ - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention" - ], - "max_position_embeddings": 262144, - "max_window_layers": 36, - "model_type": "qwen3", - "num_attention_heads": 32, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "pad_token_id": 151662, - "rms_norm_eps": 1e-06, - "rope_parameters": { - "rope_theta": 5000000, - "rope_type": "default" - }, - "sliding_window": null, - "tie_word_embeddings": true, - "transformers_version": "5.5.3", - "use_cache": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/checkpoint-3948/generation_config.json b/checkpoint-3948/generation_config.json deleted file mode 100644 index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000 --- a/checkpoint-3948/generation_config.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "do_sample": true, - "eos_token_id": [ - 151645, - 151643 - ], - "pad_token_id": 151662, - "temperature": 0.7, - "top_k": 20, - "top_p": 0.8, - "transformers_version": "5.5.3" -} diff --git a/checkpoint-3948/model.safetensors b/checkpoint-3948/model.safetensors deleted file mode 100644 index f787ad62bc7ccc577c324b6d71689c0739123f0c..0000000000000000000000000000000000000000 --- a/checkpoint-3948/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7db19800bbcf792dcb25dea9b5ae39f4e934a0d56f64ed6f74d7d89e87ae928 -size 17645743048 diff --git a/checkpoint-3948/optimizer.bin b/checkpoint-3948/optimizer.bin deleted file mode 100644 index 90ea9835df74c549d6f6b88c64f00fdc211af5fa..0000000000000000000000000000000000000000 --- a/checkpoint-3948/optimizer.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:656d334c407ae1443fcaeda271d597e51249875fdde8e1a12a024812f6de73ab -size 32180124005 diff --git a/checkpoint-3948/pytorch_model_fsdp.bin b/checkpoint-3948/pytorch_model_fsdp.bin deleted file mode 100644 index a96db7a5fcab43218d82108cacd5f6fc2583929f..0000000000000000000000000000000000000000 --- a/checkpoint-3948/pytorch_model_fsdp.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51d19fbc90bb938bf3c747a8b9c2b23f00398029d4ab146ca0ca0a0ea7d8885c -size 17645897996 diff --git a/checkpoint-3948/rng_state_0.pth b/checkpoint-3948/rng_state_0.pth deleted file mode 100644 index 5379ca97bc0c62d226d0fc37920d4937a7bb8b43..0000000000000000000000000000000000000000 --- a/checkpoint-3948/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61e957b4cd785256be4cb26eb03060ef689e1d58f1766d7f26ca36a62bec4994 -size 14917 diff --git a/checkpoint-3948/rng_state_1.pth b/checkpoint-3948/rng_state_1.pth deleted file mode 100644 index 662ad0d5b30369c825f66c080779973608c5058e..0000000000000000000000000000000000000000 --- a/checkpoint-3948/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:550c54d430b44b77b0abe44c6e3ceba90a155305315c081b7616b35e2c18d1ce -size 14917 diff --git a/checkpoint-3948/scheduler.pt b/checkpoint-3948/scheduler.pt deleted file mode 100644 index 51ed35f90326eb016d2a1c3993d7061549624ca8..0000000000000000000000000000000000000000 --- a/checkpoint-3948/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:deaab1725fa5d6abb332a09b31b7c4d93808c0289cb39a32cd5102547b98e285 -size 1465 diff --git a/checkpoint-3948/tokenizer.json b/checkpoint-3948/tokenizer.json deleted file mode 100644 index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000 --- a/checkpoint-3948/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 -size 11422650 diff --git a/checkpoint-3948/tokenizer_config.json b/checkpoint-3948/tokenizer_config.json deleted file mode 100644 index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000 --- a/checkpoint-3948/tokenizer_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "add_prefix_space": false, - "backend": "tokenizers", - "bos_token": null, - "clean_up_tokenization_spaces": false, - "eos_token": "<|im_end|>", - "errors": "replace", - "extra_special_tokens": [ - "<|im_start|>", - "<|im_end|>", - "<|object_ref_start|>", - "<|object_ref_end|>", - "<|box_start|>", - "<|box_end|>", - "<|quad_start|>", - "<|quad_end|>", - "<|vision_start|>", - "<|vision_end|>", - "<|vision_pad|>", - "<|image_pad|>", - "<|video_pad|>" - ], - "is_local": false, - "model_max_length": 1010000, - "pad_token": "<|fim_pad|>", - "split_special_tokens": false, - "tokenizer_class": "Qwen2Tokenizer", - "unk_token": null -} diff --git a/checkpoint-3948/trainer_state.json b/checkpoint-3948/trainer_state.json deleted file mode 100644 index 5d447faf10413b9ec27585679ff7a32bdbe441fe..0000000000000000000000000000000000000000 --- a/checkpoint-3948/trainer_state.json +++ /dev/null @@ -1,35566 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 3.0, - "eval_steps": 500, - "global_step": 3948, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0007598784194528875, - "grad_norm": 11.767926216125488, - "learning_rate": 0.0, - "loss": 0.7937269806861877, - "mean_token_accuracy": 0.7822731137275696, - "num_tokens": 10507.0, - "step": 1 - }, - { - "epoch": 0.001519756838905775, - "grad_norm": 14.9199800491333, - "learning_rate": 2.5252525252525256e-08, - "loss": 0.7665389776229858, - "mean_token_accuracy": 0.8342233300209045, - "num_tokens": 14806.0, - "step": 2 - }, - { - "epoch": 0.0022796352583586625, - "grad_norm": 11.991217613220215, - "learning_rate": 5.050505050505051e-08, - "loss": 0.9597002267837524, - "mean_token_accuracy": 0.7054992318153381, - "num_tokens": 27170.0, - "step": 3 - }, - { - "epoch": 0.00303951367781155, - "grad_norm": 12.958333015441895, - "learning_rate": 7.575757575757576e-08, - "loss": 0.9971482753753662, - "mean_token_accuracy": 0.7261134386062622, - "num_tokens": 33729.0, - "step": 4 - }, - { - "epoch": 0.003799392097264438, - "grad_norm": 13.5665283203125, - "learning_rate": 1.0101010101010103e-07, - "loss": 0.9504883885383606, - "mean_token_accuracy": 0.745307445526123, - "num_tokens": 41174.0, - "step": 5 - }, - { - "epoch": 0.004559270516717325, - "grad_norm": 10.09444808959961, - "learning_rate": 1.2626262626262626e-07, - "loss": 0.759548008441925, - "mean_token_accuracy": 0.7842121124267578, - "num_tokens": 47943.0, - "step": 6 - }, - { - "epoch": 0.005319148936170213, - "grad_norm": 10.741650581359863, - "learning_rate": 1.5151515151515152e-07, - "loss": 0.8231598138809204, - "mean_token_accuracy": 0.7550969123840332, - "num_tokens": 56665.0, - "step": 7 - }, - { - "epoch": 0.0060790273556231, - "grad_norm": 12.250170707702637, - "learning_rate": 1.767676767676768e-07, - "loss": 0.8576581478118896, - "mean_token_accuracy": 0.7568671703338623, - "num_tokens": 67606.0, - "step": 8 - }, - { - "epoch": 0.006838905775075988, - "grad_norm": 12.828629493713379, - "learning_rate": 2.0202020202020205e-07, - "loss": 0.9886435866355896, - "mean_token_accuracy": 0.733400285243988, - "num_tokens": 74272.0, - "step": 9 - }, - { - "epoch": 0.007598784194528876, - "grad_norm": 15.966923713684082, - "learning_rate": 2.2727272727272729e-07, - "loss": 1.064985990524292, - "mean_token_accuracy": 0.7101132869720459, - "num_tokens": 80524.0, - "step": 10 - }, - { - "epoch": 0.008358662613981762, - "grad_norm": 10.864850044250488, - "learning_rate": 2.525252525252525e-07, - "loss": 0.8311550617218018, - "mean_token_accuracy": 0.7431639432907104, - "num_tokens": 96292.0, - "step": 11 - }, - { - "epoch": 0.00911854103343465, - "grad_norm": 16.438785552978516, - "learning_rate": 2.7777777777777776e-07, - "loss": 1.0579866170883179, - "mean_token_accuracy": 0.7222976684570312, - "num_tokens": 102992.0, - "step": 12 - }, - { - "epoch": 0.009878419452887538, - "grad_norm": 11.179214477539062, - "learning_rate": 3.0303030303030305e-07, - "loss": 0.9816144704818726, - "mean_token_accuracy": 0.7206371426582336, - "num_tokens": 113571.0, - "step": 13 - }, - { - "epoch": 0.010638297872340425, - "grad_norm": 12.780299186706543, - "learning_rate": 3.2828282828282834e-07, - "loss": 0.847449004650116, - "mean_token_accuracy": 0.7826199531555176, - "num_tokens": 119568.0, - "step": 14 - }, - { - "epoch": 0.011398176291793313, - "grad_norm": 14.800421714782715, - "learning_rate": 3.535353535353536e-07, - "loss": 0.9275516271591187, - "mean_token_accuracy": 0.7655045986175537, - "num_tokens": 126258.0, - "step": 15 - }, - { - "epoch": 0.0121580547112462, - "grad_norm": 11.267602920532227, - "learning_rate": 3.787878787878788e-07, - "loss": 0.8464037179946899, - "mean_token_accuracy": 0.7606508731842041, - "num_tokens": 136831.0, - "step": 16 - }, - { - "epoch": 0.012917933130699088, - "grad_norm": 12.891013145446777, - "learning_rate": 4.040404040404041e-07, - "loss": 0.9903074502944946, - "mean_token_accuracy": 0.7247487306594849, - "num_tokens": 150434.0, - "step": 17 - }, - { - "epoch": 0.013677811550151976, - "grad_norm": 11.13957691192627, - "learning_rate": 4.2929292929292934e-07, - "loss": 0.8287211656570435, - "mean_token_accuracy": 0.7621913552284241, - "num_tokens": 158516.0, - "step": 18 - }, - { - "epoch": 0.014437689969604863, - "grad_norm": 18.39569664001465, - "learning_rate": 4.5454545454545457e-07, - "loss": 1.150015115737915, - "mean_token_accuracy": 0.7349498271942139, - "num_tokens": 162214.0, - "step": 19 - }, - { - "epoch": 0.015197568389057751, - "grad_norm": 9.353750228881836, - "learning_rate": 4.797979797979798e-07, - "loss": 0.7228299379348755, - "mean_token_accuracy": 0.7969573736190796, - "num_tokens": 173035.0, - "step": 20 - }, - { - "epoch": 0.015957446808510637, - "grad_norm": 8.267163276672363, - "learning_rate": 5.05050505050505e-07, - "loss": 0.7358136177062988, - "mean_token_accuracy": 0.7903937101364136, - "num_tokens": 183568.0, - "step": 21 - }, - { - "epoch": 0.016717325227963525, - "grad_norm": 11.137128829956055, - "learning_rate": 5.303030303030304e-07, - "loss": 1.0075397491455078, - "mean_token_accuracy": 0.702807605266571, - "num_tokens": 192759.0, - "step": 22 - }, - { - "epoch": 0.017477203647416412, - "grad_norm": 10.734103202819824, - "learning_rate": 5.555555555555555e-07, - "loss": 0.8925919532775879, - "mean_token_accuracy": 0.7475671768188477, - "num_tokens": 201280.0, - "step": 23 - }, - { - "epoch": 0.0182370820668693, - "grad_norm": 11.945566177368164, - "learning_rate": 5.808080808080809e-07, - "loss": 0.7260514497756958, - "mean_token_accuracy": 0.7859152555465698, - "num_tokens": 218053.0, - "step": 24 - }, - { - "epoch": 0.018996960486322188, - "grad_norm": 18.610652923583984, - "learning_rate": 6.060606060606061e-07, - "loss": 0.8995465636253357, - "mean_token_accuracy": 0.7931990623474121, - "num_tokens": 220953.0, - "step": 25 - }, - { - "epoch": 0.019756838905775075, - "grad_norm": 10.51898193359375, - "learning_rate": 6.313131313131314e-07, - "loss": 0.9532671570777893, - "mean_token_accuracy": 0.7257645726203918, - "num_tokens": 231200.0, - "step": 26 - }, - { - "epoch": 0.020516717325227963, - "grad_norm": 9.581812858581543, - "learning_rate": 6.565656565656567e-07, - "loss": 0.9038010239601135, - "mean_token_accuracy": 0.7390379905700684, - "num_tokens": 237711.0, - "step": 27 - }, - { - "epoch": 0.02127659574468085, - "grad_norm": 12.297484397888184, - "learning_rate": 6.818181818181818e-07, - "loss": 1.048936367034912, - "mean_token_accuracy": 0.7175670862197876, - "num_tokens": 242503.0, - "step": 28 - }, - { - "epoch": 0.022036474164133738, - "grad_norm": 7.437953472137451, - "learning_rate": 7.070707070707071e-07, - "loss": 0.8308826684951782, - "mean_token_accuracy": 0.7415335774421692, - "num_tokens": 250842.0, - "step": 29 - }, - { - "epoch": 0.022796352583586626, - "grad_norm": 6.134475231170654, - "learning_rate": 7.323232323232324e-07, - "loss": 0.647913932800293, - "mean_token_accuracy": 0.8124054670333862, - "num_tokens": 267453.0, - "step": 30 - }, - { - "epoch": 0.023556231003039513, - "grad_norm": 6.678966045379639, - "learning_rate": 7.575757575757576e-07, - "loss": 0.7052810192108154, - "mean_token_accuracy": 0.7908754348754883, - "num_tokens": 284416.0, - "step": 31 - }, - { - "epoch": 0.0243161094224924, - "grad_norm": 7.42232084274292, - "learning_rate": 7.82828282828283e-07, - "loss": 1.022383213043213, - "mean_token_accuracy": 0.7053230404853821, - "num_tokens": 292073.0, - "step": 32 - }, - { - "epoch": 0.02507598784194529, - "grad_norm": 6.463219165802002, - "learning_rate": 8.080808080808082e-07, - "loss": 0.7603012323379517, - "mean_token_accuracy": 0.7728140354156494, - "num_tokens": 298550.0, - "step": 33 - }, - { - "epoch": 0.025835866261398176, - "grad_norm": 5.668411731719971, - "learning_rate": 8.333333333333333e-07, - "loss": 0.7707852721214294, - "mean_token_accuracy": 0.7827773094177246, - "num_tokens": 306683.0, - "step": 34 - }, - { - "epoch": 0.026595744680851064, - "grad_norm": 4.984964847564697, - "learning_rate": 8.585858585858587e-07, - "loss": 0.6317349672317505, - "mean_token_accuracy": 0.8106861114501953, - "num_tokens": 318842.0, - "step": 35 - }, - { - "epoch": 0.02735562310030395, - "grad_norm": 4.421732425689697, - "learning_rate": 8.838383838383839e-07, - "loss": 0.6228617429733276, - "mean_token_accuracy": 0.8023355603218079, - "num_tokens": 329850.0, - "step": 36 - }, - { - "epoch": 0.02811550151975684, - "grad_norm": 5.970808029174805, - "learning_rate": 9.090909090909091e-07, - "loss": 0.8443238139152527, - "mean_token_accuracy": 0.7462409734725952, - "num_tokens": 335844.0, - "step": 37 - }, - { - "epoch": 0.028875379939209727, - "grad_norm": 4.5389084815979, - "learning_rate": 9.343434343434345e-07, - "loss": 0.6976436376571655, - "mean_token_accuracy": 0.790410041809082, - "num_tokens": 348768.0, - "step": 38 - }, - { - "epoch": 0.029635258358662615, - "grad_norm": 4.116631507873535, - "learning_rate": 9.595959595959596e-07, - "loss": 0.6698519587516785, - "mean_token_accuracy": 0.7818127870559692, - "num_tokens": 355460.0, - "step": 39 - }, - { - "epoch": 0.030395136778115502, - "grad_norm": 3.3714773654937744, - "learning_rate": 9.84848484848485e-07, - "loss": 0.5723201036453247, - "mean_token_accuracy": 0.8100086450576782, - "num_tokens": 368507.0, - "step": 40 - }, - { - "epoch": 0.03115501519756839, - "grad_norm": 4.4438347816467285, - "learning_rate": 1.01010101010101e-06, - "loss": 0.7508786916732788, - "mean_token_accuracy": 0.7711942791938782, - "num_tokens": 376467.0, - "step": 41 - }, - { - "epoch": 0.031914893617021274, - "grad_norm": 5.609974384307861, - "learning_rate": 1.0353535353535354e-06, - "loss": 0.566256046295166, - "mean_token_accuracy": 0.8319284319877625, - "num_tokens": 381399.0, - "step": 42 - }, - { - "epoch": 0.03267477203647416, - "grad_norm": 5.124386787414551, - "learning_rate": 1.0606060606060608e-06, - "loss": 0.8151067495346069, - "mean_token_accuracy": 0.7537785768508911, - "num_tokens": 387389.0, - "step": 43 - }, - { - "epoch": 0.03343465045592705, - "grad_norm": 3.6318116188049316, - "learning_rate": 1.085858585858586e-06, - "loss": 0.5989949107170105, - "mean_token_accuracy": 0.8129256963729858, - "num_tokens": 395302.0, - "step": 44 - }, - { - "epoch": 0.03419452887537994, - "grad_norm": 2.694424629211426, - "learning_rate": 1.111111111111111e-06, - "loss": 0.5831396579742432, - "mean_token_accuracy": 0.8056820631027222, - "num_tokens": 409920.0, - "step": 45 - }, - { - "epoch": 0.034954407294832825, - "grad_norm": 2.2949178218841553, - "learning_rate": 1.1363636363636364e-06, - "loss": 0.472550630569458, - "mean_token_accuracy": 0.8343006372451782, - "num_tokens": 428323.0, - "step": 46 - }, - { - "epoch": 0.03571428571428571, - "grad_norm": 3.3930575847625732, - "learning_rate": 1.1616161616161617e-06, - "loss": 0.6246505379676819, - "mean_token_accuracy": 0.783149003982544, - "num_tokens": 435889.0, - "step": 47 - }, - { - "epoch": 0.0364741641337386, - "grad_norm": 3.692598819732666, - "learning_rate": 1.186868686868687e-06, - "loss": 0.46132946014404297, - "mean_token_accuracy": 0.8583089113235474, - "num_tokens": 441192.0, - "step": 48 - }, - { - "epoch": 0.03723404255319149, - "grad_norm": 6.571533203125, - "learning_rate": 1.2121212121212122e-06, - "loss": 0.9351121783256531, - "mean_token_accuracy": 0.7580878734588623, - "num_tokens": 444277.0, - "step": 49 - }, - { - "epoch": 0.037993920972644375, - "grad_norm": 5.029570579528809, - "learning_rate": 1.2373737373737375e-06, - "loss": 0.6921554803848267, - "mean_token_accuracy": 0.8131166100502014, - "num_tokens": 447646.0, - "step": 50 - }, - { - "epoch": 0.03875379939209726, - "grad_norm": 2.9174208641052246, - "learning_rate": 1.2626262626262629e-06, - "loss": 0.591706395149231, - "mean_token_accuracy": 0.8108617067337036, - "num_tokens": 461397.0, - "step": 51 - }, - { - "epoch": 0.03951367781155015, - "grad_norm": 4.315536022186279, - "learning_rate": 1.287878787878788e-06, - "loss": 0.6986310482025146, - "mean_token_accuracy": 0.7710754871368408, - "num_tokens": 472047.0, - "step": 52 - }, - { - "epoch": 0.04027355623100304, - "grad_norm": 2.6216275691986084, - "learning_rate": 1.3131313131313134e-06, - "loss": 0.5553690791130066, - "mean_token_accuracy": 0.8167896866798401, - "num_tokens": 482795.0, - "step": 53 - }, - { - "epoch": 0.041033434650455926, - "grad_norm": 3.0562477111816406, - "learning_rate": 1.3383838383838385e-06, - "loss": 0.6909202337265015, - "mean_token_accuracy": 0.7859863638877869, - "num_tokens": 494818.0, - "step": 54 - }, - { - "epoch": 0.04179331306990881, - "grad_norm": 2.1420412063598633, - "learning_rate": 1.3636363636363636e-06, - "loss": 0.5415265560150146, - "mean_token_accuracy": 0.818886399269104, - "num_tokens": 513695.0, - "step": 55 - }, - { - "epoch": 0.0425531914893617, - "grad_norm": 2.9610488414764404, - "learning_rate": 1.3888888888888892e-06, - "loss": 0.6602212190628052, - "mean_token_accuracy": 0.7830734252929688, - "num_tokens": 523784.0, - "step": 56 - }, - { - "epoch": 0.04331306990881459, - "grad_norm": 2.511972665786743, - "learning_rate": 1.4141414141414143e-06, - "loss": 0.5717809796333313, - "mean_token_accuracy": 0.8053616285324097, - "num_tokens": 546308.0, - "step": 57 - }, - { - "epoch": 0.044072948328267476, - "grad_norm": 3.52642822265625, - "learning_rate": 1.4393939393939396e-06, - "loss": 0.6242594718933105, - "mean_token_accuracy": 0.8162082433700562, - "num_tokens": 552019.0, - "step": 58 - }, - { - "epoch": 0.044832826747720364, - "grad_norm": 3.02362322807312, - "learning_rate": 1.4646464646464648e-06, - "loss": 0.6634255647659302, - "mean_token_accuracy": 0.7682032585144043, - "num_tokens": 560009.0, - "step": 59 - }, - { - "epoch": 0.04559270516717325, - "grad_norm": 2.3910107612609863, - "learning_rate": 1.48989898989899e-06, - "loss": 0.5519146919250488, - "mean_token_accuracy": 0.8270269632339478, - "num_tokens": 571005.0, - "step": 60 - }, - { - "epoch": 0.04635258358662614, - "grad_norm": 4.28154993057251, - "learning_rate": 1.5151515151515152e-06, - "loss": 0.7437789440155029, - "mean_token_accuracy": 0.7782418131828308, - "num_tokens": 574950.0, - "step": 61 - }, - { - "epoch": 0.04711246200607903, - "grad_norm": 3.4078686237335205, - "learning_rate": 1.5404040404040404e-06, - "loss": 0.6345915198326111, - "mean_token_accuracy": 0.7903392314910889, - "num_tokens": 581657.0, - "step": 62 - }, - { - "epoch": 0.047872340425531915, - "grad_norm": 2.6834158897399902, - "learning_rate": 1.565656565656566e-06, - "loss": 0.5981127023696899, - "mean_token_accuracy": 0.7911489605903625, - "num_tokens": 591267.0, - "step": 63 - }, - { - "epoch": 0.0486322188449848, - "grad_norm": 2.1054461002349854, - "learning_rate": 1.590909090909091e-06, - "loss": 0.5523523688316345, - "mean_token_accuracy": 0.8194501399993896, - "num_tokens": 606787.0, - "step": 64 - }, - { - "epoch": 0.04939209726443769, - "grad_norm": 3.322596788406372, - "learning_rate": 1.6161616161616164e-06, - "loss": 0.48417025804519653, - "mean_token_accuracy": 0.8293706178665161, - "num_tokens": 611068.0, - "step": 65 - }, - { - "epoch": 0.05015197568389058, - "grad_norm": 2.302450180053711, - "learning_rate": 1.6414141414141415e-06, - "loss": 0.6498389840126038, - "mean_token_accuracy": 0.7728497385978699, - "num_tokens": 624452.0, - "step": 66 - }, - { - "epoch": 0.050911854103343465, - "grad_norm": 2.680191993713379, - "learning_rate": 1.6666666666666667e-06, - "loss": 0.6347037553787231, - "mean_token_accuracy": 0.8108306527137756, - "num_tokens": 638049.0, - "step": 67 - }, - { - "epoch": 0.05167173252279635, - "grad_norm": 3.0297021865844727, - "learning_rate": 1.6919191919191922e-06, - "loss": 0.5344363451004028, - "mean_token_accuracy": 0.8113535046577454, - "num_tokens": 643892.0, - "step": 68 - }, - { - "epoch": 0.05243161094224924, - "grad_norm": 2.9283676147460938, - "learning_rate": 1.7171717171717173e-06, - "loss": 0.6999260187149048, - "mean_token_accuracy": 0.7782022356987, - "num_tokens": 654418.0, - "step": 69 - }, - { - "epoch": 0.05319148936170213, - "grad_norm": 3.4098572731018066, - "learning_rate": 1.7424242424242427e-06, - "loss": 0.6508946418762207, - "mean_token_accuracy": 0.7942900657653809, - "num_tokens": 659837.0, - "step": 70 - }, - { - "epoch": 0.053951367781155016, - "grad_norm": 2.6756019592285156, - "learning_rate": 1.7676767676767678e-06, - "loss": 0.603486180305481, - "mean_token_accuracy": 0.8015457391738892, - "num_tokens": 668361.0, - "step": 71 - }, - { - "epoch": 0.0547112462006079, - "grad_norm": 2.2630293369293213, - "learning_rate": 1.792929292929293e-06, - "loss": 0.6608274579048157, - "mean_token_accuracy": 0.7753809690475464, - "num_tokens": 679025.0, - "step": 72 - }, - { - "epoch": 0.05547112462006079, - "grad_norm": 2.123962879180908, - "learning_rate": 1.8181818181818183e-06, - "loss": 0.4525482654571533, - "mean_token_accuracy": 0.8425612449645996, - "num_tokens": 688574.0, - "step": 73 - }, - { - "epoch": 0.05623100303951368, - "grad_norm": 7.90519905090332, - "learning_rate": 1.8434343434343434e-06, - "loss": 0.6507195830345154, - "mean_token_accuracy": 0.7714964151382446, - "num_tokens": 694534.0, - "step": 74 - }, - { - "epoch": 0.056990881458966566, - "grad_norm": 2.372203826904297, - "learning_rate": 1.868686868686869e-06, - "loss": 0.4458143413066864, - "mean_token_accuracy": 0.7991449236869812, - "num_tokens": 703114.0, - "step": 75 - }, - { - "epoch": 0.057750759878419454, - "grad_norm": 2.918677568435669, - "learning_rate": 1.8939393939393941e-06, - "loss": 0.5614339113235474, - "mean_token_accuracy": 0.8211464881896973, - "num_tokens": 709038.0, - "step": 76 - }, - { - "epoch": 0.05851063829787234, - "grad_norm": 1.6106709241867065, - "learning_rate": 1.9191919191919192e-06, - "loss": 0.5802098512649536, - "mean_token_accuracy": 0.8055065870285034, - "num_tokens": 730482.0, - "step": 77 - }, - { - "epoch": 0.05927051671732523, - "grad_norm": 2.8069989681243896, - "learning_rate": 1.944444444444445e-06, - "loss": 0.5709059238433838, - "mean_token_accuracy": 0.8024872541427612, - "num_tokens": 751817.0, - "step": 78 - }, - { - "epoch": 0.06003039513677812, - "grad_norm": 2.641667127609253, - "learning_rate": 1.96969696969697e-06, - "loss": 0.6480152606964111, - "mean_token_accuracy": 0.7912271618843079, - "num_tokens": 759236.0, - "step": 79 - }, - { - "epoch": 0.060790273556231005, - "grad_norm": 2.6034350395202637, - "learning_rate": 1.994949494949495e-06, - "loss": 0.5535176396369934, - "mean_token_accuracy": 0.7980542778968811, - "num_tokens": 766496.0, - "step": 80 - }, - { - "epoch": 0.06155015197568389, - "grad_norm": 1.7095069885253906, - "learning_rate": 2.02020202020202e-06, - "loss": 0.4545496106147766, - "mean_token_accuracy": 0.8229660391807556, - "num_tokens": 780124.0, - "step": 81 - }, - { - "epoch": 0.06231003039513678, - "grad_norm": 3.788830518722534, - "learning_rate": 2.0454545454545457e-06, - "loss": 0.6679391264915466, - "mean_token_accuracy": 0.7942397594451904, - "num_tokens": 784555.0, - "step": 82 - }, - { - "epoch": 0.06306990881458967, - "grad_norm": 2.009831666946411, - "learning_rate": 2.070707070707071e-06, - "loss": 0.5067101120948792, - "mean_token_accuracy": 0.8276634216308594, - "num_tokens": 797459.0, - "step": 83 - }, - { - "epoch": 0.06382978723404255, - "grad_norm": 2.201627731323242, - "learning_rate": 2.095959595959596e-06, - "loss": 0.5012127161026001, - "mean_token_accuracy": 0.8432504534721375, - "num_tokens": 810817.0, - "step": 84 - }, - { - "epoch": 0.06458966565349544, - "grad_norm": 2.492568016052246, - "learning_rate": 2.1212121212121216e-06, - "loss": 0.6142797470092773, - "mean_token_accuracy": 0.8338661193847656, - "num_tokens": 818191.0, - "step": 85 - }, - { - "epoch": 0.06534954407294832, - "grad_norm": 2.8360862731933594, - "learning_rate": 2.1464646464646467e-06, - "loss": 0.5569300651550293, - "mean_token_accuracy": 0.8121030330657959, - "num_tokens": 825325.0, - "step": 86 - }, - { - "epoch": 0.06610942249240122, - "grad_norm": 2.407548427581787, - "learning_rate": 2.171717171717172e-06, - "loss": 0.6442930102348328, - "mean_token_accuracy": 0.792514443397522, - "num_tokens": 834439.0, - "step": 87 - }, - { - "epoch": 0.0668693009118541, - "grad_norm": 2.340728759765625, - "learning_rate": 2.196969696969697e-06, - "loss": 0.6494365930557251, - "mean_token_accuracy": 0.7746615409851074, - "num_tokens": 843078.0, - "step": 88 - }, - { - "epoch": 0.067629179331307, - "grad_norm": 1.7703697681427002, - "learning_rate": 2.222222222222222e-06, - "loss": 0.598991870880127, - "mean_token_accuracy": 0.7992157340049744, - "num_tokens": 860171.0, - "step": 89 - }, - { - "epoch": 0.06838905775075987, - "grad_norm": 2.5779271125793457, - "learning_rate": 2.2474747474747476e-06, - "loss": 0.5693082809448242, - "mean_token_accuracy": 0.8093700408935547, - "num_tokens": 866669.0, - "step": 90 - }, - { - "epoch": 0.06914893617021277, - "grad_norm": 2.014092206954956, - "learning_rate": 2.2727272727272728e-06, - "loss": 0.5346695780754089, - "mean_token_accuracy": 0.8165590763092041, - "num_tokens": 876698.0, - "step": 91 - }, - { - "epoch": 0.06990881458966565, - "grad_norm": 1.7555919885635376, - "learning_rate": 2.2979797979797983e-06, - "loss": 0.5321458578109741, - "mean_token_accuracy": 0.8166656494140625, - "num_tokens": 889488.0, - "step": 92 - }, - { - "epoch": 0.07066869300911854, - "grad_norm": 1.8631824254989624, - "learning_rate": 2.3232323232323234e-06, - "loss": 0.5246532559394836, - "mean_token_accuracy": 0.8088107705116272, - "num_tokens": 901322.0, - "step": 93 - }, - { - "epoch": 0.07142857142857142, - "grad_norm": 3.2332139015197754, - "learning_rate": 2.348484848484849e-06, - "loss": 0.5141711235046387, - "mean_token_accuracy": 0.8382217884063721, - "num_tokens": 905792.0, - "step": 94 - }, - { - "epoch": 0.07218844984802432, - "grad_norm": 1.7806555032730103, - "learning_rate": 2.373737373737374e-06, - "loss": 0.5233149528503418, - "mean_token_accuracy": 0.8101529479026794, - "num_tokens": 917320.0, - "step": 95 - }, - { - "epoch": 0.0729483282674772, - "grad_norm": 1.8169859647750854, - "learning_rate": 2.3989898989898993e-06, - "loss": 0.578881561756134, - "mean_token_accuracy": 0.8044873476028442, - "num_tokens": 931062.0, - "step": 96 - }, - { - "epoch": 0.0737082066869301, - "grad_norm": 4.677402496337891, - "learning_rate": 2.4242424242424244e-06, - "loss": 0.7842556238174438, - "mean_token_accuracy": 0.7579764127731323, - "num_tokens": 934712.0, - "step": 97 - }, - { - "epoch": 0.07446808510638298, - "grad_norm": 2.6987264156341553, - "learning_rate": 2.4494949494949495e-06, - "loss": 0.5669287443161011, - "mean_token_accuracy": 0.8186933994293213, - "num_tokens": 941058.0, - "step": 98 - }, - { - "epoch": 0.07522796352583587, - "grad_norm": 1.6906023025512695, - "learning_rate": 2.474747474747475e-06, - "loss": 0.4976363778114319, - "mean_token_accuracy": 0.8198553323745728, - "num_tokens": 956509.0, - "step": 99 - }, - { - "epoch": 0.07598784194528875, - "grad_norm": 2.7256152629852295, - "learning_rate": 2.5e-06, - "loss": 0.7138420343399048, - "mean_token_accuracy": 0.7752805948257446, - "num_tokens": 963920.0, - "step": 100 - }, - { - "epoch": 0.07674772036474165, - "grad_norm": 2.174870491027832, - "learning_rate": 2.5252525252525258e-06, - "loss": 0.6733541488647461, - "mean_token_accuracy": 0.7745175361633301, - "num_tokens": 975268.0, - "step": 101 - }, - { - "epoch": 0.07750759878419453, - "grad_norm": 1.5587213039398193, - "learning_rate": 2.5505050505050505e-06, - "loss": 0.44223445653915405, - "mean_token_accuracy": 0.8278359174728394, - "num_tokens": 991837.0, - "step": 102 - }, - { - "epoch": 0.07826747720364742, - "grad_norm": 2.181840658187866, - "learning_rate": 2.575757575757576e-06, - "loss": 0.625128448009491, - "mean_token_accuracy": 0.7941786050796509, - "num_tokens": 1004325.0, - "step": 103 - }, - { - "epoch": 0.0790273556231003, - "grad_norm": 1.4986687898635864, - "learning_rate": 2.601010101010101e-06, - "loss": 0.39262527227401733, - "mean_token_accuracy": 0.8412648439407349, - "num_tokens": 1018331.0, - "step": 104 - }, - { - "epoch": 0.0797872340425532, - "grad_norm": 2.3416061401367188, - "learning_rate": 2.6262626262626267e-06, - "loss": 0.5495132803916931, - "mean_token_accuracy": 0.8193322420120239, - "num_tokens": 1026090.0, - "step": 105 - }, - { - "epoch": 0.08054711246200608, - "grad_norm": 3.8168859481811523, - "learning_rate": 2.6515151515151514e-06, - "loss": 0.4898706376552582, - "mean_token_accuracy": 0.8467956185340881, - "num_tokens": 1029955.0, - "step": 106 - }, - { - "epoch": 0.08130699088145897, - "grad_norm": 4.113908767700195, - "learning_rate": 2.676767676767677e-06, - "loss": 0.6189584732055664, - "mean_token_accuracy": 0.8019394278526306, - "num_tokens": 1033598.0, - "step": 107 - }, - { - "epoch": 0.08206686930091185, - "grad_norm": 2.50003981590271, - "learning_rate": 2.7020202020202025e-06, - "loss": 0.6479471921920776, - "mean_token_accuracy": 0.7790026664733887, - "num_tokens": 1042533.0, - "step": 108 - }, - { - "epoch": 0.08282674772036475, - "grad_norm": 1.408934473991394, - "learning_rate": 2.7272727272727272e-06, - "loss": 0.3909248113632202, - "mean_token_accuracy": 0.8477586507797241, - "num_tokens": 1061755.0, - "step": 109 - }, - { - "epoch": 0.08358662613981763, - "grad_norm": 3.360633611679077, - "learning_rate": 2.7525252525252528e-06, - "loss": 0.6952459812164307, - "mean_token_accuracy": 0.777535080909729, - "num_tokens": 1067316.0, - "step": 110 - }, - { - "epoch": 0.08434650455927052, - "grad_norm": 1.8631696701049805, - "learning_rate": 2.7777777777777783e-06, - "loss": 0.5420593023300171, - "mean_token_accuracy": 0.8157662749290466, - "num_tokens": 1079930.0, - "step": 111 - }, - { - "epoch": 0.0851063829787234, - "grad_norm": 2.4308314323425293, - "learning_rate": 2.803030303030303e-06, - "loss": 0.5863882303237915, - "mean_token_accuracy": 0.8206346035003662, - "num_tokens": 1088069.0, - "step": 112 - }, - { - "epoch": 0.0858662613981763, - "grad_norm": 2.922808885574341, - "learning_rate": 2.8282828282828286e-06, - "loss": 0.5217319130897522, - "mean_token_accuracy": 0.8253234028816223, - "num_tokens": 1093607.0, - "step": 113 - }, - { - "epoch": 0.08662613981762918, - "grad_norm": 2.3596107959747314, - "learning_rate": 2.8535353535353537e-06, - "loss": 0.5070714950561523, - "mean_token_accuracy": 0.8258323669433594, - "num_tokens": 1100405.0, - "step": 114 - }, - { - "epoch": 0.08738601823708207, - "grad_norm": 3.0853066444396973, - "learning_rate": 2.8787878787878793e-06, - "loss": 0.591964840888977, - "mean_token_accuracy": 0.8047322630882263, - "num_tokens": 1107535.0, - "step": 115 - }, - { - "epoch": 0.08814589665653495, - "grad_norm": 1.9251092672348022, - "learning_rate": 2.904040404040404e-06, - "loss": 0.5226191878318787, - "mean_token_accuracy": 0.8022720217704773, - "num_tokens": 1118716.0, - "step": 116 - }, - { - "epoch": 0.08890577507598785, - "grad_norm": 1.9692988395690918, - "learning_rate": 2.9292929292929295e-06, - "loss": 0.5462069511413574, - "mean_token_accuracy": 0.8157015442848206, - "num_tokens": 1131917.0, - "step": 117 - }, - { - "epoch": 0.08966565349544073, - "grad_norm": 1.4738909006118774, - "learning_rate": 2.954545454545455e-06, - "loss": 0.4564219117164612, - "mean_token_accuracy": 0.849632978439331, - "num_tokens": 1148534.0, - "step": 118 - }, - { - "epoch": 0.09042553191489362, - "grad_norm": 2.72646164894104, - "learning_rate": 2.97979797979798e-06, - "loss": 0.6654808521270752, - "mean_token_accuracy": 0.7752684354782104, - "num_tokens": 1155438.0, - "step": 119 - }, - { - "epoch": 0.0911854103343465, - "grad_norm": 2.7843852043151855, - "learning_rate": 3.0050505050505054e-06, - "loss": 0.5354680418968201, - "mean_token_accuracy": 0.8196378946304321, - "num_tokens": 1161815.0, - "step": 120 - }, - { - "epoch": 0.0919452887537994, - "grad_norm": 2.8052573204040527, - "learning_rate": 3.0303030303030305e-06, - "loss": 0.6366757154464722, - "mean_token_accuracy": 0.7967483997344971, - "num_tokens": 1168295.0, - "step": 121 - }, - { - "epoch": 0.09270516717325228, - "grad_norm": 2.7462735176086426, - "learning_rate": 3.055555555555556e-06, - "loss": 0.59470534324646, - "mean_token_accuracy": 0.8023771047592163, - "num_tokens": 1174502.0, - "step": 122 - }, - { - "epoch": 0.09346504559270517, - "grad_norm": 2.2743821144104004, - "learning_rate": 3.0808080808080807e-06, - "loss": 0.5720560550689697, - "mean_token_accuracy": 0.8162771463394165, - "num_tokens": 1183615.0, - "step": 123 - }, - { - "epoch": 0.09422492401215805, - "grad_norm": 1.8669533729553223, - "learning_rate": 3.1060606060606063e-06, - "loss": 0.4655378758907318, - "mean_token_accuracy": 0.8360732793807983, - "num_tokens": 1193761.0, - "step": 124 - }, - { - "epoch": 0.09498480243161095, - "grad_norm": 1.7666901350021362, - "learning_rate": 3.131313131313132e-06, - "loss": 0.5524153709411621, - "mean_token_accuracy": 0.8252713680267334, - "num_tokens": 1207870.0, - "step": 125 - }, - { - "epoch": 0.09574468085106383, - "grad_norm": 2.4720070362091064, - "learning_rate": 3.1565656565656566e-06, - "loss": 0.5003011226654053, - "mean_token_accuracy": 0.8491042852401733, - "num_tokens": 1214603.0, - "step": 126 - }, - { - "epoch": 0.09650455927051672, - "grad_norm": 1.6500422954559326, - "learning_rate": 3.181818181818182e-06, - "loss": 0.5137069225311279, - "mean_token_accuracy": 0.8273531198501587, - "num_tokens": 1228717.0, - "step": 127 - }, - { - "epoch": 0.0972644376899696, - "grad_norm": 3.402543067932129, - "learning_rate": 3.2070707070707072e-06, - "loss": 0.708167552947998, - "mean_token_accuracy": 0.7705385684967041, - "num_tokens": 1234361.0, - "step": 128 - }, - { - "epoch": 0.0980243161094225, - "grad_norm": 2.547285795211792, - "learning_rate": 3.232323232323233e-06, - "loss": 0.6020137071609497, - "mean_token_accuracy": 0.7981340289115906, - "num_tokens": 1244169.0, - "step": 129 - }, - { - "epoch": 0.09878419452887538, - "grad_norm": 2.0578792095184326, - "learning_rate": 3.257575757575758e-06, - "loss": 0.4425000250339508, - "mean_token_accuracy": 0.8567807674407959, - "num_tokens": 1252709.0, - "step": 130 - }, - { - "epoch": 0.09954407294832827, - "grad_norm": 1.672614336013794, - "learning_rate": 3.282828282828283e-06, - "loss": 0.4860966205596924, - "mean_token_accuracy": 0.8393139243125916, - "num_tokens": 1265766.0, - "step": 131 - }, - { - "epoch": 0.10030395136778116, - "grad_norm": 3.2560198307037354, - "learning_rate": 3.3080808080808086e-06, - "loss": 0.624736487865448, - "mean_token_accuracy": 0.7875322699546814, - "num_tokens": 1270779.0, - "step": 132 - }, - { - "epoch": 0.10106382978723404, - "grad_norm": 2.4468185901641846, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.5062227249145508, - "mean_token_accuracy": 0.8217229843139648, - "num_tokens": 1277113.0, - "step": 133 - }, - { - "epoch": 0.10182370820668693, - "grad_norm": 2.6371328830718994, - "learning_rate": 3.358585858585859e-06, - "loss": 0.477113276720047, - "mean_token_accuracy": 0.8605583906173706, - "num_tokens": 1282514.0, - "step": 134 - }, - { - "epoch": 0.10258358662613981, - "grad_norm": 2.48421311378479, - "learning_rate": 3.3838383838383844e-06, - "loss": 0.40855684876441956, - "mean_token_accuracy": 0.864548921585083, - "num_tokens": 1287859.0, - "step": 135 - }, - { - "epoch": 0.1033434650455927, - "grad_norm": 1.993099331855774, - "learning_rate": 3.409090909090909e-06, - "loss": 0.5913145542144775, - "mean_token_accuracy": 0.8248485922813416, - "num_tokens": 1301074.0, - "step": 136 - }, - { - "epoch": 0.10410334346504559, - "grad_norm": 3.5947680473327637, - "learning_rate": 3.4343434343434347e-06, - "loss": 0.5028599500656128, - "mean_token_accuracy": 0.8367215394973755, - "num_tokens": 1305219.0, - "step": 137 - }, - { - "epoch": 0.10486322188449848, - "grad_norm": 2.5778582096099854, - "learning_rate": 3.45959595959596e-06, - "loss": 0.5297672748565674, - "mean_token_accuracy": 0.8232187032699585, - "num_tokens": 1312482.0, - "step": 138 - }, - { - "epoch": 0.10562310030395136, - "grad_norm": 1.8961588144302368, - "learning_rate": 3.4848484848484854e-06, - "loss": 0.39954107999801636, - "mean_token_accuracy": 0.8605833053588867, - "num_tokens": 1323404.0, - "step": 139 - }, - { - "epoch": 0.10638297872340426, - "grad_norm": 1.9687960147857666, - "learning_rate": 3.51010101010101e-06, - "loss": 0.48791587352752686, - "mean_token_accuracy": 0.8200347423553467, - "num_tokens": 1333027.0, - "step": 140 - }, - { - "epoch": 0.10714285714285714, - "grad_norm": 2.520242691040039, - "learning_rate": 3.5353535353535356e-06, - "loss": 0.6106002330780029, - "mean_token_accuracy": 0.790692150592804, - "num_tokens": 1340999.0, - "step": 141 - }, - { - "epoch": 0.10790273556231003, - "grad_norm": 3.751617431640625, - "learning_rate": 3.560606060606061e-06, - "loss": 0.48141729831695557, - "mean_token_accuracy": 0.8421382904052734, - "num_tokens": 1344687.0, - "step": 142 - }, - { - "epoch": 0.10866261398176291, - "grad_norm": 2.7101709842681885, - "learning_rate": 3.585858585858586e-06, - "loss": 0.5375241637229919, - "mean_token_accuracy": 0.8061438202857971, - "num_tokens": 1350192.0, - "step": 143 - }, - { - "epoch": 0.1094224924012158, - "grad_norm": 2.583484411239624, - "learning_rate": 3.6111111111111115e-06, - "loss": 0.6492470502853394, - "mean_token_accuracy": 0.7863001823425293, - "num_tokens": 1358148.0, - "step": 144 - }, - { - "epoch": 0.11018237082066869, - "grad_norm": 1.792561650276184, - "learning_rate": 3.6363636363636366e-06, - "loss": 0.48480600118637085, - "mean_token_accuracy": 0.8358709812164307, - "num_tokens": 1369519.0, - "step": 145 - }, - { - "epoch": 0.11094224924012158, - "grad_norm": 2.6480472087860107, - "learning_rate": 3.661616161616162e-06, - "loss": 0.5268933176994324, - "mean_token_accuracy": 0.8214013576507568, - "num_tokens": 1375862.0, - "step": 146 - }, - { - "epoch": 0.11170212765957446, - "grad_norm": 2.3174469470977783, - "learning_rate": 3.686868686868687e-06, - "loss": 0.42517897486686707, - "mean_token_accuracy": 0.8523461222648621, - "num_tokens": 1381546.0, - "step": 147 - }, - { - "epoch": 0.11246200607902736, - "grad_norm": 3.0090949535369873, - "learning_rate": 3.7121212121212124e-06, - "loss": 0.4042336940765381, - "mean_token_accuracy": 0.8670448064804077, - "num_tokens": 1385896.0, - "step": 148 - }, - { - "epoch": 0.11322188449848024, - "grad_norm": 2.4928104877471924, - "learning_rate": 3.737373737373738e-06, - "loss": 0.6498878598213196, - "mean_token_accuracy": 0.7967068552970886, - "num_tokens": 1394169.0, - "step": 149 - }, - { - "epoch": 0.11398176291793313, - "grad_norm": 1.5984913110733032, - "learning_rate": 3.7626262626262627e-06, - "loss": 0.546096920967102, - "mean_token_accuracy": 0.8035850524902344, - "num_tokens": 1408785.0, - "step": 150 - }, - { - "epoch": 0.11474164133738601, - "grad_norm": 2.3663532733917236, - "learning_rate": 3.7878787878787882e-06, - "loss": 0.6111721992492676, - "mean_token_accuracy": 0.8015355467796326, - "num_tokens": 1417510.0, - "step": 151 - }, - { - "epoch": 0.11550151975683891, - "grad_norm": 2.518932819366455, - "learning_rate": 3.8131313131313138e-06, - "loss": 0.5274964570999146, - "mean_token_accuracy": 0.8155480623245239, - "num_tokens": 1424186.0, - "step": 152 - }, - { - "epoch": 0.11626139817629179, - "grad_norm": 2.14353609085083, - "learning_rate": 3.8383838383838385e-06, - "loss": 0.5283297896385193, - "mean_token_accuracy": 0.8275758028030396, - "num_tokens": 1432630.0, - "step": 153 - }, - { - "epoch": 0.11702127659574468, - "grad_norm": 1.8243604898452759, - "learning_rate": 3.863636363636364e-06, - "loss": 0.41854870319366455, - "mean_token_accuracy": 0.8222295045852661, - "num_tokens": 1442691.0, - "step": 154 - }, - { - "epoch": 0.11778115501519756, - "grad_norm": 2.088212251663208, - "learning_rate": 3.88888888888889e-06, - "loss": 0.6062943339347839, - "mean_token_accuracy": 0.8009427785873413, - "num_tokens": 1456890.0, - "step": 155 - }, - { - "epoch": 0.11854103343465046, - "grad_norm": 1.3469511270523071, - "learning_rate": 3.914141414141415e-06, - "loss": 0.4390433728694916, - "mean_token_accuracy": 0.8436295986175537, - "num_tokens": 1475349.0, - "step": 156 - }, - { - "epoch": 0.11930091185410334, - "grad_norm": 3.247023105621338, - "learning_rate": 3.93939393939394e-06, - "loss": 0.6490433216094971, - "mean_token_accuracy": 0.8037861585617065, - "num_tokens": 1479952.0, - "step": 157 - }, - { - "epoch": 0.12006079027355623, - "grad_norm": 2.6610445976257324, - "learning_rate": 3.964646464646465e-06, - "loss": 0.6221826076507568, - "mean_token_accuracy": 0.7848749160766602, - "num_tokens": 1487306.0, - "step": 158 - }, - { - "epoch": 0.12082066869300911, - "grad_norm": 2.3060810565948486, - "learning_rate": 3.98989898989899e-06, - "loss": 0.5052388310432434, - "mean_token_accuracy": 0.8281195759773254, - "num_tokens": 1495367.0, - "step": 159 - }, - { - "epoch": 0.12158054711246201, - "grad_norm": 2.504448652267456, - "learning_rate": 4.015151515151515e-06, - "loss": 0.5005477666854858, - "mean_token_accuracy": 0.8408058881759644, - "num_tokens": 1502069.0, - "step": 160 - }, - { - "epoch": 0.12234042553191489, - "grad_norm": 3.993938446044922, - "learning_rate": 4.04040404040404e-06, - "loss": 0.5569638013839722, - "mean_token_accuracy": 0.8095242977142334, - "num_tokens": 1510224.0, - "step": 161 - }, - { - "epoch": 0.12310030395136778, - "grad_norm": 2.2287683486938477, - "learning_rate": 4.065656565656566e-06, - "loss": 0.524042546749115, - "mean_token_accuracy": 0.8102203607559204, - "num_tokens": 1518364.0, - "step": 162 - }, - { - "epoch": 0.12386018237082067, - "grad_norm": 1.9531738758087158, - "learning_rate": 4.0909090909090915e-06, - "loss": 0.45794573426246643, - "mean_token_accuracy": 0.8560376167297363, - "num_tokens": 1528097.0, - "step": 163 - }, - { - "epoch": 0.12462006079027356, - "grad_norm": 1.5841206312179565, - "learning_rate": 4.116161616161617e-06, - "loss": 0.5420972108840942, - "mean_token_accuracy": 0.8092726469039917, - "num_tokens": 1544119.0, - "step": 164 - }, - { - "epoch": 0.12537993920972645, - "grad_norm": 1.7536218166351318, - "learning_rate": 4.141414141414142e-06, - "loss": 0.554668664932251, - "mean_token_accuracy": 0.8193825483322144, - "num_tokens": 1559140.0, - "step": 165 - }, - { - "epoch": 0.12613981762917933, - "grad_norm": 3.545454740524292, - "learning_rate": 4.166666666666667e-06, - "loss": 0.580947995185852, - "mean_token_accuracy": 0.8286383152008057, - "num_tokens": 1563625.0, - "step": 166 - }, - { - "epoch": 0.12689969604863222, - "grad_norm": 1.6608915328979492, - "learning_rate": 4.191919191919192e-06, - "loss": 0.5523324012756348, - "mean_token_accuracy": 0.8155215978622437, - "num_tokens": 1574945.0, - "step": 167 - }, - { - "epoch": 0.1276595744680851, - "grad_norm": 1.4832708835601807, - "learning_rate": 4.217171717171717e-06, - "loss": 0.5133191347122192, - "mean_token_accuracy": 0.8367571830749512, - "num_tokens": 1595865.0, - "step": 168 - }, - { - "epoch": 0.128419452887538, - "grad_norm": 1.7807520627975464, - "learning_rate": 4.242424242424243e-06, - "loss": 0.5131410360336304, - "mean_token_accuracy": 0.8129367232322693, - "num_tokens": 1608723.0, - "step": 169 - }, - { - "epoch": 0.12917933130699089, - "grad_norm": 2.707569122314453, - "learning_rate": 4.267676767676767e-06, - "loss": 0.6129013299942017, - "mean_token_accuracy": 0.7926048040390015, - "num_tokens": 1616136.0, - "step": 170 - }, - { - "epoch": 0.12993920972644377, - "grad_norm": 2.5831644535064697, - "learning_rate": 4.292929292929293e-06, - "loss": 0.6264227628707886, - "mean_token_accuracy": 0.8074911236763, - "num_tokens": 1624228.0, - "step": 171 - }, - { - "epoch": 0.13069908814589665, - "grad_norm": 3.1124250888824463, - "learning_rate": 4.3181818181818185e-06, - "loss": 0.41763827204704285, - "mean_token_accuracy": 0.8565453290939331, - "num_tokens": 1628098.0, - "step": 172 - }, - { - "epoch": 0.13145896656534956, - "grad_norm": 2.3214211463928223, - "learning_rate": 4.343434343434344e-06, - "loss": 0.421974778175354, - "mean_token_accuracy": 0.8391546010971069, - "num_tokens": 1634950.0, - "step": 173 - }, - { - "epoch": 0.13221884498480244, - "grad_norm": 2.1010327339172363, - "learning_rate": 4.368686868686869e-06, - "loss": 0.5307331681251526, - "mean_token_accuracy": 0.8139588236808777, - "num_tokens": 1644132.0, - "step": 174 - }, - { - "epoch": 0.13297872340425532, - "grad_norm": 2.533612012863159, - "learning_rate": 4.393939393939394e-06, - "loss": 0.5626664161682129, - "mean_token_accuracy": 0.8029808402061462, - "num_tokens": 1651637.0, - "step": 175 - }, - { - "epoch": 0.1337386018237082, - "grad_norm": 1.669508457183838, - "learning_rate": 4.41919191919192e-06, - "loss": 0.5351508259773254, - "mean_token_accuracy": 0.8281655311584473, - "num_tokens": 1666776.0, - "step": 176 - }, - { - "epoch": 0.1344984802431611, - "grad_norm": 1.7579659223556519, - "learning_rate": 4.444444444444444e-06, - "loss": 0.5235031247138977, - "mean_token_accuracy": 0.8143284320831299, - "num_tokens": 1679241.0, - "step": 177 - }, - { - "epoch": 0.135258358662614, - "grad_norm": 3.123563528060913, - "learning_rate": 4.46969696969697e-06, - "loss": 0.43051332235336304, - "mean_token_accuracy": 0.8518186211585999, - "num_tokens": 1683317.0, - "step": 178 - }, - { - "epoch": 0.13601823708206687, - "grad_norm": 2.2411575317382812, - "learning_rate": 4.494949494949495e-06, - "loss": 0.5471380949020386, - "mean_token_accuracy": 0.8267596960067749, - "num_tokens": 1691366.0, - "step": 179 - }, - { - "epoch": 0.13677811550151975, - "grad_norm": 2.621973991394043, - "learning_rate": 4.520202020202021e-06, - "loss": 0.5685839653015137, - "mean_token_accuracy": 0.8260642290115356, - "num_tokens": 1698148.0, - "step": 180 - }, - { - "epoch": 0.13753799392097266, - "grad_norm": 2.1553852558135986, - "learning_rate": 4.5454545454545455e-06, - "loss": 0.5703883171081543, - "mean_token_accuracy": 0.8219090700149536, - "num_tokens": 1707225.0, - "step": 181 - }, - { - "epoch": 0.13829787234042554, - "grad_norm": 5.1767897605896, - "learning_rate": 4.5707070707070715e-06, - "loss": 0.32704639434814453, - "mean_token_accuracy": 0.8754568099975586, - "num_tokens": 1712748.0, - "step": 182 - }, - { - "epoch": 0.13905775075987842, - "grad_norm": 2.609168291091919, - "learning_rate": 4.595959595959597e-06, - "loss": 0.5939987301826477, - "mean_token_accuracy": 0.8034975528717041, - "num_tokens": 1719932.0, - "step": 183 - }, - { - "epoch": 0.1398176291793313, - "grad_norm": 2.2059099674224854, - "learning_rate": 4.621212121212122e-06, - "loss": 0.5310720205307007, - "mean_token_accuracy": 0.8177368640899658, - "num_tokens": 1727640.0, - "step": 184 - }, - { - "epoch": 0.1405775075987842, - "grad_norm": 2.6367759704589844, - "learning_rate": 4.646464646464647e-06, - "loss": 0.522086501121521, - "mean_token_accuracy": 0.826233983039856, - "num_tokens": 1733609.0, - "step": 185 - }, - { - "epoch": 0.1413373860182371, - "grad_norm": 3.326732873916626, - "learning_rate": 4.671717171717172e-06, - "loss": 0.4127829074859619, - "mean_token_accuracy": 0.8551101684570312, - "num_tokens": 1737256.0, - "step": 186 - }, - { - "epoch": 0.14209726443768997, - "grad_norm": 1.828412413597107, - "learning_rate": 4.696969696969698e-06, - "loss": 0.5444269180297852, - "mean_token_accuracy": 0.8350818157196045, - "num_tokens": 1750196.0, - "step": 187 - }, - { - "epoch": 0.14285714285714285, - "grad_norm": 3.209203004837036, - "learning_rate": 4.722222222222222e-06, - "loss": 0.5087994933128357, - "mean_token_accuracy": 0.8349015712738037, - "num_tokens": 1754836.0, - "step": 188 - }, - { - "epoch": 0.14361702127659576, - "grad_norm": 1.7339166402816772, - "learning_rate": 4.747474747474748e-06, - "loss": 0.5151352286338806, - "mean_token_accuracy": 0.8321266174316406, - "num_tokens": 1766015.0, - "step": 189 - }, - { - "epoch": 0.14437689969604864, - "grad_norm": 2.699068069458008, - "learning_rate": 4.772727272727273e-06, - "loss": 0.4406203031539917, - "mean_token_accuracy": 0.8425000905990601, - "num_tokens": 1771684.0, - "step": 190 - }, - { - "epoch": 0.14513677811550152, - "grad_norm": 2.8117282390594482, - "learning_rate": 4.7979797979797985e-06, - "loss": 0.40428489446640015, - "mean_token_accuracy": 0.8654326796531677, - "num_tokens": 1776301.0, - "step": 191 - }, - { - "epoch": 0.1458966565349544, - "grad_norm": 2.9204647541046143, - "learning_rate": 4.823232323232324e-06, - "loss": 0.4191770553588867, - "mean_token_accuracy": 0.8574687242507935, - "num_tokens": 1781678.0, - "step": 192 - }, - { - "epoch": 0.1466565349544073, - "grad_norm": 2.1648988723754883, - "learning_rate": 4.848484848484849e-06, - "loss": 0.5839012861251831, - "mean_token_accuracy": 0.8053664565086365, - "num_tokens": 1792516.0, - "step": 193 - }, - { - "epoch": 0.1474164133738602, - "grad_norm": 2.3221631050109863, - "learning_rate": 4.873737373737374e-06, - "loss": 0.5037894248962402, - "mean_token_accuracy": 0.8427227139472961, - "num_tokens": 1800192.0, - "step": 194 - }, - { - "epoch": 0.14817629179331307, - "grad_norm": 2.4536430835723877, - "learning_rate": 4.898989898989899e-06, - "loss": 0.42326074838638306, - "mean_token_accuracy": 0.8510633111000061, - "num_tokens": 1806159.0, - "step": 195 - }, - { - "epoch": 0.14893617021276595, - "grad_norm": 2.4875805377960205, - "learning_rate": 4.924242424242425e-06, - "loss": 0.539531409740448, - "mean_token_accuracy": 0.8060250282287598, - "num_tokens": 1813392.0, - "step": 196 - }, - { - "epoch": 0.14969604863221886, - "grad_norm": 2.1664798259735107, - "learning_rate": 4.94949494949495e-06, - "loss": 0.42502015829086304, - "mean_token_accuracy": 0.8503251075744629, - "num_tokens": 1821424.0, - "step": 197 - }, - { - "epoch": 0.15045592705167174, - "grad_norm": 2.568808078765869, - "learning_rate": 4.974747474747475e-06, - "loss": 0.5025098323822021, - "mean_token_accuracy": 0.8182311058044434, - "num_tokens": 1827225.0, - "step": 198 - }, - { - "epoch": 0.15121580547112462, - "grad_norm": 1.9116802215576172, - "learning_rate": 5e-06, - "loss": 0.4907258450984955, - "mean_token_accuracy": 0.8310189843177795, - "num_tokens": 1836297.0, - "step": 199 - }, - { - "epoch": 0.1519756838905775, - "grad_norm": 3.150765895843506, - "learning_rate": 4.999999122701883e-06, - "loss": 0.390616774559021, - "mean_token_accuracy": 0.8626647591590881, - "num_tokens": 1839984.0, - "step": 200 - }, - { - "epoch": 0.15273556231003038, - "grad_norm": 3.2229044437408447, - "learning_rate": 4.999996490808146e-06, - "loss": 0.48009657859802246, - "mean_token_accuracy": 0.825214147567749, - "num_tokens": 1844610.0, - "step": 201 - }, - { - "epoch": 0.1534954407294833, - "grad_norm": 1.4473289251327515, - "learning_rate": 4.9999921043206356e-06, - "loss": 0.40135183930397034, - "mean_token_accuracy": 0.8537827730178833, - "num_tokens": 1859573.0, - "step": 202 - }, - { - "epoch": 0.15425531914893617, - "grad_norm": 4.072319507598877, - "learning_rate": 4.999985963242432e-06, - "loss": 0.6158689260482788, - "mean_token_accuracy": 0.8075432777404785, - "num_tokens": 1863147.0, - "step": 203 - }, - { - "epoch": 0.15501519756838905, - "grad_norm": 3.15741229057312, - "learning_rate": 4.999978067577844e-06, - "loss": 0.4603108763694763, - "mean_token_accuracy": 0.8418779373168945, - "num_tokens": 1867201.0, - "step": 204 - }, - { - "epoch": 0.15577507598784193, - "grad_norm": 2.1925418376922607, - "learning_rate": 4.999968417332415e-06, - "loss": 0.5552488565444946, - "mean_token_accuracy": 0.8216016292572021, - "num_tokens": 1874837.0, - "step": 205 - }, - { - "epoch": 0.15653495440729484, - "grad_norm": 2.2518117427825928, - "learning_rate": 4.999957012512916e-06, - "loss": 0.4912569522857666, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 1881842.0, - "step": 206 - }, - { - "epoch": 0.15729483282674772, - "grad_norm": 1.8223762512207031, - "learning_rate": 4.999943853127351e-06, - "loss": 0.47709137201309204, - "mean_token_accuracy": 0.8311659097671509, - "num_tokens": 1890805.0, - "step": 207 - }, - { - "epoch": 0.1580547112462006, - "grad_norm": 2.066499948501587, - "learning_rate": 4.999928939184958e-06, - "loss": 0.44794657826423645, - "mean_token_accuracy": 0.8513424396514893, - "num_tokens": 1898264.0, - "step": 208 - }, - { - "epoch": 0.15881458966565348, - "grad_norm": 3.53865909576416, - "learning_rate": 4.999912270696202e-06, - "loss": 0.5978270769119263, - "mean_token_accuracy": 0.8080137968063354, - "num_tokens": 1902435.0, - "step": 209 - }, - { - "epoch": 0.1595744680851064, - "grad_norm": 2.0760679244995117, - "learning_rate": 4.999893847672783e-06, - "loss": 0.5930601358413696, - "mean_token_accuracy": 0.8028650283813477, - "num_tokens": 1912252.0, - "step": 210 - }, - { - "epoch": 0.16033434650455927, - "grad_norm": 2.21551513671875, - "learning_rate": 4.99987367012763e-06, - "loss": 0.6336753964424133, - "mean_token_accuracy": 0.7902286648750305, - "num_tokens": 1922095.0, - "step": 211 - }, - { - "epoch": 0.16109422492401215, - "grad_norm": 1.7654480934143066, - "learning_rate": 4.999851738074904e-06, - "loss": 0.6373403668403625, - "mean_token_accuracy": 0.7802424430847168, - "num_tokens": 1938962.0, - "step": 212 - }, - { - "epoch": 0.16185410334346503, - "grad_norm": 2.852834701538086, - "learning_rate": 4.9998280515300006e-06, - "loss": 0.6418683528900146, - "mean_token_accuracy": 0.7895716428756714, - "num_tokens": 1944668.0, - "step": 213 - }, - { - "epoch": 0.16261398176291794, - "grad_norm": 3.4737212657928467, - "learning_rate": 4.999802610509541e-06, - "loss": 0.6323273181915283, - "mean_token_accuracy": 0.7982614636421204, - "num_tokens": 1949142.0, - "step": 214 - }, - { - "epoch": 0.16337386018237082, - "grad_norm": 3.0802664756774902, - "learning_rate": 4.999775415031381e-06, - "loss": 0.5929068326950073, - "mean_token_accuracy": 0.8112219572067261, - "num_tokens": 1954141.0, - "step": 215 - }, - { - "epoch": 0.1641337386018237, - "grad_norm": 2.9808855056762695, - "learning_rate": 4.999746465114609e-06, - "loss": 0.5556406378746033, - "mean_token_accuracy": 0.8117628693580627, - "num_tokens": 1959406.0, - "step": 216 - }, - { - "epoch": 0.16489361702127658, - "grad_norm": 1.7346166372299194, - "learning_rate": 4.999715760779541e-06, - "loss": 0.5122925043106079, - "mean_token_accuracy": 0.8040724992752075, - "num_tokens": 1971921.0, - "step": 217 - }, - { - "epoch": 0.1656534954407295, - "grad_norm": 1.4183907508850098, - "learning_rate": 4.999683302047729e-06, - "loss": 0.46471893787384033, - "mean_token_accuracy": 0.8381330966949463, - "num_tokens": 1988863.0, - "step": 218 - }, - { - "epoch": 0.16641337386018237, - "grad_norm": 1.6797802448272705, - "learning_rate": 4.999649088941951e-06, - "loss": 0.38348832726478577, - "mean_token_accuracy": 0.8344278931617737, - "num_tokens": 2000003.0, - "step": 219 - }, - { - "epoch": 0.16717325227963525, - "grad_norm": 3.036963939666748, - "learning_rate": 4.999613121486222e-06, - "loss": 0.6062780618667603, - "mean_token_accuracy": 0.8217900991439819, - "num_tokens": 2004813.0, - "step": 220 - }, - { - "epoch": 0.16793313069908813, - "grad_norm": 2.0343217849731445, - "learning_rate": 4.999575399705782e-06, - "loss": 0.5052450895309448, - "mean_token_accuracy": 0.8368623852729797, - "num_tokens": 2013565.0, - "step": 221 - }, - { - "epoch": 0.16869300911854104, - "grad_norm": 2.1162009239196777, - "learning_rate": 4.9995359236271094e-06, - "loss": 0.5169756412506104, - "mean_token_accuracy": 0.8339958190917969, - "num_tokens": 2025763.0, - "step": 222 - }, - { - "epoch": 0.16945288753799392, - "grad_norm": 2.055333375930786, - "learning_rate": 4.9994946932779076e-06, - "loss": 0.6327048540115356, - "mean_token_accuracy": 0.8078711032867432, - "num_tokens": 2037005.0, - "step": 223 - }, - { - "epoch": 0.1702127659574468, - "grad_norm": 3.334620475769043, - "learning_rate": 4.999451708687114e-06, - "loss": 0.5688358545303345, - "mean_token_accuracy": 0.8015589714050293, - "num_tokens": 2041473.0, - "step": 224 - }, - { - "epoch": 0.17097264437689969, - "grad_norm": 2.3734676837921143, - "learning_rate": 4.999406969884897e-06, - "loss": 0.5673821568489075, - "mean_token_accuracy": 0.8054057359695435, - "num_tokens": 2049397.0, - "step": 225 - }, - { - "epoch": 0.1717325227963526, - "grad_norm": 1.807358980178833, - "learning_rate": 4.999360476902656e-06, - "loss": 0.4376158118247986, - "mean_token_accuracy": 0.8456039428710938, - "num_tokens": 2058721.0, - "step": 226 - }, - { - "epoch": 0.17249240121580547, - "grad_norm": 3.231638193130493, - "learning_rate": 4.999312229773022e-06, - "loss": 0.5592809915542603, - "mean_token_accuracy": 0.8170154094696045, - "num_tokens": 2063455.0, - "step": 227 - }, - { - "epoch": 0.17325227963525835, - "grad_norm": 2.2717151641845703, - "learning_rate": 4.999262228529855e-06, - "loss": 0.6144396066665649, - "mean_token_accuracy": 0.7948470115661621, - "num_tokens": 2071686.0, - "step": 228 - }, - { - "epoch": 0.17401215805471124, - "grad_norm": 1.4171342849731445, - "learning_rate": 4.99921047320825e-06, - "loss": 0.43680912256240845, - "mean_token_accuracy": 0.84850013256073, - "num_tokens": 2086999.0, - "step": 229 - }, - { - "epoch": 0.17477203647416414, - "grad_norm": 3.162736654281616, - "learning_rate": 4.99915696384453e-06, - "loss": 0.6025407910346985, - "mean_token_accuracy": 0.8042335510253906, - "num_tokens": 2092001.0, - "step": 230 - }, - { - "epoch": 0.17553191489361702, - "grad_norm": 1.8672804832458496, - "learning_rate": 4.99910170047625e-06, - "loss": 0.5843087434768677, - "mean_token_accuracy": 0.8016980886459351, - "num_tokens": 2103372.0, - "step": 231 - }, - { - "epoch": 0.1762917933130699, - "grad_norm": 2.967587471008301, - "learning_rate": 4.999044683142196e-06, - "loss": 0.5123642086982727, - "mean_token_accuracy": 0.8216149806976318, - "num_tokens": 2108008.0, - "step": 232 - }, - { - "epoch": 0.1770516717325228, - "grad_norm": 1.9651981592178345, - "learning_rate": 4.998985911882383e-06, - "loss": 0.5868178606033325, - "mean_token_accuracy": 0.7904198169708252, - "num_tokens": 2119009.0, - "step": 233 - }, - { - "epoch": 0.1778115501519757, - "grad_norm": 2.7785449028015137, - "learning_rate": 4.998925386738063e-06, - "loss": 0.5075510144233704, - "mean_token_accuracy": 0.8280210494995117, - "num_tokens": 2124915.0, - "step": 234 - }, - { - "epoch": 0.17857142857142858, - "grad_norm": 2.957470417022705, - "learning_rate": 4.998863107751711e-06, - "loss": 0.5351958274841309, - "mean_token_accuracy": 0.846825122833252, - "num_tokens": 2129905.0, - "step": 235 - }, - { - "epoch": 0.17933130699088146, - "grad_norm": 3.207671880722046, - "learning_rate": 4.99879907496704e-06, - "loss": 0.6209091544151306, - "mean_token_accuracy": 0.789960503578186, - "num_tokens": 2135027.0, - "step": 236 - }, - { - "epoch": 0.18009118541033434, - "grad_norm": 2.018953800201416, - "learning_rate": 4.998733288428987e-06, - "loss": 0.601510763168335, - "mean_token_accuracy": 0.8136930465698242, - "num_tokens": 2147016.0, - "step": 237 - }, - { - "epoch": 0.18085106382978725, - "grad_norm": 2.437281847000122, - "learning_rate": 4.998665748183727e-06, - "loss": 0.5813639163970947, - "mean_token_accuracy": 0.8116716146469116, - "num_tokens": 2155386.0, - "step": 238 - }, - { - "epoch": 0.18161094224924013, - "grad_norm": 1.5708180665969849, - "learning_rate": 4.998596454278661e-06, - "loss": 0.5252395272254944, - "mean_token_accuracy": 0.8193864822387695, - "num_tokens": 2170295.0, - "step": 239 - }, - { - "epoch": 0.182370820668693, - "grad_norm": 1.9921495914459229, - "learning_rate": 4.998525406762422e-06, - "loss": 0.5335029363632202, - "mean_token_accuracy": 0.8120872974395752, - "num_tokens": 2180012.0, - "step": 240 - }, - { - "epoch": 0.1831306990881459, - "grad_norm": 2.6562681198120117, - "learning_rate": 4.998452605684874e-06, - "loss": 0.48021435737609863, - "mean_token_accuracy": 0.8388714790344238, - "num_tokens": 2185607.0, - "step": 241 - }, - { - "epoch": 0.1838905775075988, - "grad_norm": 2.2535853385925293, - "learning_rate": 4.998378051097111e-06, - "loss": 0.5747300386428833, - "mean_token_accuracy": 0.8004639148712158, - "num_tokens": 2194105.0, - "step": 242 - }, - { - "epoch": 0.18465045592705168, - "grad_norm": 1.6151788234710693, - "learning_rate": 4.998301743051459e-06, - "loss": 0.6190565824508667, - "mean_token_accuracy": 0.7816627621650696, - "num_tokens": 2210629.0, - "step": 243 - }, - { - "epoch": 0.18541033434650456, - "grad_norm": 2.1088173389434814, - "learning_rate": 4.9982236816014735e-06, - "loss": 0.4715560972690582, - "mean_token_accuracy": 0.8485721349716187, - "num_tokens": 2218958.0, - "step": 244 - }, - { - "epoch": 0.18617021276595744, - "grad_norm": 2.6168735027313232, - "learning_rate": 4.998143866801941e-06, - "loss": 0.6077103018760681, - "mean_token_accuracy": 0.8057924509048462, - "num_tokens": 2226368.0, - "step": 245 - }, - { - "epoch": 0.18693009118541035, - "grad_norm": 2.5988616943359375, - "learning_rate": 4.99806229870888e-06, - "loss": 0.5021637678146362, - "mean_token_accuracy": 0.8361666202545166, - "num_tokens": 2232485.0, - "step": 246 - }, - { - "epoch": 0.18768996960486323, - "grad_norm": 2.015887498855591, - "learning_rate": 4.9979789773795365e-06, - "loss": 0.4309737980365753, - "mean_token_accuracy": 0.8508044481277466, - "num_tokens": 2240819.0, - "step": 247 - }, - { - "epoch": 0.1884498480243161, - "grad_norm": 2.3115265369415283, - "learning_rate": 4.997893902872389e-06, - "loss": 0.5776500701904297, - "mean_token_accuracy": 0.8079549074172974, - "num_tokens": 2249460.0, - "step": 248 - }, - { - "epoch": 0.189209726443769, - "grad_norm": 1.7387021780014038, - "learning_rate": 4.997807075247147e-06, - "loss": 0.430944561958313, - "mean_token_accuracy": 0.8483544588088989, - "num_tokens": 2259124.0, - "step": 249 - }, - { - "epoch": 0.1899696048632219, - "grad_norm": 1.6378381252288818, - "learning_rate": 4.997718494564747e-06, - "loss": 0.4123363792896271, - "mean_token_accuracy": 0.8557409644126892, - "num_tokens": 2269899.0, - "step": 250 - }, - { - "epoch": 0.19072948328267478, - "grad_norm": 1.336282730102539, - "learning_rate": 4.997628160887361e-06, - "loss": 0.502329409122467, - "mean_token_accuracy": 0.8186938166618347, - "num_tokens": 2292821.0, - "step": 251 - }, - { - "epoch": 0.19148936170212766, - "grad_norm": 3.3335583209991455, - "learning_rate": 4.997536074278388e-06, - "loss": 0.584446907043457, - "mean_token_accuracy": 0.8062717318534851, - "num_tokens": 2297175.0, - "step": 252 - }, - { - "epoch": 0.19224924012158054, - "grad_norm": 2.246727228164673, - "learning_rate": 4.9974422348024565e-06, - "loss": 0.5683060884475708, - "mean_token_accuracy": 0.8193703293800354, - "num_tokens": 2305456.0, - "step": 253 - }, - { - "epoch": 0.19300911854103345, - "grad_norm": 2.3520865440368652, - "learning_rate": 4.997346642525429e-06, - "loss": 0.4724946618080139, - "mean_token_accuracy": 0.8426719307899475, - "num_tokens": 2312241.0, - "step": 254 - }, - { - "epoch": 0.19376899696048633, - "grad_norm": 2.7115702629089355, - "learning_rate": 4.9972492975143936e-06, - "loss": 0.5019032955169678, - "mean_token_accuracy": 0.8253573179244995, - "num_tokens": 2318094.0, - "step": 255 - }, - { - "epoch": 0.1945288753799392, - "grad_norm": 1.705528974533081, - "learning_rate": 4.997150199837671e-06, - "loss": 0.45588475465774536, - "mean_token_accuracy": 0.836666464805603, - "num_tokens": 2329025.0, - "step": 256 - }, - { - "epoch": 0.1952887537993921, - "grad_norm": 2.161400318145752, - "learning_rate": 4.997049349564814e-06, - "loss": 0.5170183777809143, - "mean_token_accuracy": 0.8287534117698669, - "num_tokens": 2337448.0, - "step": 257 - }, - { - "epoch": 0.196048632218845, - "grad_norm": 2.629669189453125, - "learning_rate": 4.996946746766602e-06, - "loss": 0.44650501012802124, - "mean_token_accuracy": 0.850114107131958, - "num_tokens": 2343207.0, - "step": 258 - }, - { - "epoch": 0.19680851063829788, - "grad_norm": 1.6735503673553467, - "learning_rate": 4.996842391515045e-06, - "loss": 0.5247820019721985, - "mean_token_accuracy": 0.8285071849822998, - "num_tokens": 2356801.0, - "step": 259 - }, - { - "epoch": 0.19756838905775076, - "grad_norm": 1.2753115892410278, - "learning_rate": 4.996736283883382e-06, - "loss": 0.41870927810668945, - "mean_token_accuracy": 0.8448047637939453, - "num_tokens": 2377306.0, - "step": 260 - }, - { - "epoch": 0.19832826747720364, - "grad_norm": 2.6947314739227295, - "learning_rate": 4.9966284239460875e-06, - "loss": 0.5059205889701843, - "mean_token_accuracy": 0.8430814743041992, - "num_tokens": 2383352.0, - "step": 261 - }, - { - "epoch": 0.19908814589665655, - "grad_norm": 2.0509963035583496, - "learning_rate": 4.996518811778858e-06, - "loss": 0.4565388560295105, - "mean_token_accuracy": 0.8453130722045898, - "num_tokens": 2391149.0, - "step": 262 - }, - { - "epoch": 0.19984802431610943, - "grad_norm": 2.1856348514556885, - "learning_rate": 4.996407447458626e-06, - "loss": 0.531380832195282, - "mean_token_accuracy": 0.8387004137039185, - "num_tokens": 2399875.0, - "step": 263 - }, - { - "epoch": 0.2006079027355623, - "grad_norm": 2.7348573207855225, - "learning_rate": 4.99629433106355e-06, - "loss": 0.5242817401885986, - "mean_token_accuracy": 0.8177423477172852, - "num_tokens": 2406586.0, - "step": 264 - }, - { - "epoch": 0.2013677811550152, - "grad_norm": 1.76587975025177, - "learning_rate": 4.99617946267302e-06, - "loss": 0.49298471212387085, - "mean_token_accuracy": 0.8271149396896362, - "num_tokens": 2418683.0, - "step": 265 - }, - { - "epoch": 0.20212765957446807, - "grad_norm": 2.8129730224609375, - "learning_rate": 4.996062842367655e-06, - "loss": 0.46420302987098694, - "mean_token_accuracy": 0.8453244566917419, - "num_tokens": 2422929.0, - "step": 266 - }, - { - "epoch": 0.20288753799392098, - "grad_norm": 2.575744152069092, - "learning_rate": 4.9959444702293025e-06, - "loss": 0.43208545446395874, - "mean_token_accuracy": 0.8494843244552612, - "num_tokens": 2429567.0, - "step": 267 - }, - { - "epoch": 0.20364741641337386, - "grad_norm": 2.7586750984191895, - "learning_rate": 4.995824346341041e-06, - "loss": 0.4390473961830139, - "mean_token_accuracy": 0.8348895311355591, - "num_tokens": 2434700.0, - "step": 268 - }, - { - "epoch": 0.20440729483282674, - "grad_norm": 1.972145438194275, - "learning_rate": 4.99570247078718e-06, - "loss": 0.6219544410705566, - "mean_token_accuracy": 0.7939999103546143, - "num_tokens": 2447007.0, - "step": 269 - }, - { - "epoch": 0.20516717325227962, - "grad_norm": 2.2963485717773438, - "learning_rate": 4.995578843653255e-06, - "loss": 0.5008970499038696, - "mean_token_accuracy": 0.8255308866500854, - "num_tokens": 2453936.0, - "step": 270 - }, - { - "epoch": 0.20592705167173253, - "grad_norm": 1.8897721767425537, - "learning_rate": 4.995453465026033e-06, - "loss": 0.5436089038848877, - "mean_token_accuracy": 0.819086492061615, - "num_tokens": 2464494.0, - "step": 271 - }, - { - "epoch": 0.2066869300911854, - "grad_norm": 2.319728374481201, - "learning_rate": 4.995326334993508e-06, - "loss": 0.5136368870735168, - "mean_token_accuracy": 0.820817232131958, - "num_tokens": 2470938.0, - "step": 272 - }, - { - "epoch": 0.2074468085106383, - "grad_norm": 2.230414390563965, - "learning_rate": 4.9951974536449055e-06, - "loss": 0.5272846817970276, - "mean_token_accuracy": 0.8203279972076416, - "num_tokens": 2478629.0, - "step": 273 - }, - { - "epoch": 0.20820668693009117, - "grad_norm": 3.401937484741211, - "learning_rate": 4.9950668210706795e-06, - "loss": 0.4389592111110687, - "mean_token_accuracy": 0.8647899031639099, - "num_tokens": 2482193.0, - "step": 274 - }, - { - "epoch": 0.20896656534954408, - "grad_norm": 2.1278507709503174, - "learning_rate": 4.994934437362513e-06, - "loss": 0.598863422870636, - "mean_token_accuracy": 0.7945119738578796, - "num_tokens": 2492465.0, - "step": 275 - }, - { - "epoch": 0.20972644376899696, - "grad_norm": 1.9259960651397705, - "learning_rate": 4.994800302613318e-06, - "loss": 0.49520939588546753, - "mean_token_accuracy": 0.8371536135673523, - "num_tokens": 2500825.0, - "step": 276 - }, - { - "epoch": 0.21048632218844984, - "grad_norm": 2.346418857574463, - "learning_rate": 4.994664416917236e-06, - "loss": 0.5412614345550537, - "mean_token_accuracy": 0.810661792755127, - "num_tokens": 2509513.0, - "step": 277 - }, - { - "epoch": 0.21124620060790272, - "grad_norm": 1.3092039823532104, - "learning_rate": 4.994526780369636e-06, - "loss": 0.46305379271507263, - "mean_token_accuracy": 0.8358527421951294, - "num_tokens": 2531405.0, - "step": 278 - }, - { - "epoch": 0.21200607902735563, - "grad_norm": 2.924611806869507, - "learning_rate": 4.9943873930671175e-06, - "loss": 0.6134544610977173, - "mean_token_accuracy": 0.7947378754615784, - "num_tokens": 2536744.0, - "step": 279 - }, - { - "epoch": 0.2127659574468085, - "grad_norm": 2.8290598392486572, - "learning_rate": 4.994246255107506e-06, - "loss": 0.465520441532135, - "mean_token_accuracy": 0.8440108299255371, - "num_tokens": 2541184.0, - "step": 280 - }, - { - "epoch": 0.2135258358662614, - "grad_norm": 3.8081259727478027, - "learning_rate": 4.994103366589859e-06, - "loss": 0.43394139409065247, - "mean_token_accuracy": 0.8579148054122925, - "num_tokens": 2545395.0, - "step": 281 - }, - { - "epoch": 0.21428571428571427, - "grad_norm": 1.7994529008865356, - "learning_rate": 4.993958727614462e-06, - "loss": 0.5076484680175781, - "mean_token_accuracy": 0.8270803093910217, - "num_tokens": 2556541.0, - "step": 282 - }, - { - "epoch": 0.21504559270516718, - "grad_norm": 2.5582659244537354, - "learning_rate": 4.993812338282826e-06, - "loss": 0.4453684389591217, - "mean_token_accuracy": 0.8488293886184692, - "num_tokens": 2562949.0, - "step": 283 - }, - { - "epoch": 0.21580547112462006, - "grad_norm": 1.6448938846588135, - "learning_rate": 4.993664198697694e-06, - "loss": 0.461971640586853, - "mean_token_accuracy": 0.824763298034668, - "num_tokens": 2576407.0, - "step": 284 - }, - { - "epoch": 0.21656534954407294, - "grad_norm": 2.1264469623565674, - "learning_rate": 4.993514308963037e-06, - "loss": 0.6241602897644043, - "mean_token_accuracy": 0.7916014790534973, - "num_tokens": 2585695.0, - "step": 285 - }, - { - "epoch": 0.21732522796352582, - "grad_norm": 3.629991292953491, - "learning_rate": 4.993362669184051e-06, - "loss": 0.610355019569397, - "mean_token_accuracy": 0.7847568988800049, - "num_tokens": 2589778.0, - "step": 286 - }, - { - "epoch": 0.21808510638297873, - "grad_norm": 1.9070756435394287, - "learning_rate": 4.993209279467164e-06, - "loss": 0.5513623952865601, - "mean_token_accuracy": 0.7911607027053833, - "num_tokens": 2600920.0, - "step": 287 - }, - { - "epoch": 0.2188449848024316, - "grad_norm": 1.761062741279602, - "learning_rate": 4.993054139920031e-06, - "loss": 0.4579957127571106, - "mean_token_accuracy": 0.8189530372619629, - "num_tokens": 2611856.0, - "step": 288 - }, - { - "epoch": 0.2196048632218845, - "grad_norm": 1.7264713048934937, - "learning_rate": 4.992897250651535e-06, - "loss": 0.5871305465698242, - "mean_token_accuracy": 0.7918527126312256, - "num_tokens": 2624730.0, - "step": 289 - }, - { - "epoch": 0.22036474164133737, - "grad_norm": 1.7455977201461792, - "learning_rate": 4.992738611771787e-06, - "loss": 0.5475119948387146, - "mean_token_accuracy": 0.8226917386054993, - "num_tokens": 2635705.0, - "step": 290 - }, - { - "epoch": 0.22112462006079028, - "grad_norm": 2.095095157623291, - "learning_rate": 4.992578223392124e-06, - "loss": 0.5952225923538208, - "mean_token_accuracy": 0.8078469038009644, - "num_tokens": 2643954.0, - "step": 291 - }, - { - "epoch": 0.22188449848024316, - "grad_norm": 2.994664192199707, - "learning_rate": 4.992416085625115e-06, - "loss": 0.5432442426681519, - "mean_token_accuracy": 0.8329008221626282, - "num_tokens": 2648800.0, - "step": 292 - }, - { - "epoch": 0.22264437689969604, - "grad_norm": 2.796790361404419, - "learning_rate": 4.992252198584554e-06, - "loss": 0.5168961882591248, - "mean_token_accuracy": 0.8393474817276001, - "num_tokens": 2653546.0, - "step": 293 - }, - { - "epoch": 0.22340425531914893, - "grad_norm": 1.8610522747039795, - "learning_rate": 4.992086562385462e-06, - "loss": 0.5728024244308472, - "mean_token_accuracy": 0.797406792640686, - "num_tokens": 2667483.0, - "step": 294 - }, - { - "epoch": 0.22416413373860183, - "grad_norm": 1.695472002029419, - "learning_rate": 4.9919191771440905e-06, - "loss": 0.5460028648376465, - "mean_token_accuracy": 0.8123016357421875, - "num_tokens": 2683574.0, - "step": 295 - }, - { - "epoch": 0.22492401215805471, - "grad_norm": 2.8627376556396484, - "learning_rate": 4.9917500429779165e-06, - "loss": 0.5566985011100769, - "mean_token_accuracy": 0.815531313419342, - "num_tokens": 2688985.0, - "step": 296 - }, - { - "epoch": 0.2256838905775076, - "grad_norm": 2.73323655128479, - "learning_rate": 4.991579160005644e-06, - "loss": 0.48197102546691895, - "mean_token_accuracy": 0.8471829295158386, - "num_tokens": 2694799.0, - "step": 297 - }, - { - "epoch": 0.22644376899696048, - "grad_norm": 1.8436161279678345, - "learning_rate": 4.991406528347206e-06, - "loss": 0.4528339207172394, - "mean_token_accuracy": 0.8603188395500183, - "num_tokens": 2707321.0, - "step": 298 - }, - { - "epoch": 0.22720364741641338, - "grad_norm": 2.6231515407562256, - "learning_rate": 4.9912321481237616e-06, - "loss": 0.5916541814804077, - "mean_token_accuracy": 0.8050242066383362, - "num_tokens": 2714233.0, - "step": 299 - }, - { - "epoch": 0.22796352583586627, - "grad_norm": 3.08776593208313, - "learning_rate": 4.991056019457697e-06, - "loss": 0.4860580563545227, - "mean_token_accuracy": 0.8464088439941406, - "num_tokens": 2718443.0, - "step": 300 - }, - { - "epoch": 0.22872340425531915, - "grad_norm": 2.2537803649902344, - "learning_rate": 4.990878142472628e-06, - "loss": 0.5158311128616333, - "mean_token_accuracy": 0.824694812297821, - "num_tokens": 2726158.0, - "step": 301 - }, - { - "epoch": 0.22948328267477203, - "grad_norm": 2.1122705936431885, - "learning_rate": 4.990698517293394e-06, - "loss": 0.495265394449234, - "mean_token_accuracy": 0.8343238830566406, - "num_tokens": 2735022.0, - "step": 302 - }, - { - "epoch": 0.23024316109422494, - "grad_norm": 3.5503528118133545, - "learning_rate": 4.9905171440460645e-06, - "loss": 0.46063232421875, - "mean_token_accuracy": 0.8420047760009766, - "num_tokens": 2738550.0, - "step": 303 - }, - { - "epoch": 0.23100303951367782, - "grad_norm": 3.9858486652374268, - "learning_rate": 4.990334022857932e-06, - "loss": 0.5832710266113281, - "mean_token_accuracy": 0.8144199848175049, - "num_tokens": 2741720.0, - "step": 304 - }, - { - "epoch": 0.2317629179331307, - "grad_norm": 2.407231330871582, - "learning_rate": 4.990149153857519e-06, - "loss": 0.4692630171775818, - "mean_token_accuracy": 0.8429223299026489, - "num_tokens": 2748693.0, - "step": 305 - }, - { - "epoch": 0.23252279635258358, - "grad_norm": 1.6996397972106934, - "learning_rate": 4.989962537174573e-06, - "loss": 0.49143946170806885, - "mean_token_accuracy": 0.8340128064155579, - "num_tokens": 2761254.0, - "step": 306 - }, - { - "epoch": 0.23328267477203649, - "grad_norm": 3.746432065963745, - "learning_rate": 4.989774172940071e-06, - "loss": 0.6282026767730713, - "mean_token_accuracy": 0.775698184967041, - "num_tokens": 2765115.0, - "step": 307 - }, - { - "epoch": 0.23404255319148937, - "grad_norm": 2.212872266769409, - "learning_rate": 4.989584061286211e-06, - "loss": 0.5193763971328735, - "mean_token_accuracy": 0.8168246746063232, - "num_tokens": 2772345.0, - "step": 308 - }, - { - "epoch": 0.23480243161094225, - "grad_norm": 1.752297282218933, - "learning_rate": 4.989392202346423e-06, - "loss": 0.4437984824180603, - "mean_token_accuracy": 0.8451256155967712, - "num_tokens": 2783072.0, - "step": 309 - }, - { - "epoch": 0.23556231003039513, - "grad_norm": 2.386019706726074, - "learning_rate": 4.989198596255361e-06, - "loss": 0.4090752899646759, - "mean_token_accuracy": 0.8480085134506226, - "num_tokens": 2788757.0, - "step": 310 - }, - { - "epoch": 0.23632218844984804, - "grad_norm": 3.9981489181518555, - "learning_rate": 4.989003243148904e-06, - "loss": 0.5149132013320923, - "mean_token_accuracy": 0.8179056644439697, - "num_tokens": 2792096.0, - "step": 311 - }, - { - "epoch": 0.23708206686930092, - "grad_norm": 1.8723100423812866, - "learning_rate": 4.988806143164159e-06, - "loss": 0.4531487822532654, - "mean_token_accuracy": 0.8400167226791382, - "num_tokens": 2802210.0, - "step": 312 - }, - { - "epoch": 0.2378419452887538, - "grad_norm": 2.3415136337280273, - "learning_rate": 4.988607296439459e-06, - "loss": 0.5974439978599548, - "mean_token_accuracy": 0.8035976886749268, - "num_tokens": 2810088.0, - "step": 313 - }, - { - "epoch": 0.23860182370820668, - "grad_norm": 1.5317577123641968, - "learning_rate": 4.98840670311436e-06, - "loss": 0.49247145652770996, - "mean_token_accuracy": 0.8292540311813354, - "num_tokens": 2824005.0, - "step": 314 - }, - { - "epoch": 0.2393617021276596, - "grad_norm": 2.170772075653076, - "learning_rate": 4.988204363329648e-06, - "loss": 0.6359974145889282, - "mean_token_accuracy": 0.7785564661026001, - "num_tokens": 2834680.0, - "step": 315 - }, - { - "epoch": 0.24012158054711247, - "grad_norm": 3.2655932903289795, - "learning_rate": 4.988000277227334e-06, - "loss": 0.5080196857452393, - "mean_token_accuracy": 0.8295877575874329, - "num_tokens": 2838735.0, - "step": 316 - }, - { - "epoch": 0.24088145896656535, - "grad_norm": 3.406589984893799, - "learning_rate": 4.987794444950651e-06, - "loss": 0.3939085006713867, - "mean_token_accuracy": 0.8700719475746155, - "num_tokens": 2842127.0, - "step": 317 - }, - { - "epoch": 0.24164133738601823, - "grad_norm": 1.8211106061935425, - "learning_rate": 4.987586866644061e-06, - "loss": 0.5270540118217468, - "mean_token_accuracy": 0.826683521270752, - "num_tokens": 2853656.0, - "step": 318 - }, - { - "epoch": 0.24240121580547114, - "grad_norm": 1.8429969549179077, - "learning_rate": 4.9873775424532515e-06, - "loss": 0.4705049991607666, - "mean_token_accuracy": 0.8355701565742493, - "num_tokens": 2863513.0, - "step": 319 - }, - { - "epoch": 0.24316109422492402, - "grad_norm": 2.2425320148468018, - "learning_rate": 4.9871664725251314e-06, - "loss": 0.485736608505249, - "mean_token_accuracy": 0.835182785987854, - "num_tokens": 2871556.0, - "step": 320 - }, - { - "epoch": 0.2439209726443769, - "grad_norm": 1.6202056407928467, - "learning_rate": 4.986953657007841e-06, - "loss": 0.4437887370586395, - "mean_token_accuracy": 0.8282591700553894, - "num_tokens": 2884335.0, - "step": 321 - }, - { - "epoch": 0.24468085106382978, - "grad_norm": 1.1027268171310425, - "learning_rate": 4.98673909605074e-06, - "loss": 0.3770800828933716, - "mean_token_accuracy": 0.8325437307357788, - "num_tokens": 2904286.0, - "step": 322 - }, - { - "epoch": 0.2454407294832827, - "grad_norm": 2.3239076137542725, - "learning_rate": 4.986522789804417e-06, - "loss": 0.5387254953384399, - "mean_token_accuracy": 0.806242823600769, - "num_tokens": 2910975.0, - "step": 323 - }, - { - "epoch": 0.24620060790273557, - "grad_norm": 2.243482828140259, - "learning_rate": 4.986304738420684e-06, - "loss": 0.4396553039550781, - "mean_token_accuracy": 0.8561904430389404, - "num_tokens": 2917087.0, - "step": 324 - }, - { - "epoch": 0.24696048632218845, - "grad_norm": 2.537264347076416, - "learning_rate": 4.986084942052577e-06, - "loss": 0.395110160112381, - "mean_token_accuracy": 0.8636915683746338, - "num_tokens": 2921887.0, - "step": 325 - }, - { - "epoch": 0.24772036474164133, - "grad_norm": 2.319399118423462, - "learning_rate": 4.9858634008543574e-06, - "loss": 0.581517219543457, - "mean_token_accuracy": 0.8157487511634827, - "num_tokens": 2928996.0, - "step": 326 - }, - { - "epoch": 0.24848024316109424, - "grad_norm": 1.9787474870681763, - "learning_rate": 4.985640114981513e-06, - "loss": 0.5084106922149658, - "mean_token_accuracy": 0.835221529006958, - "num_tokens": 2940302.0, - "step": 327 - }, - { - "epoch": 0.24924012158054712, - "grad_norm": 2.4783265590667725, - "learning_rate": 4.985415084590752e-06, - "loss": 0.6062222719192505, - "mean_token_accuracy": 0.7885516285896301, - "num_tokens": 2946386.0, - "step": 328 - }, - { - "epoch": 0.25, - "grad_norm": 2.4081411361694336, - "learning_rate": 4.985188309840012e-06, - "loss": 0.5079880356788635, - "mean_token_accuracy": 0.8313904404640198, - "num_tokens": 2952323.0, - "step": 329 - }, - { - "epoch": 0.2507598784194529, - "grad_norm": 2.64993953704834, - "learning_rate": 4.984959790888451e-06, - "loss": 0.5461447834968567, - "mean_token_accuracy": 0.8125468492507935, - "num_tokens": 2958119.0, - "step": 330 - }, - { - "epoch": 0.25151975683890576, - "grad_norm": 2.549734115600586, - "learning_rate": 4.984729527896451e-06, - "loss": 0.5998573303222656, - "mean_token_accuracy": 0.8076666593551636, - "num_tokens": 2964947.0, - "step": 331 - }, - { - "epoch": 0.25227963525835867, - "grad_norm": 3.2185161113739014, - "learning_rate": 4.984497521025622e-06, - "loss": 0.4232945442199707, - "mean_token_accuracy": 0.8543803095817566, - "num_tokens": 2968598.0, - "step": 332 - }, - { - "epoch": 0.2530395136778115, - "grad_norm": 2.588994264602661, - "learning_rate": 4.984263770438793e-06, - "loss": 0.460967481136322, - "mean_token_accuracy": 0.8416207432746887, - "num_tokens": 2974510.0, - "step": 333 - }, - { - "epoch": 0.25379939209726443, - "grad_norm": 2.1373162269592285, - "learning_rate": 4.984028276300021e-06, - "loss": 0.49382102489471436, - "mean_token_accuracy": 0.8388048410415649, - "num_tokens": 2981632.0, - "step": 334 - }, - { - "epoch": 0.25455927051671734, - "grad_norm": 2.2524826526641846, - "learning_rate": 4.983791038774585e-06, - "loss": 0.4947671890258789, - "mean_token_accuracy": 0.8066365122795105, - "num_tokens": 2988736.0, - "step": 335 - }, - { - "epoch": 0.2553191489361702, - "grad_norm": 1.7244199514389038, - "learning_rate": 4.983552058028985e-06, - "loss": 0.48096776008605957, - "mean_token_accuracy": 0.830735445022583, - "num_tokens": 3003576.0, - "step": 336 - }, - { - "epoch": 0.2560790273556231, - "grad_norm": 3.0628933906555176, - "learning_rate": 4.9833113342309495e-06, - "loss": 0.6027032136917114, - "mean_token_accuracy": 0.8008694648742676, - "num_tokens": 3009549.0, - "step": 337 - }, - { - "epoch": 0.256838905775076, - "grad_norm": 2.438674211502075, - "learning_rate": 4.983068867549427e-06, - "loss": 0.517090916633606, - "mean_token_accuracy": 0.827893853187561, - "num_tokens": 3015236.0, - "step": 338 - }, - { - "epoch": 0.25759878419452886, - "grad_norm": 2.131535053253174, - "learning_rate": 4.982824658154589e-06, - "loss": 0.6656812429428101, - "mean_token_accuracy": 0.7772425413131714, - "num_tokens": 3028142.0, - "step": 339 - }, - { - "epoch": 0.25835866261398177, - "grad_norm": 2.3206584453582764, - "learning_rate": 4.9825787062178315e-06, - "loss": 0.5757625699043274, - "mean_token_accuracy": 0.8073873519897461, - "num_tokens": 3040996.0, - "step": 340 - }, - { - "epoch": 0.2591185410334346, - "grad_norm": 1.3905521631240845, - "learning_rate": 4.982331011911774e-06, - "loss": 0.4193805456161499, - "mean_token_accuracy": 0.8399466872215271, - "num_tokens": 3061931.0, - "step": 341 - }, - { - "epoch": 0.25987841945288753, - "grad_norm": 2.184173345565796, - "learning_rate": 4.982081575410256e-06, - "loss": 0.4751223921775818, - "mean_token_accuracy": 0.8409271240234375, - "num_tokens": 3069081.0, - "step": 342 - }, - { - "epoch": 0.26063829787234044, - "grad_norm": 3.538764238357544, - "learning_rate": 4.9818303968883445e-06, - "loss": 0.8119601011276245, - "mean_token_accuracy": 0.7442739009857178, - "num_tokens": 3073628.0, - "step": 343 - }, - { - "epoch": 0.2613981762917933, - "grad_norm": 1.8063762187957764, - "learning_rate": 4.981577476522323e-06, - "loss": 0.5615730881690979, - "mean_token_accuracy": 0.8207751512527466, - "num_tokens": 3086596.0, - "step": 344 - }, - { - "epoch": 0.2621580547112462, - "grad_norm": 2.4346961975097656, - "learning_rate": 4.981322814489703e-06, - "loss": 0.5266709327697754, - "mean_token_accuracy": 0.8211277723312378, - "num_tokens": 3092631.0, - "step": 345 - }, - { - "epoch": 0.2629179331306991, - "grad_norm": 1.91289484500885, - "learning_rate": 4.981066410969215e-06, - "loss": 0.5047177672386169, - "mean_token_accuracy": 0.8356877565383911, - "num_tokens": 3101102.0, - "step": 346 - }, - { - "epoch": 0.26367781155015196, - "grad_norm": 2.1495707035064697, - "learning_rate": 4.980808266140813e-06, - "loss": 0.47876280546188354, - "mean_token_accuracy": 0.8364313244819641, - "num_tokens": 3107998.0, - "step": 347 - }, - { - "epoch": 0.26443768996960487, - "grad_norm": 2.5961992740631104, - "learning_rate": 4.9805483801856744e-06, - "loss": 0.5512958765029907, - "mean_token_accuracy": 0.8181467652320862, - "num_tokens": 3113848.0, - "step": 348 - }, - { - "epoch": 0.2651975683890577, - "grad_norm": 3.2828900814056396, - "learning_rate": 4.980286753286196e-06, - "loss": 0.4217945635318756, - "mean_token_accuracy": 0.8617103099822998, - "num_tokens": 3117652.0, - "step": 349 - }, - { - "epoch": 0.26595744680851063, - "grad_norm": 1.425554871559143, - "learning_rate": 4.980023385625996e-06, - "loss": 0.4042487144470215, - "mean_token_accuracy": 0.8492785692214966, - "num_tokens": 3132336.0, - "step": 350 - }, - { - "epoch": 0.26671732522796354, - "grad_norm": 2.933504104614258, - "learning_rate": 4.979758277389919e-06, - "loss": 0.5406704545021057, - "mean_token_accuracy": 0.8035423755645752, - "num_tokens": 3137544.0, - "step": 351 - }, - { - "epoch": 0.2674772036474164, - "grad_norm": 1.9958966970443726, - "learning_rate": 4.9794914287640264e-06, - "loss": 0.5857555270195007, - "mean_token_accuracy": 0.7965140342712402, - "num_tokens": 3149705.0, - "step": 352 - }, - { - "epoch": 0.2682370820668693, - "grad_norm": 2.467694044113159, - "learning_rate": 4.979222839935602e-06, - "loss": 0.6404043436050415, - "mean_token_accuracy": 0.7823755741119385, - "num_tokens": 3158353.0, - "step": 353 - }, - { - "epoch": 0.2689969604863222, - "grad_norm": 2.0102720260620117, - "learning_rate": 4.9789525110931545e-06, - "loss": 0.5681496858596802, - "mean_token_accuracy": 0.8108169436454773, - "num_tokens": 3167121.0, - "step": 354 - }, - { - "epoch": 0.26975683890577506, - "grad_norm": 2.6017866134643555, - "learning_rate": 4.978680442426409e-06, - "loss": 0.6309828162193298, - "mean_token_accuracy": 0.7742617130279541, - "num_tokens": 3175012.0, - "step": 355 - }, - { - "epoch": 0.270516717325228, - "grad_norm": 1.8799268007278442, - "learning_rate": 4.978406634126315e-06, - "loss": 0.524029016494751, - "mean_token_accuracy": 0.8317689895629883, - "num_tokens": 3185331.0, - "step": 356 - }, - { - "epoch": 0.2712765957446808, - "grad_norm": 1.508332371711731, - "learning_rate": 4.978131086385041e-06, - "loss": 0.46656402945518494, - "mean_token_accuracy": 0.8339117765426636, - "num_tokens": 3198813.0, - "step": 357 - }, - { - "epoch": 0.27203647416413373, - "grad_norm": 3.595707654953003, - "learning_rate": 4.977853799395976e-06, - "loss": 0.5101234912872314, - "mean_token_accuracy": 0.8251723051071167, - "num_tokens": 3206557.0, - "step": 358 - }, - { - "epoch": 0.27279635258358664, - "grad_norm": 3.5317916870117188, - "learning_rate": 4.977574773353732e-06, - "loss": 0.5684665441513062, - "mean_token_accuracy": 0.8124493360519409, - "num_tokens": 3210912.0, - "step": 359 - }, - { - "epoch": 0.2735562310030395, - "grad_norm": 2.8606204986572266, - "learning_rate": 4.97729400845414e-06, - "loss": 0.4746384620666504, - "mean_token_accuracy": 0.8195606470108032, - "num_tokens": 3215365.0, - "step": 360 - }, - { - "epoch": 0.2743161094224924, - "grad_norm": 1.8214033842086792, - "learning_rate": 4.977011504894253e-06, - "loss": 0.4842769503593445, - "mean_token_accuracy": 0.82928866147995, - "num_tokens": 3224037.0, - "step": 361 - }, - { - "epoch": 0.2750759878419453, - "grad_norm": 1.628746509552002, - "learning_rate": 4.97672726287234e-06, - "loss": 0.4397493302822113, - "mean_token_accuracy": 0.8606528043746948, - "num_tokens": 3235589.0, - "step": 362 - }, - { - "epoch": 0.27583586626139817, - "grad_norm": 3.557973861694336, - "learning_rate": 4.976441282587894e-06, - "loss": 0.5732032060623169, - "mean_token_accuracy": 0.8041545748710632, - "num_tokens": 3239958.0, - "step": 363 - }, - { - "epoch": 0.2765957446808511, - "grad_norm": 1.3467901945114136, - "learning_rate": 4.9761535642416284e-06, - "loss": 0.4525323510169983, - "mean_token_accuracy": 0.8281061053276062, - "num_tokens": 3257703.0, - "step": 364 - }, - { - "epoch": 0.2773556231003039, - "grad_norm": 2.2649986743927, - "learning_rate": 4.9758641080354745e-06, - "loss": 0.5074734687805176, - "mean_token_accuracy": 0.8447474241256714, - "num_tokens": 3264334.0, - "step": 365 - }, - { - "epoch": 0.27811550151975684, - "grad_norm": 2.8667566776275635, - "learning_rate": 4.975572914172581e-06, - "loss": 0.5759559869766235, - "mean_token_accuracy": 0.7976793050765991, - "num_tokens": 3269314.0, - "step": 366 - }, - { - "epoch": 0.27887537993920974, - "grad_norm": 2.2514986991882324, - "learning_rate": 4.975279982857324e-06, - "loss": 0.5786465406417847, - "mean_token_accuracy": 0.8058781623840332, - "num_tokens": 3277324.0, - "step": 367 - }, - { - "epoch": 0.2796352583586626, - "grad_norm": 1.3826723098754883, - "learning_rate": 4.97498531429529e-06, - "loss": 0.40801727771759033, - "mean_token_accuracy": 0.8601310849189758, - "num_tokens": 3290530.0, - "step": 368 - }, - { - "epoch": 0.2803951367781155, - "grad_norm": 2.084092617034912, - "learning_rate": 4.97468890869329e-06, - "loss": 0.47076648473739624, - "mean_token_accuracy": 0.8310186862945557, - "num_tokens": 3298325.0, - "step": 369 - }, - { - "epoch": 0.2811550151975684, - "grad_norm": 1.3467998504638672, - "learning_rate": 4.974390766259353e-06, - "loss": 0.44668465852737427, - "mean_token_accuracy": 0.8275353908538818, - "num_tokens": 3314302.0, - "step": 370 - }, - { - "epoch": 0.28191489361702127, - "grad_norm": 2.5921075344085693, - "learning_rate": 4.974090887202726e-06, - "loss": 0.5343953967094421, - "mean_token_accuracy": 0.8110706806182861, - "num_tokens": 3320963.0, - "step": 371 - }, - { - "epoch": 0.2826747720364742, - "grad_norm": 2.042781352996826, - "learning_rate": 4.973789271733877e-06, - "loss": 0.6293343305587769, - "mean_token_accuracy": 0.7800243496894836, - "num_tokens": 3332742.0, - "step": 372 - }, - { - "epoch": 0.28343465045592703, - "grad_norm": 4.822193145751953, - "learning_rate": 4.973485920064491e-06, - "loss": 0.6256728768348694, - "mean_token_accuracy": 0.7962433099746704, - "num_tokens": 3335872.0, - "step": 373 - }, - { - "epoch": 0.28419452887537994, - "grad_norm": 1.260988473892212, - "learning_rate": 4.973180832407471e-06, - "loss": 0.38731223344802856, - "mean_token_accuracy": 0.8385066986083984, - "num_tokens": 3351884.0, - "step": 374 - }, - { - "epoch": 0.28495440729483285, - "grad_norm": 2.669966697692871, - "learning_rate": 4.97287400897694e-06, - "loss": 0.5594710111618042, - "mean_token_accuracy": 0.8097212314605713, - "num_tokens": 3358197.0, - "step": 375 - }, - { - "epoch": 0.2857142857142857, - "grad_norm": 3.0344486236572266, - "learning_rate": 4.972565449988238e-06, - "loss": 0.34449583292007446, - "mean_token_accuracy": 0.8813316822052002, - "num_tokens": 3362133.0, - "step": 376 - }, - { - "epoch": 0.2864741641337386, - "grad_norm": 2.562251091003418, - "learning_rate": 4.972255155657925e-06, - "loss": 0.5331522822380066, - "mean_token_accuracy": 0.8212941288948059, - "num_tokens": 3370346.0, - "step": 377 - }, - { - "epoch": 0.2872340425531915, - "grad_norm": 2.7083740234375, - "learning_rate": 4.9719431262037755e-06, - "loss": 0.5403046011924744, - "mean_token_accuracy": 0.8108335733413696, - "num_tokens": 3375588.0, - "step": 378 - }, - { - "epoch": 0.28799392097264437, - "grad_norm": 1.396430492401123, - "learning_rate": 4.971629361844785e-06, - "loss": 0.4041529893875122, - "mean_token_accuracy": 0.8588063716888428, - "num_tokens": 3390749.0, - "step": 379 - }, - { - "epoch": 0.2887537993920973, - "grad_norm": 1.9872784614562988, - "learning_rate": 4.971313862801166e-06, - "loss": 0.4336993098258972, - "mean_token_accuracy": 0.8511303663253784, - "num_tokens": 3399064.0, - "step": 380 - }, - { - "epoch": 0.28951367781155013, - "grad_norm": 1.9652575254440308, - "learning_rate": 4.9709966292943455e-06, - "loss": 0.4578358232975006, - "mean_token_accuracy": 0.8229440450668335, - "num_tokens": 3407229.0, - "step": 381 - }, - { - "epoch": 0.29027355623100304, - "grad_norm": 1.6626898050308228, - "learning_rate": 4.970677661546972e-06, - "loss": 0.5427594184875488, - "mean_token_accuracy": 0.815427303314209, - "num_tokens": 3422321.0, - "step": 382 - }, - { - "epoch": 0.29103343465045595, - "grad_norm": 3.5265562534332275, - "learning_rate": 4.970356959782909e-06, - "loss": 0.6661460995674133, - "mean_token_accuracy": 0.7856965065002441, - "num_tokens": 3427442.0, - "step": 383 - }, - { - "epoch": 0.2917933130699088, - "grad_norm": 1.667205572128296, - "learning_rate": 4.970034524227239e-06, - "loss": 0.36256325244903564, - "mean_token_accuracy": 0.8711205720901489, - "num_tokens": 3436662.0, - "step": 384 - }, - { - "epoch": 0.2925531914893617, - "grad_norm": 1.3389486074447632, - "learning_rate": 4.969710355106256e-06, - "loss": 0.4282698631286621, - "mean_token_accuracy": 0.838951587677002, - "num_tokens": 3450060.0, - "step": 385 - }, - { - "epoch": 0.2933130699088146, - "grad_norm": 2.5163397789001465, - "learning_rate": 4.969384452647477e-06, - "loss": 0.5176984071731567, - "mean_token_accuracy": 0.8235267996788025, - "num_tokens": 3456990.0, - "step": 386 - }, - { - "epoch": 0.29407294832826747, - "grad_norm": 1.7588495016098022, - "learning_rate": 4.969056817079633e-06, - "loss": 0.49710947275161743, - "mean_token_accuracy": 0.818520724773407, - "num_tokens": 3468098.0, - "step": 387 - }, - { - "epoch": 0.2948328267477204, - "grad_norm": 2.6381046772003174, - "learning_rate": 4.968727448632669e-06, - "loss": 0.4425308108329773, - "mean_token_accuracy": 0.8451643586158752, - "num_tokens": 3472899.0, - "step": 388 - }, - { - "epoch": 0.29559270516717323, - "grad_norm": 1.6345038414001465, - "learning_rate": 4.968396347537751e-06, - "loss": 0.4177059829235077, - "mean_token_accuracy": 0.8498886227607727, - "num_tokens": 3484826.0, - "step": 389 - }, - { - "epoch": 0.29635258358662614, - "grad_norm": 3.0466468334198, - "learning_rate": 4.968063514027258e-06, - "loss": 0.4274463951587677, - "mean_token_accuracy": 0.8387278318405151, - "num_tokens": 3488610.0, - "step": 390 - }, - { - "epoch": 0.29711246200607905, - "grad_norm": 2.6509406566619873, - "learning_rate": 4.967728948334784e-06, - "loss": 0.5401753783226013, - "mean_token_accuracy": 0.8252490162849426, - "num_tokens": 3493657.0, - "step": 391 - }, - { - "epoch": 0.2978723404255319, - "grad_norm": 1.6372219324111938, - "learning_rate": 4.967392650695141e-06, - "loss": 0.3862472176551819, - "mean_token_accuracy": 0.8555525541305542, - "num_tokens": 3505588.0, - "step": 392 - }, - { - "epoch": 0.2986322188449848, - "grad_norm": 2.1615452766418457, - "learning_rate": 4.967054621344356e-06, - "loss": 0.57850581407547, - "mean_token_accuracy": 0.8222678899765015, - "num_tokens": 3514396.0, - "step": 393 - }, - { - "epoch": 0.2993920972644377, - "grad_norm": 1.8610916137695312, - "learning_rate": 4.96671486051967e-06, - "loss": 0.5440595149993896, - "mean_token_accuracy": 0.8196715116500854, - "num_tokens": 3523604.0, - "step": 394 - }, - { - "epoch": 0.30015197568389057, - "grad_norm": 2.9585862159729004, - "learning_rate": 4.966373368459542e-06, - "loss": 0.6921588182449341, - "mean_token_accuracy": 0.7816659808158875, - "num_tokens": 3529849.0, - "step": 395 - }, - { - "epoch": 0.3009118541033435, - "grad_norm": 1.9374035596847534, - "learning_rate": 4.966030145403642e-06, - "loss": 0.5494055151939392, - "mean_token_accuracy": 0.8126792907714844, - "num_tokens": 3539529.0, - "step": 396 - }, - { - "epoch": 0.30167173252279633, - "grad_norm": 1.730530023574829, - "learning_rate": 4.965685191592859e-06, - "loss": 0.4271572232246399, - "mean_token_accuracy": 0.8383668661117554, - "num_tokens": 3550972.0, - "step": 397 - }, - { - "epoch": 0.30243161094224924, - "grad_norm": 3.9635560512542725, - "learning_rate": 4.9653385072692935e-06, - "loss": 0.5576210021972656, - "mean_token_accuracy": 0.799404501914978, - "num_tokens": 3554147.0, - "step": 398 - }, - { - "epoch": 0.30319148936170215, - "grad_norm": 2.5731968879699707, - "learning_rate": 4.964990092676263e-06, - "loss": 0.5478942394256592, - "mean_token_accuracy": 0.8220961093902588, - "num_tokens": 3559972.0, - "step": 399 - }, - { - "epoch": 0.303951367781155, - "grad_norm": 2.2096588611602783, - "learning_rate": 4.964639948058297e-06, - "loss": 0.35461270809173584, - "mean_token_accuracy": 0.8640927076339722, - "num_tokens": 3565770.0, - "step": 400 - }, - { - "epoch": 0.3047112462006079, - "grad_norm": 1.7874189615249634, - "learning_rate": 4.964288073661142e-06, - "loss": 0.38849619030952454, - "mean_token_accuracy": 0.8443037271499634, - "num_tokens": 3574514.0, - "step": 401 - }, - { - "epoch": 0.30547112462006076, - "grad_norm": 1.5583146810531616, - "learning_rate": 4.963934469731756e-06, - "loss": 0.48909449577331543, - "mean_token_accuracy": 0.8429768681526184, - "num_tokens": 3585877.0, - "step": 402 - }, - { - "epoch": 0.30623100303951367, - "grad_norm": 3.026599645614624, - "learning_rate": 4.963579136518312e-06, - "loss": 0.5138992071151733, - "mean_token_accuracy": 0.8283728361129761, - "num_tokens": 3590412.0, - "step": 403 - }, - { - "epoch": 0.3069908814589666, - "grad_norm": 2.777505874633789, - "learning_rate": 4.963222074270197e-06, - "loss": 0.6241534948348999, - "mean_token_accuracy": 0.8130464553833008, - "num_tokens": 3596246.0, - "step": 404 - }, - { - "epoch": 0.30775075987841943, - "grad_norm": 2.4772839546203613, - "learning_rate": 4.962863283238011e-06, - "loss": 0.5930814146995544, - "mean_token_accuracy": 0.8036394715309143, - "num_tokens": 3602878.0, - "step": 405 - }, - { - "epoch": 0.30851063829787234, - "grad_norm": 1.5049982070922852, - "learning_rate": 4.962502763673566e-06, - "loss": 0.4903082549571991, - "mean_token_accuracy": 0.8184912204742432, - "num_tokens": 3617018.0, - "step": 406 - }, - { - "epoch": 0.30927051671732525, - "grad_norm": 2.453155040740967, - "learning_rate": 4.96214051582989e-06, - "loss": 0.5138067603111267, - "mean_token_accuracy": 0.8336835503578186, - "num_tokens": 3624188.0, - "step": 407 - }, - { - "epoch": 0.3100303951367781, - "grad_norm": 2.4038336277008057, - "learning_rate": 4.961776539961222e-06, - "loss": 0.5752760171890259, - "mean_token_accuracy": 0.8054730892181396, - "num_tokens": 3634152.0, - "step": 408 - }, - { - "epoch": 0.310790273556231, - "grad_norm": 2.629068374633789, - "learning_rate": 4.961410836323014e-06, - "loss": 0.5580606460571289, - "mean_token_accuracy": 0.8121089935302734, - "num_tokens": 3639528.0, - "step": 409 - }, - { - "epoch": 0.31155015197568386, - "grad_norm": 1.4245928525924683, - "learning_rate": 4.961043405171931e-06, - "loss": 0.5399882793426514, - "mean_token_accuracy": 0.812280535697937, - "num_tokens": 3655744.0, - "step": 410 - }, - { - "epoch": 0.3123100303951368, - "grad_norm": 1.5236459970474243, - "learning_rate": 4.9606742467658505e-06, - "loss": 0.5234690308570862, - "mean_token_accuracy": 0.8188928365707397, - "num_tokens": 3675010.0, - "step": 411 - }, - { - "epoch": 0.3130699088145897, - "grad_norm": 2.27961802482605, - "learning_rate": 4.960303361363863e-06, - "loss": 0.5502505898475647, - "mean_token_accuracy": 0.8161963224411011, - "num_tokens": 3682328.0, - "step": 412 - }, - { - "epoch": 0.31382978723404253, - "grad_norm": 1.554518222808838, - "learning_rate": 4.959930749226269e-06, - "loss": 0.420867919921875, - "mean_token_accuracy": 0.8499157428741455, - "num_tokens": 3694980.0, - "step": 413 - }, - { - "epoch": 0.31458966565349544, - "grad_norm": 2.609218120574951, - "learning_rate": 4.9595564106145825e-06, - "loss": 0.4706704318523407, - "mean_token_accuracy": 0.8412490487098694, - "num_tokens": 3700033.0, - "step": 414 - }, - { - "epoch": 0.31534954407294835, - "grad_norm": 1.5303231477737427, - "learning_rate": 4.959180345791528e-06, - "loss": 0.4668654799461365, - "mean_token_accuracy": 0.8125015497207642, - "num_tokens": 3715012.0, - "step": 415 - }, - { - "epoch": 0.3161094224924012, - "grad_norm": 1.2774665355682373, - "learning_rate": 4.958802555021042e-06, - "loss": 0.4339369237422943, - "mean_token_accuracy": 0.8442851901054382, - "num_tokens": 3733928.0, - "step": 416 - }, - { - "epoch": 0.3168693009118541, - "grad_norm": 2.1240181922912598, - "learning_rate": 4.958423038568274e-06, - "loss": 0.4029104709625244, - "mean_token_accuracy": 0.8627674579620361, - "num_tokens": 3740202.0, - "step": 417 - }, - { - "epoch": 0.31762917933130697, - "grad_norm": 2.00538969039917, - "learning_rate": 4.958041796699583e-06, - "loss": 0.5229607820510864, - "mean_token_accuracy": 0.8282366394996643, - "num_tokens": 3749308.0, - "step": 418 - }, - { - "epoch": 0.3183890577507599, - "grad_norm": 2.6555092334747314, - "learning_rate": 4.957658829682539e-06, - "loss": 0.5344101190567017, - "mean_token_accuracy": 0.8183202743530273, - "num_tokens": 3754595.0, - "step": 419 - }, - { - "epoch": 0.3191489361702128, - "grad_norm": 1.7468839883804321, - "learning_rate": 4.9572741377859225e-06, - "loss": 0.5667245984077454, - "mean_token_accuracy": 0.8080123662948608, - "num_tokens": 3765761.0, - "step": 420 - }, - { - "epoch": 0.31990881458966564, - "grad_norm": 2.9612457752227783, - "learning_rate": 4.956887721279726e-06, - "loss": 0.5389559864997864, - "mean_token_accuracy": 0.8019476532936096, - "num_tokens": 3770844.0, - "step": 421 - }, - { - "epoch": 0.32066869300911854, - "grad_norm": 1.842403769493103, - "learning_rate": 4.95649958043515e-06, - "loss": 0.38279837369918823, - "mean_token_accuracy": 0.858866810798645, - "num_tokens": 3778094.0, - "step": 422 - }, - { - "epoch": 0.32142857142857145, - "grad_norm": 2.3108131885528564, - "learning_rate": 4.956109715524609e-06, - "loss": 0.5453893542289734, - "mean_token_accuracy": 0.8085013031959534, - "num_tokens": 3785015.0, - "step": 423 - }, - { - "epoch": 0.3221884498480243, - "grad_norm": 3.0326945781707764, - "learning_rate": 4.9557181268217225e-06, - "loss": 0.5550523400306702, - "mean_token_accuracy": 0.8125876188278198, - "num_tokens": 3789830.0, - "step": 424 - }, - { - "epoch": 0.3229483282674772, - "grad_norm": 1.8851977586746216, - "learning_rate": 4.955324814601324e-06, - "loss": 0.4902324974536896, - "mean_token_accuracy": 0.8205406665802002, - "num_tokens": 3799862.0, - "step": 425 - }, - { - "epoch": 0.32370820668693007, - "grad_norm": 2.6018171310424805, - "learning_rate": 4.954929779139455e-06, - "loss": 0.5920133590698242, - "mean_token_accuracy": 0.8340690732002258, - "num_tokens": 3806617.0, - "step": 426 - }, - { - "epoch": 0.324468085106383, - "grad_norm": 2.4283878803253174, - "learning_rate": 4.954533020713367e-06, - "loss": 0.5305854082107544, - "mean_token_accuracy": 0.8137468099594116, - "num_tokens": 3813843.0, - "step": 427 - }, - { - "epoch": 0.3252279635258359, - "grad_norm": 2.667978525161743, - "learning_rate": 4.954134539601519e-06, - "loss": 0.5333638787269592, - "mean_token_accuracy": 0.8402629494667053, - "num_tokens": 3819450.0, - "step": 428 - }, - { - "epoch": 0.32598784194528874, - "grad_norm": 1.7302523851394653, - "learning_rate": 4.953734336083582e-06, - "loss": 0.422895610332489, - "mean_token_accuracy": 0.8709704875946045, - "num_tokens": 3831027.0, - "step": 429 - }, - { - "epoch": 0.32674772036474165, - "grad_norm": 2.427192211151123, - "learning_rate": 4.953332410440434e-06, - "loss": 0.6334598064422607, - "mean_token_accuracy": 0.7817479968070984, - "num_tokens": 3841776.0, - "step": 430 - }, - { - "epoch": 0.32750759878419455, - "grad_norm": 1.460949182510376, - "learning_rate": 4.952928762954161e-06, - "loss": 0.3654777705669403, - "mean_token_accuracy": 0.8780122995376587, - "num_tokens": 3852213.0, - "step": 431 - }, - { - "epoch": 0.3282674772036474, - "grad_norm": 1.9855005741119385, - "learning_rate": 4.952523393908059e-06, - "loss": 0.5117089748382568, - "mean_token_accuracy": 0.811911404132843, - "num_tokens": 3861176.0, - "step": 432 - }, - { - "epoch": 0.3290273556231003, - "grad_norm": 2.2653207778930664, - "learning_rate": 4.952116303586631e-06, - "loss": 0.42514950037002563, - "mean_token_accuracy": 0.8448518514633179, - "num_tokens": 3867164.0, - "step": 433 - }, - { - "epoch": 0.32978723404255317, - "grad_norm": 1.9780964851379395, - "learning_rate": 4.951707492275589e-06, - "loss": 0.5095293521881104, - "mean_token_accuracy": 0.8262748718261719, - "num_tokens": 3876406.0, - "step": 434 - }, - { - "epoch": 0.3305471124620061, - "grad_norm": 2.9480233192443848, - "learning_rate": 4.951296960261853e-06, - "loss": 0.3494448959827423, - "mean_token_accuracy": 0.8781307935714722, - "num_tokens": 3880298.0, - "step": 435 - }, - { - "epoch": 0.331306990881459, - "grad_norm": 2.335571527481079, - "learning_rate": 4.95088470783355e-06, - "loss": 0.5456914901733398, - "mean_token_accuracy": 0.816297173500061, - "num_tokens": 3886487.0, - "step": 436 - }, - { - "epoch": 0.33206686930091184, - "grad_norm": 2.3046419620513916, - "learning_rate": 4.950470735280013e-06, - "loss": 0.4835948944091797, - "mean_token_accuracy": 0.8539175391197205, - "num_tokens": 3892706.0, - "step": 437 - }, - { - "epoch": 0.33282674772036475, - "grad_norm": 2.44047474861145, - "learning_rate": 4.950055042891786e-06, - "loss": 0.5154092907905579, - "mean_token_accuracy": 0.8579919338226318, - "num_tokens": 3899532.0, - "step": 438 - }, - { - "epoch": 0.33358662613981765, - "grad_norm": 4.826764106750488, - "learning_rate": 4.949637630960618e-06, - "loss": 0.5270259976387024, - "mean_token_accuracy": 0.8172192573547363, - "num_tokens": 3902260.0, - "step": 439 - }, - { - "epoch": 0.3343465045592705, - "grad_norm": 2.001574754714966, - "learning_rate": 4.949218499779462e-06, - "loss": 0.5413002967834473, - "mean_token_accuracy": 0.8162837028503418, - "num_tokens": 3911706.0, - "step": 440 - }, - { - "epoch": 0.3351063829787234, - "grad_norm": 1.7998944520950317, - "learning_rate": 4.948797649642484e-06, - "loss": 0.5131614208221436, - "mean_token_accuracy": 0.8367440700531006, - "num_tokens": 3923490.0, - "step": 441 - }, - { - "epoch": 0.33586626139817627, - "grad_norm": 3.4566173553466797, - "learning_rate": 4.94837508084505e-06, - "loss": 0.7258909940719604, - "mean_token_accuracy": 0.771377444267273, - "num_tokens": 3928099.0, - "step": 442 - }, - { - "epoch": 0.3366261398176292, - "grad_norm": 2.0040442943573, - "learning_rate": 4.9479507936837364e-06, - "loss": 0.482135534286499, - "mean_token_accuracy": 0.8339327573776245, - "num_tokens": 3937328.0, - "step": 443 - }, - { - "epoch": 0.3373860182370821, - "grad_norm": 2.949502944946289, - "learning_rate": 4.947524788456325e-06, - "loss": 0.6474795341491699, - "mean_token_accuracy": 0.7951677441596985, - "num_tokens": 3942529.0, - "step": 444 - }, - { - "epoch": 0.33814589665653494, - "grad_norm": 1.5528364181518555, - "learning_rate": 4.947097065461801e-06, - "loss": 0.48791584372520447, - "mean_token_accuracy": 0.8425545692443848, - "num_tokens": 3955200.0, - "step": 445 - }, - { - "epoch": 0.33890577507598785, - "grad_norm": 1.8813284635543823, - "learning_rate": 4.946667625000358e-06, - "loss": 0.45922309160232544, - "mean_token_accuracy": 0.8206527233123779, - "num_tokens": 3962975.0, - "step": 446 - }, - { - "epoch": 0.33966565349544076, - "grad_norm": 1.7157847881317139, - "learning_rate": 4.946236467373392e-06, - "loss": 0.5454182028770447, - "mean_token_accuracy": 0.8049604892730713, - "num_tokens": 3973956.0, - "step": 447 - }, - { - "epoch": 0.3404255319148936, - "grad_norm": 2.008857250213623, - "learning_rate": 4.945803592883509e-06, - "loss": 0.5151860117912292, - "mean_token_accuracy": 0.8262045383453369, - "num_tokens": 3982853.0, - "step": 448 - }, - { - "epoch": 0.3411854103343465, - "grad_norm": 1.6632496118545532, - "learning_rate": 4.9453690018345144e-06, - "loss": 0.42710691690444946, - "mean_token_accuracy": 0.8521314859390259, - "num_tokens": 3993838.0, - "step": 449 - }, - { - "epoch": 0.34194528875379937, - "grad_norm": 1.365234375, - "learning_rate": 4.944932694531423e-06, - "loss": 0.5172526836395264, - "mean_token_accuracy": 0.8277045488357544, - "num_tokens": 4014179.0, - "step": 450 - }, - { - "epoch": 0.3427051671732523, - "grad_norm": 1.7610243558883667, - "learning_rate": 4.94449467128045e-06, - "loss": 0.42104798555374146, - "mean_token_accuracy": 0.8552065491676331, - "num_tokens": 4023663.0, - "step": 451 - }, - { - "epoch": 0.3434650455927052, - "grad_norm": 2.3732354640960693, - "learning_rate": 4.944054932389018e-06, - "loss": 0.5471175909042358, - "mean_token_accuracy": 0.8487317562103271, - "num_tokens": 4030100.0, - "step": 452 - }, - { - "epoch": 0.34422492401215804, - "grad_norm": 1.5973623991012573, - "learning_rate": 4.943613478165753e-06, - "loss": 0.419813871383667, - "mean_token_accuracy": 0.8484025001525879, - "num_tokens": 4041124.0, - "step": 453 - }, - { - "epoch": 0.34498480243161095, - "grad_norm": 2.966381549835205, - "learning_rate": 4.943170308920484e-06, - "loss": 0.5370652675628662, - "mean_token_accuracy": 0.8439491987228394, - "num_tokens": 4045675.0, - "step": 454 - }, - { - "epoch": 0.34574468085106386, - "grad_norm": 2.5097248554229736, - "learning_rate": 4.9427254249642445e-06, - "loss": 0.5776349306106567, - "mean_token_accuracy": 0.8060523867607117, - "num_tokens": 4053250.0, - "step": 455 - }, - { - "epoch": 0.3465045592705167, - "grad_norm": 1.6779125928878784, - "learning_rate": 4.942278826609272e-06, - "loss": 0.5245476961135864, - "mean_token_accuracy": 0.8168526887893677, - "num_tokens": 4064106.0, - "step": 456 - }, - { - "epoch": 0.3472644376899696, - "grad_norm": 1.5945546627044678, - "learning_rate": 4.9418305141690045e-06, - "loss": 0.4972047209739685, - "mean_token_accuracy": 0.8257735967636108, - "num_tokens": 4077687.0, - "step": 457 - }, - { - "epoch": 0.34802431610942247, - "grad_norm": 2.864778757095337, - "learning_rate": 4.9413804879580865e-06, - "loss": 0.5372499823570251, - "mean_token_accuracy": 0.8423776626586914, - "num_tokens": 4082632.0, - "step": 458 - }, - { - "epoch": 0.3487841945288754, - "grad_norm": 1.4797078371047974, - "learning_rate": 4.940928748292363e-06, - "loss": 0.5903409719467163, - "mean_token_accuracy": 0.8061295747756958, - "num_tokens": 4104218.0, - "step": 459 - }, - { - "epoch": 0.3495440729483283, - "grad_norm": 2.4376983642578125, - "learning_rate": 4.940475295488882e-06, - "loss": 0.4534894824028015, - "mean_token_accuracy": 0.8395825028419495, - "num_tokens": 4110530.0, - "step": 460 - }, - { - "epoch": 0.35030395136778114, - "grad_norm": 1.2955626249313354, - "learning_rate": 4.940020129865895e-06, - "loss": 0.47155818343162537, - "mean_token_accuracy": 0.8253582715988159, - "num_tokens": 4128398.0, - "step": 461 - }, - { - "epoch": 0.35106382978723405, - "grad_norm": 2.066575527191162, - "learning_rate": 4.9395632517428546e-06, - "loss": 0.5555641651153564, - "mean_token_accuracy": 0.814624547958374, - "num_tokens": 4137623.0, - "step": 462 - }, - { - "epoch": 0.3518237082066869, - "grad_norm": 1.6407525539398193, - "learning_rate": 4.939104661440415e-06, - "loss": 0.4361790418624878, - "mean_token_accuracy": 0.8544459342956543, - "num_tokens": 4152803.0, - "step": 463 - }, - { - "epoch": 0.3525835866261398, - "grad_norm": 2.1685116291046143, - "learning_rate": 4.938644359280433e-06, - "loss": 0.5347012877464294, - "mean_token_accuracy": 0.853853702545166, - "num_tokens": 4160778.0, - "step": 464 - }, - { - "epoch": 0.3533434650455927, - "grad_norm": 1.8824869394302368, - "learning_rate": 4.938182345585967e-06, - "loss": 0.5512481927871704, - "mean_token_accuracy": 0.7985891699790955, - "num_tokens": 4170380.0, - "step": 465 - }, - { - "epoch": 0.3541033434650456, - "grad_norm": 2.2229504585266113, - "learning_rate": 4.937718620681273e-06, - "loss": 0.516828179359436, - "mean_token_accuracy": 0.8265621066093445, - "num_tokens": 4178179.0, - "step": 466 - }, - { - "epoch": 0.3548632218844985, - "grad_norm": 1.955990195274353, - "learning_rate": 4.9372531848918145e-06, - "loss": 0.5586158037185669, - "mean_token_accuracy": 0.8367916345596313, - "num_tokens": 4188626.0, - "step": 467 - }, - { - "epoch": 0.3556231003039514, - "grad_norm": 1.9687023162841797, - "learning_rate": 4.936786038544251e-06, - "loss": 0.5517531633377075, - "mean_token_accuracy": 0.8134098052978516, - "num_tokens": 4198144.0, - "step": 468 - }, - { - "epoch": 0.35638297872340424, - "grad_norm": 1.405516505241394, - "learning_rate": 4.9363171819664434e-06, - "loss": 0.5305492877960205, - "mean_token_accuracy": 0.8014427423477173, - "num_tokens": 4222818.0, - "step": 469 - }, - { - "epoch": 0.35714285714285715, - "grad_norm": 2.6355695724487305, - "learning_rate": 4.9358466154874535e-06, - "loss": 0.5303391218185425, - "mean_token_accuracy": 0.8028861284255981, - "num_tokens": 4228318.0, - "step": 470 - }, - { - "epoch": 0.35790273556231, - "grad_norm": 1.5133824348449707, - "learning_rate": 4.935374339437543e-06, - "loss": 0.5329189300537109, - "mean_token_accuracy": 0.8479441404342651, - "num_tokens": 4244527.0, - "step": 471 - }, - { - "epoch": 0.3586626139817629, - "grad_norm": 3.4356725215911865, - "learning_rate": 4.934900354148173e-06, - "loss": 0.5431582927703857, - "mean_token_accuracy": 0.8328983783721924, - "num_tokens": 4248034.0, - "step": 472 - }, - { - "epoch": 0.3594224924012158, - "grad_norm": 2.5789499282836914, - "learning_rate": 4.934424659952006e-06, - "loss": 0.4141455292701721, - "mean_token_accuracy": 0.8658635020256042, - "num_tokens": 4252953.0, - "step": 473 - }, - { - "epoch": 0.3601823708206687, - "grad_norm": 1.145262598991394, - "learning_rate": 4.933947257182901e-06, - "loss": 0.40294092893600464, - "mean_token_accuracy": 0.8565847277641296, - "num_tokens": 4277813.0, - "step": 474 - }, - { - "epoch": 0.3609422492401216, - "grad_norm": 1.7242133617401123, - "learning_rate": 4.933468146175918e-06, - "loss": 0.6036738753318787, - "mean_token_accuracy": 0.8072597980499268, - "num_tokens": 4291088.0, - "step": 475 - }, - { - "epoch": 0.3617021276595745, - "grad_norm": 2.3490941524505615, - "learning_rate": 4.932987327267317e-06, - "loss": 0.49456146359443665, - "mean_token_accuracy": 0.8372673988342285, - "num_tokens": 4297376.0, - "step": 476 - }, - { - "epoch": 0.36246200607902734, - "grad_norm": 1.3605526685714722, - "learning_rate": 4.932504800794553e-06, - "loss": 0.43595948815345764, - "mean_token_accuracy": 0.8415953516960144, - "num_tokens": 4312054.0, - "step": 477 - }, - { - "epoch": 0.36322188449848025, - "grad_norm": 1.4525885581970215, - "learning_rate": 4.9320205670962815e-06, - "loss": 0.5390371680259705, - "mean_token_accuracy": 0.8101649284362793, - "num_tokens": 4328701.0, - "step": 478 - }, - { - "epoch": 0.3639817629179331, - "grad_norm": 1.9862419366836548, - "learning_rate": 4.931534626512359e-06, - "loss": 0.45436930656433105, - "mean_token_accuracy": 0.8352861404418945, - "num_tokens": 4338372.0, - "step": 479 - }, - { - "epoch": 0.364741641337386, - "grad_norm": 1.7804961204528809, - "learning_rate": 4.931046979383836e-06, - "loss": 0.4677754044532776, - "mean_token_accuracy": 0.840467095375061, - "num_tokens": 4347897.0, - "step": 480 - }, - { - "epoch": 0.3655015197568389, - "grad_norm": 2.066632032394409, - "learning_rate": 4.930557626052961e-06, - "loss": 0.42418140172958374, - "mean_token_accuracy": 0.8528275489807129, - "num_tokens": 4354061.0, - "step": 481 - }, - { - "epoch": 0.3662613981762918, - "grad_norm": 1.6155282258987427, - "learning_rate": 4.930066566863182e-06, - "loss": 0.5424284934997559, - "mean_token_accuracy": 0.825040876865387, - "num_tokens": 4370400.0, - "step": 482 - }, - { - "epoch": 0.3670212765957447, - "grad_norm": 2.1452953815460205, - "learning_rate": 4.929573802159143e-06, - "loss": 0.5105804204940796, - "mean_token_accuracy": 0.8284053802490234, - "num_tokens": 4377579.0, - "step": 483 - }, - { - "epoch": 0.3677811550151976, - "grad_norm": 1.8940945863723755, - "learning_rate": 4.929079332286685e-06, - "loss": 0.43478304147720337, - "mean_token_accuracy": 0.8505665063858032, - "num_tokens": 4385686.0, - "step": 484 - }, - { - "epoch": 0.36854103343465044, - "grad_norm": 1.6785860061645508, - "learning_rate": 4.928583157592846e-06, - "loss": 0.40227848291397095, - "mean_token_accuracy": 0.8623573780059814, - "num_tokens": 4396128.0, - "step": 485 - }, - { - "epoch": 0.36930091185410335, - "grad_norm": 1.6416733264923096, - "learning_rate": 4.928085278425862e-06, - "loss": 0.526267409324646, - "mean_token_accuracy": 0.8284667730331421, - "num_tokens": 4407963.0, - "step": 486 - }, - { - "epoch": 0.3700607902735562, - "grad_norm": 1.8882389068603516, - "learning_rate": 4.927585695135162e-06, - "loss": 0.5555213093757629, - "mean_token_accuracy": 0.8115293979644775, - "num_tokens": 4418057.0, - "step": 487 - }, - { - "epoch": 0.3708206686930091, - "grad_norm": 2.300248384475708, - "learning_rate": 4.9270844080713735e-06, - "loss": 0.5812339186668396, - "mean_token_accuracy": 0.800270676612854, - "num_tokens": 4425358.0, - "step": 488 - }, - { - "epoch": 0.371580547112462, - "grad_norm": 1.6802922487258911, - "learning_rate": 4.926581417586319e-06, - "loss": 0.5134941935539246, - "mean_token_accuracy": 0.8247408866882324, - "num_tokens": 4437702.0, - "step": 489 - }, - { - "epoch": 0.3723404255319149, - "grad_norm": 1.7620291709899902, - "learning_rate": 4.926076724033016e-06, - "loss": 0.5233973264694214, - "mean_token_accuracy": 0.8102161884307861, - "num_tokens": 4448584.0, - "step": 490 - }, - { - "epoch": 0.3731003039513678, - "grad_norm": 1.6911998987197876, - "learning_rate": 4.925570327765678e-06, - "loss": 0.5337274074554443, - "mean_token_accuracy": 0.845306396484375, - "num_tokens": 4462651.0, - "step": 491 - }, - { - "epoch": 0.3738601823708207, - "grad_norm": 1.7991242408752441, - "learning_rate": 4.9250622291397144e-06, - "loss": 0.31018948554992676, - "mean_token_accuracy": 0.8857606053352356, - "num_tokens": 4469971.0, - "step": 492 - }, - { - "epoch": 0.37462006079027355, - "grad_norm": 4.9776835441589355, - "learning_rate": 4.924552428511727e-06, - "loss": 0.44114983081817627, - "mean_token_accuracy": 0.8429906368255615, - "num_tokens": 4478275.0, - "step": 493 - }, - { - "epoch": 0.37537993920972645, - "grad_norm": 1.8007272481918335, - "learning_rate": 4.924040926239515e-06, - "loss": 0.574328601360321, - "mean_token_accuracy": 0.7669196128845215, - "num_tokens": 4491551.0, - "step": 494 - }, - { - "epoch": 0.3761398176291793, - "grad_norm": 2.021300792694092, - "learning_rate": 4.92352772268207e-06, - "loss": 0.45636120438575745, - "mean_token_accuracy": 0.840438723564148, - "num_tokens": 4498658.0, - "step": 495 - }, - { - "epoch": 0.3768996960486322, - "grad_norm": 2.369748592376709, - "learning_rate": 4.923012818199576e-06, - "loss": 0.5206376910209656, - "mean_token_accuracy": 0.8521823287010193, - "num_tokens": 4504648.0, - "step": 496 - }, - { - "epoch": 0.3776595744680851, - "grad_norm": 2.733485221862793, - "learning_rate": 4.922496213153416e-06, - "loss": 0.5067723989486694, - "mean_token_accuracy": 0.8168281316757202, - "num_tokens": 4509990.0, - "step": 497 - }, - { - "epoch": 0.378419452887538, - "grad_norm": 2.3751676082611084, - "learning_rate": 4.921977907906161e-06, - "loss": 0.49757206439971924, - "mean_token_accuracy": 0.8325017690658569, - "num_tokens": 4518373.0, - "step": 498 - }, - { - "epoch": 0.3791793313069909, - "grad_norm": 2.1672775745391846, - "learning_rate": 4.921457902821578e-06, - "loss": 0.4237566590309143, - "mean_token_accuracy": 0.8404698371887207, - "num_tokens": 4524338.0, - "step": 499 - }, - { - "epoch": 0.3799392097264438, - "grad_norm": 1.8374360799789429, - "learning_rate": 4.9209361982646275e-06, - "loss": 0.4995468854904175, - "mean_token_accuracy": 0.8299649953842163, - "num_tokens": 4533396.0, - "step": 500 - }, - { - "epoch": 0.38069908814589665, - "grad_norm": 2.083967924118042, - "learning_rate": 4.920412794601461e-06, - "loss": 0.489935040473938, - "mean_token_accuracy": 0.8315291404724121, - "num_tokens": 4540941.0, - "step": 501 - }, - { - "epoch": 0.38145896656534956, - "grad_norm": 2.2075610160827637, - "learning_rate": 4.919887692199423e-06, - "loss": 0.5233147740364075, - "mean_token_accuracy": 0.804171085357666, - "num_tokens": 4548215.0, - "step": 502 - }, - { - "epoch": 0.3822188449848024, - "grad_norm": 2.076775312423706, - "learning_rate": 4.9193608914270515e-06, - "loss": 0.5785550475120544, - "mean_token_accuracy": 0.7993186116218567, - "num_tokens": 4558204.0, - "step": 503 - }, - { - "epoch": 0.3829787234042553, - "grad_norm": 2.238546133041382, - "learning_rate": 4.918832392654075e-06, - "loss": 0.5287384390830994, - "mean_token_accuracy": 0.8214945793151855, - "num_tokens": 4565407.0, - "step": 504 - }, - { - "epoch": 0.3837386018237082, - "grad_norm": 1.6783074140548706, - "learning_rate": 4.9183021962514145e-06, - "loss": 0.6063359379768372, - "mean_token_accuracy": 0.7914625406265259, - "num_tokens": 4580991.0, - "step": 505 - }, - { - "epoch": 0.3844984802431611, - "grad_norm": 1.6287449598312378, - "learning_rate": 4.917770302591183e-06, - "loss": 0.3598247766494751, - "mean_token_accuracy": 0.8706809878349304, - "num_tokens": 4590579.0, - "step": 506 - }, - { - "epoch": 0.385258358662614, - "grad_norm": 1.5432041883468628, - "learning_rate": 4.917236712046682e-06, - "loss": 0.5267890095710754, - "mean_token_accuracy": 0.8032117486000061, - "num_tokens": 4608380.0, - "step": 507 - }, - { - "epoch": 0.3860182370820669, - "grad_norm": 1.7664037942886353, - "learning_rate": 4.9167014249924075e-06, - "loss": 0.3552354574203491, - "mean_token_accuracy": 0.8569793701171875, - "num_tokens": 4616426.0, - "step": 508 - }, - { - "epoch": 0.38677811550151975, - "grad_norm": 2.1147472858428955, - "learning_rate": 4.916164441804044e-06, - "loss": 0.5212404727935791, - "mean_token_accuracy": 0.8196578025817871, - "num_tokens": 4623908.0, - "step": 509 - }, - { - "epoch": 0.38753799392097266, - "grad_norm": 2.1092333793640137, - "learning_rate": 4.915625762858467e-06, - "loss": 0.5197038650512695, - "mean_token_accuracy": 0.8245604634284973, - "num_tokens": 4630956.0, - "step": 510 - }, - { - "epoch": 0.3882978723404255, - "grad_norm": 1.23331880569458, - "learning_rate": 4.915085388533743e-06, - "loss": 0.4759839177131653, - "mean_token_accuracy": 0.8192248344421387, - "num_tokens": 4651269.0, - "step": 511 - }, - { - "epoch": 0.3890577507598784, - "grad_norm": 2.424199104309082, - "learning_rate": 4.914543319209126e-06, - "loss": 0.5576270818710327, - "mean_token_accuracy": 0.8203302621841431, - "num_tokens": 4657296.0, - "step": 512 - }, - { - "epoch": 0.3898176291793313, - "grad_norm": 2.725156307220459, - "learning_rate": 4.913999555265062e-06, - "loss": 0.4337949752807617, - "mean_token_accuracy": 0.8382406234741211, - "num_tokens": 4661850.0, - "step": 513 - }, - { - "epoch": 0.3905775075987842, - "grad_norm": 2.3120534420013428, - "learning_rate": 4.913454097083185e-06, - "loss": 0.4941597580909729, - "mean_token_accuracy": 0.8302834033966064, - "num_tokens": 4667769.0, - "step": 514 - }, - { - "epoch": 0.3913373860182371, - "grad_norm": 2.3111207485198975, - "learning_rate": 4.912906945046319e-06, - "loss": 0.5253715515136719, - "mean_token_accuracy": 0.84515380859375, - "num_tokens": 4674537.0, - "step": 515 - }, - { - "epoch": 0.39209726443769, - "grad_norm": 1.4117841720581055, - "learning_rate": 4.912358099538476e-06, - "loss": 0.4521017074584961, - "mean_token_accuracy": 0.8208256959915161, - "num_tokens": 4690605.0, - "step": 516 - }, - { - "epoch": 0.39285714285714285, - "grad_norm": 2.3742799758911133, - "learning_rate": 4.911807560944858e-06, - "loss": 0.41572901606559753, - "mean_token_accuracy": 0.8550551533699036, - "num_tokens": 4706437.0, - "step": 517 - }, - { - "epoch": 0.39361702127659576, - "grad_norm": 2.4052202701568604, - "learning_rate": 4.911255329651852e-06, - "loss": 0.6003736257553101, - "mean_token_accuracy": 0.8247885704040527, - "num_tokens": 4712746.0, - "step": 518 - }, - { - "epoch": 0.3943768996960486, - "grad_norm": 1.9335490465164185, - "learning_rate": 4.910701406047037e-06, - "loss": 0.5457713603973389, - "mean_token_accuracy": 0.787429690361023, - "num_tokens": 4731937.0, - "step": 519 - }, - { - "epoch": 0.3951367781155015, - "grad_norm": 2.257706880569458, - "learning_rate": 4.910145790519177e-06, - "loss": 0.5300652980804443, - "mean_token_accuracy": 0.8192912936210632, - "num_tokens": 4739422.0, - "step": 520 - }, - { - "epoch": 0.3958966565349544, - "grad_norm": 1.2099462747573853, - "learning_rate": 4.9095884834582256e-06, - "loss": 0.45872747898101807, - "mean_token_accuracy": 0.8362667560577393, - "num_tokens": 4757113.0, - "step": 521 - }, - { - "epoch": 0.3966565349544073, - "grad_norm": 2.7991135120391846, - "learning_rate": 4.909029485255321e-06, - "loss": 0.49039560556411743, - "mean_token_accuracy": 0.8260016441345215, - "num_tokens": 4761709.0, - "step": 522 - }, - { - "epoch": 0.3974164133738602, - "grad_norm": 2.2360129356384277, - "learning_rate": 4.90846879630279e-06, - "loss": 0.49556830525398254, - "mean_token_accuracy": 0.827864408493042, - "num_tokens": 4769048.0, - "step": 523 - }, - { - "epoch": 0.3981762917933131, - "grad_norm": 2.5953688621520996, - "learning_rate": 4.907906416994146e-06, - "loss": 0.387208491563797, - "mean_token_accuracy": 0.8467001914978027, - "num_tokens": 4774637.0, - "step": 524 - }, - { - "epoch": 0.39893617021276595, - "grad_norm": 2.1046814918518066, - "learning_rate": 4.907342347724088e-06, - "loss": 0.5477259755134583, - "mean_token_accuracy": 0.8060322999954224, - "num_tokens": 4782774.0, - "step": 525 - }, - { - "epoch": 0.39969604863221886, - "grad_norm": 2.5622646808624268, - "learning_rate": 4.906776588888502e-06, - "loss": 0.5684159398078918, - "mean_token_accuracy": 0.8095303177833557, - "num_tokens": 4788766.0, - "step": 526 - }, - { - "epoch": 0.4004559270516717, - "grad_norm": 1.9027913808822632, - "learning_rate": 4.906209140884459e-06, - "loss": 0.535524845123291, - "mean_token_accuracy": 0.815237820148468, - "num_tokens": 4798492.0, - "step": 527 - }, - { - "epoch": 0.4012158054711246, - "grad_norm": 2.1447622776031494, - "learning_rate": 4.905640004110216e-06, - "loss": 0.5628632307052612, - "mean_token_accuracy": 0.8085395097732544, - "num_tokens": 4805737.0, - "step": 528 - }, - { - "epoch": 0.40197568389057753, - "grad_norm": 1.6754741668701172, - "learning_rate": 4.905069178965215e-06, - "loss": 0.5046736598014832, - "mean_token_accuracy": 0.8247535228729248, - "num_tokens": 4816912.0, - "step": 529 - }, - { - "epoch": 0.4027355623100304, - "grad_norm": 2.271230459213257, - "learning_rate": 4.904496665850083e-06, - "loss": 0.6086187958717346, - "mean_token_accuracy": 0.7935276627540588, - "num_tokens": 4824577.0, - "step": 530 - }, - { - "epoch": 0.4034954407294833, - "grad_norm": 2.107595205307007, - "learning_rate": 4.903922465166633e-06, - "loss": 0.5431341528892517, - "mean_token_accuracy": 0.8129537105560303, - "num_tokens": 4831772.0, - "step": 531 - }, - { - "epoch": 0.40425531914893614, - "grad_norm": 1.3860732316970825, - "learning_rate": 4.903346577317859e-06, - "loss": 0.45816320180892944, - "mean_token_accuracy": 0.8328287601470947, - "num_tokens": 4850302.0, - "step": 532 - }, - { - "epoch": 0.40501519756838905, - "grad_norm": 1.9186837673187256, - "learning_rate": 4.902769002707942e-06, - "loss": 0.3294633626937866, - "mean_token_accuracy": 0.8853933811187744, - "num_tokens": 4856624.0, - "step": 533 - }, - { - "epoch": 0.40577507598784196, - "grad_norm": 1.516194462776184, - "learning_rate": 4.902189741742247e-06, - "loss": 0.45482105016708374, - "mean_token_accuracy": 0.8370342254638672, - "num_tokens": 4870395.0, - "step": 534 - }, - { - "epoch": 0.4065349544072948, - "grad_norm": 2.3235628604888916, - "learning_rate": 4.901608794827321e-06, - "loss": 0.40688639879226685, - "mean_token_accuracy": 0.8643521666526794, - "num_tokens": 4875645.0, - "step": 535 - }, - { - "epoch": 0.4072948328267477, - "grad_norm": 2.29286527633667, - "learning_rate": 4.9010261623708945e-06, - "loss": 0.45482826232910156, - "mean_token_accuracy": 0.8429383039474487, - "num_tokens": 4881772.0, - "step": 536 - }, - { - "epoch": 0.40805471124620063, - "grad_norm": 1.5907070636749268, - "learning_rate": 4.900441844781882e-06, - "loss": 0.5266948342323303, - "mean_token_accuracy": 0.8348641395568848, - "num_tokens": 4894289.0, - "step": 537 - }, - { - "epoch": 0.4088145896656535, - "grad_norm": 2.1816294193267822, - "learning_rate": 4.89985584247038e-06, - "loss": 0.4797617793083191, - "mean_token_accuracy": 0.8549500703811646, - "num_tokens": 4901106.0, - "step": 538 - }, - { - "epoch": 0.4095744680851064, - "grad_norm": 1.7347146272659302, - "learning_rate": 4.899268155847667e-06, - "loss": 0.4754739999771118, - "mean_token_accuracy": 0.8278418183326721, - "num_tokens": 4912131.0, - "step": 539 - }, - { - "epoch": 0.41033434650455924, - "grad_norm": 2.0694527626037598, - "learning_rate": 4.898678785326205e-06, - "loss": 0.5071008801460266, - "mean_token_accuracy": 0.8157946467399597, - "num_tokens": 4921141.0, - "step": 540 - }, - { - "epoch": 0.41109422492401215, - "grad_norm": 2.570047616958618, - "learning_rate": 4.898087731319637e-06, - "loss": 0.43639278411865234, - "mean_token_accuracy": 0.8682913780212402, - "num_tokens": 4926182.0, - "step": 541 - }, - { - "epoch": 0.41185410334346506, - "grad_norm": 4.064006805419922, - "learning_rate": 4.8974949942427854e-06, - "loss": 0.539260745048523, - "mean_token_accuracy": 0.8225528001785278, - "num_tokens": 4929449.0, - "step": 542 - }, - { - "epoch": 0.4126139817629179, - "grad_norm": 1.7644332647323608, - "learning_rate": 4.896900574511657e-06, - "loss": 0.472618043422699, - "mean_token_accuracy": 0.8332902193069458, - "num_tokens": 4939443.0, - "step": 543 - }, - { - "epoch": 0.4133738601823708, - "grad_norm": 2.879918336868286, - "learning_rate": 4.89630447254344e-06, - "loss": 0.6360667943954468, - "mean_token_accuracy": 0.8215296268463135, - "num_tokens": 4950838.0, - "step": 544 - }, - { - "epoch": 0.41413373860182373, - "grad_norm": 1.4575570821762085, - "learning_rate": 4.8957066887565005e-06, - "loss": 0.45617997646331787, - "mean_token_accuracy": 0.8373187184333801, - "num_tokens": 4965222.0, - "step": 545 - }, - { - "epoch": 0.4148936170212766, - "grad_norm": 2.4829535484313965, - "learning_rate": 4.895107223570386e-06, - "loss": 0.42285341024398804, - "mean_token_accuracy": 0.8686380386352539, - "num_tokens": 4970724.0, - "step": 546 - }, - { - "epoch": 0.4156534954407295, - "grad_norm": 2.639474630355835, - "learning_rate": 4.894506077405824e-06, - "loss": 0.5906289219856262, - "mean_token_accuracy": 0.8174435496330261, - "num_tokens": 4976766.0, - "step": 547 - }, - { - "epoch": 0.41641337386018235, - "grad_norm": 2.7960562705993652, - "learning_rate": 4.893903250684723e-06, - "loss": 0.4518949091434479, - "mean_token_accuracy": 0.8387585282325745, - "num_tokens": 4980991.0, - "step": 548 - }, - { - "epoch": 0.41717325227963525, - "grad_norm": 2.184176206588745, - "learning_rate": 4.893298743830168e-06, - "loss": 0.5223842859268188, - "mean_token_accuracy": 0.8170937299728394, - "num_tokens": 4987781.0, - "step": 549 - }, - { - "epoch": 0.41793313069908816, - "grad_norm": 2.2393438816070557, - "learning_rate": 4.892692557266429e-06, - "loss": 0.5238431692123413, - "mean_token_accuracy": 0.8217905759811401, - "num_tokens": 4994321.0, - "step": 550 - }, - { - "epoch": 0.418693009118541, - "grad_norm": 3.579047441482544, - "learning_rate": 4.8920846914189465e-06, - "loss": 0.5367584228515625, - "mean_token_accuracy": 0.8312011361122131, - "num_tokens": 4997951.0, - "step": 551 - }, - { - "epoch": 0.4194528875379939, - "grad_norm": 1.6330240964889526, - "learning_rate": 4.891475146714348e-06, - "loss": 0.6054705381393433, - "mean_token_accuracy": 0.7938206791877747, - "num_tokens": 5012726.0, - "step": 552 - }, - { - "epoch": 0.42021276595744683, - "grad_norm": 1.5775716304779053, - "learning_rate": 4.8908639235804324e-06, - "loss": 0.4774656891822815, - "mean_token_accuracy": 0.828762948513031, - "num_tokens": 5026751.0, - "step": 553 - }, - { - "epoch": 0.4209726443768997, - "grad_norm": 1.5719101428985596, - "learning_rate": 4.890251022446181e-06, - "loss": 0.549429178237915, - "mean_token_accuracy": 0.8110791444778442, - "num_tokens": 5041861.0, - "step": 554 - }, - { - "epoch": 0.4217325227963526, - "grad_norm": 1.8585275411605835, - "learning_rate": 4.889636443741752e-06, - "loss": 0.4448118805885315, - "mean_token_accuracy": 0.8462690711021423, - "num_tokens": 5052690.0, - "step": 555 - }, - { - "epoch": 0.42249240121580545, - "grad_norm": 2.189202070236206, - "learning_rate": 4.88902018789848e-06, - "loss": 0.4296762943267822, - "mean_token_accuracy": 0.8488791584968567, - "num_tokens": 5058964.0, - "step": 556 - }, - { - "epoch": 0.42325227963525835, - "grad_norm": 1.9328460693359375, - "learning_rate": 4.888402255348877e-06, - "loss": 0.5369474291801453, - "mean_token_accuracy": 0.8184729814529419, - "num_tokens": 5068465.0, - "step": 557 - }, - { - "epoch": 0.42401215805471126, - "grad_norm": 1.6233323812484741, - "learning_rate": 4.887782646526631e-06, - "loss": 0.5284391641616821, - "mean_token_accuracy": 0.8276044726371765, - "num_tokens": 5081052.0, - "step": 558 - }, - { - "epoch": 0.4247720364741641, - "grad_norm": 2.222813844680786, - "learning_rate": 4.887161361866608e-06, - "loss": 0.5679137706756592, - "mean_token_accuracy": 0.8012375831604004, - "num_tokens": 5090001.0, - "step": 559 - }, - { - "epoch": 0.425531914893617, - "grad_norm": 2.1062207221984863, - "learning_rate": 4.8865384018048494e-06, - "loss": 0.5554201602935791, - "mean_token_accuracy": 0.8128066062927246, - "num_tokens": 5097644.0, - "step": 560 - }, - { - "epoch": 0.42629179331306993, - "grad_norm": 1.5380984544754028, - "learning_rate": 4.8859137667785735e-06, - "loss": 0.4948265850543976, - "mean_token_accuracy": 0.8258291482925415, - "num_tokens": 5110069.0, - "step": 561 - }, - { - "epoch": 0.4270516717325228, - "grad_norm": 2.0290257930755615, - "learning_rate": 4.8852874572261715e-06, - "loss": 0.4969530403614044, - "mean_token_accuracy": 0.8297134637832642, - "num_tokens": 5117452.0, - "step": 562 - }, - { - "epoch": 0.4278115501519757, - "grad_norm": 1.5651452541351318, - "learning_rate": 4.884659473587213e-06, - "loss": 0.5353102087974548, - "mean_token_accuracy": 0.8161719441413879, - "num_tokens": 5133756.0, - "step": 563 - }, - { - "epoch": 0.42857142857142855, - "grad_norm": 2.2470998764038086, - "learning_rate": 4.884029816302441e-06, - "loss": 0.5104288458824158, - "mean_token_accuracy": 0.8081635236740112, - "num_tokens": 5140278.0, - "step": 564 - }, - { - "epoch": 0.42933130699088146, - "grad_norm": 1.726891279220581, - "learning_rate": 4.883398485813772e-06, - "loss": 0.4508771002292633, - "mean_token_accuracy": 0.8548800349235535, - "num_tokens": 5150115.0, - "step": 565 - }, - { - "epoch": 0.43009118541033436, - "grad_norm": 1.4779289960861206, - "learning_rate": 4.8827654825642984e-06, - "loss": 0.46861088275909424, - "mean_token_accuracy": 0.8209476470947266, - "num_tokens": 5163225.0, - "step": 566 - }, - { - "epoch": 0.4308510638297872, - "grad_norm": 1.2361034154891968, - "learning_rate": 4.882130806998287e-06, - "loss": 0.4591076672077179, - "mean_token_accuracy": 0.803041934967041, - "num_tokens": 5180342.0, - "step": 567 - }, - { - "epoch": 0.4316109422492401, - "grad_norm": 1.882467269897461, - "learning_rate": 4.881494459561177e-06, - "loss": 0.579258143901825, - "mean_token_accuracy": 0.8007112741470337, - "num_tokens": 5189595.0, - "step": 568 - }, - { - "epoch": 0.43237082066869303, - "grad_norm": 1.095462441444397, - "learning_rate": 4.880856440699582e-06, - "loss": 0.3806574046611786, - "mean_token_accuracy": 0.8650111556053162, - "num_tokens": 5211642.0, - "step": 569 - }, - { - "epoch": 0.4331306990881459, - "grad_norm": 1.6469846963882446, - "learning_rate": 4.880216750861288e-06, - "loss": 0.544589638710022, - "mean_token_accuracy": 0.8060122728347778, - "num_tokens": 5224137.0, - "step": 570 - }, - { - "epoch": 0.4338905775075988, - "grad_norm": 1.8561251163482666, - "learning_rate": 4.879575390495254e-06, - "loss": 0.4094924330711365, - "mean_token_accuracy": 0.8591406345367432, - "num_tokens": 5231588.0, - "step": 571 - }, - { - "epoch": 0.43465045592705165, - "grad_norm": 3.01326847076416, - "learning_rate": 4.878932360051611e-06, - "loss": 0.6139192581176758, - "mean_token_accuracy": 0.8108739852905273, - "num_tokens": 5236853.0, - "step": 572 - }, - { - "epoch": 0.43541033434650456, - "grad_norm": 2.1753034591674805, - "learning_rate": 4.878287659981663e-06, - "loss": 0.49082931876182556, - "mean_token_accuracy": 0.862828254699707, - "num_tokens": 5243264.0, - "step": 573 - }, - { - "epoch": 0.43617021276595747, - "grad_norm": 1.4437755346298218, - "learning_rate": 4.8776412907378845e-06, - "loss": 0.5608728528022766, - "mean_token_accuracy": 0.8271626234054565, - "num_tokens": 5261757.0, - "step": 574 - }, - { - "epoch": 0.4369300911854103, - "grad_norm": 1.786683440208435, - "learning_rate": 4.876993252773923e-06, - "loss": 0.4377627968788147, - "mean_token_accuracy": 0.844936192035675, - "num_tokens": 5271038.0, - "step": 575 - }, - { - "epoch": 0.4376899696048632, - "grad_norm": 1.3425915241241455, - "learning_rate": 4.876343546544596e-06, - "loss": 0.44762521982192993, - "mean_token_accuracy": 0.8397793769836426, - "num_tokens": 5285555.0, - "step": 576 - }, - { - "epoch": 0.43844984802431614, - "grad_norm": 2.1549675464630127, - "learning_rate": 4.8756921725058935e-06, - "loss": 0.5332942008972168, - "mean_token_accuracy": 0.820149302482605, - "num_tokens": 5294595.0, - "step": 577 - }, - { - "epoch": 0.439209726443769, - "grad_norm": 1.5254042148590088, - "learning_rate": 4.875039131114975e-06, - "loss": 0.3646543622016907, - "mean_token_accuracy": 0.8442583084106445, - "num_tokens": 5304955.0, - "step": 578 - }, - { - "epoch": 0.4399696048632219, - "grad_norm": 1.5751557350158691, - "learning_rate": 4.8743844228301676e-06, - "loss": 0.4854734539985657, - "mean_token_accuracy": 0.8317523002624512, - "num_tokens": 5317351.0, - "step": 579 - }, - { - "epoch": 0.44072948328267475, - "grad_norm": 1.6950466632843018, - "learning_rate": 4.873728048110973e-06, - "loss": 0.5907570719718933, - "mean_token_accuracy": 0.7946986556053162, - "num_tokens": 5332542.0, - "step": 580 - }, - { - "epoch": 0.44148936170212766, - "grad_norm": 2.1180708408355713, - "learning_rate": 4.873070007418059e-06, - "loss": 0.5220296382904053, - "mean_token_accuracy": 0.8037363290786743, - "num_tokens": 5341722.0, - "step": 581 - }, - { - "epoch": 0.44224924012158057, - "grad_norm": 1.3643816709518433, - "learning_rate": 4.872410301213265e-06, - "loss": 0.4865502417087555, - "mean_token_accuracy": 0.8377852439880371, - "num_tokens": 5359359.0, - "step": 582 - }, - { - "epoch": 0.4430091185410334, - "grad_norm": 1.483280897140503, - "learning_rate": 4.871748929959598e-06, - "loss": 0.36856764554977417, - "mean_token_accuracy": 0.8709549903869629, - "num_tokens": 5369749.0, - "step": 583 - }, - { - "epoch": 0.44376899696048633, - "grad_norm": 1.6891541481018066, - "learning_rate": 4.871085894121234e-06, - "loss": 0.5768930912017822, - "mean_token_accuracy": 0.8030461668968201, - "num_tokens": 5383912.0, - "step": 584 - }, - { - "epoch": 0.44452887537993924, - "grad_norm": 2.1318740844726562, - "learning_rate": 4.870421194163515e-06, - "loss": 0.4337100386619568, - "mean_token_accuracy": 0.8562518358230591, - "num_tokens": 5389412.0, - "step": 585 - }, - { - "epoch": 0.4452887537993921, - "grad_norm": 2.540255546569824, - "learning_rate": 4.869754830552956e-06, - "loss": 0.4708256125450134, - "mean_token_accuracy": 0.8446552753448486, - "num_tokens": 5394762.0, - "step": 586 - }, - { - "epoch": 0.446048632218845, - "grad_norm": 2.048015594482422, - "learning_rate": 4.869086803757235e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8181137442588806, - "num_tokens": 5402379.0, - "step": 587 - }, - { - "epoch": 0.44680851063829785, - "grad_norm": 2.9821012020111084, - "learning_rate": 4.868417114245199e-06, - "loss": 0.6299797296524048, - "mean_token_accuracy": 0.8237329125404358, - "num_tokens": 5408229.0, - "step": 588 - }, - { - "epoch": 0.44756838905775076, - "grad_norm": 1.7807202339172363, - "learning_rate": 4.867745762486862e-06, - "loss": 0.5176759958267212, - "mean_token_accuracy": 0.8184244632720947, - "num_tokens": 5418383.0, - "step": 589 - }, - { - "epoch": 0.44832826747720367, - "grad_norm": 1.5466399192810059, - "learning_rate": 4.8670727489534035e-06, - "loss": 0.5137228965759277, - "mean_token_accuracy": 0.8365053534507751, - "num_tokens": 5432127.0, - "step": 590 - }, - { - "epoch": 0.4490881458966565, - "grad_norm": 2.9521141052246094, - "learning_rate": 4.866398074117173e-06, - "loss": 0.4056887924671173, - "mean_token_accuracy": 0.8561501502990723, - "num_tokens": 5436062.0, - "step": 591 - }, - { - "epoch": 0.44984802431610943, - "grad_norm": 2.058743953704834, - "learning_rate": 4.86572173845168e-06, - "loss": 0.6124799251556396, - "mean_token_accuracy": 0.8007957339286804, - "num_tokens": 5444989.0, - "step": 592 - }, - { - "epoch": 0.4506079027355623, - "grad_norm": 2.1243767738342285, - "learning_rate": 4.865043742431605e-06, - "loss": 0.5659694671630859, - "mean_token_accuracy": 0.8084750175476074, - "num_tokens": 5453865.0, - "step": 593 - }, - { - "epoch": 0.4513677811550152, - "grad_norm": 1.6732314825057983, - "learning_rate": 4.864364086532792e-06, - "loss": 0.47879064083099365, - "mean_token_accuracy": 0.8346436023712158, - "num_tokens": 5466398.0, - "step": 594 - }, - { - "epoch": 0.4521276595744681, - "grad_norm": 1.3793858289718628, - "learning_rate": 4.863682771232249e-06, - "loss": 0.45989373326301575, - "mean_token_accuracy": 0.8254791498184204, - "num_tokens": 5482121.0, - "step": 595 - }, - { - "epoch": 0.45288753799392095, - "grad_norm": 1.9812315702438354, - "learning_rate": 4.862999797008149e-06, - "loss": 0.5778874754905701, - "mean_token_accuracy": 0.8041508197784424, - "num_tokens": 5493000.0, - "step": 596 - }, - { - "epoch": 0.45364741641337386, - "grad_norm": 3.3065083026885986, - "learning_rate": 4.862315164339829e-06, - "loss": 0.4623975157737732, - "mean_token_accuracy": 0.8426318168640137, - "num_tokens": 5496723.0, - "step": 597 - }, - { - "epoch": 0.45440729483282677, - "grad_norm": 3.167119026184082, - "learning_rate": 4.861628873707792e-06, - "loss": 0.6984533667564392, - "mean_token_accuracy": 0.772136926651001, - "num_tokens": 5501161.0, - "step": 598 - }, - { - "epoch": 0.4551671732522796, - "grad_norm": 2.2130985260009766, - "learning_rate": 4.860940925593703e-06, - "loss": 0.4823192059993744, - "mean_token_accuracy": 0.8462972640991211, - "num_tokens": 5509544.0, - "step": 599 - }, - { - "epoch": 0.45592705167173253, - "grad_norm": 3.029191732406616, - "learning_rate": 4.86025132048039e-06, - "loss": 0.523664116859436, - "mean_token_accuracy": 0.8229140043258667, - "num_tokens": 5514586.0, - "step": 600 - }, - { - "epoch": 0.4566869300911854, - "grad_norm": 1.6983962059020996, - "learning_rate": 4.859560058851844e-06, - "loss": 0.4832698106765747, - "mean_token_accuracy": 0.8403248190879822, - "num_tokens": 5525773.0, - "step": 601 - }, - { - "epoch": 0.4574468085106383, - "grad_norm": 3.0504038333892822, - "learning_rate": 4.8588671411932195e-06, - "loss": 0.5158926248550415, - "mean_token_accuracy": 0.8098392486572266, - "num_tokens": 5529739.0, - "step": 602 - }, - { - "epoch": 0.4582066869300912, - "grad_norm": 2.584836483001709, - "learning_rate": 4.858172567990832e-06, - "loss": 0.5724587440490723, - "mean_token_accuracy": 0.8128519058227539, - "num_tokens": 5535763.0, - "step": 603 - }, - { - "epoch": 0.45896656534954405, - "grad_norm": 2.0514042377471924, - "learning_rate": 4.857476339732162e-06, - "loss": 0.4337679445743561, - "mean_token_accuracy": 0.8405929207801819, - "num_tokens": 5543075.0, - "step": 604 - }, - { - "epoch": 0.45972644376899696, - "grad_norm": 2.2949347496032715, - "learning_rate": 4.856778456905846e-06, - "loss": 0.46532145142555237, - "mean_token_accuracy": 0.8345137238502502, - "num_tokens": 5549035.0, - "step": 605 - }, - { - "epoch": 0.46048632218844987, - "grad_norm": 2.2067551612854004, - "learning_rate": 4.856078920001689e-06, - "loss": 0.5855136513710022, - "mean_token_accuracy": 0.8043795228004456, - "num_tokens": 5555545.0, - "step": 606 - }, - { - "epoch": 0.4612462006079027, - "grad_norm": 2.101945161819458, - "learning_rate": 4.855377729510648e-06, - "loss": 0.6071814298629761, - "mean_token_accuracy": 0.7973253130912781, - "num_tokens": 5563615.0, - "step": 607 - }, - { - "epoch": 0.46200607902735563, - "grad_norm": 2.5958821773529053, - "learning_rate": 4.8546748859248504e-06, - "loss": 0.6278061866760254, - "mean_token_accuracy": 0.7864972352981567, - "num_tokens": 5570078.0, - "step": 608 - }, - { - "epoch": 0.4627659574468085, - "grad_norm": 2.778101921081543, - "learning_rate": 4.853970389737576e-06, - "loss": 0.35521194338798523, - "mean_token_accuracy": 0.8752605319023132, - "num_tokens": 5573995.0, - "step": 609 - }, - { - "epoch": 0.4635258358662614, - "grad_norm": 2.600534677505493, - "learning_rate": 4.8532642414432675e-06, - "loss": 0.6541563868522644, - "mean_token_accuracy": 0.7843613028526306, - "num_tokens": 5580333.0, - "step": 610 - }, - { - "epoch": 0.4642857142857143, - "grad_norm": 1.778337836265564, - "learning_rate": 4.852556441537528e-06, - "loss": 0.3561405837535858, - "mean_token_accuracy": 0.8579353094100952, - "num_tokens": 5588430.0, - "step": 611 - }, - { - "epoch": 0.46504559270516715, - "grad_norm": 1.5653862953186035, - "learning_rate": 4.851846990517118e-06, - "loss": 0.6067906618118286, - "mean_token_accuracy": 0.7919317483901978, - "num_tokens": 5601700.0, - "step": 612 - }, - { - "epoch": 0.46580547112462006, - "grad_norm": 1.6097723245620728, - "learning_rate": 4.851135888879958e-06, - "loss": 0.446664422750473, - "mean_token_accuracy": 0.8441969156265259, - "num_tokens": 5612063.0, - "step": 613 - }, - { - "epoch": 0.46656534954407297, - "grad_norm": 1.961207389831543, - "learning_rate": 4.850423137125126e-06, - "loss": 0.5508605241775513, - "mean_token_accuracy": 0.8240450024604797, - "num_tokens": 5620245.0, - "step": 614 - }, - { - "epoch": 0.4673252279635258, - "grad_norm": 2.2189085483551025, - "learning_rate": 4.8497087357528585e-06, - "loss": 0.6805076599121094, - "mean_token_accuracy": 0.771978497505188, - "num_tokens": 5629590.0, - "step": 615 - }, - { - "epoch": 0.46808510638297873, - "grad_norm": 2.5176279544830322, - "learning_rate": 4.8489926852645505e-06, - "loss": 0.4512156844139099, - "mean_token_accuracy": 0.836459755897522, - "num_tokens": 5635259.0, - "step": 616 - }, - { - "epoch": 0.4688449848024316, - "grad_norm": 1.5327287912368774, - "learning_rate": 4.848274986162754e-06, - "loss": 0.4884302616119385, - "mean_token_accuracy": 0.8194037079811096, - "num_tokens": 5649993.0, - "step": 617 - }, - { - "epoch": 0.4696048632218845, - "grad_norm": 2.184554100036621, - "learning_rate": 4.847555638951177e-06, - "loss": 0.5141451358795166, - "mean_token_accuracy": 0.8245922327041626, - "num_tokens": 5657375.0, - "step": 618 - }, - { - "epoch": 0.4703647416413374, - "grad_norm": 1.6143407821655273, - "learning_rate": 4.846834644134686e-06, - "loss": 0.4276641607284546, - "mean_token_accuracy": 0.8481845855712891, - "num_tokens": 5667941.0, - "step": 619 - }, - { - "epoch": 0.47112462006079026, - "grad_norm": 2.3747270107269287, - "learning_rate": 4.846112002219301e-06, - "loss": 0.5608246922492981, - "mean_token_accuracy": 0.8073011040687561, - "num_tokens": 5675042.0, - "step": 620 - }, - { - "epoch": 0.47188449848024316, - "grad_norm": 2.390404224395752, - "learning_rate": 4.845387713712203e-06, - "loss": 0.46616724133491516, - "mean_token_accuracy": 0.8468319177627563, - "num_tokens": 5680207.0, - "step": 621 - }, - { - "epoch": 0.4726443768996961, - "grad_norm": 1.7245099544525146, - "learning_rate": 4.844661779121723e-06, - "loss": 0.5652435421943665, - "mean_token_accuracy": 0.8010749816894531, - "num_tokens": 5693759.0, - "step": 622 - }, - { - "epoch": 0.4734042553191489, - "grad_norm": 2.6923108100891113, - "learning_rate": 4.843934198957351e-06, - "loss": 0.6254661679267883, - "mean_token_accuracy": 0.8236024975776672, - "num_tokens": 5699916.0, - "step": 623 - }, - { - "epoch": 0.47416413373860183, - "grad_norm": 2.516901969909668, - "learning_rate": 4.84320497372973e-06, - "loss": 0.6334252953529358, - "mean_token_accuracy": 0.7803834676742554, - "num_tokens": 5706554.0, - "step": 624 - }, - { - "epoch": 0.4749240121580547, - "grad_norm": 2.3744447231292725, - "learning_rate": 4.842474103950658e-06, - "loss": 0.4221811890602112, - "mean_token_accuracy": 0.8639545440673828, - "num_tokens": 5711756.0, - "step": 625 - }, - { - "epoch": 0.4756838905775076, - "grad_norm": 3.2373476028442383, - "learning_rate": 4.841741590133089e-06, - "loss": 0.6637828946113586, - "mean_token_accuracy": 0.7968347072601318, - "num_tokens": 5716458.0, - "step": 626 - }, - { - "epoch": 0.4764437689969605, - "grad_norm": 2.153888463973999, - "learning_rate": 4.841007432791129e-06, - "loss": 0.4877486228942871, - "mean_token_accuracy": 0.8345249891281128, - "num_tokens": 5723155.0, - "step": 627 - }, - { - "epoch": 0.47720364741641336, - "grad_norm": 2.120497703552246, - "learning_rate": 4.8402716324400375e-06, - "loss": 0.37323033809661865, - "mean_token_accuracy": 0.8734050393104553, - "num_tokens": 5729171.0, - "step": 628 - }, - { - "epoch": 0.47796352583586627, - "grad_norm": 1.5294172763824463, - "learning_rate": 4.839534189596228e-06, - "loss": 0.4057067334651947, - "mean_token_accuracy": 0.8523319959640503, - "num_tokens": 5740112.0, - "step": 629 - }, - { - "epoch": 0.4787234042553192, - "grad_norm": 2.1913886070251465, - "learning_rate": 4.8387951047772656e-06, - "loss": 0.4835960865020752, - "mean_token_accuracy": 0.8438145518302917, - "num_tokens": 5746838.0, - "step": 630 - }, - { - "epoch": 0.479483282674772, - "grad_norm": 1.482897162437439, - "learning_rate": 4.838054378501868e-06, - "loss": 0.46967992186546326, - "mean_token_accuracy": 0.8315759897232056, - "num_tokens": 5760428.0, - "step": 631 - }, - { - "epoch": 0.48024316109422494, - "grad_norm": 1.38850998878479, - "learning_rate": 4.837312011289907e-06, - "loss": 0.41845446825027466, - "mean_token_accuracy": 0.8557186126708984, - "num_tokens": 5773437.0, - "step": 632 - }, - { - "epoch": 0.4810030395136778, - "grad_norm": 3.8337457180023193, - "learning_rate": 4.836568003662403e-06, - "loss": 0.5102912187576294, - "mean_token_accuracy": 0.830644965171814, - "num_tokens": 5776367.0, - "step": 633 - }, - { - "epoch": 0.4817629179331307, - "grad_norm": 1.2084007263183594, - "learning_rate": 4.8358223561415304e-06, - "loss": 0.3835333585739136, - "mean_token_accuracy": 0.8639016151428223, - "num_tokens": 5792246.0, - "step": 634 - }, - { - "epoch": 0.4825227963525836, - "grad_norm": 1.939408540725708, - "learning_rate": 4.835075069250613e-06, - "loss": 0.4044850468635559, - "mean_token_accuracy": 0.8488376140594482, - "num_tokens": 5799853.0, - "step": 635 - }, - { - "epoch": 0.48328267477203646, - "grad_norm": 1.345870852470398, - "learning_rate": 4.8343261435141245e-06, - "loss": 0.46660199761390686, - "mean_token_accuracy": 0.8371681571006775, - "num_tokens": 5817478.0, - "step": 636 - }, - { - "epoch": 0.48404255319148937, - "grad_norm": 1.6531339883804321, - "learning_rate": 4.833575579457691e-06, - "loss": 0.3886989951133728, - "mean_token_accuracy": 0.8763507008552551, - "num_tokens": 5825739.0, - "step": 637 - }, - { - "epoch": 0.4848024316109423, - "grad_norm": 1.6443969011306763, - "learning_rate": 4.832823377608088e-06, - "loss": 0.4070289731025696, - "mean_token_accuracy": 0.8586630821228027, - "num_tokens": 5837917.0, - "step": 638 - }, - { - "epoch": 0.48556231003039513, - "grad_norm": 2.005136013031006, - "learning_rate": 4.832069538493237e-06, - "loss": 0.40616685152053833, - "mean_token_accuracy": 0.8571510314941406, - "num_tokens": 5845250.0, - "step": 639 - }, - { - "epoch": 0.48632218844984804, - "grad_norm": 1.5244266986846924, - "learning_rate": 4.831314062642213e-06, - "loss": 0.49530288577079773, - "mean_token_accuracy": 0.8328841924667358, - "num_tokens": 5857407.0, - "step": 640 - }, - { - "epoch": 0.4870820668693009, - "grad_norm": 1.9876971244812012, - "learning_rate": 4.830556950585239e-06, - "loss": 0.4583776593208313, - "mean_token_accuracy": 0.8427221179008484, - "num_tokens": 5865391.0, - "step": 641 - }, - { - "epoch": 0.4878419452887538, - "grad_norm": 3.023336172103882, - "learning_rate": 4.829798202853683e-06, - "loss": 0.6134771108627319, - "mean_token_accuracy": 0.7981935739517212, - "num_tokens": 5870729.0, - "step": 642 - }, - { - "epoch": 0.4886018237082067, - "grad_norm": 1.8889515399932861, - "learning_rate": 4.829037819980065e-06, - "loss": 0.4420135021209717, - "mean_token_accuracy": 0.8480775356292725, - "num_tokens": 5878982.0, - "step": 643 - }, - { - "epoch": 0.48936170212765956, - "grad_norm": 2.2408435344696045, - "learning_rate": 4.828275802498051e-06, - "loss": 0.525706946849823, - "mean_token_accuracy": 0.8271557092666626, - "num_tokens": 5885097.0, - "step": 644 - }, - { - "epoch": 0.49012158054711247, - "grad_norm": 1.9734224081039429, - "learning_rate": 4.827512150942454e-06, - "loss": 0.44246578216552734, - "mean_token_accuracy": 0.8456668257713318, - "num_tokens": 5893941.0, - "step": 645 - }, - { - "epoch": 0.4908814589665654, - "grad_norm": 1.9618173837661743, - "learning_rate": 4.8267468658492335e-06, - "loss": 0.5119768381118774, - "mean_token_accuracy": 0.8355510830879211, - "num_tokens": 5902829.0, - "step": 646 - }, - { - "epoch": 0.49164133738601823, - "grad_norm": 1.7181587219238281, - "learning_rate": 4.825979947755496e-06, - "loss": 0.5666520595550537, - "mean_token_accuracy": 0.7951971888542175, - "num_tokens": 5915212.0, - "step": 647 - }, - { - "epoch": 0.49240121580547114, - "grad_norm": 3.0121164321899414, - "learning_rate": 4.8252113971994955e-06, - "loss": 0.628632128238678, - "mean_token_accuracy": 0.8041050434112549, - "num_tokens": 5921410.0, - "step": 648 - }, - { - "epoch": 0.493161094224924, - "grad_norm": 2.9980475902557373, - "learning_rate": 4.824441214720629e-06, - "loss": 0.4507424831390381, - "mean_token_accuracy": 0.8636263608932495, - "num_tokens": 5925179.0, - "step": 649 - }, - { - "epoch": 0.4939209726443769, - "grad_norm": 2.0096445083618164, - "learning_rate": 4.823669400859441e-06, - "loss": 0.602759838104248, - "mean_token_accuracy": 0.8104915618896484, - "num_tokens": 5934160.0, - "step": 650 - }, - { - "epoch": 0.4946808510638298, - "grad_norm": 1.1186442375183105, - "learning_rate": 4.8228959561576195e-06, - "loss": 0.41168469190597534, - "mean_token_accuracy": 0.8461419939994812, - "num_tokens": 5954163.0, - "step": 651 - }, - { - "epoch": 0.49544072948328266, - "grad_norm": 1.855465054512024, - "learning_rate": 4.822120881157998e-06, - "loss": 0.5049735307693481, - "mean_token_accuracy": 0.8225747346878052, - "num_tokens": 5963840.0, - "step": 652 - }, - { - "epoch": 0.49620060790273557, - "grad_norm": 3.550563335418701, - "learning_rate": 4.821344176404554e-06, - "loss": 0.49025264382362366, - "mean_token_accuracy": 0.8265978693962097, - "num_tokens": 5967358.0, - "step": 653 - }, - { - "epoch": 0.4969604863221885, - "grad_norm": 3.063910484313965, - "learning_rate": 4.820565842442408e-06, - "loss": 0.5652767419815063, - "mean_token_accuracy": 0.811700701713562, - "num_tokens": 5971858.0, - "step": 654 - }, - { - "epoch": 0.49772036474164133, - "grad_norm": 2.4613308906555176, - "learning_rate": 4.819785879817827e-06, - "loss": 0.5296125411987305, - "mean_token_accuracy": 0.8336488008499146, - "num_tokens": 5977442.0, - "step": 655 - }, - { - "epoch": 0.49848024316109424, - "grad_norm": 2.342519760131836, - "learning_rate": 4.819004289078217e-06, - "loss": 0.5753380060195923, - "mean_token_accuracy": 0.7922406792640686, - "num_tokens": 5984531.0, - "step": 656 - }, - { - "epoch": 0.4992401215805471, - "grad_norm": 2.0410680770874023, - "learning_rate": 4.818221070772129e-06, - "loss": 0.5433275699615479, - "mean_token_accuracy": 0.8043830990791321, - "num_tokens": 5992642.0, - "step": 657 - }, - { - "epoch": 0.5, - "grad_norm": 1.4999698400497437, - "learning_rate": 4.8174362254492555e-06, - "loss": 0.5248899459838867, - "mean_token_accuracy": 0.8107168674468994, - "num_tokens": 6005543.0, - "step": 658 - }, - { - "epoch": 0.5007598784194529, - "grad_norm": 1.9494401216506958, - "learning_rate": 4.816649753660431e-06, - "loss": 0.41291385889053345, - "mean_token_accuracy": 0.8650569915771484, - "num_tokens": 6012185.0, - "step": 659 - }, - { - "epoch": 0.5015197568389058, - "grad_norm": 2.7514095306396484, - "learning_rate": 4.815861655957632e-06, - "loss": 0.4244142770767212, - "mean_token_accuracy": 0.8485112190246582, - "num_tokens": 6016809.0, - "step": 660 - }, - { - "epoch": 0.5022796352583586, - "grad_norm": 1.4354928731918335, - "learning_rate": 4.815071932893976e-06, - "loss": 0.4332060217857361, - "mean_token_accuracy": 0.8386815786361694, - "num_tokens": 6034795.0, - "step": 661 - }, - { - "epoch": 0.5030395136778115, - "grad_norm": 1.3113417625427246, - "learning_rate": 4.81428058502372e-06, - "loss": 0.5415540933609009, - "mean_token_accuracy": 0.8115285038948059, - "num_tokens": 6053624.0, - "step": 662 - }, - { - "epoch": 0.5037993920972644, - "grad_norm": 1.820868730545044, - "learning_rate": 4.813487612902265e-06, - "loss": 0.5360245108604431, - "mean_token_accuracy": 0.8313555717468262, - "num_tokens": 6063399.0, - "step": 663 - }, - { - "epoch": 0.5045592705167173, - "grad_norm": 2.347001552581787, - "learning_rate": 4.812693017086145e-06, - "loss": 0.4926982820034027, - "mean_token_accuracy": 0.8137006759643555, - "num_tokens": 6070111.0, - "step": 664 - }, - { - "epoch": 0.5053191489361702, - "grad_norm": 1.8830888271331787, - "learning_rate": 4.811896798133042e-06, - "loss": 0.5419014692306519, - "mean_token_accuracy": 0.8027454614639282, - "num_tokens": 6081090.0, - "step": 665 - }, - { - "epoch": 0.506079027355623, - "grad_norm": 2.3258056640625, - "learning_rate": 4.811098956601772e-06, - "loss": 0.4629337787628174, - "mean_token_accuracy": 0.8416580557823181, - "num_tokens": 6087921.0, - "step": 666 - }, - { - "epoch": 0.506838905775076, - "grad_norm": 1.9578291177749634, - "learning_rate": 4.810299493052289e-06, - "loss": 0.40305402874946594, - "mean_token_accuracy": 0.8529061079025269, - "num_tokens": 6100034.0, - "step": 667 - }, - { - "epoch": 0.5075987841945289, - "grad_norm": 2.800635576248169, - "learning_rate": 4.809498408045691e-06, - "loss": 0.5087342262268066, - "mean_token_accuracy": 0.8214689493179321, - "num_tokens": 6104742.0, - "step": 668 - }, - { - "epoch": 0.5083586626139818, - "grad_norm": 1.5318149328231812, - "learning_rate": 4.808695702144206e-06, - "loss": 0.4733222723007202, - "mean_token_accuracy": 0.837577223777771, - "num_tokens": 6117242.0, - "step": 669 - }, - { - "epoch": 0.5091185410334347, - "grad_norm": 1.2368661165237427, - "learning_rate": 4.807891375911207e-06, - "loss": 0.3929097056388855, - "mean_token_accuracy": 0.8331400752067566, - "num_tokens": 6133509.0, - "step": 670 - }, - { - "epoch": 0.5098784194528876, - "grad_norm": 2.4711415767669678, - "learning_rate": 4.8070854299112e-06, - "loss": 0.6294851303100586, - "mean_token_accuracy": 0.7956781983375549, - "num_tokens": 6140294.0, - "step": 671 - }, - { - "epoch": 0.5106382978723404, - "grad_norm": 2.590961217880249, - "learning_rate": 4.806277864709828e-06, - "loss": 0.580160915851593, - "mean_token_accuracy": 0.809589684009552, - "num_tokens": 6145803.0, - "step": 672 - }, - { - "epoch": 0.5113981762917933, - "grad_norm": 2.4653842449188232, - "learning_rate": 4.805468680873874e-06, - "loss": 0.5262120366096497, - "mean_token_accuracy": 0.822458803653717, - "num_tokens": 6151236.0, - "step": 673 - }, - { - "epoch": 0.5121580547112462, - "grad_norm": 2.860720157623291, - "learning_rate": 4.804657878971252e-06, - "loss": 0.4007391035556793, - "mean_token_accuracy": 0.8637382984161377, - "num_tokens": 6155310.0, - "step": 674 - }, - { - "epoch": 0.5129179331306991, - "grad_norm": 2.520282030105591, - "learning_rate": 4.803845459571014e-06, - "loss": 0.45798182487487793, - "mean_token_accuracy": 0.8270114660263062, - "num_tokens": 6160326.0, - "step": 675 - }, - { - "epoch": 0.513677811550152, - "grad_norm": 2.7290921211242676, - "learning_rate": 4.803031423243349e-06, - "loss": 0.5745848417282104, - "mean_token_accuracy": 0.8401234745979309, - "num_tokens": 6165709.0, - "step": 676 - }, - { - "epoch": 0.5144376899696048, - "grad_norm": 1.6678650379180908, - "learning_rate": 4.802215770559578e-06, - "loss": 0.5257721543312073, - "mean_token_accuracy": 0.8241991996765137, - "num_tokens": 6177875.0, - "step": 677 - }, - { - "epoch": 0.5151975683890577, - "grad_norm": 2.1720468997955322, - "learning_rate": 4.801398502092156e-06, - "loss": 0.45342206954956055, - "mean_token_accuracy": 0.8463799953460693, - "num_tokens": 6185415.0, - "step": 678 - }, - { - "epoch": 0.5159574468085106, - "grad_norm": 2.282259702682495, - "learning_rate": 4.800579618414677e-06, - "loss": 0.4864169955253601, - "mean_token_accuracy": 0.8300632238388062, - "num_tokens": 6191832.0, - "step": 679 - }, - { - "epoch": 0.5167173252279635, - "grad_norm": 2.0092248916625977, - "learning_rate": 4.799759120101861e-06, - "loss": 0.5781463980674744, - "mean_token_accuracy": 0.8267031908035278, - "num_tokens": 6199440.0, - "step": 680 - }, - { - "epoch": 0.5174772036474165, - "grad_norm": 1.396580696105957, - "learning_rate": 4.798937007729568e-06, - "loss": 0.49689239263534546, - "mean_token_accuracy": 0.8257499933242798, - "num_tokens": 6213840.0, - "step": 681 - }, - { - "epoch": 0.5182370820668692, - "grad_norm": 1.9060769081115723, - "learning_rate": 4.798113281874788e-06, - "loss": 0.48969539999961853, - "mean_token_accuracy": 0.8171790838241577, - "num_tokens": 6223006.0, - "step": 682 - }, - { - "epoch": 0.5189969604863222, - "grad_norm": 1.6255282163619995, - "learning_rate": 4.797287943115642e-06, - "loss": 0.5532330870628357, - "mean_token_accuracy": 0.8173393607139587, - "num_tokens": 6234857.0, - "step": 683 - }, - { - "epoch": 0.5197568389057751, - "grad_norm": 1.6923905611038208, - "learning_rate": 4.796460992031386e-06, - "loss": 0.4880887269973755, - "mean_token_accuracy": 0.834983229637146, - "num_tokens": 6245252.0, - "step": 684 - }, - { - "epoch": 0.520516717325228, - "grad_norm": 2.13161301612854, - "learning_rate": 4.7956324292024045e-06, - "loss": 0.5687593817710876, - "mean_token_accuracy": 0.7996571063995361, - "num_tokens": 6253726.0, - "step": 685 - }, - { - "epoch": 0.5212765957446809, - "grad_norm": 2.509375810623169, - "learning_rate": 4.794802255210217e-06, - "loss": 0.5396929979324341, - "mean_token_accuracy": 0.8007107973098755, - "num_tokens": 6259238.0, - "step": 686 - }, - { - "epoch": 0.5220364741641338, - "grad_norm": 2.393710136413574, - "learning_rate": 4.793970470637469e-06, - "loss": 0.6165191531181335, - "mean_token_accuracy": 0.7891418933868408, - "num_tokens": 6266325.0, - "step": 687 - }, - { - "epoch": 0.5227963525835866, - "grad_norm": 1.511647343635559, - "learning_rate": 4.7931370760679415e-06, - "loss": 0.4773876965045929, - "mean_token_accuracy": 0.8381044864654541, - "num_tokens": 6277447.0, - "step": 688 - }, - { - "epoch": 0.5235562310030395, - "grad_norm": 2.206587314605713, - "learning_rate": 4.792302072086542e-06, - "loss": 0.5482058525085449, - "mean_token_accuracy": 0.8239108920097351, - "num_tokens": 6285163.0, - "step": 689 - }, - { - "epoch": 0.5243161094224924, - "grad_norm": 3.018146514892578, - "learning_rate": 4.7914654592793065e-06, - "loss": 0.4880615472793579, - "mean_token_accuracy": 0.8361308574676514, - "num_tokens": 6289386.0, - "step": 690 - }, - { - "epoch": 0.5250759878419453, - "grad_norm": 1.6469231843948364, - "learning_rate": 4.790627238233405e-06, - "loss": 0.4164774715900421, - "mean_token_accuracy": 0.8496290445327759, - "num_tokens": 6298915.0, - "step": 691 - }, - { - "epoch": 0.5258358662613982, - "grad_norm": 2.352505922317505, - "learning_rate": 4.789787409537131e-06, - "loss": 0.5366303324699402, - "mean_token_accuracy": 0.8350417613983154, - "num_tokens": 6306130.0, - "step": 692 - }, - { - "epoch": 0.526595744680851, - "grad_norm": 1.7463021278381348, - "learning_rate": 4.7889459737799105e-06, - "loss": 0.4389137923717499, - "mean_token_accuracy": 0.8463300466537476, - "num_tokens": 6315503.0, - "step": 693 - }, - { - "epoch": 0.5273556231003039, - "grad_norm": 2.257706642150879, - "learning_rate": 4.788102931552294e-06, - "loss": 0.5309344530105591, - "mean_token_accuracy": 0.8164352178573608, - "num_tokens": 6321852.0, - "step": 694 - }, - { - "epoch": 0.5281155015197568, - "grad_norm": 2.392732620239258, - "learning_rate": 4.787258283445962e-06, - "loss": 0.3956204056739807, - "mean_token_accuracy": 0.8671456575393677, - "num_tokens": 6327380.0, - "step": 695 - }, - { - "epoch": 0.5288753799392097, - "grad_norm": 2.210514545440674, - "learning_rate": 4.786412030053721e-06, - "loss": 0.4842875003814697, - "mean_token_accuracy": 0.8508446216583252, - "num_tokens": 6334898.0, - "step": 696 - }, - { - "epoch": 0.5296352583586627, - "grad_norm": 1.8678946495056152, - "learning_rate": 4.785564171969503e-06, - "loss": 0.47399595379829407, - "mean_token_accuracy": 0.8514996767044067, - "num_tokens": 6346374.0, - "step": 697 - }, - { - "epoch": 0.5303951367781155, - "grad_norm": 2.604079484939575, - "learning_rate": 4.784714709788368e-06, - "loss": 0.5950228571891785, - "mean_token_accuracy": 0.7983481884002686, - "num_tokens": 6351648.0, - "step": 698 - }, - { - "epoch": 0.5311550151975684, - "grad_norm": 1.662381649017334, - "learning_rate": 4.783863644106502e-06, - "loss": 0.41616758704185486, - "mean_token_accuracy": 0.8554803133010864, - "num_tokens": 6360506.0, - "step": 699 - }, - { - "epoch": 0.5319148936170213, - "grad_norm": 1.6300342082977295, - "learning_rate": 4.783010975521216e-06, - "loss": 0.43029269576072693, - "mean_token_accuracy": 0.8443028926849365, - "num_tokens": 6370675.0, - "step": 700 - }, - { - "epoch": 0.5326747720364742, - "grad_norm": 1.731873869895935, - "learning_rate": 4.782156704630944e-06, - "loss": 0.4383814334869385, - "mean_token_accuracy": 0.8443183898925781, - "num_tokens": 6381803.0, - "step": 701 - }, - { - "epoch": 0.5334346504559271, - "grad_norm": 3.1788413524627686, - "learning_rate": 4.7813008320352475e-06, - "loss": 0.32194480299949646, - "mean_token_accuracy": 0.8870962858200073, - "num_tokens": 6389263.0, - "step": 702 - }, - { - "epoch": 0.53419452887538, - "grad_norm": 2.099513530731201, - "learning_rate": 4.78044335833481e-06, - "loss": 0.36962923407554626, - "mean_token_accuracy": 0.8661133646965027, - "num_tokens": 6395589.0, - "step": 703 - }, - { - "epoch": 0.5349544072948328, - "grad_norm": 1.4859435558319092, - "learning_rate": 4.77958428413144e-06, - "loss": 0.4619954824447632, - "mean_token_accuracy": 0.8438555002212524, - "num_tokens": 6407470.0, - "step": 704 - }, - { - "epoch": 0.5357142857142857, - "grad_norm": 1.2561073303222656, - "learning_rate": 4.7787236100280685e-06, - "loss": 0.3770977258682251, - "mean_token_accuracy": 0.8515733480453491, - "num_tokens": 6422888.0, - "step": 705 - }, - { - "epoch": 0.5364741641337386, - "grad_norm": 1.4455817937850952, - "learning_rate": 4.777861336628751e-06, - "loss": 0.46481069922447205, - "mean_token_accuracy": 0.8502002954483032, - "num_tokens": 6441266.0, - "step": 706 - }, - { - "epoch": 0.5372340425531915, - "grad_norm": 1.1387295722961426, - "learning_rate": 4.7769974645386616e-06, - "loss": 0.36964765191078186, - "mean_token_accuracy": 0.8719524145126343, - "num_tokens": 6463686.0, - "step": 707 - }, - { - "epoch": 0.5379939209726444, - "grad_norm": 1.7179663181304932, - "learning_rate": 4.776131994364102e-06, - "loss": 0.4231719970703125, - "mean_token_accuracy": 0.8416585922241211, - "num_tokens": 6472956.0, - "step": 708 - }, - { - "epoch": 0.5387537993920972, - "grad_norm": 1.6328502893447876, - "learning_rate": 4.775264926712489e-06, - "loss": 0.5836569666862488, - "mean_token_accuracy": 0.8039724230766296, - "num_tokens": 6485773.0, - "step": 709 - }, - { - "epoch": 0.5395136778115501, - "grad_norm": 1.8515360355377197, - "learning_rate": 4.774396262192368e-06, - "loss": 0.5477553009986877, - "mean_token_accuracy": 0.8136521577835083, - "num_tokens": 6496379.0, - "step": 710 - }, - { - "epoch": 0.540273556231003, - "grad_norm": 1.741858959197998, - "learning_rate": 4.7735260014133986e-06, - "loss": 0.4663267731666565, - "mean_token_accuracy": 0.8473691940307617, - "num_tokens": 6507652.0, - "step": 711 - }, - { - "epoch": 0.541033434650456, - "grad_norm": 1.7516659498214722, - "learning_rate": 4.772654144986364e-06, - "loss": 0.374914288520813, - "mean_token_accuracy": 0.8600220680236816, - "num_tokens": 6519030.0, - "step": 712 - }, - { - "epoch": 0.5417933130699089, - "grad_norm": 2.662343978881836, - "learning_rate": 4.7717806935231665e-06, - "loss": 0.4206875264644623, - "mean_token_accuracy": 0.8544126749038696, - "num_tokens": 6523669.0, - "step": 713 - }, - { - "epoch": 0.5425531914893617, - "grad_norm": 1.4088834524154663, - "learning_rate": 4.770905647636828e-06, - "loss": 0.5824331045150757, - "mean_token_accuracy": 0.7857901453971863, - "num_tokens": 6540560.0, - "step": 714 - }, - { - "epoch": 0.5433130699088146, - "grad_norm": 2.173656940460205, - "learning_rate": 4.77002900794149e-06, - "loss": 0.555023729801178, - "mean_token_accuracy": 0.8067290782928467, - "num_tokens": 6548946.0, - "step": 715 - }, - { - "epoch": 0.5440729483282675, - "grad_norm": 2.121018648147583, - "learning_rate": 4.769150775052411e-06, - "loss": 0.559730052947998, - "mean_token_accuracy": 0.8166372776031494, - "num_tokens": 6556065.0, - "step": 716 - }, - { - "epoch": 0.5448328267477204, - "grad_norm": 3.335866928100586, - "learning_rate": 4.768270949585968e-06, - "loss": 0.6442267894744873, - "mean_token_accuracy": 0.7858607769012451, - "num_tokens": 6560615.0, - "step": 717 - }, - { - "epoch": 0.5455927051671733, - "grad_norm": 2.3813695907592773, - "learning_rate": 4.767389532159659e-06, - "loss": 0.4027421474456787, - "mean_token_accuracy": 0.8635619282722473, - "num_tokens": 6565841.0, - "step": 718 - }, - { - "epoch": 0.5463525835866262, - "grad_norm": 2.0657708644866943, - "learning_rate": 4.766506523392095e-06, - "loss": 0.38899827003479004, - "mean_token_accuracy": 0.8660480380058289, - "num_tokens": 6572362.0, - "step": 719 - }, - { - "epoch": 0.547112462006079, - "grad_norm": 1.093705415725708, - "learning_rate": 4.765621923903005e-06, - "loss": 0.45967352390289307, - "mean_token_accuracy": 0.8338102102279663, - "num_tokens": 6595998.0, - "step": 720 - }, - { - "epoch": 0.5478723404255319, - "grad_norm": 2.942065954208374, - "learning_rate": 4.764735734313236e-06, - "loss": 0.42910510301589966, - "mean_token_accuracy": 0.8406122922897339, - "num_tokens": 6601075.0, - "step": 721 - }, - { - "epoch": 0.5486322188449848, - "grad_norm": 2.049011707305908, - "learning_rate": 4.763847955244749e-06, - "loss": 0.5584231615066528, - "mean_token_accuracy": 0.8171684741973877, - "num_tokens": 6609310.0, - "step": 722 - }, - { - "epoch": 0.5493920972644377, - "grad_norm": 2.485543966293335, - "learning_rate": 4.762958587320623e-06, - "loss": 0.5396170020103455, - "mean_token_accuracy": 0.8158525824546814, - "num_tokens": 6616185.0, - "step": 723 - }, - { - "epoch": 0.5501519756838906, - "grad_norm": 1.87015962600708, - "learning_rate": 4.762067631165049e-06, - "loss": 0.49739527702331543, - "mean_token_accuracy": 0.8303765654563904, - "num_tokens": 6625629.0, - "step": 724 - }, - { - "epoch": 0.5509118541033434, - "grad_norm": 4.239654541015625, - "learning_rate": 4.761175087403336e-06, - "loss": 0.6029239296913147, - "mean_token_accuracy": 0.8123486042022705, - "num_tokens": 6629194.0, - "step": 725 - }, - { - "epoch": 0.5516717325227963, - "grad_norm": 2.0134730339050293, - "learning_rate": 4.760280956661904e-06, - "loss": 0.4777873754501343, - "mean_token_accuracy": 0.8283513784408569, - "num_tokens": 6636929.0, - "step": 726 - }, - { - "epoch": 0.5524316109422492, - "grad_norm": 1.991780400276184, - "learning_rate": 4.75938523956829e-06, - "loss": 0.4631248116493225, - "mean_token_accuracy": 0.8275107741355896, - "num_tokens": 6645135.0, - "step": 727 - }, - { - "epoch": 0.5531914893617021, - "grad_norm": 1.423792839050293, - "learning_rate": 4.75848793675114e-06, - "loss": 0.49630722403526306, - "mean_token_accuracy": 0.8388000130653381, - "num_tokens": 6662690.0, - "step": 728 - }, - { - "epoch": 0.5539513677811551, - "grad_norm": 2.345294952392578, - "learning_rate": 4.757589048840219e-06, - "loss": 0.37830638885498047, - "mean_token_accuracy": 0.8782080411911011, - "num_tokens": 6667285.0, - "step": 729 - }, - { - "epoch": 0.5547112462006079, - "grad_norm": 2.7452144622802734, - "learning_rate": 4.756688576466398e-06, - "loss": 0.51595538854599, - "mean_token_accuracy": 0.8441770672798157, - "num_tokens": 6672324.0, - "step": 730 - }, - { - "epoch": 0.5554711246200608, - "grad_norm": 1.5247859954833984, - "learning_rate": 4.755786520261666e-06, - "loss": 0.48365193605422974, - "mean_token_accuracy": 0.8276445269584656, - "num_tokens": 6685296.0, - "step": 731 - }, - { - "epoch": 0.5562310030395137, - "grad_norm": 1.4018276929855347, - "learning_rate": 4.75488288085912e-06, - "loss": 0.3876481354236603, - "mean_token_accuracy": 0.8612343072891235, - "num_tokens": 6697515.0, - "step": 732 - }, - { - "epoch": 0.5569908814589666, - "grad_norm": 2.9570324420928955, - "learning_rate": 4.753977658892967e-06, - "loss": 0.5468149185180664, - "mean_token_accuracy": 0.8054271340370178, - "num_tokens": 6702194.0, - "step": 733 - }, - { - "epoch": 0.5577507598784195, - "grad_norm": 1.9282715320587158, - "learning_rate": 4.753070854998529e-06, - "loss": 0.4758574962615967, - "mean_token_accuracy": 0.8379775285720825, - "num_tokens": 6709938.0, - "step": 734 - }, - { - "epoch": 0.5585106382978723, - "grad_norm": 1.981264591217041, - "learning_rate": 4.752162469812234e-06, - "loss": 0.48461222648620605, - "mean_token_accuracy": 0.833509087562561, - "num_tokens": 6718125.0, - "step": 735 - }, - { - "epoch": 0.5592705167173252, - "grad_norm": 1.1643427610397339, - "learning_rate": 4.751252503971624e-06, - "loss": 0.410121887922287, - "mean_token_accuracy": 0.8221402764320374, - "num_tokens": 6735125.0, - "step": 736 - }, - { - "epoch": 0.5600303951367781, - "grad_norm": 1.786566972732544, - "learning_rate": 4.750340958115346e-06, - "loss": 0.5964341163635254, - "mean_token_accuracy": 0.8038164377212524, - "num_tokens": 6747369.0, - "step": 737 - }, - { - "epoch": 0.560790273556231, - "grad_norm": 1.7256991863250732, - "learning_rate": 4.749427832883158e-06, - "loss": 0.48737066984176636, - "mean_token_accuracy": 0.830894947052002, - "num_tokens": 6758115.0, - "step": 738 - }, - { - "epoch": 0.5615501519756839, - "grad_norm": 1.997747540473938, - "learning_rate": 4.748513128915928e-06, - "loss": 0.5238886475563049, - "mean_token_accuracy": 0.8066858053207397, - "num_tokens": 6766111.0, - "step": 739 - }, - { - "epoch": 0.5623100303951368, - "grad_norm": 2.127016305923462, - "learning_rate": 4.747596846855629e-06, - "loss": 0.5045586228370667, - "mean_token_accuracy": 0.821424126625061, - "num_tokens": 6772893.0, - "step": 740 - }, - { - "epoch": 0.5630699088145896, - "grad_norm": 1.7664796113967896, - "learning_rate": 4.7466789873453446e-06, - "loss": 0.42954835295677185, - "mean_token_accuracy": 0.8533384799957275, - "num_tokens": 6785133.0, - "step": 741 - }, - { - "epoch": 0.5638297872340425, - "grad_norm": 1.4987404346466064, - "learning_rate": 4.7457595510292615e-06, - "loss": 0.5378558039665222, - "mean_token_accuracy": 0.8184819221496582, - "num_tokens": 6799563.0, - "step": 742 - }, - { - "epoch": 0.5645896656534954, - "grad_norm": 1.4444655179977417, - "learning_rate": 4.744838538552678e-06, - "loss": 0.42193782329559326, - "mean_token_accuracy": 0.837514340877533, - "num_tokens": 6812470.0, - "step": 743 - }, - { - "epoch": 0.5653495440729484, - "grad_norm": 3.867751121520996, - "learning_rate": 4.7439159505619946e-06, - "loss": 0.4457814693450928, - "mean_token_accuracy": 0.8630104660987854, - "num_tokens": 6815652.0, - "step": 744 - }, - { - "epoch": 0.5661094224924013, - "grad_norm": 2.1250710487365723, - "learning_rate": 4.74299178770472e-06, - "loss": 0.5638922452926636, - "mean_token_accuracy": 0.7969781160354614, - "num_tokens": 6824566.0, - "step": 745 - }, - { - "epoch": 0.5668693009118541, - "grad_norm": 2.547072410583496, - "learning_rate": 4.742066050629465e-06, - "loss": 0.5516207814216614, - "mean_token_accuracy": 0.8160669803619385, - "num_tokens": 6830589.0, - "step": 746 - }, - { - "epoch": 0.567629179331307, - "grad_norm": 1.2975233793258667, - "learning_rate": 4.741138739985951e-06, - "loss": 0.3823344111442566, - "mean_token_accuracy": 0.8668368458747864, - "num_tokens": 6842707.0, - "step": 747 - }, - { - "epoch": 0.5683890577507599, - "grad_norm": 1.3410450220108032, - "learning_rate": 4.740209856424998e-06, - "loss": 0.5148671269416809, - "mean_token_accuracy": 0.8188045024871826, - "num_tokens": 6857624.0, - "step": 748 - }, - { - "epoch": 0.5691489361702128, - "grad_norm": 1.219467282295227, - "learning_rate": 4.7392794005985324e-06, - "loss": 0.3998957872390747, - "mean_token_accuracy": 0.855175256729126, - "num_tokens": 6875064.0, - "step": 749 - }, - { - "epoch": 0.5699088145896657, - "grad_norm": 1.3530343770980835, - "learning_rate": 4.738347373159585e-06, - "loss": 0.5359633564949036, - "mean_token_accuracy": 0.8178457021713257, - "num_tokens": 6890911.0, - "step": 750 - }, - { - "epoch": 0.5706686930091185, - "grad_norm": 2.146988868713379, - "learning_rate": 4.737413774762287e-06, - "loss": 0.4460008144378662, - "mean_token_accuracy": 0.8172903060913086, - "num_tokens": 6896959.0, - "step": 751 - }, - { - "epoch": 0.5714285714285714, - "grad_norm": 1.456023097038269, - "learning_rate": 4.736478606061876e-06, - "loss": 0.43616920709609985, - "mean_token_accuracy": 0.8465108871459961, - "num_tokens": 6908904.0, - "step": 752 - }, - { - "epoch": 0.5721884498480243, - "grad_norm": 2.9696967601776123, - "learning_rate": 4.735541867714687e-06, - "loss": 0.43464532494544983, - "mean_token_accuracy": 0.8608652353286743, - "num_tokens": 6913026.0, - "step": 753 - }, - { - "epoch": 0.5729483282674772, - "grad_norm": 2.2990667819976807, - "learning_rate": 4.73460356037816e-06, - "loss": 0.6619116067886353, - "mean_token_accuracy": 0.7821142673492432, - "num_tokens": 6920588.0, - "step": 754 - }, - { - "epoch": 0.5737082066869301, - "grad_norm": 2.054746389389038, - "learning_rate": 4.733663684710835e-06, - "loss": 0.5304250717163086, - "mean_token_accuracy": 0.8265531063079834, - "num_tokens": 6928910.0, - "step": 755 - }, - { - "epoch": 0.574468085106383, - "grad_norm": 2.0050594806671143, - "learning_rate": 4.732722241372354e-06, - "loss": 0.6393026113510132, - "mean_token_accuracy": 0.796819806098938, - "num_tokens": 6940217.0, - "step": 756 - }, - { - "epoch": 0.5752279635258358, - "grad_norm": 1.4285320043563843, - "learning_rate": 4.731779231023456e-06, - "loss": 0.5432837009429932, - "mean_token_accuracy": 0.8104778528213501, - "num_tokens": 6959101.0, - "step": 757 - }, - { - "epoch": 0.5759878419452887, - "grad_norm": 2.3941943645477295, - "learning_rate": 4.730834654325984e-06, - "loss": 0.46550673246383667, - "mean_token_accuracy": 0.8444503545761108, - "num_tokens": 6965036.0, - "step": 758 - }, - { - "epoch": 0.5767477203647416, - "grad_norm": 2.3850574493408203, - "learning_rate": 4.729888511942877e-06, - "loss": 0.4916389584541321, - "mean_token_accuracy": 0.8228527307510376, - "num_tokens": 6971184.0, - "step": 759 - }, - { - "epoch": 0.5775075987841946, - "grad_norm": 1.627480149269104, - "learning_rate": 4.728940804538176e-06, - "loss": 0.5863215923309326, - "mean_token_accuracy": 0.7995302677154541, - "num_tokens": 6982569.0, - "step": 760 - }, - { - "epoch": 0.5782674772036475, - "grad_norm": 1.1723195314407349, - "learning_rate": 4.727991532777016e-06, - "loss": 0.36908864974975586, - "mean_token_accuracy": 0.8355655670166016, - "num_tokens": 6998659.0, - "step": 761 - }, - { - "epoch": 0.5790273556231003, - "grad_norm": 1.5324925184249878, - "learning_rate": 4.727040697325634e-06, - "loss": 0.557658851146698, - "mean_token_accuracy": 0.8141458034515381, - "num_tokens": 7012969.0, - "step": 762 - }, - { - "epoch": 0.5797872340425532, - "grad_norm": 2.4106390476226807, - "learning_rate": 4.726088298851362e-06, - "loss": 0.5004243850708008, - "mean_token_accuracy": 0.8376860618591309, - "num_tokens": 7018301.0, - "step": 763 - }, - { - "epoch": 0.5805471124620061, - "grad_norm": 2.2594921588897705, - "learning_rate": 4.725134338022631e-06, - "loss": 0.6067016124725342, - "mean_token_accuracy": 0.8100241422653198, - "num_tokens": 7025201.0, - "step": 764 - }, - { - "epoch": 0.581306990881459, - "grad_norm": 1.4649826288223267, - "learning_rate": 4.724178815508967e-06, - "loss": 0.36200693249702454, - "mean_token_accuracy": 0.8621826171875, - "num_tokens": 7035112.0, - "step": 765 - }, - { - "epoch": 0.5820668693009119, - "grad_norm": 2.3634560108184814, - "learning_rate": 4.723221731980993e-06, - "loss": 0.41862213611602783, - "mean_token_accuracy": 0.8541463613510132, - "num_tokens": 7040339.0, - "step": 766 - }, - { - "epoch": 0.5828267477203647, - "grad_norm": 2.7798104286193848, - "learning_rate": 4.722263088110426e-06, - "loss": 0.4647108018398285, - "mean_token_accuracy": 0.8505672216415405, - "num_tokens": 7044880.0, - "step": 767 - }, - { - "epoch": 0.5835866261398176, - "grad_norm": 2.070528507232666, - "learning_rate": 4.721302884570079e-06, - "loss": 0.5147565007209778, - "mean_token_accuracy": 0.8113877773284912, - "num_tokens": 7052433.0, - "step": 768 - }, - { - "epoch": 0.5843465045592705, - "grad_norm": 2.1953284740448, - "learning_rate": 4.720341122033862e-06, - "loss": 0.5075466632843018, - "mean_token_accuracy": 0.8474211096763611, - "num_tokens": 7058686.0, - "step": 769 - }, - { - "epoch": 0.5851063829787234, - "grad_norm": 1.9287755489349365, - "learning_rate": 4.719377801176774e-06, - "loss": 0.5382202863693237, - "mean_token_accuracy": 0.8148090243339539, - "num_tokens": 7067538.0, - "step": 770 - }, - { - "epoch": 0.5858662613981763, - "grad_norm": 1.5574456453323364, - "learning_rate": 4.718412922674913e-06, - "loss": 0.43406790494918823, - "mean_token_accuracy": 0.8477081060409546, - "num_tokens": 7077853.0, - "step": 771 - }, - { - "epoch": 0.5866261398176292, - "grad_norm": 1.5490336418151855, - "learning_rate": 4.717446487205466e-06, - "loss": 0.43164271116256714, - "mean_token_accuracy": 0.8504570126533508, - "num_tokens": 7091728.0, - "step": 772 - }, - { - "epoch": 0.587386018237082, - "grad_norm": 1.6945984363555908, - "learning_rate": 4.716478495446717e-06, - "loss": 0.5153743624687195, - "mean_token_accuracy": 0.8213579058647156, - "num_tokens": 7108680.0, - "step": 773 - }, - { - "epoch": 0.5881458966565349, - "grad_norm": 2.2633883953094482, - "learning_rate": 4.715508948078037e-06, - "loss": 0.45254790782928467, - "mean_token_accuracy": 0.8392219543457031, - "num_tokens": 7115546.0, - "step": 774 - }, - { - "epoch": 0.5889057750759878, - "grad_norm": 1.5731090307235718, - "learning_rate": 4.714537845779894e-06, - "loss": 0.38678881525993347, - "mean_token_accuracy": 0.8800252676010132, - "num_tokens": 7126360.0, - "step": 775 - }, - { - "epoch": 0.5896656534954408, - "grad_norm": 2.4873392581939697, - "learning_rate": 4.7135651892338445e-06, - "loss": 0.5190927386283875, - "mean_token_accuracy": 0.8145407438278198, - "num_tokens": 7135705.0, - "step": 776 - }, - { - "epoch": 0.5904255319148937, - "grad_norm": 1.2931004762649536, - "learning_rate": 4.712590979122534e-06, - "loss": 0.3686544895172119, - "mean_token_accuracy": 0.8720537424087524, - "num_tokens": 7150688.0, - "step": 777 - }, - { - "epoch": 0.5911854103343465, - "grad_norm": 1.6353671550750732, - "learning_rate": 4.7116152161297045e-06, - "loss": 0.49065062403678894, - "mean_token_accuracy": 0.8203760385513306, - "num_tokens": 7161040.0, - "step": 778 - }, - { - "epoch": 0.5919452887537994, - "grad_norm": 1.2345483303070068, - "learning_rate": 4.710637900940181e-06, - "loss": 0.4004976451396942, - "mean_token_accuracy": 0.8302007913589478, - "num_tokens": 7178074.0, - "step": 779 - }, - { - "epoch": 0.5927051671732523, - "grad_norm": 2.2506837844848633, - "learning_rate": 4.7096590342398825e-06, - "loss": 0.45142874121665955, - "mean_token_accuracy": 0.8481036424636841, - "num_tokens": 7184153.0, - "step": 780 - }, - { - "epoch": 0.5934650455927052, - "grad_norm": 1.420479416847229, - "learning_rate": 4.708678616715815e-06, - "loss": 0.4802100360393524, - "mean_token_accuracy": 0.8586992025375366, - "num_tokens": 7202810.0, - "step": 781 - }, - { - "epoch": 0.5942249240121581, - "grad_norm": 3.457632303237915, - "learning_rate": 4.707696649056073e-06, - "loss": 0.5265094041824341, - "mean_token_accuracy": 0.8260114192962646, - "num_tokens": 7206396.0, - "step": 782 - }, - { - "epoch": 0.5949848024316109, - "grad_norm": 1.1592093706130981, - "learning_rate": 4.706713131949839e-06, - "loss": 0.3708173632621765, - "mean_token_accuracy": 0.8476542234420776, - "num_tokens": 7225034.0, - "step": 783 - }, - { - "epoch": 0.5957446808510638, - "grad_norm": 1.6761400699615479, - "learning_rate": 4.705728066087384e-06, - "loss": 0.4137252867221832, - "mean_token_accuracy": 0.8462049961090088, - "num_tokens": 7237101.0, - "step": 784 - }, - { - "epoch": 0.5965045592705167, - "grad_norm": 2.320185422897339, - "learning_rate": 4.704741452160064e-06, - "loss": 0.5157154202461243, - "mean_token_accuracy": 0.8391785621643066, - "num_tokens": 7243826.0, - "step": 785 - }, - { - "epoch": 0.5972644376899696, - "grad_norm": 2.079423427581787, - "learning_rate": 4.703753290860323e-06, - "loss": 0.4734993278980255, - "mean_token_accuracy": 0.8353281021118164, - "num_tokens": 7250175.0, - "step": 786 - }, - { - "epoch": 0.5980243161094225, - "grad_norm": 1.8215159177780151, - "learning_rate": 4.702763582881692e-06, - "loss": 0.520193338394165, - "mean_token_accuracy": 0.844062864780426, - "num_tokens": 7258868.0, - "step": 787 - }, - { - "epoch": 0.5987841945288754, - "grad_norm": 1.3823071718215942, - "learning_rate": 4.701772328918784e-06, - "loss": 0.4177844822406769, - "mean_token_accuracy": 0.8363165259361267, - "num_tokens": 7271744.0, - "step": 788 - }, - { - "epoch": 0.5995440729483282, - "grad_norm": 2.4749298095703125, - "learning_rate": 4.700779529667301e-06, - "loss": 0.5115069150924683, - "mean_token_accuracy": 0.8473520278930664, - "num_tokens": 7277040.0, - "step": 789 - }, - { - "epoch": 0.6003039513677811, - "grad_norm": 1.7072296142578125, - "learning_rate": 4.699785185824026e-06, - "loss": 0.5265800952911377, - "mean_token_accuracy": 0.8161447048187256, - "num_tokens": 7288288.0, - "step": 790 - }, - { - "epoch": 0.601063829787234, - "grad_norm": 1.6479384899139404, - "learning_rate": 4.69878929808683e-06, - "loss": 0.4445168972015381, - "mean_token_accuracy": 0.8381255865097046, - "num_tokens": 7298640.0, - "step": 791 - }, - { - "epoch": 0.601823708206687, - "grad_norm": 1.9095896482467651, - "learning_rate": 4.6977918671546635e-06, - "loss": 0.5841238498687744, - "mean_token_accuracy": 0.7971454858779907, - "num_tokens": 7307220.0, - "step": 792 - }, - { - "epoch": 0.6025835866261399, - "grad_norm": 1.9614146947860718, - "learning_rate": 4.696792893727562e-06, - "loss": 0.34684082865715027, - "mean_token_accuracy": 0.8739526271820068, - "num_tokens": 7313875.0, - "step": 793 - }, - { - "epoch": 0.6033434650455927, - "grad_norm": 2.015570640563965, - "learning_rate": 4.695792378506645e-06, - "loss": 0.42779117822647095, - "mean_token_accuracy": 0.8625012636184692, - "num_tokens": 7321439.0, - "step": 794 - }, - { - "epoch": 0.6041033434650456, - "grad_norm": 2.8581228256225586, - "learning_rate": 4.694790322194111e-06, - "loss": 0.6519991159439087, - "mean_token_accuracy": 0.7629562616348267, - "num_tokens": 7326916.0, - "step": 795 - }, - { - "epoch": 0.6048632218844985, - "grad_norm": 2.482715368270874, - "learning_rate": 4.693786725493242e-06, - "loss": 0.532963216304779, - "mean_token_accuracy": 0.832184910774231, - "num_tokens": 7333311.0, - "step": 796 - }, - { - "epoch": 0.6056231003039514, - "grad_norm": 1.6076741218566895, - "learning_rate": 4.692781589108402e-06, - "loss": 0.43381205201148987, - "mean_token_accuracy": 0.8402494192123413, - "num_tokens": 7343731.0, - "step": 797 - }, - { - "epoch": 0.6063829787234043, - "grad_norm": 2.2133216857910156, - "learning_rate": 4.691774913745033e-06, - "loss": 0.4380851089954376, - "mean_token_accuracy": 0.8600908517837524, - "num_tokens": 7350224.0, - "step": 798 - }, - { - "epoch": 0.6071428571428571, - "grad_norm": 2.046280860900879, - "learning_rate": 4.690766700109659e-06, - "loss": 0.3821919560432434, - "mean_token_accuracy": 0.8691814541816711, - "num_tokens": 7356717.0, - "step": 799 - }, - { - "epoch": 0.60790273556231, - "grad_norm": 1.8482693433761597, - "learning_rate": 4.689756948909884e-06, - "loss": 0.5217651128768921, - "mean_token_accuracy": 0.803473711013794, - "num_tokens": 7365806.0, - "step": 800 - }, - { - "epoch": 0.6086626139817629, - "grad_norm": 2.192134141921997, - "learning_rate": 4.688745660854388e-06, - "loss": 0.573980987071991, - "mean_token_accuracy": 0.8198676109313965, - "num_tokens": 7380281.0, - "step": 801 - }, - { - "epoch": 0.6094224924012158, - "grad_norm": 2.363626718521118, - "learning_rate": 4.687732836652935e-06, - "loss": 0.5204599499702454, - "mean_token_accuracy": 0.8373252153396606, - "num_tokens": 7386938.0, - "step": 802 - }, - { - "epoch": 0.6101823708206687, - "grad_norm": 1.9320523738861084, - "learning_rate": 4.686718477016361e-06, - "loss": 0.47316622734069824, - "mean_token_accuracy": 0.830596923828125, - "num_tokens": 7395069.0, - "step": 803 - }, - { - "epoch": 0.6109422492401215, - "grad_norm": 2.6573057174682617, - "learning_rate": 4.6857025826565845e-06, - "loss": 0.5495861768722534, - "mean_token_accuracy": 0.8187421560287476, - "num_tokens": 7400563.0, - "step": 804 - }, - { - "epoch": 0.6117021276595744, - "grad_norm": 2.0893123149871826, - "learning_rate": 4.684685154286599e-06, - "loss": 0.5362675786018372, - "mean_token_accuracy": 0.8394701480865479, - "num_tokens": 7406973.0, - "step": 805 - }, - { - "epoch": 0.6124620060790273, - "grad_norm": 2.455130100250244, - "learning_rate": 4.683666192620474e-06, - "loss": 0.5405995845794678, - "mean_token_accuracy": 0.8079100847244263, - "num_tokens": 7412931.0, - "step": 806 - }, - { - "epoch": 0.6132218844984803, - "grad_norm": 2.311915636062622, - "learning_rate": 4.682645698373357e-06, - "loss": 0.5395106077194214, - "mean_token_accuracy": 0.8156260251998901, - "num_tokens": 7419699.0, - "step": 807 - }, - { - "epoch": 0.6139817629179332, - "grad_norm": 1.686838984489441, - "learning_rate": 4.6816236722614694e-06, - "loss": 0.6034521460533142, - "mean_token_accuracy": 0.7855954170227051, - "num_tokens": 7431899.0, - "step": 808 - }, - { - "epoch": 0.6147416413373861, - "grad_norm": 1.682759165763855, - "learning_rate": 4.680600115002109e-06, - "loss": 0.48593831062316895, - "mean_token_accuracy": 0.8229435682296753, - "num_tokens": 7443187.0, - "step": 809 - }, - { - "epoch": 0.6155015197568389, - "grad_norm": 2.064589738845825, - "learning_rate": 4.679575027313649e-06, - "loss": 0.5098468661308289, - "mean_token_accuracy": 0.8234638571739197, - "num_tokens": 7450868.0, - "step": 810 - }, - { - "epoch": 0.6162613981762918, - "grad_norm": 2.2063486576080322, - "learning_rate": 4.6785484099155324e-06, - "loss": 0.5138497352600098, - "mean_token_accuracy": 0.8152111172676086, - "num_tokens": 7457176.0, - "step": 811 - }, - { - "epoch": 0.6170212765957447, - "grad_norm": 1.6258726119995117, - "learning_rate": 4.67752026352828e-06, - "loss": 0.4064181447029114, - "mean_token_accuracy": 0.8720619678497314, - "num_tokens": 7466557.0, - "step": 812 - }, - { - "epoch": 0.6177811550151976, - "grad_norm": 2.3309383392333984, - "learning_rate": 4.676490588873486e-06, - "loss": 0.5180112719535828, - "mean_token_accuracy": 0.8233879804611206, - "num_tokens": 7472650.0, - "step": 813 - }, - { - "epoch": 0.6185410334346505, - "grad_norm": 1.4545246362686157, - "learning_rate": 4.675459386673815e-06, - "loss": 0.37917959690093994, - "mean_token_accuracy": 0.8598103523254395, - "num_tokens": 7485171.0, - "step": 814 - }, - { - "epoch": 0.6193009118541033, - "grad_norm": 2.654231071472168, - "learning_rate": 4.674426657653003e-06, - "loss": 0.554074227809906, - "mean_token_accuracy": 0.8026446104049683, - "num_tokens": 7490787.0, - "step": 815 - }, - { - "epoch": 0.6200607902735562, - "grad_norm": 1.5543994903564453, - "learning_rate": 4.67339240253586e-06, - "loss": 0.6335440278053284, - "mean_token_accuracy": 0.783241868019104, - "num_tokens": 7505975.0, - "step": 816 - }, - { - "epoch": 0.6208206686930091, - "grad_norm": 2.079998016357422, - "learning_rate": 4.672356622048266e-06, - "loss": 0.5169394016265869, - "mean_token_accuracy": 0.8088761568069458, - "num_tokens": 7513470.0, - "step": 817 - }, - { - "epoch": 0.621580547112462, - "grad_norm": 1.5971896648406982, - "learning_rate": 4.671319316917172e-06, - "loss": 0.44588586688041687, - "mean_token_accuracy": 0.8518649339675903, - "num_tokens": 7524352.0, - "step": 818 - }, - { - "epoch": 0.6223404255319149, - "grad_norm": 2.477579116821289, - "learning_rate": 4.670280487870599e-06, - "loss": 0.5713893175125122, - "mean_token_accuracy": 0.8116940259933472, - "num_tokens": 7530359.0, - "step": 819 - }, - { - "epoch": 0.6231003039513677, - "grad_norm": 2.066211700439453, - "learning_rate": 4.669240135637635e-06, - "loss": 0.5295331478118896, - "mean_token_accuracy": 0.819536566734314, - "num_tokens": 7536963.0, - "step": 820 - }, - { - "epoch": 0.6238601823708206, - "grad_norm": 2.1217997074127197, - "learning_rate": 4.668198260948442e-06, - "loss": 0.6146406531333923, - "mean_token_accuracy": 0.7932635545730591, - "num_tokens": 7545800.0, - "step": 821 - }, - { - "epoch": 0.6246200607902735, - "grad_norm": 2.0173542499542236, - "learning_rate": 4.667154864534245e-06, - "loss": 0.6240535974502563, - "mean_token_accuracy": 0.7883644104003906, - "num_tokens": 7556165.0, - "step": 822 - }, - { - "epoch": 0.6253799392097265, - "grad_norm": 2.014526128768921, - "learning_rate": 4.666109947127343e-06, - "loss": 0.40367332100868225, - "mean_token_accuracy": 0.8653522729873657, - "num_tokens": 7562665.0, - "step": 823 - }, - { - "epoch": 0.6261398176291794, - "grad_norm": 2.5078861713409424, - "learning_rate": 4.665063509461098e-06, - "loss": 0.5903617739677429, - "mean_token_accuracy": 0.7902897596359253, - "num_tokens": 7568922.0, - "step": 824 - }, - { - "epoch": 0.6268996960486323, - "grad_norm": 2.454622745513916, - "learning_rate": 4.664015552269938e-06, - "loss": 0.5238361358642578, - "mean_token_accuracy": 0.838546872138977, - "num_tokens": 7575965.0, - "step": 825 - }, - { - "epoch": 0.6276595744680851, - "grad_norm": 2.920919418334961, - "learning_rate": 4.662966076289363e-06, - "loss": 0.5028782486915588, - "mean_token_accuracy": 0.8311152458190918, - "num_tokens": 7580193.0, - "step": 826 - }, - { - "epoch": 0.628419452887538, - "grad_norm": 1.545382022857666, - "learning_rate": 4.661915082255932e-06, - "loss": 0.4817378520965576, - "mean_token_accuracy": 0.8373227119445801, - "num_tokens": 7593024.0, - "step": 827 - }, - { - "epoch": 0.6291793313069909, - "grad_norm": 1.5152469873428345, - "learning_rate": 4.6608625709072766e-06, - "loss": 0.4693033695220947, - "mean_token_accuracy": 0.8150848150253296, - "num_tokens": 7606459.0, - "step": 828 - }, - { - "epoch": 0.6299392097264438, - "grad_norm": 2.1310224533081055, - "learning_rate": 4.659808542982089e-06, - "loss": 0.4653395414352417, - "mean_token_accuracy": 0.8286294341087341, - "num_tokens": 7613036.0, - "step": 829 - }, - { - "epoch": 0.6306990881458967, - "grad_norm": 2.1949679851531982, - "learning_rate": 4.658752999220125e-06, - "loss": 0.3698633909225464, - "mean_token_accuracy": 0.871590793132782, - "num_tokens": 7618527.0, - "step": 830 - }, - { - "epoch": 0.6314589665653495, - "grad_norm": 2.2770416736602783, - "learning_rate": 4.657695940362207e-06, - "loss": 0.5202419757843018, - "mean_token_accuracy": 0.817577600479126, - "num_tokens": 7624459.0, - "step": 831 - }, - { - "epoch": 0.6322188449848024, - "grad_norm": 1.402042269706726, - "learning_rate": 4.65663736715022e-06, - "loss": 0.51531583070755, - "mean_token_accuracy": 0.8228116631507874, - "num_tokens": 7639371.0, - "step": 832 - }, - { - "epoch": 0.6329787234042553, - "grad_norm": 3.3554883003234863, - "learning_rate": 4.65557728032711e-06, - "loss": 0.6771188378334045, - "mean_token_accuracy": 0.7880028486251831, - "num_tokens": 7643924.0, - "step": 833 - }, - { - "epoch": 0.6337386018237082, - "grad_norm": 2.081040143966675, - "learning_rate": 4.654515680636888e-06, - "loss": 0.5712796449661255, - "mean_token_accuracy": 0.8177868127822876, - "num_tokens": 7651881.0, - "step": 834 - }, - { - "epoch": 0.6344984802431611, - "grad_norm": 0.9128716588020325, - "learning_rate": 4.653452568824625e-06, - "loss": 0.3423936069011688, - "mean_token_accuracy": 0.8782886266708374, - "num_tokens": 7677829.0, - "step": 835 - }, - { - "epoch": 0.6352583586626139, - "grad_norm": 3.49015736579895, - "learning_rate": 4.652387945636454e-06, - "loss": 0.34657734632492065, - "mean_token_accuracy": 0.8770567178726196, - "num_tokens": 7680796.0, - "step": 836 - }, - { - "epoch": 0.6360182370820668, - "grad_norm": 2.026247501373291, - "learning_rate": 4.651321811819568e-06, - "loss": 0.5098431706428528, - "mean_token_accuracy": 0.8216961622238159, - "num_tokens": 7688746.0, - "step": 837 - }, - { - "epoch": 0.6367781155015197, - "grad_norm": 2.444343090057373, - "learning_rate": 4.650254168122222e-06, - "loss": 0.5490090250968933, - "mean_token_accuracy": 0.8092857599258423, - "num_tokens": 7695220.0, - "step": 838 - }, - { - "epoch": 0.6375379939209727, - "grad_norm": 2.0171122550964355, - "learning_rate": 4.649185015293728e-06, - "loss": 0.47221142053604126, - "mean_token_accuracy": 0.8514408469200134, - "num_tokens": 7702759.0, - "step": 839 - }, - { - "epoch": 0.6382978723404256, - "grad_norm": 1.9800984859466553, - "learning_rate": 4.64811435408446e-06, - "loss": 0.5238803625106812, - "mean_token_accuracy": 0.8479194641113281, - "num_tokens": 7714017.0, - "step": 840 - }, - { - "epoch": 0.6390577507598785, - "grad_norm": 3.0674357414245605, - "learning_rate": 4.647042185245848e-06, - "loss": 0.4668245315551758, - "mean_token_accuracy": 0.8381714820861816, - "num_tokens": 7717801.0, - "step": 841 - }, - { - "epoch": 0.6398176291793313, - "grad_norm": 1.5672820806503296, - "learning_rate": 4.645968509530381e-06, - "loss": 0.4428741931915283, - "mean_token_accuracy": 0.8416479825973511, - "num_tokens": 7728342.0, - "step": 842 - }, - { - "epoch": 0.6405775075987842, - "grad_norm": 2.3042354583740234, - "learning_rate": 4.644893327691608e-06, - "loss": 0.49937760829925537, - "mean_token_accuracy": 0.827070951461792, - "num_tokens": 7734576.0, - "step": 843 - }, - { - "epoch": 0.6413373860182371, - "grad_norm": 2.057772159576416, - "learning_rate": 4.6438166404841316e-06, - "loss": 0.5912986993789673, - "mean_token_accuracy": 0.805509090423584, - "num_tokens": 7742481.0, - "step": 844 - }, - { - "epoch": 0.64209726443769, - "grad_norm": 1.9688186645507812, - "learning_rate": 4.6427384486636115e-06, - "loss": 0.482401967048645, - "mean_token_accuracy": 0.8358086347579956, - "num_tokens": 7750002.0, - "step": 845 - }, - { - "epoch": 0.6428571428571429, - "grad_norm": 2.6852948665618896, - "learning_rate": 4.6416587529867665e-06, - "loss": 0.5479315519332886, - "mean_token_accuracy": 0.8091106414794922, - "num_tokens": 7755578.0, - "step": 846 - }, - { - "epoch": 0.6436170212765957, - "grad_norm": 2.0547337532043457, - "learning_rate": 4.640577554211366e-06, - "loss": 0.5327274203300476, - "mean_token_accuracy": 0.8280376195907593, - "num_tokens": 7763513.0, - "step": 847 - }, - { - "epoch": 0.6443768996960486, - "grad_norm": 2.0328633785247803, - "learning_rate": 4.63949485309624e-06, - "loss": 0.4814409613609314, - "mean_token_accuracy": 0.8527672290802002, - "num_tokens": 7771131.0, - "step": 848 - }, - { - "epoch": 0.6451367781155015, - "grad_norm": 1.5892863273620605, - "learning_rate": 4.638410650401267e-06, - "loss": 0.4492785334587097, - "mean_token_accuracy": 0.846997857093811, - "num_tokens": 7781572.0, - "step": 849 - }, - { - "epoch": 0.6458966565349544, - "grad_norm": 1.8295910358428955, - "learning_rate": 4.637324946887384e-06, - "loss": 0.37088239192962646, - "mean_token_accuracy": 0.8616628646850586, - "num_tokens": 7788604.0, - "step": 850 - }, - { - "epoch": 0.6466565349544073, - "grad_norm": 3.380040168762207, - "learning_rate": 4.636237743316578e-06, - "loss": 0.4737280607223511, - "mean_token_accuracy": 0.855940580368042, - "num_tokens": 7792504.0, - "step": 851 - }, - { - "epoch": 0.6474164133738601, - "grad_norm": 2.8790009021759033, - "learning_rate": 4.635149040451891e-06, - "loss": 0.39790448546409607, - "mean_token_accuracy": 0.8710698485374451, - "num_tokens": 7796333.0, - "step": 852 - }, - { - "epoch": 0.648176291793313, - "grad_norm": 1.914914608001709, - "learning_rate": 4.634058839057417e-06, - "loss": 0.2954312562942505, - "mean_token_accuracy": 0.8880234956741333, - "num_tokens": 7802456.0, - "step": 853 - }, - { - "epoch": 0.648936170212766, - "grad_norm": 1.3709120750427246, - "learning_rate": 4.632967139898301e-06, - "loss": 0.43224576115608215, - "mean_token_accuracy": 0.8446190357208252, - "num_tokens": 7816770.0, - "step": 854 - }, - { - "epoch": 0.6496960486322189, - "grad_norm": 1.6579312086105347, - "learning_rate": 4.63187394374074e-06, - "loss": 0.3535553514957428, - "mean_token_accuracy": 0.8738704919815063, - "num_tokens": 7824963.0, - "step": 855 - }, - { - "epoch": 0.6504559270516718, - "grad_norm": 2.4055678844451904, - "learning_rate": 4.63077925135198e-06, - "loss": 0.5078744292259216, - "mean_token_accuracy": 0.8430874347686768, - "num_tokens": 7830962.0, - "step": 856 - }, - { - "epoch": 0.6512158054711246, - "grad_norm": 2.5171499252319336, - "learning_rate": 4.629683063500319e-06, - "loss": 0.5172419548034668, - "mean_token_accuracy": 0.8087141513824463, - "num_tokens": 7836638.0, - "step": 857 - }, - { - "epoch": 0.6519756838905775, - "grad_norm": 1.7588486671447754, - "learning_rate": 4.628585380955104e-06, - "loss": 0.5759496092796326, - "mean_token_accuracy": 0.8043236136436462, - "num_tokens": 7844654.0, - "step": 858 - }, - { - "epoch": 0.6527355623100304, - "grad_norm": 1.5887070894241333, - "learning_rate": 4.62748620448673e-06, - "loss": 0.41849038004875183, - "mean_token_accuracy": 0.8556643724441528, - "num_tokens": 7855642.0, - "step": 859 - }, - { - "epoch": 0.6534954407294833, - "grad_norm": 3.227942705154419, - "learning_rate": 4.626385534866642e-06, - "loss": 0.5279449224472046, - "mean_token_accuracy": 0.8250958323478699, - "num_tokens": 7859890.0, - "step": 860 - }, - { - "epoch": 0.6542553191489362, - "grad_norm": 2.440467119216919, - "learning_rate": 4.625283372867333e-06, - "loss": 0.5294933319091797, - "mean_token_accuracy": 0.8235013484954834, - "num_tokens": 7866766.0, - "step": 861 - }, - { - "epoch": 0.6550151975683891, - "grad_norm": 2.4106903076171875, - "learning_rate": 4.624179719262342e-06, - "loss": 0.5662813186645508, - "mean_token_accuracy": 0.8061668872833252, - "num_tokens": 7872809.0, - "step": 862 - }, - { - "epoch": 0.6557750759878419, - "grad_norm": 3.5151145458221436, - "learning_rate": 4.623074574826254e-06, - "loss": 0.5471097230911255, - "mean_token_accuracy": 0.8220691084861755, - "num_tokens": 7876136.0, - "step": 863 - }, - { - "epoch": 0.6565349544072948, - "grad_norm": 1.5319840908050537, - "learning_rate": 4.621967940334705e-06, - "loss": 0.4178982377052307, - "mean_token_accuracy": 0.8517135977745056, - "num_tokens": 7886113.0, - "step": 864 - }, - { - "epoch": 0.6572948328267477, - "grad_norm": 1.63701331615448, - "learning_rate": 4.620859816564371e-06, - "loss": 0.4666512608528137, - "mean_token_accuracy": 0.8223508596420288, - "num_tokens": 7897982.0, - "step": 865 - }, - { - "epoch": 0.6580547112462006, - "grad_norm": 2.1515414714813232, - "learning_rate": 4.619750204292978e-06, - "loss": 0.5359305143356323, - "mean_token_accuracy": 0.8192868232727051, - "num_tokens": 7904947.0, - "step": 866 - }, - { - "epoch": 0.6588145896656535, - "grad_norm": 2.2140955924987793, - "learning_rate": 4.618639104299294e-06, - "loss": 0.5275633931159973, - "mean_token_accuracy": 0.8120715618133545, - "num_tokens": 7913913.0, - "step": 867 - }, - { - "epoch": 0.6595744680851063, - "grad_norm": 1.3956893682479858, - "learning_rate": 4.6175265173631304e-06, - "loss": 0.4378768503665924, - "mean_token_accuracy": 0.8479125499725342, - "num_tokens": 7927979.0, - "step": 868 - }, - { - "epoch": 0.6603343465045592, - "grad_norm": 2.98103928565979, - "learning_rate": 4.616412444265344e-06, - "loss": 0.42614591121673584, - "mean_token_accuracy": 0.8595094680786133, - "num_tokens": 7934293.0, - "step": 869 - }, - { - "epoch": 0.6610942249240122, - "grad_norm": 2.554845094680786, - "learning_rate": 4.6152968857878365e-06, - "loss": 0.3698030412197113, - "mean_token_accuracy": 0.8717041015625, - "num_tokens": 7938547.0, - "step": 870 - }, - { - "epoch": 0.6618541033434651, - "grad_norm": 3.0901825428009033, - "learning_rate": 4.6141798427135475e-06, - "loss": 0.5037497282028198, - "mean_token_accuracy": 0.8354041576385498, - "num_tokens": 7942829.0, - "step": 871 - }, - { - "epoch": 0.662613981762918, - "grad_norm": 2.8692073822021484, - "learning_rate": 4.6130613158264605e-06, - "loss": 0.5418164134025574, - "mean_token_accuracy": 0.8298909664154053, - "num_tokens": 7949303.0, - "step": 872 - }, - { - "epoch": 0.6633738601823708, - "grad_norm": 3.960404396057129, - "learning_rate": 4.611941305911602e-06, - "loss": 0.6284480094909668, - "mean_token_accuracy": 0.837495744228363, - "num_tokens": 7952486.0, - "step": 873 - }, - { - "epoch": 0.6641337386018237, - "grad_norm": 2.6690115928649902, - "learning_rate": 4.610819813755038e-06, - "loss": 0.5214360952377319, - "mean_token_accuracy": 0.8213508129119873, - "num_tokens": 7957559.0, - "step": 874 - }, - { - "epoch": 0.6648936170212766, - "grad_norm": 2.3376171588897705, - "learning_rate": 4.609696840143875e-06, - "loss": 0.46887528896331787, - "mean_token_accuracy": 0.8438819646835327, - "num_tokens": 7962826.0, - "step": 875 - }, - { - "epoch": 0.6656534954407295, - "grad_norm": 2.2222683429718018, - "learning_rate": 4.6085723858662575e-06, - "loss": 0.5607719421386719, - "mean_token_accuracy": 0.8128405809402466, - "num_tokens": 7970131.0, - "step": 876 - }, - { - "epoch": 0.6664133738601824, - "grad_norm": 2.069091558456421, - "learning_rate": 4.607446451711372e-06, - "loss": 0.506301760673523, - "mean_token_accuracy": 0.8256827592849731, - "num_tokens": 7977524.0, - "step": 877 - }, - { - "epoch": 0.6671732522796353, - "grad_norm": 1.3724967241287231, - "learning_rate": 4.606319038469443e-06, - "loss": 0.43285101652145386, - "mean_token_accuracy": 0.8525032997131348, - "num_tokens": 7989174.0, - "step": 878 - }, - { - "epoch": 0.6679331306990881, - "grad_norm": 2.278205156326294, - "learning_rate": 4.605190146931731e-06, - "loss": 0.4845905303955078, - "mean_token_accuracy": 0.8284652829170227, - "num_tokens": 7998524.0, - "step": 879 - }, - { - "epoch": 0.668693009118541, - "grad_norm": 1.3871766328811646, - "learning_rate": 4.604059777890537e-06, - "loss": 0.5736679434776306, - "mean_token_accuracy": 0.8223285675048828, - "num_tokens": 8015776.0, - "step": 880 - }, - { - "epoch": 0.6694528875379939, - "grad_norm": 1.926164984703064, - "learning_rate": 4.602927932139197e-06, - "loss": 0.4133230447769165, - "mean_token_accuracy": 0.8653768301010132, - "num_tokens": 8022979.0, - "step": 881 - }, - { - "epoch": 0.6702127659574468, - "grad_norm": 2.109272003173828, - "learning_rate": 4.601794610472083e-06, - "loss": 0.7005600929260254, - "mean_token_accuracy": 0.7777010202407837, - "num_tokens": 8032618.0, - "step": 882 - }, - { - "epoch": 0.6709726443768997, - "grad_norm": 2.077977418899536, - "learning_rate": 4.6006598136846056e-06, - "loss": 0.5278208255767822, - "mean_token_accuracy": 0.8230358958244324, - "num_tokens": 8040534.0, - "step": 883 - }, - { - "epoch": 0.6717325227963525, - "grad_norm": 1.678581714630127, - "learning_rate": 4.599523542573207e-06, - "loss": 0.4955351650714874, - "mean_token_accuracy": 0.8270003795623779, - "num_tokens": 8052249.0, - "step": 884 - }, - { - "epoch": 0.6724924012158054, - "grad_norm": 2.0751662254333496, - "learning_rate": 4.598385797935368e-06, - "loss": 0.5266247987747192, - "mean_token_accuracy": 0.8263581991195679, - "num_tokens": 8060600.0, - "step": 885 - }, - { - "epoch": 0.6732522796352584, - "grad_norm": 2.418405771255493, - "learning_rate": 4.5972465805696e-06, - "loss": 0.4481425881385803, - "mean_token_accuracy": 0.846164345741272, - "num_tokens": 8066025.0, - "step": 886 - }, - { - "epoch": 0.6740121580547113, - "grad_norm": 2.3936474323272705, - "learning_rate": 4.596105891275449e-06, - "loss": 0.4553404450416565, - "mean_token_accuracy": 0.8412896394729614, - "num_tokens": 8071544.0, - "step": 887 - }, - { - "epoch": 0.6747720364741642, - "grad_norm": 2.2024407386779785, - "learning_rate": 4.594963730853497e-06, - "loss": 0.6218541860580444, - "mean_token_accuracy": 0.7890232801437378, - "num_tokens": 8079061.0, - "step": 888 - }, - { - "epoch": 0.675531914893617, - "grad_norm": 2.51015567779541, - "learning_rate": 4.593820100105355e-06, - "loss": 0.5149124264717102, - "mean_token_accuracy": 0.8241918087005615, - "num_tokens": 8084293.0, - "step": 889 - }, - { - "epoch": 0.6762917933130699, - "grad_norm": 1.8748939037322998, - "learning_rate": 4.5926749998336665e-06, - "loss": 0.50836181640625, - "mean_token_accuracy": 0.8067223429679871, - "num_tokens": 8092511.0, - "step": 890 - }, - { - "epoch": 0.6770516717325228, - "grad_norm": 1.801193118095398, - "learning_rate": 4.5915284308421075e-06, - "loss": 0.4372861683368683, - "mean_token_accuracy": 0.8510604500770569, - "num_tokens": 8101174.0, - "step": 891 - }, - { - "epoch": 0.6778115501519757, - "grad_norm": 2.6476457118988037, - "learning_rate": 4.590380393935383e-06, - "loss": 0.38700711727142334, - "mean_token_accuracy": 0.8659796714782715, - "num_tokens": 8105398.0, - "step": 892 - }, - { - "epoch": 0.6785714285714286, - "grad_norm": 1.1147183179855347, - "learning_rate": 4.589230889919232e-06, - "loss": 0.38546115159988403, - "mean_token_accuracy": 0.8570581674575806, - "num_tokens": 8127394.0, - "step": 893 - }, - { - "epoch": 0.6793313069908815, - "grad_norm": 2.908905506134033, - "learning_rate": 4.588079919600419e-06, - "loss": 0.5108504295349121, - "mean_token_accuracy": 0.8121406435966492, - "num_tokens": 8131801.0, - "step": 894 - }, - { - "epoch": 0.6800911854103343, - "grad_norm": 3.1522326469421387, - "learning_rate": 4.586927483786739e-06, - "loss": 0.44059112668037415, - "mean_token_accuracy": 0.8448011875152588, - "num_tokens": 8154416.0, - "step": 895 - }, - { - "epoch": 0.6808510638297872, - "grad_norm": 1.5142440795898438, - "learning_rate": 4.585773583287017e-06, - "loss": 0.513217568397522, - "mean_token_accuracy": 0.8386049270629883, - "num_tokens": 8171156.0, - "step": 896 - }, - { - "epoch": 0.6816109422492401, - "grad_norm": 2.597881317138672, - "learning_rate": 4.584618218911104e-06, - "loss": 0.4937712550163269, - "mean_token_accuracy": 0.8223681449890137, - "num_tokens": 8176124.0, - "step": 897 - }, - { - "epoch": 0.682370820668693, - "grad_norm": 1.8185619115829468, - "learning_rate": 4.583461391469879e-06, - "loss": 0.519811749458313, - "mean_token_accuracy": 0.8169777393341064, - "num_tokens": 8185136.0, - "step": 898 - }, - { - "epoch": 0.6831306990881459, - "grad_norm": 3.2061994075775146, - "learning_rate": 4.582303101775249e-06, - "loss": 0.4655115008354187, - "mean_token_accuracy": 0.8425977230072021, - "num_tokens": 8188864.0, - "step": 899 - }, - { - "epoch": 0.6838905775075987, - "grad_norm": 1.3485229015350342, - "learning_rate": 4.581143350640146e-06, - "loss": 0.5014470815658569, - "mean_token_accuracy": 0.8273109197616577, - "num_tokens": 8203460.0, - "step": 900 - }, - { - "epoch": 0.6846504559270516, - "grad_norm": 1.3264713287353516, - "learning_rate": 4.579982138878527e-06, - "loss": 0.5073703527450562, - "mean_token_accuracy": 0.8259357213973999, - "num_tokens": 8219348.0, - "step": 901 - }, - { - "epoch": 0.6854103343465046, - "grad_norm": 2.4436347484588623, - "learning_rate": 4.578819467305375e-06, - "loss": 0.47020310163497925, - "mean_token_accuracy": 0.8567265272140503, - "num_tokens": 8224427.0, - "step": 902 - }, - { - "epoch": 0.6861702127659575, - "grad_norm": 1.921749234199524, - "learning_rate": 4.5776553367367e-06, - "loss": 0.622514009475708, - "mean_token_accuracy": 0.7863982319831848, - "num_tokens": 8233151.0, - "step": 903 - }, - { - "epoch": 0.6869300911854104, - "grad_norm": 1.8815616369247437, - "learning_rate": 4.576489747989532e-06, - "loss": 0.4910545349121094, - "mean_token_accuracy": 0.8147122859954834, - "num_tokens": 8240762.0, - "step": 904 - }, - { - "epoch": 0.6876899696048632, - "grad_norm": 1.2366989850997925, - "learning_rate": 4.575322701881926e-06, - "loss": 0.3947566747665405, - "mean_token_accuracy": 0.873993992805481, - "num_tokens": 8259381.0, - "step": 905 - }, - { - "epoch": 0.6884498480243161, - "grad_norm": 1.5767735242843628, - "learning_rate": 4.57415419923296e-06, - "loss": 0.57136070728302, - "mean_token_accuracy": 0.8028088808059692, - "num_tokens": 8273296.0, - "step": 906 - }, - { - "epoch": 0.689209726443769, - "grad_norm": 2.378675699234009, - "learning_rate": 4.572984240862733e-06, - "loss": 0.5894849896430969, - "mean_token_accuracy": 0.7977708578109741, - "num_tokens": 8280083.0, - "step": 907 - }, - { - "epoch": 0.6899696048632219, - "grad_norm": 2.0401132106781006, - "learning_rate": 4.57181282759237e-06, - "loss": 0.5524613261222839, - "mean_token_accuracy": 0.8138598203659058, - "num_tokens": 8288236.0, - "step": 908 - }, - { - "epoch": 0.6907294832826748, - "grad_norm": 2.293701648712158, - "learning_rate": 4.570639960244011e-06, - "loss": 0.5154546499252319, - "mean_token_accuracy": 0.8234660625457764, - "num_tokens": 8294493.0, - "step": 909 - }, - { - "epoch": 0.6914893617021277, - "grad_norm": 1.9286527633666992, - "learning_rate": 4.56946563964082e-06, - "loss": 0.5364264845848083, - "mean_token_accuracy": 0.8147368431091309, - "num_tokens": 8303441.0, - "step": 910 - }, - { - "epoch": 0.6922492401215805, - "grad_norm": 1.2571251392364502, - "learning_rate": 4.5682898666069815e-06, - "loss": 0.43535223603248596, - "mean_token_accuracy": 0.859239935874939, - "num_tokens": 8321548.0, - "step": 911 - }, - { - "epoch": 0.6930091185410334, - "grad_norm": 1.2224860191345215, - "learning_rate": 4.567112641967697e-06, - "loss": 0.40205076336860657, - "mean_token_accuracy": 0.8724711537361145, - "num_tokens": 8335205.0, - "step": 912 - }, - { - "epoch": 0.6937689969604863, - "grad_norm": 1.2064491510391235, - "learning_rate": 4.5659339665491894e-06, - "loss": 0.37790587544441223, - "mean_token_accuracy": 0.8464339971542358, - "num_tokens": 8350926.0, - "step": 913 - }, - { - "epoch": 0.6945288753799392, - "grad_norm": 2.1755270957946777, - "learning_rate": 4.5647538411786965e-06, - "loss": 0.42034298181533813, - "mean_token_accuracy": 0.84148108959198, - "num_tokens": 8356739.0, - "step": 914 - }, - { - "epoch": 0.6952887537993921, - "grad_norm": 1.234864592552185, - "learning_rate": 4.563572266684478e-06, - "loss": 0.5062938332557678, - "mean_token_accuracy": 0.8132052421569824, - "num_tokens": 8373660.0, - "step": 915 - }, - { - "epoch": 0.6960486322188449, - "grad_norm": 2.4250621795654297, - "learning_rate": 4.562389243895807e-06, - "loss": 0.4907791018486023, - "mean_token_accuracy": 0.8337979912757874, - "num_tokens": 8378661.0, - "step": 916 - }, - { - "epoch": 0.6968085106382979, - "grad_norm": 1.5018314123153687, - "learning_rate": 4.561204773642974e-06, - "loss": 0.41041281819343567, - "mean_token_accuracy": 0.8569784164428711, - "num_tokens": 8390322.0, - "step": 917 - }, - { - "epoch": 0.6975683890577508, - "grad_norm": 2.797269344329834, - "learning_rate": 4.5600188567572874e-06, - "loss": 0.3146931529045105, - "mean_token_accuracy": 0.8913302421569824, - "num_tokens": 8393567.0, - "step": 918 - }, - { - "epoch": 0.6983282674772037, - "grad_norm": 1.4002827405929565, - "learning_rate": 4.558831494071069e-06, - "loss": 0.4275597333908081, - "mean_token_accuracy": 0.8504893779754639, - "num_tokens": 8407119.0, - "step": 919 - }, - { - "epoch": 0.6990881458966566, - "grad_norm": 1.7045831680297852, - "learning_rate": 4.557642686417654e-06, - "loss": 0.49593430757522583, - "mean_token_accuracy": 0.8185091018676758, - "num_tokens": 8417408.0, - "step": 920 - }, - { - "epoch": 0.6998480243161094, - "grad_norm": 2.8818066120147705, - "learning_rate": 4.556452434631396e-06, - "loss": 0.637908935546875, - "mean_token_accuracy": 0.7883946895599365, - "num_tokens": 8422319.0, - "step": 921 - }, - { - "epoch": 0.7006079027355623, - "grad_norm": 2.3587265014648438, - "learning_rate": 4.555260739547657e-06, - "loss": 0.38749319314956665, - "mean_token_accuracy": 0.8774704933166504, - "num_tokens": 8427315.0, - "step": 922 - }, - { - "epoch": 0.7013677811550152, - "grad_norm": 1.6648749113082886, - "learning_rate": 4.554067602002815e-06, - "loss": 0.4044865369796753, - "mean_token_accuracy": 0.8524141311645508, - "num_tokens": 8438662.0, - "step": 923 - }, - { - "epoch": 0.7021276595744681, - "grad_norm": 3.467787742614746, - "learning_rate": 4.55287302283426e-06, - "loss": 0.591016411781311, - "mean_token_accuracy": 0.81184983253479, - "num_tokens": 8442237.0, - "step": 924 - }, - { - "epoch": 0.702887537993921, - "grad_norm": 2.1458635330200195, - "learning_rate": 4.551677002880395e-06, - "loss": 0.5017476677894592, - "mean_token_accuracy": 0.822914183139801, - "num_tokens": 8449494.0, - "step": 925 - }, - { - "epoch": 0.7036474164133738, - "grad_norm": 2.521714448928833, - "learning_rate": 4.550479542980632e-06, - "loss": 0.531912088394165, - "mean_token_accuracy": 0.8225687742233276, - "num_tokens": 8454983.0, - "step": 926 - }, - { - "epoch": 0.7044072948328267, - "grad_norm": 3.5248100757598877, - "learning_rate": 4.549280643975394e-06, - "loss": 0.4631815254688263, - "mean_token_accuracy": 0.8443771600723267, - "num_tokens": 8458504.0, - "step": 927 - }, - { - "epoch": 0.7051671732522796, - "grad_norm": 2.5105819702148438, - "learning_rate": 4.548080306706114e-06, - "loss": 0.30487123131752014, - "mean_token_accuracy": 0.9018767476081848, - "num_tokens": 8462589.0, - "step": 928 - }, - { - "epoch": 0.7059270516717325, - "grad_norm": 1.3367713689804077, - "learning_rate": 4.5468785320152365e-06, - "loss": 0.4355026185512543, - "mean_token_accuracy": 0.8323584794998169, - "num_tokens": 8478450.0, - "step": 929 - }, - { - "epoch": 0.7066869300911854, - "grad_norm": 2.2506282329559326, - "learning_rate": 4.545675320746212e-06, - "loss": 0.5082957744598389, - "mean_token_accuracy": 0.823430597782135, - "num_tokens": 8485991.0, - "step": 930 - }, - { - "epoch": 0.7074468085106383, - "grad_norm": 1.7164632081985474, - "learning_rate": 4.544470673743502e-06, - "loss": 0.3960164785385132, - "mean_token_accuracy": 0.8592486381530762, - "num_tokens": 8495217.0, - "step": 931 - }, - { - "epoch": 0.7082066869300911, - "grad_norm": 1.5864969491958618, - "learning_rate": 4.543264591852572e-06, - "loss": 0.49114471673965454, - "mean_token_accuracy": 0.8330780267715454, - "num_tokens": 8508904.0, - "step": 932 - }, - { - "epoch": 0.708966565349544, - "grad_norm": 2.1707003116607666, - "learning_rate": 4.542057075919898e-06, - "loss": 0.49895772337913513, - "mean_token_accuracy": 0.8327431082725525, - "num_tokens": 8515792.0, - "step": 933 - }, - { - "epoch": 0.709726443768997, - "grad_norm": 1.9002083539962769, - "learning_rate": 4.54084812679296e-06, - "loss": 0.4548531472682953, - "mean_token_accuracy": 0.834532618522644, - "num_tokens": 8524006.0, - "step": 934 - }, - { - "epoch": 0.7104863221884499, - "grad_norm": 1.8505141735076904, - "learning_rate": 4.539637745320247e-06, - "loss": 0.35716521739959717, - "mean_token_accuracy": 0.872222900390625, - "num_tokens": 8533647.0, - "step": 935 - }, - { - "epoch": 0.7112462006079028, - "grad_norm": 2.092620849609375, - "learning_rate": 4.53842593235125e-06, - "loss": 0.4673694372177124, - "mean_token_accuracy": 0.8460999131202698, - "num_tokens": 8540734.0, - "step": 936 - }, - { - "epoch": 0.7120060790273556, - "grad_norm": 2.689514636993408, - "learning_rate": 4.537212688736466e-06, - "loss": 0.45461273193359375, - "mean_token_accuracy": 0.8450704216957092, - "num_tokens": 8544948.0, - "step": 937 - }, - { - "epoch": 0.7127659574468085, - "grad_norm": 2.4507734775543213, - "learning_rate": 4.535998015327396e-06, - "loss": 0.4571906626224518, - "mean_token_accuracy": 0.8429360389709473, - "num_tokens": 8550445.0, - "step": 938 - }, - { - "epoch": 0.7135258358662614, - "grad_norm": 1.8960013389587402, - "learning_rate": 4.534781912976546e-06, - "loss": 0.4461391568183899, - "mean_token_accuracy": 0.8487973213195801, - "num_tokens": 8557630.0, - "step": 939 - }, - { - "epoch": 0.7142857142857143, - "grad_norm": 1.602611780166626, - "learning_rate": 4.533564382537421e-06, - "loss": 0.5277102589607239, - "mean_token_accuracy": 0.8330916166305542, - "num_tokens": 8570397.0, - "step": 940 - }, - { - "epoch": 0.7150455927051672, - "grad_norm": 1.8936395645141602, - "learning_rate": 4.532345424864533e-06, - "loss": 0.38619571924209595, - "mean_token_accuracy": 0.8514572381973267, - "num_tokens": 8582673.0, - "step": 941 - }, - { - "epoch": 0.71580547112462, - "grad_norm": 1.3898619413375854, - "learning_rate": 4.531125040813392e-06, - "loss": 0.4825032949447632, - "mean_token_accuracy": 0.833012580871582, - "num_tokens": 8597239.0, - "step": 942 - }, - { - "epoch": 0.7165653495440729, - "grad_norm": 2.128230571746826, - "learning_rate": 4.529903231240511e-06, - "loss": 0.4862118065357208, - "mean_token_accuracy": 0.8210917711257935, - "num_tokens": 8605877.0, - "step": 943 - }, - { - "epoch": 0.7173252279635258, - "grad_norm": 1.6552259922027588, - "learning_rate": 4.528679997003403e-06, - "loss": 0.5092059373855591, - "mean_token_accuracy": 0.8247389793395996, - "num_tokens": 8617060.0, - "step": 944 - }, - { - "epoch": 0.7180851063829787, - "grad_norm": 2.1174771785736084, - "learning_rate": 4.52745533896058e-06, - "loss": 0.39110174775123596, - "mean_token_accuracy": 0.8672944903373718, - "num_tokens": 8623306.0, - "step": 945 - }, - { - "epoch": 0.7188449848024316, - "grad_norm": 2.8648383617401123, - "learning_rate": 4.526229257971556e-06, - "loss": 0.49864327907562256, - "mean_token_accuracy": 0.8305130004882812, - "num_tokens": 8627466.0, - "step": 946 - }, - { - "epoch": 0.7196048632218845, - "grad_norm": 2.155514717102051, - "learning_rate": 4.52500175489684e-06, - "loss": 0.5070191025733948, - "mean_token_accuracy": 0.8311188817024231, - "num_tokens": 8634759.0, - "step": 947 - }, - { - "epoch": 0.7203647416413373, - "grad_norm": 1.8432683944702148, - "learning_rate": 4.523772830597942e-06, - "loss": 0.5569252371788025, - "mean_token_accuracy": 0.8070821762084961, - "num_tokens": 8644160.0, - "step": 948 - }, - { - "epoch": 0.7211246200607903, - "grad_norm": 2.8912241458892822, - "learning_rate": 4.522542485937369e-06, - "loss": 0.4799427390098572, - "mean_token_accuracy": 0.8443552851676941, - "num_tokens": 8648377.0, - "step": 949 - }, - { - "epoch": 0.7218844984802432, - "grad_norm": 3.3449625968933105, - "learning_rate": 4.521310721778622e-06, - "loss": 0.44043463468551636, - "mean_token_accuracy": 0.8521315455436707, - "num_tokens": 8651846.0, - "step": 950 - }, - { - "epoch": 0.7226443768996961, - "grad_norm": 1.4127917289733887, - "learning_rate": 4.520077538986203e-06, - "loss": 0.4700999855995178, - "mean_token_accuracy": 0.8377952575683594, - "num_tokens": 8665199.0, - "step": 951 - }, - { - "epoch": 0.723404255319149, - "grad_norm": 2.1607301235198975, - "learning_rate": 4.518842938425606e-06, - "loss": 0.4374256730079651, - "mean_token_accuracy": 0.8448896408081055, - "num_tokens": 8672158.0, - "step": 952 - }, - { - "epoch": 0.7241641337386018, - "grad_norm": 1.3442779779434204, - "learning_rate": 4.51760692096332e-06, - "loss": 0.38948923349380493, - "mean_token_accuracy": 0.8598923683166504, - "num_tokens": 8684532.0, - "step": 953 - }, - { - "epoch": 0.7249240121580547, - "grad_norm": 2.0003178119659424, - "learning_rate": 4.516369487466832e-06, - "loss": 0.3797217011451721, - "mean_token_accuracy": 0.8652102947235107, - "num_tokens": 8691460.0, - "step": 954 - }, - { - "epoch": 0.7256838905775076, - "grad_norm": 1.8196535110473633, - "learning_rate": 4.5151306388046175e-06, - "loss": 0.5676811933517456, - "mean_token_accuracy": 0.818500816822052, - "num_tokens": 8701624.0, - "step": 955 - }, - { - "epoch": 0.7264437689969605, - "grad_norm": 2.1962296962738037, - "learning_rate": 4.513890375846152e-06, - "loss": 0.45399484038352966, - "mean_token_accuracy": 0.8463879227638245, - "num_tokens": 8707410.0, - "step": 956 - }, - { - "epoch": 0.7272036474164134, - "grad_norm": 1.8798872232437134, - "learning_rate": 4.512648699461897e-06, - "loss": 0.5679811239242554, - "mean_token_accuracy": 0.8089900016784668, - "num_tokens": 8715630.0, - "step": 957 - }, - { - "epoch": 0.7279635258358662, - "grad_norm": 2.3540258407592773, - "learning_rate": 4.511405610523309e-06, - "loss": 0.5282865762710571, - "mean_token_accuracy": 0.8196114301681519, - "num_tokens": 8721934.0, - "step": 958 - }, - { - "epoch": 0.7287234042553191, - "grad_norm": 2.5630908012390137, - "learning_rate": 4.510161109902837e-06, - "loss": 0.39442378282546997, - "mean_token_accuracy": 0.8400980830192566, - "num_tokens": 8726511.0, - "step": 959 - }, - { - "epoch": 0.729483282674772, - "grad_norm": 1.9829226732254028, - "learning_rate": 4.508915198473919e-06, - "loss": 0.4611976742744446, - "mean_token_accuracy": 0.8439624309539795, - "num_tokens": 8733460.0, - "step": 960 - }, - { - "epoch": 0.7302431610942249, - "grad_norm": 3.0291950702667236, - "learning_rate": 4.507667877110982e-06, - "loss": 0.5158340930938721, - "mean_token_accuracy": 0.8300060033798218, - "num_tokens": 8737629.0, - "step": 961 - }, - { - "epoch": 0.7310030395136778, - "grad_norm": 1.9208252429962158, - "learning_rate": 4.506419146689445e-06, - "loss": 0.3807099163532257, - "mean_token_accuracy": 0.871469259262085, - "num_tokens": 8744615.0, - "step": 962 - }, - { - "epoch": 0.7317629179331308, - "grad_norm": 3.051565408706665, - "learning_rate": 4.505169008085717e-06, - "loss": 0.38461726903915405, - "mean_token_accuracy": 0.874465823173523, - "num_tokens": 8748154.0, - "step": 963 - }, - { - "epoch": 0.7325227963525835, - "grad_norm": 1.375466227531433, - "learning_rate": 4.503917462177192e-06, - "loss": 0.42490679025650024, - "mean_token_accuracy": 0.8457326889038086, - "num_tokens": 8760965.0, - "step": 964 - }, - { - "epoch": 0.7332826747720365, - "grad_norm": 2.216681957244873, - "learning_rate": 4.5026645098422515e-06, - "loss": 0.43149900436401367, - "mean_token_accuracy": 0.8527278900146484, - "num_tokens": 8766996.0, - "step": 965 - }, - { - "epoch": 0.7340425531914894, - "grad_norm": 1.9422595500946045, - "learning_rate": 4.5014101519602684e-06, - "loss": 0.4964504539966583, - "mean_token_accuracy": 0.8137556314468384, - "num_tokens": 8774411.0, - "step": 966 - }, - { - "epoch": 0.7348024316109423, - "grad_norm": 2.058887004852295, - "learning_rate": 4.500154389411598e-06, - "loss": 0.4977570176124573, - "mean_token_accuracy": 0.8254626989364624, - "num_tokens": 8782220.0, - "step": 967 - }, - { - "epoch": 0.7355623100303952, - "grad_norm": 2.9977786540985107, - "learning_rate": 4.498897223077582e-06, - "loss": 0.4061415195465088, - "mean_token_accuracy": 0.8752427101135254, - "num_tokens": 8786120.0, - "step": 968 - }, - { - "epoch": 0.736322188449848, - "grad_norm": 2.2636303901672363, - "learning_rate": 4.49763865384055e-06, - "loss": 0.5062161087989807, - "mean_token_accuracy": 0.8171653747558594, - "num_tokens": 8792459.0, - "step": 969 - }, - { - "epoch": 0.7370820668693009, - "grad_norm": 1.8850842714309692, - "learning_rate": 4.496378682583813e-06, - "loss": 0.5014280676841736, - "mean_token_accuracy": 0.8547511100769043, - "num_tokens": 8800675.0, - "step": 970 - }, - { - "epoch": 0.7378419452887538, - "grad_norm": 1.191985011100769, - "learning_rate": 4.495117310191667e-06, - "loss": 0.4713883101940155, - "mean_token_accuracy": 0.8213596343994141, - "num_tokens": 8820740.0, - "step": 971 - }, - { - "epoch": 0.7386018237082067, - "grad_norm": 1.823000192642212, - "learning_rate": 4.493854537549393e-06, - "loss": 0.46332645416259766, - "mean_token_accuracy": 0.8359860777854919, - "num_tokens": 8828884.0, - "step": 972 - }, - { - "epoch": 0.7393617021276596, - "grad_norm": 2.590446949005127, - "learning_rate": 4.492590365543253e-06, - "loss": 0.49074703454971313, - "mean_token_accuracy": 0.8433758020401001, - "num_tokens": 8833859.0, - "step": 973 - }, - { - "epoch": 0.7401215805471124, - "grad_norm": 2.2762670516967773, - "learning_rate": 4.491324795060491e-06, - "loss": 0.39465656876564026, - "mean_token_accuracy": 0.8734766244888306, - "num_tokens": 8839350.0, - "step": 974 - }, - { - "epoch": 0.7408814589665653, - "grad_norm": 2.698725461959839, - "learning_rate": 4.490057826989333e-06, - "loss": 0.5552085041999817, - "mean_token_accuracy": 0.8132266998291016, - "num_tokens": 8844373.0, - "step": 975 - }, - { - "epoch": 0.7416413373860182, - "grad_norm": 2.704606294631958, - "learning_rate": 4.488789462218988e-06, - "loss": 0.3447791635990143, - "mean_token_accuracy": 0.8736170530319214, - "num_tokens": 8848236.0, - "step": 976 - }, - { - "epoch": 0.7424012158054711, - "grad_norm": 3.1260716915130615, - "learning_rate": 4.487519701639641e-06, - "loss": 0.5945233702659607, - "mean_token_accuracy": 0.7997599840164185, - "num_tokens": 8852935.0, - "step": 977 - }, - { - "epoch": 0.743161094224924, - "grad_norm": 1.6895452737808228, - "learning_rate": 4.486248546142459e-06, - "loss": 0.4823892116546631, - "mean_token_accuracy": 0.8279662132263184, - "num_tokens": 8861743.0, - "step": 978 - }, - { - "epoch": 0.743920972644377, - "grad_norm": 1.9161452054977417, - "learning_rate": 4.4849759966195885e-06, - "loss": 0.5266581773757935, - "mean_token_accuracy": 0.8218623399734497, - "num_tokens": 8870601.0, - "step": 979 - }, - { - "epoch": 0.7446808510638298, - "grad_norm": 1.6894301176071167, - "learning_rate": 4.483702053964154e-06, - "loss": 0.4186219573020935, - "mean_token_accuracy": 0.8471781015396118, - "num_tokens": 8885617.0, - "step": 980 - }, - { - "epoch": 0.7454407294832827, - "grad_norm": 1.6319992542266846, - "learning_rate": 4.482426719070258e-06, - "loss": 0.541317880153656, - "mean_token_accuracy": 0.8216162323951721, - "num_tokens": 8897595.0, - "step": 981 - }, - { - "epoch": 0.7462006079027356, - "grad_norm": 5.102413177490234, - "learning_rate": 4.4811499928329775e-06, - "loss": 0.3928517699241638, - "mean_token_accuracy": 0.858033299446106, - "num_tokens": 8901682.0, - "step": 982 - }, - { - "epoch": 0.7469604863221885, - "grad_norm": 2.213860273361206, - "learning_rate": 4.479871876148368e-06, - "loss": 0.4276347756385803, - "mean_token_accuracy": 0.8529798984527588, - "num_tokens": 8908088.0, - "step": 983 - }, - { - "epoch": 0.7477203647416414, - "grad_norm": 1.2180038690567017, - "learning_rate": 4.478592369913464e-06, - "loss": 0.3941590189933777, - "mean_token_accuracy": 0.8608149290084839, - "num_tokens": 8925876.0, - "step": 984 - }, - { - "epoch": 0.7484802431610942, - "grad_norm": 2.849802255630493, - "learning_rate": 4.477311475026271e-06, - "loss": 0.42190325260162354, - "mean_token_accuracy": 0.860505223274231, - "num_tokens": 8930190.0, - "step": 985 - }, - { - "epoch": 0.7492401215805471, - "grad_norm": 1.704128384590149, - "learning_rate": 4.476029192385769e-06, - "loss": 0.4786282777786255, - "mean_token_accuracy": 0.8302322626113892, - "num_tokens": 8938340.0, - "step": 986 - }, - { - "epoch": 0.75, - "grad_norm": 2.06322979927063, - "learning_rate": 4.474745522891915e-06, - "loss": 0.4648786187171936, - "mean_token_accuracy": 0.8366481065750122, - "num_tokens": 8944633.0, - "step": 987 - }, - { - "epoch": 0.7507598784194529, - "grad_norm": 2.0745396614074707, - "learning_rate": 4.473460467445637e-06, - "loss": 0.5744885206222534, - "mean_token_accuracy": 0.8357284069061279, - "num_tokens": 8954457.0, - "step": 988 - }, - { - "epoch": 0.7515197568389058, - "grad_norm": 1.9281407594680786, - "learning_rate": 4.472174026948836e-06, - "loss": 0.528974175453186, - "mean_token_accuracy": 0.8083580732345581, - "num_tokens": 8962701.0, - "step": 989 - }, - { - "epoch": 0.7522796352583586, - "grad_norm": 3.012381076812744, - "learning_rate": 4.470886202304385e-06, - "loss": 0.48754751682281494, - "mean_token_accuracy": 0.8368391990661621, - "num_tokens": 8967272.0, - "step": 990 - }, - { - "epoch": 0.7530395136778115, - "grad_norm": 1.691826581954956, - "learning_rate": 4.469596994416131e-06, - "loss": 0.484740674495697, - "mean_token_accuracy": 0.8500643968582153, - "num_tokens": 8976615.0, - "step": 991 - }, - { - "epoch": 0.7537993920972644, - "grad_norm": 2.4961965084075928, - "learning_rate": 4.468306404188887e-06, - "loss": 0.50777268409729, - "mean_token_accuracy": 0.8168395757675171, - "num_tokens": 8983235.0, - "step": 992 - }, - { - "epoch": 0.7545592705167173, - "grad_norm": 1.512007713317871, - "learning_rate": 4.467014432528441e-06, - "loss": 0.4583340287208557, - "mean_token_accuracy": 0.8465162515640259, - "num_tokens": 8993815.0, - "step": 993 - }, - { - "epoch": 0.7553191489361702, - "grad_norm": 1.9362257719039917, - "learning_rate": 4.465721080341547e-06, - "loss": 0.6027892827987671, - "mean_token_accuracy": 0.8052380084991455, - "num_tokens": 9002697.0, - "step": 994 - }, - { - "epoch": 0.756079027355623, - "grad_norm": 2.473632335662842, - "learning_rate": 4.4644263485359316e-06, - "loss": 0.5394320487976074, - "mean_token_accuracy": 0.834665834903717, - "num_tokens": 9007428.0, - "step": 995 - }, - { - "epoch": 0.756838905775076, - "grad_norm": 2.2527434825897217, - "learning_rate": 4.463130238020284e-06, - "loss": 0.5485198497772217, - "mean_token_accuracy": 0.8090173006057739, - "num_tokens": 9013570.0, - "step": 996 - }, - { - "epoch": 0.7575987841945289, - "grad_norm": 1.4130940437316895, - "learning_rate": 4.4618327497042676e-06, - "loss": 0.37994423508644104, - "mean_token_accuracy": 0.8625167012214661, - "num_tokens": 9025485.0, - "step": 997 - }, - { - "epoch": 0.7583586626139818, - "grad_norm": 2.685115098953247, - "learning_rate": 4.460533884498509e-06, - "loss": 0.447973370552063, - "mean_token_accuracy": 0.8564165234565735, - "num_tokens": 9030355.0, - "step": 998 - }, - { - "epoch": 0.7591185410334347, - "grad_norm": 3.2743139266967773, - "learning_rate": 4.4592336433146e-06, - "loss": 0.45275989174842834, - "mean_token_accuracy": 0.8462578058242798, - "num_tokens": 9034406.0, - "step": 999 - }, - { - "epoch": 0.7598784194528876, - "grad_norm": 1.9383049011230469, - "learning_rate": 4.457932027065102e-06, - "loss": 0.5387729406356812, - "mean_token_accuracy": 0.8357330560684204, - "num_tokens": 9041502.0, - "step": 1000 - }, - { - "epoch": 0.7606382978723404, - "grad_norm": 2.7348275184631348, - "learning_rate": 4.456629036663537e-06, - "loss": 0.4448447823524475, - "mean_token_accuracy": 0.8453642129898071, - "num_tokens": 9046088.0, - "step": 1001 - }, - { - "epoch": 0.7613981762917933, - "grad_norm": 1.8477401733398438, - "learning_rate": 4.455324673024396e-06, - "loss": 0.5766505002975464, - "mean_token_accuracy": 0.8074213862419128, - "num_tokens": 9055678.0, - "step": 1002 - }, - { - "epoch": 0.7621580547112462, - "grad_norm": 3.134481430053711, - "learning_rate": 4.4540189370631315e-06, - "loss": 0.5690872669219971, - "mean_token_accuracy": 0.8414670825004578, - "num_tokens": 9062006.0, - "step": 1003 - }, - { - "epoch": 0.7629179331306991, - "grad_norm": 1.7933398485183716, - "learning_rate": 4.452711829696158e-06, - "loss": 0.4898291826248169, - "mean_token_accuracy": 0.8259007930755615, - "num_tokens": 9070754.0, - "step": 1004 - }, - { - "epoch": 0.763677811550152, - "grad_norm": 1.2552275657653809, - "learning_rate": 4.451403351840855e-06, - "loss": 0.4280198812484741, - "mean_token_accuracy": 0.8409112691879272, - "num_tokens": 9085306.0, - "step": 1005 - }, - { - "epoch": 0.7644376899696048, - "grad_norm": 1.6749331951141357, - "learning_rate": 4.450093504415562e-06, - "loss": 0.3723178505897522, - "mean_token_accuracy": 0.8545734882354736, - "num_tokens": 9102453.0, - "step": 1006 - }, - { - "epoch": 0.7651975683890577, - "grad_norm": 2.7514500617980957, - "learning_rate": 4.44878228833958e-06, - "loss": 0.5463190674781799, - "mean_token_accuracy": 0.8121639490127563, - "num_tokens": 9108342.0, - "step": 1007 - }, - { - "epoch": 0.7659574468085106, - "grad_norm": 1.3322733640670776, - "learning_rate": 4.447469704533172e-06, - "loss": 0.573723316192627, - "mean_token_accuracy": 0.8065711259841919, - "num_tokens": 9123712.0, - "step": 1008 - }, - { - "epoch": 0.7667173252279635, - "grad_norm": 2.6893765926361084, - "learning_rate": 4.446155753917559e-06, - "loss": 0.6856257915496826, - "mean_token_accuracy": 0.7718256711959839, - "num_tokens": 9130728.0, - "step": 1009 - }, - { - "epoch": 0.7674772036474165, - "grad_norm": 1.792765498161316, - "learning_rate": 4.444840437414923e-06, - "loss": 0.48203110694885254, - "mean_token_accuracy": 0.8419194221496582, - "num_tokens": 9137983.0, - "step": 1010 - }, - { - "epoch": 0.7682370820668692, - "grad_norm": 1.4957399368286133, - "learning_rate": 4.443523755948401e-06, - "loss": 0.4372181296348572, - "mean_token_accuracy": 0.8491764664649963, - "num_tokens": 9148081.0, - "step": 1011 - }, - { - "epoch": 0.7689969604863222, - "grad_norm": 1.7294867038726807, - "learning_rate": 4.442205710442095e-06, - "loss": 0.54277503490448, - "mean_token_accuracy": 0.8196806907653809, - "num_tokens": 9158407.0, - "step": 1012 - }, - { - "epoch": 0.7697568389057751, - "grad_norm": 2.2091221809387207, - "learning_rate": 4.4408863018210564e-06, - "loss": 0.4888187646865845, - "mean_token_accuracy": 0.8384175300598145, - "num_tokens": 9164754.0, - "step": 1013 - }, - { - "epoch": 0.770516717325228, - "grad_norm": 1.7615830898284912, - "learning_rate": 4.439565531011299e-06, - "loss": 0.4640008211135864, - "mean_token_accuracy": 0.8424701690673828, - "num_tokens": 9172715.0, - "step": 1014 - }, - { - "epoch": 0.7712765957446809, - "grad_norm": 1.6796128749847412, - "learning_rate": 4.43824339893979e-06, - "loss": 0.5227609276771545, - "mean_token_accuracy": 0.8135923743247986, - "num_tokens": 9183214.0, - "step": 1015 - }, - { - "epoch": 0.7720364741641338, - "grad_norm": 2.1485698223114014, - "learning_rate": 4.436919906534452e-06, - "loss": 0.4857056140899658, - "mean_token_accuracy": 0.8323013782501221, - "num_tokens": 9190360.0, - "step": 1016 - }, - { - "epoch": 0.7727963525835866, - "grad_norm": 2.7842206954956055, - "learning_rate": 4.4355950547241645e-06, - "loss": 0.46406883001327515, - "mean_token_accuracy": 0.859869122505188, - "num_tokens": 9194523.0, - "step": 1017 - }, - { - "epoch": 0.7735562310030395, - "grad_norm": 2.3774640560150146, - "learning_rate": 4.434268844438758e-06, - "loss": 0.5625549554824829, - "mean_token_accuracy": 0.8188897371292114, - "num_tokens": 9201155.0, - "step": 1018 - }, - { - "epoch": 0.7743161094224924, - "grad_norm": 2.004427909851074, - "learning_rate": 4.432941276609018e-06, - "loss": 0.5164387226104736, - "mean_token_accuracy": 0.829569935798645, - "num_tokens": 9209269.0, - "step": 1019 - }, - { - "epoch": 0.7750759878419453, - "grad_norm": 1.7218989133834839, - "learning_rate": 4.431612352166684e-06, - "loss": 0.481005996465683, - "mean_token_accuracy": 0.8359906673431396, - "num_tokens": 9220860.0, - "step": 1020 - }, - { - "epoch": 0.7758358662613982, - "grad_norm": 2.197108507156372, - "learning_rate": 4.4302820720444454e-06, - "loss": 0.440413236618042, - "mean_token_accuracy": 0.8412867784500122, - "num_tokens": 9226414.0, - "step": 1021 - }, - { - "epoch": 0.776595744680851, - "grad_norm": 2.6995162963867188, - "learning_rate": 4.428950437175944e-06, - "loss": 0.3884299397468567, - "mean_token_accuracy": 0.8696021437644958, - "num_tokens": 9230898.0, - "step": 1022 - }, - { - "epoch": 0.7773556231003039, - "grad_norm": 2.1671667098999023, - "learning_rate": 4.427617448495772e-06, - "loss": 0.5747478008270264, - "mean_token_accuracy": 0.7842930555343628, - "num_tokens": 9238479.0, - "step": 1023 - }, - { - "epoch": 0.7781155015197568, - "grad_norm": 1.6299028396606445, - "learning_rate": 4.426283106939474e-06, - "loss": 0.39478403329849243, - "mean_token_accuracy": 0.8685503602027893, - "num_tokens": 9248263.0, - "step": 1024 - }, - { - "epoch": 0.7788753799392097, - "grad_norm": 2.2621798515319824, - "learning_rate": 4.424947413443539e-06, - "loss": 0.4582178592681885, - "mean_token_accuracy": 0.8312377333641052, - "num_tokens": 9254168.0, - "step": 1025 - }, - { - "epoch": 0.7796352583586627, - "grad_norm": 2.121091365814209, - "learning_rate": 4.423610368945411e-06, - "loss": 0.5315121412277222, - "mean_token_accuracy": 0.8121483325958252, - "num_tokens": 9261808.0, - "step": 1026 - }, - { - "epoch": 0.7803951367781155, - "grad_norm": 1.8558297157287598, - "learning_rate": 4.422271974383479e-06, - "loss": 0.4299176037311554, - "mean_token_accuracy": 0.8452648520469666, - "num_tokens": 9269264.0, - "step": 1027 - }, - { - "epoch": 0.7811550151975684, - "grad_norm": 1.9089949131011963, - "learning_rate": 4.420932230697079e-06, - "loss": 0.43876272439956665, - "mean_token_accuracy": 0.8434094190597534, - "num_tokens": 9277381.0, - "step": 1028 - }, - { - "epoch": 0.7819148936170213, - "grad_norm": 1.8619649410247803, - "learning_rate": 4.419591138826495e-06, - "loss": 0.48798668384552, - "mean_token_accuracy": 0.8281317353248596, - "num_tokens": 9285413.0, - "step": 1029 - }, - { - "epoch": 0.7826747720364742, - "grad_norm": 1.3273087739944458, - "learning_rate": 4.418248699712955e-06, - "loss": 0.4611460864543915, - "mean_token_accuracy": 0.8233213424682617, - "num_tokens": 9300805.0, - "step": 1030 - }, - { - "epoch": 0.7834346504559271, - "grad_norm": 1.0473746061325073, - "learning_rate": 4.416904914298637e-06, - "loss": 0.36537665128707886, - "mean_token_accuracy": 0.8671857118606567, - "num_tokens": 9320035.0, - "step": 1031 - }, - { - "epoch": 0.78419452887538, - "grad_norm": 1.9130918979644775, - "learning_rate": 4.415559783526661e-06, - "loss": 0.4916655123233795, - "mean_token_accuracy": 0.8266351222991943, - "num_tokens": 9326795.0, - "step": 1032 - }, - { - "epoch": 0.7849544072948328, - "grad_norm": 2.0001816749572754, - "learning_rate": 4.414213308341092e-06, - "loss": 0.5711008310317993, - "mean_token_accuracy": 0.8093076348304749, - "num_tokens": 9335625.0, - "step": 1033 - }, - { - "epoch": 0.7857142857142857, - "grad_norm": 3.933542251586914, - "learning_rate": 4.412865489686936e-06, - "loss": 0.621616542339325, - "mean_token_accuracy": 0.7938898801803589, - "num_tokens": 9339080.0, - "step": 1034 - }, - { - "epoch": 0.7864741641337386, - "grad_norm": 2.061558961868286, - "learning_rate": 4.411516328510145e-06, - "loss": 0.583686113357544, - "mean_token_accuracy": 0.8216883540153503, - "num_tokens": 9348581.0, - "step": 1035 - }, - { - "epoch": 0.7872340425531915, - "grad_norm": 1.9401264190673828, - "learning_rate": 4.410165825757613e-06, - "loss": 0.4905240535736084, - "mean_token_accuracy": 0.8229951858520508, - "num_tokens": 9356032.0, - "step": 1036 - }, - { - "epoch": 0.7879939209726444, - "grad_norm": 3.620547294616699, - "learning_rate": 4.408813982377175e-06, - "loss": 0.4269888997077942, - "mean_token_accuracy": 0.8713940978050232, - "num_tokens": 9359061.0, - "step": 1037 - }, - { - "epoch": 0.7887537993920972, - "grad_norm": 1.2027851343154907, - "learning_rate": 4.407460799317605e-06, - "loss": 0.39972418546676636, - "mean_token_accuracy": 0.8610097765922546, - "num_tokens": 9377068.0, - "step": 1038 - }, - { - "epoch": 0.7895136778115501, - "grad_norm": 2.566753387451172, - "learning_rate": 4.40610627752862e-06, - "loss": 0.45267152786254883, - "mean_token_accuracy": 0.83243328332901, - "num_tokens": 9383604.0, - "step": 1039 - }, - { - "epoch": 0.790273556231003, - "grad_norm": 2.940094470977783, - "learning_rate": 4.404750417960876e-06, - "loss": 0.42862242460250854, - "mean_token_accuracy": 0.8582849502563477, - "num_tokens": 9387541.0, - "step": 1040 - }, - { - "epoch": 0.791033434650456, - "grad_norm": 2.0223944187164307, - "learning_rate": 4.403393221565966e-06, - "loss": 0.4349963665008545, - "mean_token_accuracy": 0.8453047871589661, - "num_tokens": 9394382.0, - "step": 1041 - }, - { - "epoch": 0.7917933130699089, - "grad_norm": 2.9399030208587646, - "learning_rate": 4.402034689296425e-06, - "loss": 0.32197174429893494, - "mean_token_accuracy": 0.8953392505645752, - "num_tokens": 9397741.0, - "step": 1042 - }, - { - "epoch": 0.7925531914893617, - "grad_norm": 2.819016456604004, - "learning_rate": 4.400674822105721e-06, - "loss": 0.6790289878845215, - "mean_token_accuracy": 0.8135063648223877, - "num_tokens": 9403509.0, - "step": 1043 - }, - { - "epoch": 0.7933130699088146, - "grad_norm": 1.3225977420806885, - "learning_rate": 4.399313620948262e-06, - "loss": 0.42203834652900696, - "mean_token_accuracy": 0.8399381637573242, - "num_tokens": 9418870.0, - "step": 1044 - }, - { - "epoch": 0.7940729483282675, - "grad_norm": 1.7822176218032837, - "learning_rate": 4.397951086779392e-06, - "loss": 0.4666554927825928, - "mean_token_accuracy": 0.8364764451980591, - "num_tokens": 9427640.0, - "step": 1045 - }, - { - "epoch": 0.7948328267477204, - "grad_norm": 3.186439037322998, - "learning_rate": 4.396587220555389e-06, - "loss": 0.6048363447189331, - "mean_token_accuracy": 0.7806557416915894, - "num_tokens": 9431927.0, - "step": 1046 - }, - { - "epoch": 0.7955927051671733, - "grad_norm": 3.0804805755615234, - "learning_rate": 4.395222023233467e-06, - "loss": 0.445969820022583, - "mean_token_accuracy": 0.850671112537384, - "num_tokens": 9436136.0, - "step": 1047 - }, - { - "epoch": 0.7963525835866262, - "grad_norm": 1.675968885421753, - "learning_rate": 4.393855495771774e-06, - "loss": 0.4311422109603882, - "mean_token_accuracy": 0.8449079990386963, - "num_tokens": 9445189.0, - "step": 1048 - }, - { - "epoch": 0.797112462006079, - "grad_norm": 2.342410087585449, - "learning_rate": 4.3924876391293915e-06, - "loss": 0.5733606219291687, - "mean_token_accuracy": 0.8156592845916748, - "num_tokens": 9451939.0, - "step": 1049 - }, - { - "epoch": 0.7978723404255319, - "grad_norm": 1.5967470407485962, - "learning_rate": 4.391118454266335e-06, - "loss": 0.46664729714393616, - "mean_token_accuracy": 0.8091695308685303, - "num_tokens": 9463968.0, - "step": 1050 - }, - { - "epoch": 0.7986322188449848, - "grad_norm": 1.5777863264083862, - "learning_rate": 4.389747942143549e-06, - "loss": 0.46028903126716614, - "mean_token_accuracy": 0.8347330093383789, - "num_tokens": 9475561.0, - "step": 1051 - }, - { - "epoch": 0.7993920972644377, - "grad_norm": 2.7630488872528076, - "learning_rate": 4.388376103722914e-06, - "loss": 0.5618188977241516, - "mean_token_accuracy": 0.8273467421531677, - "num_tokens": 9480661.0, - "step": 1052 - }, - { - "epoch": 0.8001519756838906, - "grad_norm": 2.093397378921509, - "learning_rate": 4.387002939967237e-06, - "loss": 0.2998353838920593, - "mean_token_accuracy": 0.8905231952667236, - "num_tokens": 9485924.0, - "step": 1053 - }, - { - "epoch": 0.8009118541033434, - "grad_norm": 1.4385871887207031, - "learning_rate": 4.38562845184026e-06, - "loss": 0.4944111704826355, - "mean_token_accuracy": 0.8403056263923645, - "num_tokens": 9500128.0, - "step": 1054 - }, - { - "epoch": 0.8016717325227963, - "grad_norm": 1.6393156051635742, - "learning_rate": 4.384252640306649e-06, - "loss": 0.5727907419204712, - "mean_token_accuracy": 0.7849414348602295, - "num_tokens": 9511569.0, - "step": 1055 - }, - { - "epoch": 0.8024316109422492, - "grad_norm": 2.3909664154052734, - "learning_rate": 4.382875506332002e-06, - "loss": 0.4760419726371765, - "mean_token_accuracy": 0.8408266305923462, - "num_tokens": 9517244.0, - "step": 1056 - }, - { - "epoch": 0.8031914893617021, - "grad_norm": 1.7288594245910645, - "learning_rate": 4.381497050882845e-06, - "loss": 0.5375926494598389, - "mean_token_accuracy": 0.8138614892959595, - "num_tokens": 9528736.0, - "step": 1057 - }, - { - "epoch": 0.8039513677811551, - "grad_norm": 2.093407392501831, - "learning_rate": 4.380117274926632e-06, - "loss": 0.46659404039382935, - "mean_token_accuracy": 0.8450702428817749, - "num_tokens": 9536200.0, - "step": 1058 - }, - { - "epoch": 0.8047112462006079, - "grad_norm": 1.6835898160934448, - "learning_rate": 4.3787361794317405e-06, - "loss": 0.43157699704170227, - "mean_token_accuracy": 0.8279973268508911, - "num_tokens": 9546314.0, - "step": 1059 - }, - { - "epoch": 0.8054711246200608, - "grad_norm": 1.983067512512207, - "learning_rate": 4.377353765367479e-06, - "loss": 0.5021739602088928, - "mean_token_accuracy": 0.8274815082550049, - "num_tokens": 9554375.0, - "step": 1060 - }, - { - "epoch": 0.8062310030395137, - "grad_norm": 2.0472030639648438, - "learning_rate": 4.375970033704078e-06, - "loss": 0.34298190474510193, - "mean_token_accuracy": 0.8900876045227051, - "num_tokens": 9560230.0, - "step": 1061 - }, - { - "epoch": 0.8069908814589666, - "grad_norm": 1.9613717794418335, - "learning_rate": 4.374584985412692e-06, - "loss": 0.3826758861541748, - "mean_token_accuracy": 0.839923620223999, - "num_tokens": 9566809.0, - "step": 1062 - }, - { - "epoch": 0.8077507598784195, - "grad_norm": 1.991289496421814, - "learning_rate": 4.373198621465405e-06, - "loss": 0.5492525100708008, - "mean_token_accuracy": 0.8153272867202759, - "num_tokens": 9576810.0, - "step": 1063 - }, - { - "epoch": 0.8085106382978723, - "grad_norm": 2.421370506286621, - "learning_rate": 4.3718109428352155e-06, - "loss": 0.5240297317504883, - "mean_token_accuracy": 0.8087242245674133, - "num_tokens": 9582906.0, - "step": 1064 - }, - { - "epoch": 0.8092705167173252, - "grad_norm": 3.697765588760376, - "learning_rate": 4.370421950496055e-06, - "loss": 0.6096476912498474, - "mean_token_accuracy": 0.787585973739624, - "num_tokens": 9586920.0, - "step": 1065 - }, - { - "epoch": 0.8100303951367781, - "grad_norm": 2.0767786502838135, - "learning_rate": 4.369031645422768e-06, - "loss": 0.41120079159736633, - "mean_token_accuracy": 0.8513731956481934, - "num_tokens": 9593902.0, - "step": 1066 - }, - { - "epoch": 0.810790273556231, - "grad_norm": 2.5968732833862305, - "learning_rate": 4.367640028591126e-06, - "loss": 0.3364982008934021, - "mean_token_accuracy": 0.8786963224411011, - "num_tokens": 9597745.0, - "step": 1067 - }, - { - "epoch": 0.8115501519756839, - "grad_norm": 2.165742874145508, - "learning_rate": 4.366247100977818e-06, - "loss": 0.406129390001297, - "mean_token_accuracy": 0.868243932723999, - "num_tokens": 9603496.0, - "step": 1068 - }, - { - "epoch": 0.8123100303951368, - "grad_norm": 2.0493404865264893, - "learning_rate": 4.364852863560456e-06, - "loss": 0.5356296300888062, - "mean_token_accuracy": 0.8191947340965271, - "num_tokens": 9610898.0, - "step": 1069 - }, - { - "epoch": 0.8130699088145896, - "grad_norm": 2.3224308490753174, - "learning_rate": 4.363457317317568e-06, - "loss": 0.41461923718452454, - "mean_token_accuracy": 0.8537945747375488, - "num_tokens": 9616626.0, - "step": 1070 - }, - { - "epoch": 0.8138297872340425, - "grad_norm": 1.7387986183166504, - "learning_rate": 4.362060463228603e-06, - "loss": 0.5134786367416382, - "mean_token_accuracy": 0.8511737585067749, - "num_tokens": 9626223.0, - "step": 1071 - }, - { - "epoch": 0.8145896656534954, - "grad_norm": 3.0270655155181885, - "learning_rate": 4.360662302273926e-06, - "loss": 0.3410695791244507, - "mean_token_accuracy": 0.8746449947357178, - "num_tokens": 9629455.0, - "step": 1072 - }, - { - "epoch": 0.8153495440729484, - "grad_norm": 1.7727062702178955, - "learning_rate": 4.35926283543482e-06, - "loss": 0.4610968828201294, - "mean_token_accuracy": 0.8444793224334717, - "num_tokens": 9638070.0, - "step": 1073 - }, - { - "epoch": 0.8161094224924013, - "grad_norm": 3.6333565711975098, - "learning_rate": 4.357862063693486e-06, - "loss": 0.3881273865699768, - "mean_token_accuracy": 0.8757344484329224, - "num_tokens": 9641028.0, - "step": 1074 - }, - { - "epoch": 0.8168693009118541, - "grad_norm": 3.024042844772339, - "learning_rate": 4.356459988033039e-06, - "loss": 0.3853808641433716, - "mean_token_accuracy": 0.8602254390716553, - "num_tokens": 9645730.0, - "step": 1075 - }, - { - "epoch": 0.817629179331307, - "grad_norm": 2.3359482288360596, - "learning_rate": 4.355056609437509e-06, - "loss": 0.4852045476436615, - "mean_token_accuracy": 0.8502728343009949, - "num_tokens": 9650975.0, - "step": 1076 - }, - { - "epoch": 0.8183890577507599, - "grad_norm": 2.2390685081481934, - "learning_rate": 4.353651928891842e-06, - "loss": 0.5287341475486755, - "mean_token_accuracy": 0.8247801065444946, - "num_tokens": 9657471.0, - "step": 1077 - }, - { - "epoch": 0.8191489361702128, - "grad_norm": 2.3809144496917725, - "learning_rate": 4.352245947381897e-06, - "loss": 0.5218510627746582, - "mean_token_accuracy": 0.8149170875549316, - "num_tokens": 9664108.0, - "step": 1078 - }, - { - "epoch": 0.8199088145896657, - "grad_norm": 1.7072309255599976, - "learning_rate": 4.3508386658944455e-06, - "loss": 0.46481168270111084, - "mean_token_accuracy": 0.834963321685791, - "num_tokens": 9673175.0, - "step": 1079 - }, - { - "epoch": 0.8206686930091185, - "grad_norm": 1.7383702993392944, - "learning_rate": 4.349430085417171e-06, - "loss": 0.4505952000617981, - "mean_token_accuracy": 0.8507769107818604, - "num_tokens": 9682800.0, - "step": 1080 - }, - { - "epoch": 0.8214285714285714, - "grad_norm": 2.4308547973632812, - "learning_rate": 4.348020206938672e-06, - "loss": 0.4832455515861511, - "mean_token_accuracy": 0.8538393974304199, - "num_tokens": 9688123.0, - "step": 1081 - }, - { - "epoch": 0.8221884498480243, - "grad_norm": 2.2686192989349365, - "learning_rate": 4.3466090314484526e-06, - "loss": 0.5112563371658325, - "mean_token_accuracy": 0.8308460712432861, - "num_tokens": 9694299.0, - "step": 1082 - }, - { - "epoch": 0.8229483282674772, - "grad_norm": 2.806093454360962, - "learning_rate": 4.345196559936931e-06, - "loss": 0.4818246364593506, - "mean_token_accuracy": 0.86617112159729, - "num_tokens": 9698471.0, - "step": 1083 - }, - { - "epoch": 0.8237082066869301, - "grad_norm": 1.7340706586837769, - "learning_rate": 4.343782793395435e-06, - "loss": 0.38246971368789673, - "mean_token_accuracy": 0.8675198554992676, - "num_tokens": 9706444.0, - "step": 1084 - }, - { - "epoch": 0.824468085106383, - "grad_norm": 1.664942741394043, - "learning_rate": 4.3423677328162e-06, - "loss": 0.498797208070755, - "mean_token_accuracy": 0.8447319865226746, - "num_tokens": 9716765.0, - "step": 1085 - }, - { - "epoch": 0.8252279635258358, - "grad_norm": 1.3608235120773315, - "learning_rate": 4.340951379192369e-06, - "loss": 0.41961491107940674, - "mean_token_accuracy": 0.8339346647262573, - "num_tokens": 9729564.0, - "step": 1086 - }, - { - "epoch": 0.8259878419452887, - "grad_norm": 1.642503261566162, - "learning_rate": 4.3395337335179945e-06, - "loss": 0.5477945804595947, - "mean_token_accuracy": 0.8117889761924744, - "num_tokens": 9741217.0, - "step": 1087 - }, - { - "epoch": 0.8267477203647416, - "grad_norm": 3.0345044136047363, - "learning_rate": 4.338114796788035e-06, - "loss": 0.5024623870849609, - "mean_token_accuracy": 0.8333141207695007, - "num_tokens": 9744941.0, - "step": 1088 - }, - { - "epoch": 0.8275075987841946, - "grad_norm": 1.3096630573272705, - "learning_rate": 4.336694569998354e-06, - "loss": 0.44169723987579346, - "mean_token_accuracy": 0.859926700592041, - "num_tokens": 9757854.0, - "step": 1089 - }, - { - "epoch": 0.8282674772036475, - "grad_norm": 2.203279495239258, - "learning_rate": 4.3352730541457215e-06, - "loss": 0.5283265113830566, - "mean_token_accuracy": 0.8053759932518005, - "num_tokens": 9764096.0, - "step": 1090 - }, - { - "epoch": 0.8290273556231003, - "grad_norm": 1.3774312734603882, - "learning_rate": 4.333850250227814e-06, - "loss": 0.4584103226661682, - "mean_token_accuracy": 0.8342611193656921, - "num_tokens": 9777768.0, - "step": 1091 - }, - { - "epoch": 0.8297872340425532, - "grad_norm": 1.822637915611267, - "learning_rate": 4.332426159243206e-06, - "loss": 0.5432791709899902, - "mean_token_accuracy": 0.8136210441589355, - "num_tokens": 9791276.0, - "step": 1092 - }, - { - "epoch": 0.8305471124620061, - "grad_norm": 3.0190067291259766, - "learning_rate": 4.331000782191384e-06, - "loss": 0.5018150806427002, - "mean_token_accuracy": 0.8234807252883911, - "num_tokens": 9794902.0, - "step": 1093 - }, - { - "epoch": 0.831306990881459, - "grad_norm": 2.09987735748291, - "learning_rate": 4.329574120072728e-06, - "loss": 0.4270891547203064, - "mean_token_accuracy": 0.8544977903366089, - "num_tokens": 9800903.0, - "step": 1094 - }, - { - "epoch": 0.8320668693009119, - "grad_norm": 1.969549536705017, - "learning_rate": 4.328146173888528e-06, - "loss": 0.45801427960395813, - "mean_token_accuracy": 0.8334714770317078, - "num_tokens": 9808719.0, - "step": 1095 - }, - { - "epoch": 0.8328267477203647, - "grad_norm": 1.4565571546554565, - "learning_rate": 4.32671694464097e-06, - "loss": 0.34864288568496704, - "mean_token_accuracy": 0.8689061999320984, - "num_tokens": 9818262.0, - "step": 1096 - }, - { - "epoch": 0.8335866261398176, - "grad_norm": 1.2163832187652588, - "learning_rate": 4.3252864333331424e-06, - "loss": 0.37953704595565796, - "mean_token_accuracy": 0.866554856300354, - "num_tokens": 9833942.0, - "step": 1097 - }, - { - "epoch": 0.8343465045592705, - "grad_norm": 1.6112010478973389, - "learning_rate": 4.323854640969033e-06, - "loss": 0.5442801713943481, - "mean_token_accuracy": 0.8190416097640991, - "num_tokens": 9844765.0, - "step": 1098 - }, - { - "epoch": 0.8351063829787234, - "grad_norm": 1.8190315961837769, - "learning_rate": 4.322421568553529e-06, - "loss": 0.48271381855010986, - "mean_token_accuracy": 0.8203652501106262, - "num_tokens": 9852625.0, - "step": 1099 - }, - { - "epoch": 0.8358662613981763, - "grad_norm": 2.7897756099700928, - "learning_rate": 4.320987217092416e-06, - "loss": 0.4086323380470276, - "mean_token_accuracy": 0.8504934310913086, - "num_tokens": 9856888.0, - "step": 1100 - }, - { - "epoch": 0.8366261398176292, - "grad_norm": 1.7035977840423584, - "learning_rate": 4.319551587592377e-06, - "loss": 0.6325064301490784, - "mean_token_accuracy": 0.788190484046936, - "num_tokens": 9869419.0, - "step": 1101 - }, - { - "epoch": 0.837386018237082, - "grad_norm": 2.609731912612915, - "learning_rate": 4.318114681060989e-06, - "loss": 0.519314706325531, - "mean_token_accuracy": 0.8469992280006409, - "num_tokens": 9874553.0, - "step": 1102 - }, - { - "epoch": 0.8381458966565349, - "grad_norm": 1.2519766092300415, - "learning_rate": 4.316676498506735e-06, - "loss": 0.3566005825996399, - "mean_token_accuracy": 0.8588439226150513, - "num_tokens": 9886498.0, - "step": 1103 - }, - { - "epoch": 0.8389057750759878, - "grad_norm": 1.430892825126648, - "learning_rate": 4.3152370409389795e-06, - "loss": 0.5250182747840881, - "mean_token_accuracy": 0.8164948225021362, - "num_tokens": 9900256.0, - "step": 1104 - }, - { - "epoch": 0.8396656534954408, - "grad_norm": 3.1245436668395996, - "learning_rate": 4.3137963093679945e-06, - "loss": 0.3173971176147461, - "mean_token_accuracy": 0.8835347890853882, - "num_tokens": 9903899.0, - "step": 1105 - }, - { - "epoch": 0.8404255319148937, - "grad_norm": 3.131812572479248, - "learning_rate": 4.3123543048049395e-06, - "loss": 0.6567763090133667, - "mean_token_accuracy": 0.8233605027198792, - "num_tokens": 9908798.0, - "step": 1106 - }, - { - "epoch": 0.8411854103343465, - "grad_norm": 1.3551725149154663, - "learning_rate": 4.310911028261867e-06, - "loss": 0.3993729054927826, - "mean_token_accuracy": 0.8529655933380127, - "num_tokens": 9922577.0, - "step": 1107 - }, - { - "epoch": 0.8419452887537994, - "grad_norm": 2.572533130645752, - "learning_rate": 4.309466480751726e-06, - "loss": 0.40906503796577454, - "mean_token_accuracy": 0.8630726933479309, - "num_tokens": 9926890.0, - "step": 1108 - }, - { - "epoch": 0.8427051671732523, - "grad_norm": 1.9146469831466675, - "learning_rate": 4.308020663288356e-06, - "loss": 0.48423194885253906, - "mean_token_accuracy": 0.8370280861854553, - "num_tokens": 9934293.0, - "step": 1109 - }, - { - "epoch": 0.8434650455927052, - "grad_norm": 1.6178001165390015, - "learning_rate": 4.306573576886485e-06, - "loss": 0.4262213408946991, - "mean_token_accuracy": 0.839401125907898, - "num_tokens": 9944513.0, - "step": 1110 - }, - { - "epoch": 0.8442249240121581, - "grad_norm": 2.4444572925567627, - "learning_rate": 4.305125222561736e-06, - "loss": 0.5199950933456421, - "mean_token_accuracy": 0.8507720232009888, - "num_tokens": 9949512.0, - "step": 1111 - }, - { - "epoch": 0.8449848024316109, - "grad_norm": 1.7983134984970093, - "learning_rate": 4.303675601330618e-06, - "loss": 0.36155956983566284, - "mean_token_accuracy": 0.8568712472915649, - "num_tokens": 9956402.0, - "step": 1112 - }, - { - "epoch": 0.8457446808510638, - "grad_norm": 2.391096353530884, - "learning_rate": 4.302224714210532e-06, - "loss": 0.5391949415206909, - "mean_token_accuracy": 0.8183057308197021, - "num_tokens": 9961606.0, - "step": 1113 - }, - { - "epoch": 0.8465045592705167, - "grad_norm": 1.8520214557647705, - "learning_rate": 4.3007725622197675e-06, - "loss": 0.5758882761001587, - "mean_token_accuracy": 0.7924330234527588, - "num_tokens": 9971473.0, - "step": 1114 - }, - { - "epoch": 0.8472644376899696, - "grad_norm": 2.436640739440918, - "learning_rate": 4.2993191463775e-06, - "loss": 0.3837985396385193, - "mean_token_accuracy": 0.8620110750198364, - "num_tokens": 9976333.0, - "step": 1115 - }, - { - "epoch": 0.8480243161094225, - "grad_norm": 1.7287120819091797, - "learning_rate": 4.29786446770379e-06, - "loss": 0.40066856145858765, - "mean_token_accuracy": 0.8618333339691162, - "num_tokens": 9985617.0, - "step": 1116 - }, - { - "epoch": 0.8487841945288754, - "grad_norm": 2.0310518741607666, - "learning_rate": 4.296408527219592e-06, - "loss": 0.5465943217277527, - "mean_token_accuracy": 0.812044620513916, - "num_tokens": 9995363.0, - "step": 1117 - }, - { - "epoch": 0.8495440729483282, - "grad_norm": 1.4858589172363281, - "learning_rate": 4.294951325946737e-06, - "loss": 0.45840176939964294, - "mean_token_accuracy": 0.8432979583740234, - "num_tokens": 10006400.0, - "step": 1118 - }, - { - "epoch": 0.8503039513677811, - "grad_norm": 1.6153514385223389, - "learning_rate": 4.293492864907947e-06, - "loss": 0.5225611925125122, - "mean_token_accuracy": 0.8180211186408997, - "num_tokens": 10018352.0, - "step": 1119 - }, - { - "epoch": 0.851063829787234, - "grad_norm": 2.1178412437438965, - "learning_rate": 4.2920331451268246e-06, - "loss": 0.5580621361732483, - "mean_token_accuracy": 0.8211709260940552, - "num_tokens": 10025614.0, - "step": 1120 - }, - { - "epoch": 0.851823708206687, - "grad_norm": 2.036839246749878, - "learning_rate": 4.2905721676278585e-06, - "loss": 0.4658433198928833, - "mean_token_accuracy": 0.8380423784255981, - "num_tokens": 10032489.0, - "step": 1121 - }, - { - "epoch": 0.8525835866261399, - "grad_norm": 2.0056262016296387, - "learning_rate": 4.28910993343642e-06, - "loss": 0.47023308277130127, - "mean_token_accuracy": 0.8340359926223755, - "num_tokens": 10040050.0, - "step": 1122 - }, - { - "epoch": 0.8533434650455927, - "grad_norm": 2.540024518966675, - "learning_rate": 4.2876464435787576e-06, - "loss": 0.502303957939148, - "mean_token_accuracy": 0.8288739919662476, - "num_tokens": 10045042.0, - "step": 1123 - }, - { - "epoch": 0.8541033434650456, - "grad_norm": 1.7894693613052368, - "learning_rate": 4.286181699082008e-06, - "loss": 0.4732973575592041, - "mean_token_accuracy": 0.8340568542480469, - "num_tokens": 10054424.0, - "step": 1124 - }, - { - "epoch": 0.8548632218844985, - "grad_norm": 1.5601223707199097, - "learning_rate": 4.284715700974186e-06, - "loss": 0.472471684217453, - "mean_token_accuracy": 0.8274722695350647, - "num_tokens": 10065523.0, - "step": 1125 - }, - { - "epoch": 0.8556231003039514, - "grad_norm": 1.7326055765151978, - "learning_rate": 4.283248450284182e-06, - "loss": 0.5924872159957886, - "mean_token_accuracy": 0.7943467497825623, - "num_tokens": 10076839.0, - "step": 1126 - }, - { - "epoch": 0.8563829787234043, - "grad_norm": 1.5165479183197021, - "learning_rate": 4.281779948041772e-06, - "loss": 0.44768425822257996, - "mean_token_accuracy": 0.8394696712493896, - "num_tokens": 10088168.0, - "step": 1127 - }, - { - "epoch": 0.8571428571428571, - "grad_norm": 1.5448920726776123, - "learning_rate": 4.280310195277606e-06, - "loss": 0.4458175003528595, - "mean_token_accuracy": 0.835773229598999, - "num_tokens": 10100306.0, - "step": 1128 - }, - { - "epoch": 0.85790273556231, - "grad_norm": 1.6311609745025635, - "learning_rate": 4.278839193023214e-06, - "loss": 0.4158072769641876, - "mean_token_accuracy": 0.8482539653778076, - "num_tokens": 10110581.0, - "step": 1129 - }, - { - "epoch": 0.8586626139817629, - "grad_norm": 1.6714754104614258, - "learning_rate": 4.277366942311001e-06, - "loss": 0.3686875104904175, - "mean_token_accuracy": 0.8681533336639404, - "num_tokens": 10118799.0, - "step": 1130 - }, - { - "epoch": 0.8594224924012158, - "grad_norm": 2.1604413986206055, - "learning_rate": 4.2758934441742494e-06, - "loss": 0.37267982959747314, - "mean_token_accuracy": 0.8520427346229553, - "num_tokens": 10124734.0, - "step": 1131 - }, - { - "epoch": 0.8601823708206687, - "grad_norm": 2.123013973236084, - "learning_rate": 4.274418699647117e-06, - "loss": 0.49963313341140747, - "mean_token_accuracy": 0.8248758912086487, - "num_tokens": 10131965.0, - "step": 1132 - }, - { - "epoch": 0.8609422492401215, - "grad_norm": 1.4308786392211914, - "learning_rate": 4.272942709764638e-06, - "loss": 0.48666873574256897, - "mean_token_accuracy": 0.8304717540740967, - "num_tokens": 10145164.0, - "step": 1133 - }, - { - "epoch": 0.8617021276595744, - "grad_norm": 1.7952618598937988, - "learning_rate": 4.271465475562716e-06, - "loss": 0.5536223649978638, - "mean_token_accuracy": 0.8093959093093872, - "num_tokens": 10154083.0, - "step": 1134 - }, - { - "epoch": 0.8624620060790273, - "grad_norm": 2.0622456073760986, - "learning_rate": 4.269986998078132e-06, - "loss": 0.5173629522323608, - "mean_token_accuracy": 0.8285619020462036, - "num_tokens": 10161889.0, - "step": 1135 - }, - { - "epoch": 0.8632218844984803, - "grad_norm": 2.0707509517669678, - "learning_rate": 4.268507278348539e-06, - "loss": 0.5871608257293701, - "mean_token_accuracy": 0.7827386856079102, - "num_tokens": 10170726.0, - "step": 1136 - }, - { - "epoch": 0.8639817629179332, - "grad_norm": 2.054368257522583, - "learning_rate": 4.2670263174124615e-06, - "loss": 0.5788969993591309, - "mean_token_accuracy": 0.7967237234115601, - "num_tokens": 10178474.0, - "step": 1137 - }, - { - "epoch": 0.8647416413373861, - "grad_norm": 1.901846170425415, - "learning_rate": 4.265544116309294e-06, - "loss": 0.5405587553977966, - "mean_token_accuracy": 0.8151819705963135, - "num_tokens": 10187013.0, - "step": 1138 - }, - { - "epoch": 0.8655015197568389, - "grad_norm": 2.901285409927368, - "learning_rate": 4.264060676079302e-06, - "loss": 0.44101861119270325, - "mean_token_accuracy": 0.8433429002761841, - "num_tokens": 10191517.0, - "step": 1139 - }, - { - "epoch": 0.8662613981762918, - "grad_norm": 2.4168388843536377, - "learning_rate": 4.262575997763622e-06, - "loss": 0.4686204195022583, - "mean_token_accuracy": 0.8505309820175171, - "num_tokens": 10196948.0, - "step": 1140 - }, - { - "epoch": 0.8670212765957447, - "grad_norm": 1.9588396549224854, - "learning_rate": 4.2610900824042575e-06, - "loss": 0.47056013345718384, - "mean_token_accuracy": 0.8280024528503418, - "num_tokens": 10204292.0, - "step": 1141 - }, - { - "epoch": 0.8677811550151976, - "grad_norm": 2.569150924682617, - "learning_rate": 4.2596029310440826e-06, - "loss": 0.573108434677124, - "mean_token_accuracy": 0.8108246326446533, - "num_tokens": 10209571.0, - "step": 1142 - }, - { - "epoch": 0.8685410334346505, - "grad_norm": 2.038032293319702, - "learning_rate": 4.258114544726835e-06, - "loss": 0.40545332431793213, - "mean_token_accuracy": 0.8611703515052795, - "num_tokens": 10215716.0, - "step": 1143 - }, - { - "epoch": 0.8693009118541033, - "grad_norm": 1.9884231090545654, - "learning_rate": 4.256624924497124e-06, - "loss": 0.40085992217063904, - "mean_token_accuracy": 0.8615031242370605, - "num_tokens": 10222775.0, - "step": 1144 - }, - { - "epoch": 0.8700607902735562, - "grad_norm": 1.912842035293579, - "learning_rate": 4.25513407140042e-06, - "loss": 0.41022324562072754, - "mean_token_accuracy": 0.8459607362747192, - "num_tokens": 10229589.0, - "step": 1145 - }, - { - "epoch": 0.8708206686930091, - "grad_norm": 1.9190576076507568, - "learning_rate": 4.253641986483063e-06, - "loss": 0.5541447401046753, - "mean_token_accuracy": 0.8256468772888184, - "num_tokens": 10240633.0, - "step": 1146 - }, - { - "epoch": 0.871580547112462, - "grad_norm": 1.3742294311523438, - "learning_rate": 4.2521486707922545e-06, - "loss": 0.3680543899536133, - "mean_token_accuracy": 0.8654477596282959, - "num_tokens": 10251252.0, - "step": 1147 - }, - { - "epoch": 0.8723404255319149, - "grad_norm": 1.4438525438308716, - "learning_rate": 4.250654125376062e-06, - "loss": 0.45830875635147095, - "mean_token_accuracy": 0.8433834314346313, - "num_tokens": 10263980.0, - "step": 1148 - }, - { - "epoch": 0.8731003039513677, - "grad_norm": 2.1273653507232666, - "learning_rate": 4.249158351283414e-06, - "loss": 0.4129376709461212, - "mean_token_accuracy": 0.861556351184845, - "num_tokens": 10270426.0, - "step": 1149 - }, - { - "epoch": 0.8738601823708206, - "grad_norm": 2.598440647125244, - "learning_rate": 4.247661349564103e-06, - "loss": 0.418030709028244, - "mean_token_accuracy": 0.86553955078125, - "num_tokens": 10275493.0, - "step": 1150 - }, - { - "epoch": 0.8746200607902735, - "grad_norm": 1.6852490901947021, - "learning_rate": 4.246163121268782e-06, - "loss": 0.6403408050537109, - "mean_token_accuracy": 0.7966094017028809, - "num_tokens": 10287989.0, - "step": 1151 - }, - { - "epoch": 0.8753799392097265, - "grad_norm": 2.5013794898986816, - "learning_rate": 4.244663667448965e-06, - "loss": 0.49922505021095276, - "mean_token_accuracy": 0.8318735361099243, - "num_tokens": 10293360.0, - "step": 1152 - }, - { - "epoch": 0.8761398176291794, - "grad_norm": 1.2022709846496582, - "learning_rate": 4.243162989157027e-06, - "loss": 0.4414965510368347, - "mean_token_accuracy": 0.8338693380355835, - "num_tokens": 10310558.0, - "step": 1153 - }, - { - "epoch": 0.8768996960486323, - "grad_norm": 1.9903281927108765, - "learning_rate": 4.241661087446202e-06, - "loss": 0.4277610778808594, - "mean_token_accuracy": 0.8560749292373657, - "num_tokens": 10316983.0, - "step": 1154 - }, - { - "epoch": 0.8776595744680851, - "grad_norm": 2.104923725128174, - "learning_rate": 4.240157963370583e-06, - "loss": 0.44431713223457336, - "mean_token_accuracy": 0.8785282969474792, - "num_tokens": 10323294.0, - "step": 1155 - }, - { - "epoch": 0.878419452887538, - "grad_norm": 2.8364813327789307, - "learning_rate": 4.2386536179851175e-06, - "loss": 0.49948397278785706, - "mean_token_accuracy": 0.8305255174636841, - "num_tokens": 10327662.0, - "step": 1156 - }, - { - "epoch": 0.8791793313069909, - "grad_norm": 1.9493682384490967, - "learning_rate": 4.2371480523456156e-06, - "loss": 0.45867404341697693, - "mean_token_accuracy": 0.8373264074325562, - "num_tokens": 10335699.0, - "step": 1157 - }, - { - "epoch": 0.8799392097264438, - "grad_norm": 2.268616199493408, - "learning_rate": 4.235641267508741e-06, - "loss": 0.4547857940196991, - "mean_token_accuracy": 0.8252766132354736, - "num_tokens": 10342464.0, - "step": 1158 - }, - { - "epoch": 0.8806990881458967, - "grad_norm": 2.1334283351898193, - "learning_rate": 4.234133264532012e-06, - "loss": 0.39503124356269836, - "mean_token_accuracy": 0.8648351430892944, - "num_tokens": 10347514.0, - "step": 1159 - }, - { - "epoch": 0.8814589665653495, - "grad_norm": 1.2775357961654663, - "learning_rate": 4.232624044473805e-06, - "loss": 0.39945733547210693, - "mean_token_accuracy": 0.8369829654693604, - "num_tokens": 10363316.0, - "step": 1160 - }, - { - "epoch": 0.8822188449848024, - "grad_norm": 2.458413600921631, - "learning_rate": 4.231113608393348e-06, - "loss": 0.5020045638084412, - "mean_token_accuracy": 0.8295938968658447, - "num_tokens": 10368401.0, - "step": 1161 - }, - { - "epoch": 0.8829787234042553, - "grad_norm": 1.7464948892593384, - "learning_rate": 4.229601957350722e-06, - "loss": 0.5335392951965332, - "mean_token_accuracy": 0.8134858012199402, - "num_tokens": 10378337.0, - "step": 1162 - }, - { - "epoch": 0.8837386018237082, - "grad_norm": 3.1152119636535645, - "learning_rate": 4.228089092406863e-06, - "loss": 0.4811682105064392, - "mean_token_accuracy": 0.8460187315940857, - "num_tokens": 10382362.0, - "step": 1163 - }, - { - "epoch": 0.8844984802431611, - "grad_norm": 2.190847158432007, - "learning_rate": 4.226575014623557e-06, - "loss": 0.4428049921989441, - "mean_token_accuracy": 0.8382467031478882, - "num_tokens": 10388211.0, - "step": 1164 - }, - { - "epoch": 0.8852583586626139, - "grad_norm": 1.860153079032898, - "learning_rate": 4.225059725063444e-06, - "loss": 0.5265918970108032, - "mean_token_accuracy": 0.8181334733963013, - "num_tokens": 10398873.0, - "step": 1165 - }, - { - "epoch": 0.8860182370820668, - "grad_norm": 1.3372713327407837, - "learning_rate": 4.22354322479001e-06, - "loss": 0.43202850222587585, - "mean_token_accuracy": 0.8432420492172241, - "num_tokens": 10413158.0, - "step": 1166 - }, - { - "epoch": 0.8867781155015197, - "grad_norm": 1.3653379678726196, - "learning_rate": 4.222025514867596e-06, - "loss": 0.43780991435050964, - "mean_token_accuracy": 0.8441485166549683, - "num_tokens": 10428137.0, - "step": 1167 - }, - { - "epoch": 0.8875379939209727, - "grad_norm": 3.0230672359466553, - "learning_rate": 4.220506596361387e-06, - "loss": 0.6039337515830994, - "mean_token_accuracy": 0.8274872303009033, - "num_tokens": 10432586.0, - "step": 1168 - }, - { - "epoch": 0.8882978723404256, - "grad_norm": 2.2180392742156982, - "learning_rate": 4.218986470337419e-06, - "loss": 0.5453792810440063, - "mean_token_accuracy": 0.8127184510231018, - "num_tokens": 10439471.0, - "step": 1169 - }, - { - "epoch": 0.8890577507598785, - "grad_norm": 1.8519103527069092, - "learning_rate": 4.217465137862575e-06, - "loss": 0.5145469903945923, - "mean_token_accuracy": 0.8178654909133911, - "num_tokens": 10450471.0, - "step": 1170 - }, - { - "epoch": 0.8898176291793313, - "grad_norm": 2.034008026123047, - "learning_rate": 4.215942600004586e-06, - "loss": 0.44061461091041565, - "mean_token_accuracy": 0.8572084307670593, - "num_tokens": 10457382.0, - "step": 1171 - }, - { - "epoch": 0.8905775075987842, - "grad_norm": 3.4304304122924805, - "learning_rate": 4.214418857832025e-06, - "loss": 0.44397830963134766, - "mean_token_accuracy": 0.842149019241333, - "num_tokens": 10460650.0, - "step": 1172 - }, - { - "epoch": 0.8913373860182371, - "grad_norm": 1.9021750688552856, - "learning_rate": 4.212893912414316e-06, - "loss": 0.3769867420196533, - "mean_token_accuracy": 0.8806171417236328, - "num_tokens": 10468214.0, - "step": 1173 - }, - { - "epoch": 0.89209726443769, - "grad_norm": 1.9704062938690186, - "learning_rate": 4.211367764821722e-06, - "loss": 0.5501819849014282, - "mean_token_accuracy": 0.8176811337471008, - "num_tokens": 10476739.0, - "step": 1174 - }, - { - "epoch": 0.8928571428571429, - "grad_norm": 1.4350415468215942, - "learning_rate": 4.209840416125353e-06, - "loss": 0.41897401213645935, - "mean_token_accuracy": 0.8498011827468872, - "num_tokens": 10491769.0, - "step": 1175 - }, - { - "epoch": 0.8936170212765957, - "grad_norm": 3.8237783908843994, - "learning_rate": 4.208311867397162e-06, - "loss": 0.5296977162361145, - "mean_token_accuracy": 0.8168715834617615, - "num_tokens": 10494958.0, - "step": 1176 - }, - { - "epoch": 0.8943768996960486, - "grad_norm": 2.04784893989563, - "learning_rate": 4.206782119709942e-06, - "loss": 0.476105272769928, - "mean_token_accuracy": 0.834011435508728, - "num_tokens": 10502077.0, - "step": 1177 - }, - { - "epoch": 0.8951367781155015, - "grad_norm": 1.8839610815048218, - "learning_rate": 4.205251174137329e-06, - "loss": 0.49628815054893494, - "mean_token_accuracy": 0.8212119936943054, - "num_tokens": 10510077.0, - "step": 1178 - }, - { - "epoch": 0.8958966565349544, - "grad_norm": 1.2100634574890137, - "learning_rate": 4.2037190317538e-06, - "loss": 0.4931519329547882, - "mean_token_accuracy": 0.8170043230056763, - "num_tokens": 10528373.0, - "step": 1179 - }, - { - "epoch": 0.8966565349544073, - "grad_norm": 1.884637713432312, - "learning_rate": 4.202185693634671e-06, - "loss": 0.4913347363471985, - "mean_token_accuracy": 0.8234949707984924, - "num_tokens": 10537108.0, - "step": 1180 - }, - { - "epoch": 0.8974164133738601, - "grad_norm": 1.5062434673309326, - "learning_rate": 4.200651160856099e-06, - "loss": 0.4160492420196533, - "mean_token_accuracy": 0.845937192440033, - "num_tokens": 10547577.0, - "step": 1181 - }, - { - "epoch": 0.898176291793313, - "grad_norm": 2.331169605255127, - "learning_rate": 4.1991154344950755e-06, - "loss": 0.6532632112503052, - "mean_token_accuracy": 0.7743191123008728, - "num_tokens": 10556328.0, - "step": 1182 - }, - { - "epoch": 0.898936170212766, - "grad_norm": 1.3538362979888916, - "learning_rate": 4.197578515629435e-06, - "loss": 0.4437566101551056, - "mean_token_accuracy": 0.8427901268005371, - "num_tokens": 10570026.0, - "step": 1183 - }, - { - "epoch": 0.8996960486322189, - "grad_norm": 2.3828957080841064, - "learning_rate": 4.196040405337846e-06, - "loss": 0.6185290217399597, - "mean_token_accuracy": 0.7969824075698853, - "num_tokens": 10576465.0, - "step": 1184 - }, - { - "epoch": 0.9004559270516718, - "grad_norm": 2.4759042263031006, - "learning_rate": 4.194501104699813e-06, - "loss": 0.46489226818084717, - "mean_token_accuracy": 0.8472316265106201, - "num_tokens": 10582034.0, - "step": 1185 - }, - { - "epoch": 0.9012158054711246, - "grad_norm": 1.9215164184570312, - "learning_rate": 4.192960614795676e-06, - "loss": 0.48001551628112793, - "mean_token_accuracy": 0.8371596336364746, - "num_tokens": 10590556.0, - "step": 1186 - }, - { - "epoch": 0.9019756838905775, - "grad_norm": 2.2717080116271973, - "learning_rate": 4.19141893670661e-06, - "loss": 0.40083563327789307, - "mean_token_accuracy": 0.8464195728302002, - "num_tokens": 10595661.0, - "step": 1187 - }, - { - "epoch": 0.9027355623100304, - "grad_norm": 2.187122344970703, - "learning_rate": 4.189876071514624e-06, - "loss": 0.4942901134490967, - "mean_token_accuracy": 0.8186990022659302, - "num_tokens": 10603366.0, - "step": 1188 - }, - { - "epoch": 0.9034954407294833, - "grad_norm": 1.542414665222168, - "learning_rate": 4.188332020302561e-06, - "loss": 0.4731982946395874, - "mean_token_accuracy": 0.8487229347229004, - "num_tokens": 10616203.0, - "step": 1189 - }, - { - "epoch": 0.9042553191489362, - "grad_norm": 0.9957579970359802, - "learning_rate": 4.186786784154096e-06, - "loss": 0.33211836218833923, - "mean_token_accuracy": 0.870644748210907, - "num_tokens": 10633294.0, - "step": 1190 - }, - { - "epoch": 0.9050151975683891, - "grad_norm": 2.593867540359497, - "learning_rate": 4.1852403641537344e-06, - "loss": 0.6825464963912964, - "mean_token_accuracy": 0.7716869115829468, - "num_tokens": 10640615.0, - "step": 1191 - }, - { - "epoch": 0.9057750759878419, - "grad_norm": 2.0424516201019287, - "learning_rate": 4.183692761386813e-06, - "loss": 0.5672709941864014, - "mean_token_accuracy": 0.7973801493644714, - "num_tokens": 10649845.0, - "step": 1192 - }, - { - "epoch": 0.9065349544072948, - "grad_norm": 1.429018259048462, - "learning_rate": 4.1821439769395e-06, - "loss": 0.5427846908569336, - "mean_token_accuracy": 0.8200292587280273, - "num_tokens": 10665898.0, - "step": 1193 - }, - { - "epoch": 0.9072948328267477, - "grad_norm": 1.9764264822006226, - "learning_rate": 4.180594011898791e-06, - "loss": 0.4784567356109619, - "mean_token_accuracy": 0.82924485206604, - "num_tokens": 10673595.0, - "step": 1194 - }, - { - "epoch": 0.9080547112462006, - "grad_norm": 1.4004309177398682, - "learning_rate": 4.1790428673525104e-06, - "loss": 0.4791432023048401, - "mean_token_accuracy": 0.8334879875183105, - "num_tokens": 10687892.0, - "step": 1195 - }, - { - "epoch": 0.9088145896656535, - "grad_norm": 2.2207727432250977, - "learning_rate": 4.177490544389313e-06, - "loss": 0.5089365243911743, - "mean_token_accuracy": 0.8270776271820068, - "num_tokens": 10694911.0, - "step": 1196 - }, - { - "epoch": 0.9095744680851063, - "grad_norm": 2.2890450954437256, - "learning_rate": 4.175937044098678e-06, - "loss": 0.5152267813682556, - "mean_token_accuracy": 0.8527299165725708, - "num_tokens": 10700512.0, - "step": 1197 - }, - { - "epoch": 0.9103343465045592, - "grad_norm": 1.7938050031661987, - "learning_rate": 4.1743823675709115e-06, - "loss": 0.3507300615310669, - "mean_token_accuracy": 0.8694599866867065, - "num_tokens": 10707953.0, - "step": 1198 - }, - { - "epoch": 0.9110942249240122, - "grad_norm": 1.4368808269500732, - "learning_rate": 4.172826515897146e-06, - "loss": 0.407418429851532, - "mean_token_accuracy": 0.8432893753051758, - "num_tokens": 10717485.0, - "step": 1199 - }, - { - "epoch": 0.9118541033434651, - "grad_norm": 1.735339879989624, - "learning_rate": 4.171269490169337e-06, - "loss": 0.46996885538101196, - "mean_token_accuracy": 0.8331948518753052, - "num_tokens": 10726160.0, - "step": 1200 - }, - { - "epoch": 0.912613981762918, - "grad_norm": 1.7859221696853638, - "learning_rate": 4.1697112914802665e-06, - "loss": 0.5325199365615845, - "mean_token_accuracy": 0.8179605007171631, - "num_tokens": 10736284.0, - "step": 1201 - }, - { - "epoch": 0.9133738601823708, - "grad_norm": 2.6394896507263184, - "learning_rate": 4.168151920923536e-06, - "loss": 0.4039744734764099, - "mean_token_accuracy": 0.8545527458190918, - "num_tokens": 10740673.0, - "step": 1202 - }, - { - "epoch": 0.9141337386018237, - "grad_norm": 1.910988211631775, - "learning_rate": 4.1665913795935755e-06, - "loss": 0.5190291404724121, - "mean_token_accuracy": 0.8203921318054199, - "num_tokens": 10751946.0, - "step": 1203 - }, - { - "epoch": 0.9148936170212766, - "grad_norm": 3.0006964206695557, - "learning_rate": 4.16502966858563e-06, - "loss": 0.5856777429580688, - "mean_token_accuracy": 0.8061224222183228, - "num_tokens": 10756795.0, - "step": 1204 - }, - { - "epoch": 0.9156534954407295, - "grad_norm": 1.7396167516708374, - "learning_rate": 4.163466788995768e-06, - "loss": 0.54935222864151, - "mean_token_accuracy": 0.8052443265914917, - "num_tokens": 10767202.0, - "step": 1205 - }, - { - "epoch": 0.9164133738601824, - "grad_norm": 2.143735885620117, - "learning_rate": 4.161902741920881e-06, - "loss": 0.5020298361778259, - "mean_token_accuracy": 0.8249630928039551, - "num_tokens": 10774329.0, - "step": 1206 - }, - { - "epoch": 0.9171732522796353, - "grad_norm": 2.8871893882751465, - "learning_rate": 4.160337528458676e-06, - "loss": 0.5154489278793335, - "mean_token_accuracy": 0.8276848793029785, - "num_tokens": 10778929.0, - "step": 1207 - }, - { - "epoch": 0.9179331306990881, - "grad_norm": 1.4642788171768188, - "learning_rate": 4.15877114970768e-06, - "loss": 0.5033774375915527, - "mean_token_accuracy": 0.8296241164207458, - "num_tokens": 10790928.0, - "step": 1208 - }, - { - "epoch": 0.918693009118541, - "grad_norm": 1.8313497304916382, - "learning_rate": 4.1572036067672386e-06, - "loss": 0.5674909353256226, - "mean_token_accuracy": 0.7975562214851379, - "num_tokens": 10801372.0, - "step": 1209 - }, - { - "epoch": 0.9194528875379939, - "grad_norm": 2.005958080291748, - "learning_rate": 4.155634900737513e-06, - "loss": 0.5557019114494324, - "mean_token_accuracy": 0.8141391277313232, - "num_tokens": 10809150.0, - "step": 1210 - }, - { - "epoch": 0.9202127659574468, - "grad_norm": 2.333519697189331, - "learning_rate": 4.154065032719482e-06, - "loss": 0.6990420818328857, - "mean_token_accuracy": 0.7565394043922424, - "num_tokens": 10816612.0, - "step": 1211 - }, - { - "epoch": 0.9209726443768997, - "grad_norm": 1.4472655057907104, - "learning_rate": 4.152494003814939e-06, - "loss": 0.541398286819458, - "mean_token_accuracy": 0.8027358055114746, - "num_tokens": 10833840.0, - "step": 1212 - }, - { - "epoch": 0.9217325227963525, - "grad_norm": 1.6183619499206543, - "learning_rate": 4.150921815126493e-06, - "loss": 0.6096762418746948, - "mean_token_accuracy": 0.7994354963302612, - "num_tokens": 10846367.0, - "step": 1213 - }, - { - "epoch": 0.9224924012158054, - "grad_norm": 2.614919900894165, - "learning_rate": 4.149348467757566e-06, - "loss": 0.41846764087677, - "mean_token_accuracy": 0.8555068969726562, - "num_tokens": 10850836.0, - "step": 1214 - }, - { - "epoch": 0.9232522796352584, - "grad_norm": 1.4419831037521362, - "learning_rate": 4.147773962812393e-06, - "loss": 0.4139535427093506, - "mean_token_accuracy": 0.845671534538269, - "num_tokens": 10864228.0, - "step": 1215 - }, - { - "epoch": 0.9240121580547113, - "grad_norm": 2.3868865966796875, - "learning_rate": 4.146198301396025e-06, - "loss": 0.3357275128364563, - "mean_token_accuracy": 0.8829520344734192, - "num_tokens": 10868920.0, - "step": 1216 - }, - { - "epoch": 0.9247720364741642, - "grad_norm": 1.7685474157333374, - "learning_rate": 4.14462148461432e-06, - "loss": 0.45333072543144226, - "mean_token_accuracy": 0.8505891561508179, - "num_tokens": 10877286.0, - "step": 1217 - }, - { - "epoch": 0.925531914893617, - "grad_norm": 1.7627625465393066, - "learning_rate": 4.143043513573949e-06, - "loss": 0.5028705596923828, - "mean_token_accuracy": 0.825471043586731, - "num_tokens": 10887047.0, - "step": 1218 - }, - { - "epoch": 0.9262917933130699, - "grad_norm": 1.3168725967407227, - "learning_rate": 4.141464389382392e-06, - "loss": 0.5494637489318848, - "mean_token_accuracy": 0.8121747970581055, - "num_tokens": 10903599.0, - "step": 1219 - }, - { - "epoch": 0.9270516717325228, - "grad_norm": 2.5180399417877197, - "learning_rate": 4.13988411314794e-06, - "loss": 0.6134277582168579, - "mean_token_accuracy": 0.7983006834983826, - "num_tokens": 10909791.0, - "step": 1220 - }, - { - "epoch": 0.9278115501519757, - "grad_norm": 1.1889166831970215, - "learning_rate": 4.13830268597969e-06, - "loss": 0.36713096499443054, - "mean_token_accuracy": 0.8416121006011963, - "num_tokens": 10925794.0, - "step": 1221 - }, - { - "epoch": 0.9285714285714286, - "grad_norm": 2.142422676086426, - "learning_rate": 4.136720108987552e-06, - "loss": 0.4427933096885681, - "mean_token_accuracy": 0.8427745699882507, - "num_tokens": 10931622.0, - "step": 1222 - }, - { - "epoch": 0.9293313069908815, - "grad_norm": 1.908564567565918, - "learning_rate": 4.1351363832822364e-06, - "loss": 0.5088109374046326, - "mean_token_accuracy": 0.8309272527694702, - "num_tokens": 10940843.0, - "step": 1223 - }, - { - "epoch": 0.9300911854103343, - "grad_norm": 1.2862322330474854, - "learning_rate": 4.133551509975264e-06, - "loss": 0.3963761329650879, - "mean_token_accuracy": 0.8602159023284912, - "num_tokens": 10954481.0, - "step": 1224 - }, - { - "epoch": 0.9308510638297872, - "grad_norm": 1.5876200199127197, - "learning_rate": 4.13196549017896e-06, - "loss": 0.4311184287071228, - "mean_token_accuracy": 0.8460899591445923, - "num_tokens": 10963501.0, - "step": 1225 - }, - { - "epoch": 0.9316109422492401, - "grad_norm": 2.459878444671631, - "learning_rate": 4.130378325006453e-06, - "loss": 0.5016295313835144, - "mean_token_accuracy": 0.8125218152999878, - "num_tokens": 10968850.0, - "step": 1226 - }, - { - "epoch": 0.932370820668693, - "grad_norm": 2.059718370437622, - "learning_rate": 4.128790015571679e-06, - "loss": 0.48982277512550354, - "mean_token_accuracy": 0.8327049016952515, - "num_tokens": 10976642.0, - "step": 1227 - }, - { - "epoch": 0.9331306990881459, - "grad_norm": 1.3719185590744019, - "learning_rate": 4.127200562989372e-06, - "loss": 0.38778752088546753, - "mean_token_accuracy": 0.8623501062393188, - "num_tokens": 10988703.0, - "step": 1228 - }, - { - "epoch": 0.9338905775075987, - "grad_norm": 1.302140712738037, - "learning_rate": 4.125609968375073e-06, - "loss": 0.4887842535972595, - "mean_token_accuracy": 0.8322232961654663, - "num_tokens": 11005981.0, - "step": 1229 - }, - { - "epoch": 0.9346504559270516, - "grad_norm": 1.819624423980713, - "learning_rate": 4.12401823284512e-06, - "loss": 0.49825209379196167, - "mean_token_accuracy": 0.8278916478157043, - "num_tokens": 11014145.0, - "step": 1230 - }, - { - "epoch": 0.9354103343465046, - "grad_norm": 1.2762807607650757, - "learning_rate": 4.122425357516658e-06, - "loss": 0.433994323015213, - "mean_token_accuracy": 0.853028416633606, - "num_tokens": 11029232.0, - "step": 1231 - }, - { - "epoch": 0.9361702127659575, - "grad_norm": 2.2171671390533447, - "learning_rate": 4.1208313435076255e-06, - "loss": 0.38436949253082275, - "mean_token_accuracy": 0.8616260290145874, - "num_tokens": 11034743.0, - "step": 1232 - }, - { - "epoch": 0.9369300911854104, - "grad_norm": 1.355879545211792, - "learning_rate": 4.119236191936764e-06, - "loss": 0.5378084182739258, - "mean_token_accuracy": 0.8256701231002808, - "num_tokens": 11048149.0, - "step": 1233 - }, - { - "epoch": 0.9376899696048632, - "grad_norm": 2.66812801361084, - "learning_rate": 4.117639903923611e-06, - "loss": 0.5236451625823975, - "mean_token_accuracy": 0.8431973457336426, - "num_tokens": 11052295.0, - "step": 1234 - }, - { - "epoch": 0.9384498480243161, - "grad_norm": 1.5740545988082886, - "learning_rate": 4.116042480588505e-06, - "loss": 0.44322824478149414, - "mean_token_accuracy": 0.8436908721923828, - "num_tokens": 11062066.0, - "step": 1235 - }, - { - "epoch": 0.939209726443769, - "grad_norm": 1.230706810951233, - "learning_rate": 4.114443923052577e-06, - "loss": 0.3325323462486267, - "mean_token_accuracy": 0.8674666881561279, - "num_tokens": 11074300.0, - "step": 1236 - }, - { - "epoch": 0.9399696048632219, - "grad_norm": 1.9870070219039917, - "learning_rate": 4.112844232437757e-06, - "loss": 0.5711548328399658, - "mean_token_accuracy": 0.8081738948822021, - "num_tokens": 11082297.0, - "step": 1237 - }, - { - "epoch": 0.9407294832826748, - "grad_norm": 1.3020970821380615, - "learning_rate": 4.11124340986677e-06, - "loss": 0.4187922477722168, - "mean_token_accuracy": 0.8566171526908875, - "num_tokens": 11096810.0, - "step": 1238 - }, - { - "epoch": 0.9414893617021277, - "grad_norm": 2.1399197578430176, - "learning_rate": 4.109641456463135e-06, - "loss": 0.5293116569519043, - "mean_token_accuracy": 0.8176157474517822, - "num_tokens": 11102761.0, - "step": 1239 - }, - { - "epoch": 0.9422492401215805, - "grad_norm": 1.3503763675689697, - "learning_rate": 4.108038373351163e-06, - "loss": 0.4907652735710144, - "mean_token_accuracy": 0.8204987049102783, - "num_tokens": 11118480.0, - "step": 1240 - }, - { - "epoch": 0.9430091185410334, - "grad_norm": 1.9571399688720703, - "learning_rate": 4.106434161655962e-06, - "loss": 0.4709656536579132, - "mean_token_accuracy": 0.8371885418891907, - "num_tokens": 11126265.0, - "step": 1241 - }, - { - "epoch": 0.9437689969604863, - "grad_norm": 2.1277313232421875, - "learning_rate": 4.104828822503427e-06, - "loss": 0.4010283350944519, - "mean_token_accuracy": 0.8586333990097046, - "num_tokens": 11133022.0, - "step": 1242 - }, - { - "epoch": 0.9445288753799392, - "grad_norm": 1.6745036840438843, - "learning_rate": 4.103222357020248e-06, - "loss": 0.562545657157898, - "mean_token_accuracy": 0.8052060604095459, - "num_tokens": 11145255.0, - "step": 1243 - }, - { - "epoch": 0.9452887537993921, - "grad_norm": 2.3616299629211426, - "learning_rate": 4.101614766333904e-06, - "loss": 0.5878340601921082, - "mean_token_accuracy": 0.796745777130127, - "num_tokens": 11152020.0, - "step": 1244 - }, - { - "epoch": 0.9460486322188449, - "grad_norm": 1.6182078123092651, - "learning_rate": 4.100006051572664e-06, - "loss": 0.5357589721679688, - "mean_token_accuracy": 0.8089962005615234, - "num_tokens": 11163112.0, - "step": 1245 - }, - { - "epoch": 0.9468085106382979, - "grad_norm": 1.911770224571228, - "learning_rate": 4.098396213865587e-06, - "loss": 0.49805426597595215, - "mean_token_accuracy": 0.8289647102355957, - "num_tokens": 11171768.0, - "step": 1246 - }, - { - "epoch": 0.9475683890577508, - "grad_norm": 1.649155616760254, - "learning_rate": 4.096785254342518e-06, - "loss": 0.5756166577339172, - "mean_token_accuracy": 0.807680606842041, - "num_tokens": 11183527.0, - "step": 1247 - }, - { - "epoch": 0.9483282674772037, - "grad_norm": 1.8922761678695679, - "learning_rate": 4.095173174134091e-06, - "loss": 0.44688963890075684, - "mean_token_accuracy": 0.8375608921051025, - "num_tokens": 11191494.0, - "step": 1248 - }, - { - "epoch": 0.9490881458966566, - "grad_norm": 2.9044547080993652, - "learning_rate": 4.093559974371725e-06, - "loss": 0.48609739542007446, - "mean_token_accuracy": 0.8404892086982727, - "num_tokens": 11195837.0, - "step": 1249 - }, - { - "epoch": 0.9498480243161094, - "grad_norm": 2.287506580352783, - "learning_rate": 4.091945656187626e-06, - "loss": 0.5260225534439087, - "mean_token_accuracy": 0.8181945085525513, - "num_tokens": 11202174.0, - "step": 1250 - }, - { - "epoch": 0.9506079027355623, - "grad_norm": 1.7908886671066284, - "learning_rate": 4.090330220714785e-06, - "loss": 0.4207724928855896, - "mean_token_accuracy": 0.8616912364959717, - "num_tokens": 11209995.0, - "step": 1251 - }, - { - "epoch": 0.9513677811550152, - "grad_norm": 2.905418634414673, - "learning_rate": 4.0887136690869774e-06, - "loss": 0.4209241271018982, - "mean_token_accuracy": 0.8561323285102844, - "num_tokens": 11213799.0, - "step": 1252 - }, - { - "epoch": 0.9521276595744681, - "grad_norm": 2.814150333404541, - "learning_rate": 4.08709600243876e-06, - "loss": 0.36855608224868774, - "mean_token_accuracy": 0.8764539361000061, - "num_tokens": 11217643.0, - "step": 1253 - }, - { - "epoch": 0.952887537993921, - "grad_norm": 1.9385707378387451, - "learning_rate": 4.0854772219054735e-06, - "loss": 0.531031608581543, - "mean_token_accuracy": 0.80600905418396, - "num_tokens": 11225871.0, - "step": 1254 - }, - { - "epoch": 0.9536474164133738, - "grad_norm": 2.103058099746704, - "learning_rate": 4.083857328623243e-06, - "loss": 0.4576364755630493, - "mean_token_accuracy": 0.8447524905204773, - "num_tokens": 11231829.0, - "step": 1255 - }, - { - "epoch": 0.9544072948328267, - "grad_norm": 1.7518818378448486, - "learning_rate": 4.082236323728969e-06, - "loss": 0.5386767983436584, - "mean_token_accuracy": 0.8055596351623535, - "num_tokens": 11240977.0, - "step": 1256 - }, - { - "epoch": 0.9551671732522796, - "grad_norm": 1.8434966802597046, - "learning_rate": 4.0806142083603365e-06, - "loss": 0.5415925979614258, - "mean_token_accuracy": 0.809962272644043, - "num_tokens": 11249616.0, - "step": 1257 - }, - { - "epoch": 0.9559270516717325, - "grad_norm": 1.7341015338897705, - "learning_rate": 4.078990983655807e-06, - "loss": 0.4621101915836334, - "mean_token_accuracy": 0.8330386877059937, - "num_tokens": 11258616.0, - "step": 1258 - }, - { - "epoch": 0.9566869300911854, - "grad_norm": 1.8589727878570557, - "learning_rate": 4.077366650754624e-06, - "loss": 0.4031238555908203, - "mean_token_accuracy": 0.842434287071228, - "num_tokens": 11266006.0, - "step": 1259 - }, - { - "epoch": 0.9574468085106383, - "grad_norm": 1.657175898551941, - "learning_rate": 4.075741210796806e-06, - "loss": 0.41686388850212097, - "mean_token_accuracy": 0.8443650007247925, - "num_tokens": 11275601.0, - "step": 1260 - }, - { - "epoch": 0.9582066869300911, - "grad_norm": 2.4303717613220215, - "learning_rate": 4.07411466492315e-06, - "loss": 0.4554435610771179, - "mean_token_accuracy": 0.853043794631958, - "num_tokens": 11280650.0, - "step": 1261 - }, - { - "epoch": 0.958966565349544, - "grad_norm": 2.3653745651245117, - "learning_rate": 4.072487014275228e-06, - "loss": 0.4304995536804199, - "mean_token_accuracy": 0.8462260961532593, - "num_tokens": 11285637.0, - "step": 1262 - }, - { - "epoch": 0.959726443768997, - "grad_norm": 1.6689718961715698, - "learning_rate": 4.070858259995388e-06, - "loss": 0.5290807485580444, - "mean_token_accuracy": 0.8176917433738708, - "num_tokens": 11299110.0, - "step": 1263 - }, - { - "epoch": 0.9604863221884499, - "grad_norm": 2.103879451751709, - "learning_rate": 4.069228403226751e-06, - "loss": 0.4620879888534546, - "mean_token_accuracy": 0.835270345211029, - "num_tokens": 11305564.0, - "step": 1264 - }, - { - "epoch": 0.9612462006079028, - "grad_norm": 2.139012575149536, - "learning_rate": 4.067597445113216e-06, - "loss": 0.5143396258354187, - "mean_token_accuracy": 0.8191739320755005, - "num_tokens": 11311870.0, - "step": 1265 - }, - { - "epoch": 0.9620060790273556, - "grad_norm": 1.3971210718154907, - "learning_rate": 4.06596538679945e-06, - "loss": 0.472080260515213, - "mean_token_accuracy": 0.8321092128753662, - "num_tokens": 11323970.0, - "step": 1266 - }, - { - "epoch": 0.9627659574468085, - "grad_norm": 1.4965174198150635, - "learning_rate": 4.064332229430895e-06, - "loss": 0.359701007604599, - "mean_token_accuracy": 0.8903120160102844, - "num_tokens": 11333412.0, - "step": 1267 - }, - { - "epoch": 0.9635258358662614, - "grad_norm": 1.1898726224899292, - "learning_rate": 4.062697974153764e-06, - "loss": 0.3423798084259033, - "mean_token_accuracy": 0.8661491870880127, - "num_tokens": 11347657.0, - "step": 1268 - }, - { - "epoch": 0.9642857142857143, - "grad_norm": 1.4952168464660645, - "learning_rate": 4.06106262211504e-06, - "loss": 0.4214417338371277, - "mean_token_accuracy": 0.8362159729003906, - "num_tokens": 11357786.0, - "step": 1269 - }, - { - "epoch": 0.9650455927051672, - "grad_norm": 1.7949583530426025, - "learning_rate": 4.059426174462476e-06, - "loss": 0.59087735414505, - "mean_token_accuracy": 0.7965556979179382, - "num_tokens": 11370561.0, - "step": 1270 - }, - { - "epoch": 0.96580547112462, - "grad_norm": 1.8973214626312256, - "learning_rate": 4.057788632344594e-06, - "loss": 0.47525322437286377, - "mean_token_accuracy": 0.8317365050315857, - "num_tokens": 11378507.0, - "step": 1271 - }, - { - "epoch": 0.9665653495440729, - "grad_norm": 1.8665250539779663, - "learning_rate": 4.056149996910683e-06, - "loss": 0.3537125587463379, - "mean_token_accuracy": 0.8921569585800171, - "num_tokens": 11385186.0, - "step": 1272 - }, - { - "epoch": 0.9673252279635258, - "grad_norm": 1.5072317123413086, - "learning_rate": 4.054510269310803e-06, - "loss": 0.5145624876022339, - "mean_token_accuracy": 0.8265488147735596, - "num_tokens": 11397125.0, - "step": 1273 - }, - { - "epoch": 0.9680851063829787, - "grad_norm": 1.520525574684143, - "learning_rate": 4.052869450695776e-06, - "loss": 0.44322293996810913, - "mean_token_accuracy": 0.8403642177581787, - "num_tokens": 11409919.0, - "step": 1274 - }, - { - "epoch": 0.9688449848024316, - "grad_norm": 1.3764475584030151, - "learning_rate": 4.051227542217192e-06, - "loss": 0.5774400234222412, - "mean_token_accuracy": 0.804118275642395, - "num_tokens": 11425900.0, - "step": 1275 - }, - { - "epoch": 0.9696048632218845, - "grad_norm": 1.3922648429870605, - "learning_rate": 4.049584545027406e-06, - "loss": 0.42727944254875183, - "mean_token_accuracy": 0.8654505014419556, - "num_tokens": 11438787.0, - "step": 1276 - }, - { - "epoch": 0.9703647416413373, - "grad_norm": 1.8505840301513672, - "learning_rate": 4.047940460279537e-06, - "loss": 0.490803062915802, - "mean_token_accuracy": 0.8340574502944946, - "num_tokens": 11447997.0, - "step": 1277 - }, - { - "epoch": 0.9711246200607903, - "grad_norm": 2.28271222114563, - "learning_rate": 4.046295289127466e-06, - "loss": 0.588828444480896, - "mean_token_accuracy": 0.833497166633606, - "num_tokens": 11454072.0, - "step": 1278 - }, - { - "epoch": 0.9718844984802432, - "grad_norm": 2.4242560863494873, - "learning_rate": 4.044649032725836e-06, - "loss": 0.5128831267356873, - "mean_token_accuracy": 0.8225122690200806, - "num_tokens": 11460211.0, - "step": 1279 - }, - { - "epoch": 0.9726443768996961, - "grad_norm": 2.1738455295562744, - "learning_rate": 4.0430016922300566e-06, - "loss": 0.441631942987442, - "mean_token_accuracy": 0.841723620891571, - "num_tokens": 11466814.0, - "step": 1280 - }, - { - "epoch": 0.973404255319149, - "grad_norm": 2.541599988937378, - "learning_rate": 4.0413532687962926e-06, - "loss": 0.5062629580497742, - "mean_token_accuracy": 0.8013502359390259, - "num_tokens": 11472371.0, - "step": 1281 - }, - { - "epoch": 0.9741641337386018, - "grad_norm": 2.8011014461517334, - "learning_rate": 4.039703763581472e-06, - "loss": 0.5061966776847839, - "mean_token_accuracy": 0.829810380935669, - "num_tokens": 11476672.0, - "step": 1282 - }, - { - "epoch": 0.9749240121580547, - "grad_norm": 2.4505462646484375, - "learning_rate": 4.038053177743279e-06, - "loss": 0.43407535552978516, - "mean_token_accuracy": 0.8428469896316528, - "num_tokens": 11481297.0, - "step": 1283 - }, - { - "epoch": 0.9756838905775076, - "grad_norm": 2.1618378162384033, - "learning_rate": 4.036401512440161e-06, - "loss": 0.6056663393974304, - "mean_token_accuracy": 0.7977457642555237, - "num_tokens": 11488657.0, - "step": 1284 - }, - { - "epoch": 0.9764437689969605, - "grad_norm": 1.9192147254943848, - "learning_rate": 4.034748768831319e-06, - "loss": 0.524390697479248, - "mean_token_accuracy": 0.8120636940002441, - "num_tokens": 11496485.0, - "step": 1285 - }, - { - "epoch": 0.9772036474164134, - "grad_norm": 2.766435384750366, - "learning_rate": 4.033094948076713e-06, - "loss": 0.5494908690452576, - "mean_token_accuracy": 0.8141890168190002, - "num_tokens": 11501341.0, - "step": 1286 - }, - { - "epoch": 0.9779635258358662, - "grad_norm": 1.3519539833068848, - "learning_rate": 4.031440051337056e-06, - "loss": 0.4339691400527954, - "mean_token_accuracy": 0.8400131464004517, - "num_tokens": 11512843.0, - "step": 1287 - }, - { - "epoch": 0.9787234042553191, - "grad_norm": 1.2492141723632812, - "learning_rate": 4.02978407977382e-06, - "loss": 0.4433518052101135, - "mean_token_accuracy": 0.8432940244674683, - "num_tokens": 11530227.0, - "step": 1288 - }, - { - "epoch": 0.979483282674772, - "grad_norm": 1.6597715616226196, - "learning_rate": 4.02812703454923e-06, - "loss": 0.602222204208374, - "mean_token_accuracy": 0.786965548992157, - "num_tokens": 11543955.0, - "step": 1289 - }, - { - "epoch": 0.9802431610942249, - "grad_norm": 1.6621816158294678, - "learning_rate": 4.026468916826262e-06, - "loss": 0.35662174224853516, - "mean_token_accuracy": 0.8716133832931519, - "num_tokens": 11552064.0, - "step": 1290 - }, - { - "epoch": 0.9810030395136778, - "grad_norm": 4.539844989776611, - "learning_rate": 4.024809727768648e-06, - "loss": 0.543423593044281, - "mean_token_accuracy": 0.8293194770812988, - "num_tokens": 11555595.0, - "step": 1291 - }, - { - "epoch": 0.9817629179331308, - "grad_norm": 1.4026556015014648, - "learning_rate": 4.023149468540871e-06, - "loss": 0.4301237165927887, - "mean_token_accuracy": 0.8358224630355835, - "num_tokens": 11572275.0, - "step": 1292 - }, - { - "epoch": 0.9825227963525835, - "grad_norm": 1.611262321472168, - "learning_rate": 4.021488140308165e-06, - "loss": 0.5378580689430237, - "mean_token_accuracy": 0.8173760771751404, - "num_tokens": 11584299.0, - "step": 1293 - }, - { - "epoch": 0.9832826747720365, - "grad_norm": 4.138631820678711, - "learning_rate": 4.019825744236514e-06, - "loss": 0.40272149443626404, - "mean_token_accuracy": 0.8648844957351685, - "num_tokens": 11586705.0, - "step": 1294 - }, - { - "epoch": 0.9840425531914894, - "grad_norm": 3.177703619003296, - "learning_rate": 4.018162281492651e-06, - "loss": 0.5320103168487549, - "mean_token_accuracy": 0.8250276446342468, - "num_tokens": 11590689.0, - "step": 1295 - }, - { - "epoch": 0.9848024316109423, - "grad_norm": 2.727597713470459, - "learning_rate": 4.016497753244058e-06, - "loss": 0.5662774443626404, - "mean_token_accuracy": 0.8074625730514526, - "num_tokens": 11596092.0, - "step": 1296 - }, - { - "epoch": 0.9855623100303952, - "grad_norm": 1.485139012336731, - "learning_rate": 4.014832160658966e-06, - "loss": 0.5414972305297852, - "mean_token_accuracy": 0.8082696199417114, - "num_tokens": 11613785.0, - "step": 1297 - }, - { - "epoch": 0.986322188449848, - "grad_norm": 2.4025990962982178, - "learning_rate": 4.013165504906352e-06, - "loss": 0.6556503772735596, - "mean_token_accuracy": 0.7785214781761169, - "num_tokens": 11620421.0, - "step": 1298 - }, - { - "epoch": 0.9870820668693009, - "grad_norm": 1.878273606300354, - "learning_rate": 4.011497787155938e-06, - "loss": 0.4221133887767792, - "mean_token_accuracy": 0.850035548210144, - "num_tokens": 11627998.0, - "step": 1299 - }, - { - "epoch": 0.9878419452887538, - "grad_norm": 2.0430715084075928, - "learning_rate": 4.009829008578192e-06, - "loss": 0.5205984711647034, - "mean_token_accuracy": 0.819183349609375, - "num_tokens": 11636279.0, - "step": 1300 - }, - { - "epoch": 0.9886018237082067, - "grad_norm": 3.4769439697265625, - "learning_rate": 4.00815917034433e-06, - "loss": 0.5449948310852051, - "mean_token_accuracy": 0.8240023851394653, - "num_tokens": 11639638.0, - "step": 1301 - }, - { - "epoch": 0.9893617021276596, - "grad_norm": 2.4783987998962402, - "learning_rate": 4.006488273626307e-06, - "loss": 0.4316832423210144, - "mean_token_accuracy": 0.8474695086479187, - "num_tokens": 11645463.0, - "step": 1302 - }, - { - "epoch": 0.9901215805471124, - "grad_norm": 1.881475567817688, - "learning_rate": 4.004816319596822e-06, - "loss": 0.5157331824302673, - "mean_token_accuracy": 0.826042652130127, - "num_tokens": 11653955.0, - "step": 1303 - }, - { - "epoch": 0.9908814589665653, - "grad_norm": 2.6569254398345947, - "learning_rate": 4.003143309429317e-06, - "loss": 0.46492767333984375, - "mean_token_accuracy": 0.8320850133895874, - "num_tokens": 11659357.0, - "step": 1304 - }, - { - "epoch": 0.9916413373860182, - "grad_norm": 2.4917593002319336, - "learning_rate": 4.0014692442979756e-06, - "loss": 0.459585040807724, - "mean_token_accuracy": 0.8457611799240112, - "num_tokens": 11664207.0, - "step": 1305 - }, - { - "epoch": 0.9924012158054711, - "grad_norm": 2.6885526180267334, - "learning_rate": 3.999794125377721e-06, - "loss": 0.4677402973175049, - "mean_token_accuracy": 0.8307361602783203, - "num_tokens": 11668879.0, - "step": 1306 - }, - { - "epoch": 0.993161094224924, - "grad_norm": 1.9737319946289062, - "learning_rate": 3.998117953844215e-06, - "loss": 0.44684839248657227, - "mean_token_accuracy": 0.8367687463760376, - "num_tokens": 11676081.0, - "step": 1307 - }, - { - "epoch": 0.993920972644377, - "grad_norm": 1.4333021640777588, - "learning_rate": 3.996440730873861e-06, - "loss": 0.526146650314331, - "mean_token_accuracy": 0.816251814365387, - "num_tokens": 11689333.0, - "step": 1308 - }, - { - "epoch": 0.9946808510638298, - "grad_norm": 1.3689230680465698, - "learning_rate": 3.9947624576437975e-06, - "loss": 0.40214329957962036, - "mean_token_accuracy": 0.8610327839851379, - "num_tokens": 11701540.0, - "step": 1309 - }, - { - "epoch": 0.9954407294832827, - "grad_norm": 1.2435375452041626, - "learning_rate": 3.9930831353319025e-06, - "loss": 0.4532913267612457, - "mean_token_accuracy": 0.8415389060974121, - "num_tokens": 11717920.0, - "step": 1310 - }, - { - "epoch": 0.9962006079027356, - "grad_norm": 1.9968011379241943, - "learning_rate": 3.9914027651167866e-06, - "loss": 0.46954160928726196, - "mean_token_accuracy": 0.8351103663444519, - "num_tokens": 11724999.0, - "step": 1311 - }, - { - "epoch": 0.9969604863221885, - "grad_norm": 1.9521311521530151, - "learning_rate": 3.989721348177801e-06, - "loss": 0.5068016052246094, - "mean_token_accuracy": 0.8220845460891724, - "num_tokens": 11732569.0, - "step": 1312 - }, - { - "epoch": 0.9977203647416414, - "grad_norm": 2.7332582473754883, - "learning_rate": 3.988038885695028e-06, - "loss": 0.4154692590236664, - "mean_token_accuracy": 0.8493857383728027, - "num_tokens": 11736759.0, - "step": 1313 - }, - { - "epoch": 0.9984802431610942, - "grad_norm": 1.8656952381134033, - "learning_rate": 3.986355378849284e-06, - "loss": 0.4151354134082794, - "mean_token_accuracy": 0.83440101146698, - "num_tokens": 11743827.0, - "step": 1314 - }, - { - "epoch": 0.9992401215805471, - "grad_norm": 1.304006576538086, - "learning_rate": 3.984670828822118e-06, - "loss": 0.4926128089427948, - "mean_token_accuracy": 0.8603005409240723, - "num_tokens": 11757707.0, - "step": 1315 - }, - { - "epoch": 1.0, - "grad_norm": 1.497079610824585, - "learning_rate": 3.982985236795815e-06, - "loss": 0.43342477083206177, - "mean_token_accuracy": 0.8550825119018555, - "num_tokens": 11769678.0, - "step": 1316 - }, - { - "epoch": 1.000759878419453, - "grad_norm": 2.870274543762207, - "learning_rate": 3.981298603953385e-06, - "loss": 0.3723528981208801, - "mean_token_accuracy": 0.8745899796485901, - "num_tokens": 11773290.0, - "step": 1317 - }, - { - "epoch": 1.0015197568389058, - "grad_norm": 1.3442503213882446, - "learning_rate": 3.979610931478574e-06, - "loss": 0.34688329696655273, - "mean_token_accuracy": 0.8749074935913086, - "num_tokens": 11786400.0, - "step": 1318 - }, - { - "epoch": 1.0022796352583587, - "grad_norm": 1.7272238731384277, - "learning_rate": 3.977922220555855e-06, - "loss": 0.28274932503700256, - "mean_token_accuracy": 0.896713137626648, - "num_tokens": 11793059.0, - "step": 1319 - }, - { - "epoch": 1.0030395136778116, - "grad_norm": 1.7362451553344727, - "learning_rate": 3.976232472370431e-06, - "loss": 0.5494794845581055, - "mean_token_accuracy": 0.8341718912124634, - "num_tokens": 11802593.0, - "step": 1320 - }, - { - "epoch": 1.0037993920972645, - "grad_norm": 1.3316494226455688, - "learning_rate": 3.97454168810823e-06, - "loss": 0.41505366563796997, - "mean_token_accuracy": 0.8581969738006592, - "num_tokens": 11813925.0, - "step": 1321 - }, - { - "epoch": 1.0045592705167172, - "grad_norm": 1.6152615547180176, - "learning_rate": 3.972849868955913e-06, - "loss": 0.44761013984680176, - "mean_token_accuracy": 0.8413045406341553, - "num_tokens": 11825709.0, - "step": 1322 - }, - { - "epoch": 1.0053191489361701, - "grad_norm": 2.1172471046447754, - "learning_rate": 3.97115701610086e-06, - "loss": 0.3903353810310364, - "mean_token_accuracy": 0.8662760257720947, - "num_tokens": 11832070.0, - "step": 1323 - }, - { - "epoch": 1.006079027355623, - "grad_norm": 1.5923868417739868, - "learning_rate": 3.969463130731183e-06, - "loss": 0.4491051137447357, - "mean_token_accuracy": 0.8677828311920166, - "num_tokens": 11843154.0, - "step": 1324 - }, - { - "epoch": 1.006838905775076, - "grad_norm": 1.6848995685577393, - "learning_rate": 3.967768214035716e-06, - "loss": 0.45765817165374756, - "mean_token_accuracy": 0.8401060104370117, - "num_tokens": 11854826.0, - "step": 1325 - }, - { - "epoch": 1.0075987841945289, - "grad_norm": 2.3739020824432373, - "learning_rate": 3.966072267204014e-06, - "loss": 0.4482722580432892, - "mean_token_accuracy": 0.8368916511535645, - "num_tokens": 11860559.0, - "step": 1326 - }, - { - "epoch": 1.0083586626139818, - "grad_norm": 1.5403034687042236, - "learning_rate": 3.964375291426361e-06, - "loss": 0.35589972138404846, - "mean_token_accuracy": 0.8728118538856506, - "num_tokens": 11871959.0, - "step": 1327 - }, - { - "epoch": 1.0091185410334347, - "grad_norm": 1.6750119924545288, - "learning_rate": 3.962677287893758e-06, - "loss": 0.35873427987098694, - "mean_token_accuracy": 0.9027186632156372, - "num_tokens": 11881818.0, - "step": 1328 - }, - { - "epoch": 1.0098784194528876, - "grad_norm": 1.5489170551300049, - "learning_rate": 3.9609782577979305e-06, - "loss": 0.3634672462940216, - "mean_token_accuracy": 0.8582607507705688, - "num_tokens": 11891084.0, - "step": 1329 - }, - { - "epoch": 1.0106382978723405, - "grad_norm": 2.43859601020813, - "learning_rate": 3.959278202331323e-06, - "loss": 0.3640799820423126, - "mean_token_accuracy": 0.88062584400177, - "num_tokens": 11896032.0, - "step": 1330 - }, - { - "epoch": 1.0113981762917934, - "grad_norm": 3.612184524536133, - "learning_rate": 3.9575771226870986e-06, - "loss": 0.3733130097389221, - "mean_token_accuracy": 0.8946067094802856, - "num_tokens": 11899479.0, - "step": 1331 - }, - { - "epoch": 1.012158054711246, - "grad_norm": 1.541355848312378, - "learning_rate": 3.955875020059141e-06, - "loss": 0.320593923330307, - "mean_token_accuracy": 0.9057406783103943, - "num_tokens": 11910179.0, - "step": 1332 - }, - { - "epoch": 1.012917933130699, - "grad_norm": 2.0565030574798584, - "learning_rate": 3.954171895642052e-06, - "loss": 0.3341682553291321, - "mean_token_accuracy": 0.8829344511032104, - "num_tokens": 11916489.0, - "step": 1333 - }, - { - "epoch": 1.013677811550152, - "grad_norm": 2.9732539653778076, - "learning_rate": 3.9524677506311505e-06, - "loss": 0.38488566875457764, - "mean_token_accuracy": 0.8752974271774292, - "num_tokens": 11920682.0, - "step": 1334 - }, - { - "epoch": 1.0144376899696048, - "grad_norm": 2.7697458267211914, - "learning_rate": 3.950762586222469e-06, - "loss": 0.39864760637283325, - "mean_token_accuracy": 0.8593167662620544, - "num_tokens": 11925233.0, - "step": 1335 - }, - { - "epoch": 1.0151975683890577, - "grad_norm": 2.2302119731903076, - "learning_rate": 3.949056403612758e-06, - "loss": 0.3985682725906372, - "mean_token_accuracy": 0.8677899837493896, - "num_tokens": 11932000.0, - "step": 1336 - }, - { - "epoch": 1.0159574468085106, - "grad_norm": 2.360572576522827, - "learning_rate": 3.947349203999485e-06, - "loss": 0.36940714716911316, - "mean_token_accuracy": 0.8760676383972168, - "num_tokens": 11937569.0, - "step": 1337 - }, - { - "epoch": 1.0167173252279635, - "grad_norm": 1.3383921384811401, - "learning_rate": 3.945640988580824e-06, - "loss": 0.40628793835639954, - "mean_token_accuracy": 0.866442084312439, - "num_tokens": 11955679.0, - "step": 1338 - }, - { - "epoch": 1.0174772036474165, - "grad_norm": 2.1502623558044434, - "learning_rate": 3.943931758555669e-06, - "loss": 0.4493565559387207, - "mean_token_accuracy": 0.8307522535324097, - "num_tokens": 11962734.0, - "step": 1339 - }, - { - "epoch": 1.0182370820668694, - "grad_norm": 2.4737331867218018, - "learning_rate": 3.942221515123624e-06, - "loss": 0.28508758544921875, - "mean_token_accuracy": 0.8967142105102539, - "num_tokens": 11967783.0, - "step": 1340 - }, - { - "epoch": 1.0189969604863223, - "grad_norm": 2.4525370597839355, - "learning_rate": 3.940510259485002e-06, - "loss": 0.40227818489074707, - "mean_token_accuracy": 0.8618967533111572, - "num_tokens": 11972918.0, - "step": 1341 - }, - { - "epoch": 1.0197568389057752, - "grad_norm": 1.7299731969833374, - "learning_rate": 3.938797992840828e-06, - "loss": 0.26339593529701233, - "mean_token_accuracy": 0.9004406929016113, - "num_tokens": 11981250.0, - "step": 1342 - }, - { - "epoch": 1.0205167173252279, - "grad_norm": 2.8756747245788574, - "learning_rate": 3.937084716392839e-06, - "loss": 0.47792482376098633, - "mean_token_accuracy": 0.8440839052200317, - "num_tokens": 11986356.0, - "step": 1343 - }, - { - "epoch": 1.0212765957446808, - "grad_norm": 2.104473114013672, - "learning_rate": 3.935370431343475e-06, - "loss": 0.36723971366882324, - "mean_token_accuracy": 0.8831232786178589, - "num_tokens": 11994495.0, - "step": 1344 - }, - { - "epoch": 1.0220364741641337, - "grad_norm": 1.9173074960708618, - "learning_rate": 3.933655138895889e-06, - "loss": 0.409319669008255, - "mean_token_accuracy": 0.8632645606994629, - "num_tokens": 12002060.0, - "step": 1345 - }, - { - "epoch": 1.0227963525835866, - "grad_norm": 2.958311080932617, - "learning_rate": 3.9319388402539395e-06, - "loss": 0.5390093922615051, - "mean_token_accuracy": 0.8204828500747681, - "num_tokens": 12007588.0, - "step": 1346 - }, - { - "epoch": 1.0235562310030395, - "grad_norm": 1.6470831632614136, - "learning_rate": 3.930221536622192e-06, - "loss": 0.4524633288383484, - "mean_token_accuracy": 0.8516575694084167, - "num_tokens": 12018831.0, - "step": 1347 - }, - { - "epoch": 1.0243161094224924, - "grad_norm": 1.3160780668258667, - "learning_rate": 3.928503229205913e-06, - "loss": 0.4180558919906616, - "mean_token_accuracy": 0.8495022058486938, - "num_tokens": 12033947.0, - "step": 1348 - }, - { - "epoch": 1.0250759878419453, - "grad_norm": 1.9686089754104614, - "learning_rate": 3.92678391921108e-06, - "loss": 0.41927334666252136, - "mean_token_accuracy": 0.8462997674942017, - "num_tokens": 12042005.0, - "step": 1349 - }, - { - "epoch": 1.0258358662613982, - "grad_norm": 2.351778507232666, - "learning_rate": 3.92506360784437e-06, - "loss": 0.2946245074272156, - "mean_token_accuracy": 0.9170923233032227, - "num_tokens": 12046579.0, - "step": 1350 - }, - { - "epoch": 1.0265957446808511, - "grad_norm": 2.0636913776397705, - "learning_rate": 3.923342296313162e-06, - "loss": 0.3422774076461792, - "mean_token_accuracy": 0.8809213638305664, - "num_tokens": 12053214.0, - "step": 1351 - }, - { - "epoch": 1.027355623100304, - "grad_norm": 1.7272592782974243, - "learning_rate": 3.92161998582554e-06, - "loss": 0.5864541530609131, - "mean_token_accuracy": 0.7986117601394653, - "num_tokens": 12068522.0, - "step": 1352 - }, - { - "epoch": 1.028115501519757, - "grad_norm": 0.8980231881141663, - "learning_rate": 3.919896677590289e-06, - "loss": 0.2964550256729126, - "mean_token_accuracy": 0.8911845088005066, - "num_tokens": 12093834.0, - "step": 1353 - }, - { - "epoch": 1.0288753799392096, - "grad_norm": 1.6031712293624878, - "learning_rate": 3.918172372816892e-06, - "loss": 0.37254488468170166, - "mean_token_accuracy": 0.8615843057632446, - "num_tokens": 12104393.0, - "step": 1354 - }, - { - "epoch": 1.0296352583586625, - "grad_norm": 1.282134771347046, - "learning_rate": 3.916447072715531e-06, - "loss": 0.3522927761077881, - "mean_token_accuracy": 0.8713657259941101, - "num_tokens": 12118671.0, - "step": 1355 - }, - { - "epoch": 1.0303951367781155, - "grad_norm": 2.1986680030822754, - "learning_rate": 3.914720778497091e-06, - "loss": 0.3716316223144531, - "mean_token_accuracy": 0.8661249279975891, - "num_tokens": 12125178.0, - "step": 1356 - }, - { - "epoch": 1.0311550151975684, - "grad_norm": 1.5937882661819458, - "learning_rate": 3.91299349137315e-06, - "loss": 0.48067355155944824, - "mean_token_accuracy": 0.8284252882003784, - "num_tokens": 12136785.0, - "step": 1357 - }, - { - "epoch": 1.0319148936170213, - "grad_norm": 1.6743099689483643, - "learning_rate": 3.9112652125559845e-06, - "loss": 0.4461551308631897, - "mean_token_accuracy": 0.8381845355033875, - "num_tokens": 12150066.0, - "step": 1358 - }, - { - "epoch": 1.0326747720364742, - "grad_norm": 2.2346715927124023, - "learning_rate": 3.909535943258567e-06, - "loss": 0.3148220181465149, - "mean_token_accuracy": 0.8797591924667358, - "num_tokens": 12155506.0, - "step": 1359 - }, - { - "epoch": 1.033434650455927, - "grad_norm": 1.9608992338180542, - "learning_rate": 3.907805684694567e-06, - "loss": 0.32598960399627686, - "mean_token_accuracy": 0.8819410800933838, - "num_tokens": 12163261.0, - "step": 1360 - }, - { - "epoch": 1.03419452887538, - "grad_norm": 2.413477897644043, - "learning_rate": 3.906074438078343e-06, - "loss": 0.38179588317871094, - "mean_token_accuracy": 0.8739585876464844, - "num_tokens": 12169254.0, - "step": 1361 - }, - { - "epoch": 1.034954407294833, - "grad_norm": 2.0258278846740723, - "learning_rate": 3.904342204624955e-06, - "loss": 0.33240315318107605, - "mean_token_accuracy": 0.8808181285858154, - "num_tokens": 12175379.0, - "step": 1362 - }, - { - "epoch": 1.0357142857142858, - "grad_norm": 2.4111437797546387, - "learning_rate": 3.9026089855501475e-06, - "loss": 0.412802517414093, - "mean_token_accuracy": 0.8504396677017212, - "num_tokens": 12182007.0, - "step": 1363 - }, - { - "epoch": 1.0364741641337385, - "grad_norm": 2.0424840450286865, - "learning_rate": 3.900874782070362e-06, - "loss": 0.2914797067642212, - "mean_token_accuracy": 0.8731886148452759, - "num_tokens": 12187743.0, - "step": 1364 - }, - { - "epoch": 1.0372340425531914, - "grad_norm": 2.9248716831207275, - "learning_rate": 3.899139595402729e-06, - "loss": 0.34071338176727295, - "mean_token_accuracy": 0.8736443519592285, - "num_tokens": 12191830.0, - "step": 1365 - }, - { - "epoch": 1.0379939209726443, - "grad_norm": 2.240220785140991, - "learning_rate": 3.8974034267650695e-06, - "loss": 0.23049014806747437, - "mean_token_accuracy": 0.9000070691108704, - "num_tokens": 12196460.0, - "step": 1366 - }, - { - "epoch": 1.0387537993920972, - "grad_norm": 1.5038460493087769, - "learning_rate": 3.895666277375892e-06, - "loss": 0.32255327701568604, - "mean_token_accuracy": 0.873004674911499, - "num_tokens": 12206230.0, - "step": 1367 - }, - { - "epoch": 1.0395136778115501, - "grad_norm": 1.2339142560958862, - "learning_rate": 3.893928148454398e-06, - "loss": 0.4069131314754486, - "mean_token_accuracy": 0.8461740016937256, - "num_tokens": 12226502.0, - "step": 1368 - }, - { - "epoch": 1.040273556231003, - "grad_norm": 2.531553268432617, - "learning_rate": 3.89218904122047e-06, - "loss": 0.43681037425994873, - "mean_token_accuracy": 0.8497104048728943, - "num_tokens": 12232241.0, - "step": 1369 - }, - { - "epoch": 1.041033434650456, - "grad_norm": 3.8404815196990967, - "learning_rate": 3.890448956894682e-06, - "loss": 0.3241814970970154, - "mean_token_accuracy": 0.884732723236084, - "num_tokens": 12235126.0, - "step": 1370 - }, - { - "epoch": 1.0417933130699089, - "grad_norm": 2.9608030319213867, - "learning_rate": 3.888707896698293e-06, - "loss": 0.4641021490097046, - "mean_token_accuracy": 0.8496800661087036, - "num_tokens": 12240630.0, - "step": 1371 - }, - { - "epoch": 1.0425531914893618, - "grad_norm": 2.1166417598724365, - "learning_rate": 3.886965861853243e-06, - "loss": 0.42038479447364807, - "mean_token_accuracy": 0.8512747287750244, - "num_tokens": 12247969.0, - "step": 1372 - }, - { - "epoch": 1.0433130699088147, - "grad_norm": 2.5918161869049072, - "learning_rate": 3.885222853582163e-06, - "loss": 0.2871917188167572, - "mean_token_accuracy": 0.9129709601402283, - "num_tokens": 12252161.0, - "step": 1373 - }, - { - "epoch": 1.0440729483282676, - "grad_norm": 2.4261348247528076, - "learning_rate": 3.88347887310836e-06, - "loss": 0.4003123342990875, - "mean_token_accuracy": 0.8570356369018555, - "num_tokens": 12258135.0, - "step": 1374 - }, - { - "epoch": 1.0448328267477203, - "grad_norm": 1.3439548015594482, - "learning_rate": 3.881733921655829e-06, - "loss": 0.3278140425682068, - "mean_token_accuracy": 0.8831373453140259, - "num_tokens": 12272849.0, - "step": 1375 - }, - { - "epoch": 1.0455927051671732, - "grad_norm": 1.527989387512207, - "learning_rate": 3.879988000449243e-06, - "loss": 0.33789363503456116, - "mean_token_accuracy": 0.8825669884681702, - "num_tokens": 12283281.0, - "step": 1376 - }, - { - "epoch": 1.046352583586626, - "grad_norm": 1.6755503416061401, - "learning_rate": 3.878241110713957e-06, - "loss": 0.4816160798072815, - "mean_token_accuracy": 0.8193758726119995, - "num_tokens": 12295422.0, - "step": 1377 - }, - { - "epoch": 1.047112462006079, - "grad_norm": 2.8110361099243164, - "learning_rate": 3.876493253676004e-06, - "loss": 0.38662949204444885, - "mean_token_accuracy": 0.8611986637115479, - "num_tokens": 12299806.0, - "step": 1378 - }, - { - "epoch": 1.047872340425532, - "grad_norm": 1.86097252368927, - "learning_rate": 3.8747444305621e-06, - "loss": 0.27612629532814026, - "mean_token_accuracy": 0.8984048366546631, - "num_tokens": 12306599.0, - "step": 1379 - }, - { - "epoch": 1.0486322188449848, - "grad_norm": 2.361828565597534, - "learning_rate": 3.872994642599635e-06, - "loss": 0.469953715801239, - "mean_token_accuracy": 0.8464452028274536, - "num_tokens": 12314249.0, - "step": 1380 - }, - { - "epoch": 1.0493920972644377, - "grad_norm": 1.9524794816970825, - "learning_rate": 3.871243891016676e-06, - "loss": 0.5419625043869019, - "mean_token_accuracy": 0.8468329906463623, - "num_tokens": 12324987.0, - "step": 1381 - }, - { - "epoch": 1.0501519756838906, - "grad_norm": 1.6931511163711548, - "learning_rate": 3.869492177041971e-06, - "loss": 0.3791416883468628, - "mean_token_accuracy": 0.8692882061004639, - "num_tokens": 12336864.0, - "step": 1382 - }, - { - "epoch": 1.0509118541033435, - "grad_norm": 1.909692406654358, - "learning_rate": 3.867739501904938e-06, - "loss": 0.27974557876586914, - "mean_token_accuracy": 0.9004636406898499, - "num_tokens": 12343093.0, - "step": 1383 - }, - { - "epoch": 1.0516717325227964, - "grad_norm": 1.415162205696106, - "learning_rate": 3.8659858668356735e-06, - "loss": 0.38928335905075073, - "mean_token_accuracy": 0.8491984009742737, - "num_tokens": 12356613.0, - "step": 1384 - }, - { - "epoch": 1.0524316109422491, - "grad_norm": 1.8195741176605225, - "learning_rate": 3.864231273064944e-06, - "loss": 0.3798758089542389, - "mean_token_accuracy": 0.8728072047233582, - "num_tokens": 12364860.0, - "step": 1385 - }, - { - "epoch": 1.053191489361702, - "grad_norm": 1.8481454849243164, - "learning_rate": 3.862475721824193e-06, - "loss": 0.269635945558548, - "mean_token_accuracy": 0.899247407913208, - "num_tokens": 12371841.0, - "step": 1386 - }, - { - "epoch": 1.053951367781155, - "grad_norm": 1.7838784456253052, - "learning_rate": 3.8607192143455325e-06, - "loss": 0.36971768736839294, - "mean_token_accuracy": 0.8833638429641724, - "num_tokens": 12380685.0, - "step": 1387 - }, - { - "epoch": 1.0547112462006079, - "grad_norm": 1.333358645439148, - "learning_rate": 3.858961751861748e-06, - "loss": 0.4039418399333954, - "mean_token_accuracy": 0.8541078567504883, - "num_tokens": 12394072.0, - "step": 1388 - }, - { - "epoch": 1.0554711246200608, - "grad_norm": 2.1600265502929688, - "learning_rate": 3.857203335606294e-06, - "loss": 0.38211894035339355, - "mean_token_accuracy": 0.8549972772598267, - "num_tokens": 12400449.0, - "step": 1389 - }, - { - "epoch": 1.0562310030395137, - "grad_norm": 2.914902687072754, - "learning_rate": 3.855443966813295e-06, - "loss": 0.2237374186515808, - "mean_token_accuracy": 0.9253600835800171, - "num_tokens": 12403758.0, - "step": 1390 - }, - { - "epoch": 1.0569908814589666, - "grad_norm": 2.2361080646514893, - "learning_rate": 3.853683646717543e-06, - "loss": 0.3359566926956177, - "mean_token_accuracy": 0.898173451423645, - "num_tokens": 12410374.0, - "step": 1391 - }, - { - "epoch": 1.0577507598784195, - "grad_norm": 2.3639304637908936, - "learning_rate": 3.8519223765544985e-06, - "loss": 0.3844943046569824, - "mean_token_accuracy": 0.863599419593811, - "num_tokens": 12416016.0, - "step": 1392 - }, - { - "epoch": 1.0585106382978724, - "grad_norm": 2.202971935272217, - "learning_rate": 3.85016015756029e-06, - "loss": 0.3546281158924103, - "mean_token_accuracy": 0.8907540440559387, - "num_tokens": 12422026.0, - "step": 1393 - }, - { - "epoch": 1.0592705167173253, - "grad_norm": 1.1279661655426025, - "learning_rate": 3.848396990971709e-06, - "loss": 0.31522464752197266, - "mean_token_accuracy": 0.8662257194519043, - "num_tokens": 12439964.0, - "step": 1394 - }, - { - "epoch": 1.0600303951367782, - "grad_norm": 2.4731740951538086, - "learning_rate": 3.846632878026214e-06, - "loss": 0.456442266702652, - "mean_token_accuracy": 0.8516958951950073, - "num_tokens": 12446231.0, - "step": 1395 - }, - { - "epoch": 1.060790273556231, - "grad_norm": 1.7631878852844238, - "learning_rate": 3.844867819961928e-06, - "loss": 0.487227201461792, - "mean_token_accuracy": 0.8466947078704834, - "num_tokens": 12459989.0, - "step": 1396 - }, - { - "epoch": 1.0615501519756838, - "grad_norm": 2.4468278884887695, - "learning_rate": 3.843101818017637e-06, - "loss": 0.3367291986942291, - "mean_token_accuracy": 0.8734689950942993, - "num_tokens": 12465741.0, - "step": 1397 - }, - { - "epoch": 1.0623100303951367, - "grad_norm": 1.9045145511627197, - "learning_rate": 3.841334873432789e-06, - "loss": 0.4652615487575531, - "mean_token_accuracy": 0.8333107233047485, - "num_tokens": 12474963.0, - "step": 1398 - }, - { - "epoch": 1.0630699088145896, - "grad_norm": 1.6816917657852173, - "learning_rate": 3.839566987447492e-06, - "loss": 0.4144279956817627, - "mean_token_accuracy": 0.8472539186477661, - "num_tokens": 12485521.0, - "step": 1399 - }, - { - "epoch": 1.0638297872340425, - "grad_norm": 1.8990092277526855, - "learning_rate": 3.837798161302518e-06, - "loss": 0.4040985405445099, - "mean_token_accuracy": 0.8514704704284668, - "num_tokens": 12493495.0, - "step": 1400 - }, - { - "epoch": 1.0645896656534954, - "grad_norm": 2.27785325050354, - "learning_rate": 3.836028396239297e-06, - "loss": 0.43425723910331726, - "mean_token_accuracy": 0.8795069456100464, - "num_tokens": 12499789.0, - "step": 1401 - }, - { - "epoch": 1.0653495440729484, - "grad_norm": 2.5130882263183594, - "learning_rate": 3.8342576934999184e-06, - "loss": 0.33892524242401123, - "mean_token_accuracy": 0.8717449903488159, - "num_tokens": 12504885.0, - "step": 1402 - }, - { - "epoch": 1.0661094224924013, - "grad_norm": 2.650040864944458, - "learning_rate": 3.832486054327131e-06, - "loss": 0.4200317859649658, - "mean_token_accuracy": 0.8616159558296204, - "num_tokens": 12509783.0, - "step": 1403 - }, - { - "epoch": 1.0668693009118542, - "grad_norm": 2.9176881313323975, - "learning_rate": 3.830713479964335e-06, - "loss": 0.37018489837646484, - "mean_token_accuracy": 0.8676021695137024, - "num_tokens": 12514441.0, - "step": 1404 - }, - { - "epoch": 1.067629179331307, - "grad_norm": 1.6430318355560303, - "learning_rate": 3.828939971655595e-06, - "loss": 0.27539193630218506, - "mean_token_accuracy": 0.9077831506729126, - "num_tokens": 12523677.0, - "step": 1405 - }, - { - "epoch": 1.06838905775076, - "grad_norm": 1.3683708906173706, - "learning_rate": 3.827165530645627e-06, - "loss": 0.4085099697113037, - "mean_token_accuracy": 0.8579255938529968, - "num_tokens": 12540104.0, - "step": 1406 - }, - { - "epoch": 1.0691489361702127, - "grad_norm": 2.528465747833252, - "learning_rate": 3.825390158179802e-06, - "loss": 0.42462456226348877, - "mean_token_accuracy": 0.852813720703125, - "num_tokens": 12548239.0, - "step": 1407 - }, - { - "epoch": 1.0699088145896656, - "grad_norm": 1.8288795948028564, - "learning_rate": 3.823613855504144e-06, - "loss": 0.412417471408844, - "mean_token_accuracy": 0.8622130751609802, - "num_tokens": 12557316.0, - "step": 1408 - }, - { - "epoch": 1.0706686930091185, - "grad_norm": 2.341794490814209, - "learning_rate": 3.82183662386533e-06, - "loss": 0.2996668815612793, - "mean_token_accuracy": 0.8964041471481323, - "num_tokens": 12562377.0, - "step": 1409 - }, - { - "epoch": 1.0714285714285714, - "grad_norm": 2.555877208709717, - "learning_rate": 3.82005846451069e-06, - "loss": 0.4184221625328064, - "mean_token_accuracy": 0.8678828477859497, - "num_tokens": 12568516.0, - "step": 1410 - }, - { - "epoch": 1.0721884498480243, - "grad_norm": 2.081308126449585, - "learning_rate": 3.8182793786882065e-06, - "loss": 0.4376835823059082, - "mean_token_accuracy": 0.8409077525138855, - "num_tokens": 12576598.0, - "step": 1411 - }, - { - "epoch": 1.0729483282674772, - "grad_norm": 2.0272316932678223, - "learning_rate": 3.816499367646508e-06, - "loss": 0.3630060851573944, - "mean_token_accuracy": 0.8762413263320923, - "num_tokens": 12584587.0, - "step": 1412 - }, - { - "epoch": 1.0737082066869301, - "grad_norm": 2.6382484436035156, - "learning_rate": 3.814718432634877e-06, - "loss": 0.4244990348815918, - "mean_token_accuracy": 0.8509312272071838, - "num_tokens": 12590028.0, - "step": 1413 - }, - { - "epoch": 1.074468085106383, - "grad_norm": 2.429800271987915, - "learning_rate": 3.8129365749032398e-06, - "loss": 0.36990004777908325, - "mean_token_accuracy": 0.8749774098396301, - "num_tokens": 12594984.0, - "step": 1414 - }, - { - "epoch": 1.075227963525836, - "grad_norm": 3.5939090251922607, - "learning_rate": 3.8111537957021736e-06, - "loss": 0.4245661199092865, - "mean_token_accuracy": 0.8481623530387878, - "num_tokens": 12598494.0, - "step": 1415 - }, - { - "epoch": 1.0759878419452888, - "grad_norm": 2.705955982208252, - "learning_rate": 3.809370096282903e-06, - "loss": 0.41851678490638733, - "mean_token_accuracy": 0.8548051714897156, - "num_tokens": 12603876.0, - "step": 1416 - }, - { - "epoch": 1.0767477203647418, - "grad_norm": 1.7812079191207886, - "learning_rate": 3.807585477897296e-06, - "loss": 0.47113919258117676, - "mean_token_accuracy": 0.8346904516220093, - "num_tokens": 12613402.0, - "step": 1417 - }, - { - "epoch": 1.0775075987841944, - "grad_norm": 1.4335212707519531, - "learning_rate": 3.8057999417978654e-06, - "loss": 0.3802063465118408, - "mean_token_accuracy": 0.8563423156738281, - "num_tokens": 12626865.0, - "step": 1418 - }, - { - "epoch": 1.0782674772036474, - "grad_norm": 1.9171305894851685, - "learning_rate": 3.8040134892377702e-06, - "loss": 0.20898357033729553, - "mean_token_accuracy": 0.9189738035202026, - "num_tokens": 12632593.0, - "step": 1419 - }, - { - "epoch": 1.0790273556231003, - "grad_norm": 1.4996821880340576, - "learning_rate": 3.802226121470811e-06, - "loss": 0.4203261137008667, - "mean_token_accuracy": 0.8479211330413818, - "num_tokens": 12646395.0, - "step": 1420 - }, - { - "epoch": 1.0797872340425532, - "grad_norm": 2.2007253170013428, - "learning_rate": 3.800437839751432e-06, - "loss": 0.40370577573776245, - "mean_token_accuracy": 0.8427679538726807, - "num_tokens": 12653508.0, - "step": 1421 - }, - { - "epoch": 1.080547112462006, - "grad_norm": 1.7266581058502197, - "learning_rate": 3.7986486453347183e-06, - "loss": 0.46750491857528687, - "mean_token_accuracy": 0.8429205417633057, - "num_tokens": 12666329.0, - "step": 1422 - }, - { - "epoch": 1.081306990881459, - "grad_norm": 1.4716318845748901, - "learning_rate": 3.796858539476394e-06, - "loss": 0.3330317735671997, - "mean_token_accuracy": 0.879012942314148, - "num_tokens": 12676741.0, - "step": 1423 - }, - { - "epoch": 1.082066869300912, - "grad_norm": 2.652127265930176, - "learning_rate": 3.795067523432826e-06, - "loss": 0.35365715622901917, - "mean_token_accuracy": 0.8796792030334473, - "num_tokens": 12681479.0, - "step": 1424 - }, - { - "epoch": 1.0828267477203648, - "grad_norm": 1.2937829494476318, - "learning_rate": 3.793275598461017e-06, - "loss": 0.25272446870803833, - "mean_token_accuracy": 0.9231734275817871, - "num_tokens": 12694238.0, - "step": 1425 - }, - { - "epoch": 1.0835866261398177, - "grad_norm": 1.3831220865249634, - "learning_rate": 3.7914827658186104e-06, - "loss": 0.4935331344604492, - "mean_token_accuracy": 0.8417420387268066, - "num_tokens": 12712857.0, - "step": 1426 - }, - { - "epoch": 1.0843465045592706, - "grad_norm": 3.059525728225708, - "learning_rate": 3.7896890267638832e-06, - "loss": 0.2592190206050873, - "mean_token_accuracy": 0.9040263295173645, - "num_tokens": 12716766.0, - "step": 1427 - }, - { - "epoch": 1.0851063829787233, - "grad_norm": 2.8399202823638916, - "learning_rate": 3.787894382555752e-06, - "loss": 0.32098138332366943, - "mean_token_accuracy": 0.8838302493095398, - "num_tokens": 12720774.0, - "step": 1428 - }, - { - "epoch": 1.0858662613981762, - "grad_norm": 2.618479013442993, - "learning_rate": 3.7860988344537664e-06, - "loss": 0.425255686044693, - "mean_token_accuracy": 0.8564130067825317, - "num_tokens": 12726506.0, - "step": 1429 - }, - { - "epoch": 1.0866261398176291, - "grad_norm": 1.3108669519424438, - "learning_rate": 3.7843023837181126e-06, - "loss": 0.40220165252685547, - "mean_token_accuracy": 0.8588873147964478, - "num_tokens": 12742814.0, - "step": 1430 - }, - { - "epoch": 1.087386018237082, - "grad_norm": 2.2083566188812256, - "learning_rate": 3.782505031609607e-06, - "loss": 0.318379282951355, - "mean_token_accuracy": 0.8887606859207153, - "num_tokens": 12748388.0, - "step": 1431 - }, - { - "epoch": 1.088145896656535, - "grad_norm": 1.922358751296997, - "learning_rate": 3.7807067793897006e-06, - "loss": 0.2519589364528656, - "mean_token_accuracy": 0.8936764001846313, - "num_tokens": 12754761.0, - "step": 1432 - }, - { - "epoch": 1.0889057750759878, - "grad_norm": 1.7367439270019531, - "learning_rate": 3.778907628320477e-06, - "loss": 0.3970367908477783, - "mean_token_accuracy": 0.858735203742981, - "num_tokens": 12764016.0, - "step": 1433 - }, - { - "epoch": 1.0896656534954408, - "grad_norm": 2.1931066513061523, - "learning_rate": 3.77710757966465e-06, - "loss": 0.5250554084777832, - "mean_token_accuracy": 0.8356746435165405, - "num_tokens": 12772272.0, - "step": 1434 - }, - { - "epoch": 1.0904255319148937, - "grad_norm": 1.718337893486023, - "learning_rate": 3.775306634685562e-06, - "loss": 0.283231645822525, - "mean_token_accuracy": 0.9009919166564941, - "num_tokens": 12780706.0, - "step": 1435 - }, - { - "epoch": 1.0911854103343466, - "grad_norm": 2.1985926628112793, - "learning_rate": 3.773504794647187e-06, - "loss": 0.3913170397281647, - "mean_token_accuracy": 0.8909255266189575, - "num_tokens": 12787052.0, - "step": 1436 - }, - { - "epoch": 1.0919452887537995, - "grad_norm": 2.8687937259674072, - "learning_rate": 3.771702060814123e-06, - "loss": 0.3135771155357361, - "mean_token_accuracy": 0.9016125202178955, - "num_tokens": 12791854.0, - "step": 1437 - }, - { - "epoch": 1.0927051671732522, - "grad_norm": 4.203946590423584, - "learning_rate": 3.7698984344516e-06, - "loss": 0.3642737865447998, - "mean_token_accuracy": 0.8842349052429199, - "num_tokens": 12794969.0, - "step": 1438 - }, - { - "epoch": 1.093465045592705, - "grad_norm": 1.5134642124176025, - "learning_rate": 3.7680939168254733e-06, - "loss": 0.3732057213783264, - "mean_token_accuracy": 0.8671083450317383, - "num_tokens": 12808480.0, - "step": 1439 - }, - { - "epoch": 1.094224924012158, - "grad_norm": 3.2103970050811768, - "learning_rate": 3.7662885092022206e-06, - "loss": 0.3556194603443146, - "mean_token_accuracy": 0.8786529302597046, - "num_tokens": 12812654.0, - "step": 1440 - }, - { - "epoch": 1.094984802431611, - "grad_norm": 2.2774064540863037, - "learning_rate": 3.7644822128489476e-06, - "loss": 0.38409674167633057, - "mean_token_accuracy": 0.866563081741333, - "num_tokens": 12819854.0, - "step": 1441 - }, - { - "epoch": 1.0957446808510638, - "grad_norm": 1.8250885009765625, - "learning_rate": 3.7626750290333824e-06, - "loss": 0.3812350034713745, - "mean_token_accuracy": 0.8676212430000305, - "num_tokens": 12830338.0, - "step": 1442 - }, - { - "epoch": 1.0965045592705167, - "grad_norm": 1.8337891101837158, - "learning_rate": 3.7608669590238765e-06, - "loss": 0.3892471194267273, - "mean_token_accuracy": 0.8616238832473755, - "num_tokens": 12840340.0, - "step": 1443 - }, - { - "epoch": 1.0972644376899696, - "grad_norm": 1.5300254821777344, - "learning_rate": 3.7590580040894025e-06, - "loss": 0.35288217663764954, - "mean_token_accuracy": 0.8625509738922119, - "num_tokens": 12853144.0, - "step": 1444 - }, - { - "epoch": 1.0980243161094225, - "grad_norm": 2.152683734893799, - "learning_rate": 3.7572481654995554e-06, - "loss": 0.4004772901535034, - "mean_token_accuracy": 0.858427107334137, - "num_tokens": 12859970.0, - "step": 1445 - }, - { - "epoch": 1.0987841945288754, - "grad_norm": 1.532832145690918, - "learning_rate": 3.755437444524548e-06, - "loss": 0.46820127964019775, - "mean_token_accuracy": 0.8585472106933594, - "num_tokens": 12875243.0, - "step": 1446 - }, - { - "epoch": 1.0995440729483283, - "grad_norm": 1.6485342979431152, - "learning_rate": 3.7536258424352164e-06, - "loss": 0.46329325437545776, - "mean_token_accuracy": 0.8376060724258423, - "num_tokens": 12886383.0, - "step": 1447 - }, - { - "epoch": 1.1003039513677813, - "grad_norm": 2.402256488800049, - "learning_rate": 3.75181336050301e-06, - "loss": 0.43916207551956177, - "mean_token_accuracy": 0.8448786735534668, - "num_tokens": 12892613.0, - "step": 1448 - }, - { - "epoch": 1.101063829787234, - "grad_norm": 1.3893651962280273, - "learning_rate": 3.7500000000000005e-06, - "loss": 0.3919021785259247, - "mean_token_accuracy": 0.8495820760726929, - "num_tokens": 12905523.0, - "step": 1449 - }, - { - "epoch": 1.1018237082066868, - "grad_norm": 1.5519827604293823, - "learning_rate": 3.7481857621988734e-06, - "loss": 0.4710700809955597, - "mean_token_accuracy": 0.8387632369995117, - "num_tokens": 12918236.0, - "step": 1450 - }, - { - "epoch": 1.1025835866261398, - "grad_norm": 2.0141353607177734, - "learning_rate": 3.74637064837293e-06, - "loss": 0.30866751074790955, - "mean_token_accuracy": 0.9059321880340576, - "num_tokens": 12924391.0, - "step": 1451 - }, - { - "epoch": 1.1033434650455927, - "grad_norm": 1.2201496362686157, - "learning_rate": 3.7445546597960882e-06, - "loss": 0.3938257396221161, - "mean_token_accuracy": 0.8726630210876465, - "num_tokens": 12943338.0, - "step": 1452 - }, - { - "epoch": 1.1041033434650456, - "grad_norm": 2.29434871673584, - "learning_rate": 3.742737797742878e-06, - "loss": 0.4347776174545288, - "mean_token_accuracy": 0.840569257736206, - "num_tokens": 12950636.0, - "step": 1453 - }, - { - "epoch": 1.1048632218844985, - "grad_norm": 2.3875105381011963, - "learning_rate": 3.7409200634884425e-06, - "loss": 0.48353564739227295, - "mean_token_accuracy": 0.8207056522369385, - "num_tokens": 12957635.0, - "step": 1454 - }, - { - "epoch": 1.1056231003039514, - "grad_norm": 2.3539648056030273, - "learning_rate": 3.7391014583085384e-06, - "loss": 0.3532431721687317, - "mean_token_accuracy": 0.8903788924217224, - "num_tokens": 12963032.0, - "step": 1455 - }, - { - "epoch": 1.1063829787234043, - "grad_norm": 1.5611135959625244, - "learning_rate": 3.737281983479534e-06, - "loss": 0.4734863042831421, - "mean_token_accuracy": 0.8413879871368408, - "num_tokens": 12977170.0, - "step": 1456 - }, - { - "epoch": 1.1071428571428572, - "grad_norm": 1.474320411682129, - "learning_rate": 3.735461640278404e-06, - "loss": 0.41854286193847656, - "mean_token_accuracy": 0.8499876856803894, - "num_tokens": 12993750.0, - "step": 1457 - }, - { - "epoch": 1.1079027355623101, - "grad_norm": 2.6873273849487305, - "learning_rate": 3.733640429982738e-06, - "loss": 0.47637903690338135, - "mean_token_accuracy": 0.83599853515625, - "num_tokens": 12999058.0, - "step": 1458 - }, - { - "epoch": 1.108662613981763, - "grad_norm": 1.4575026035308838, - "learning_rate": 3.731818353870729e-06, - "loss": 0.38441652059555054, - "mean_token_accuracy": 0.8582364320755005, - "num_tokens": 13013864.0, - "step": 1459 - }, - { - "epoch": 1.1094224924012157, - "grad_norm": 1.7722690105438232, - "learning_rate": 3.729995413221183e-06, - "loss": 0.4224998950958252, - "mean_token_accuracy": 0.8511888384819031, - "num_tokens": 13023714.0, - "step": 1460 - }, - { - "epoch": 1.1101823708206686, - "grad_norm": 2.625760555267334, - "learning_rate": 3.7281716093135068e-06, - "loss": 0.3487582802772522, - "mean_token_accuracy": 0.8834779262542725, - "num_tokens": 13028608.0, - "step": 1461 - }, - { - "epoch": 1.1109422492401215, - "grad_norm": 1.2554056644439697, - "learning_rate": 3.726346943427719e-06, - "loss": 0.33312469720840454, - "mean_token_accuracy": 0.8704153299331665, - "num_tokens": 13044901.0, - "step": 1462 - }, - { - "epoch": 1.1117021276595744, - "grad_norm": 2.1109910011291504, - "learning_rate": 3.7245214168444388e-06, - "loss": 0.387290894985199, - "mean_token_accuracy": 0.860816240310669, - "num_tokens": 13051452.0, - "step": 1463 - }, - { - "epoch": 1.1124620060790273, - "grad_norm": 3.159201145172119, - "learning_rate": 3.722695030844891e-06, - "loss": 0.37690871953964233, - "mean_token_accuracy": 0.8717561960220337, - "num_tokens": 13055131.0, - "step": 1464 - }, - { - "epoch": 1.1132218844984803, - "grad_norm": 1.3810011148452759, - "learning_rate": 3.7208677867109042e-06, - "loss": 0.36598485708236694, - "mean_token_accuracy": 0.8683375120162964, - "num_tokens": 13069798.0, - "step": 1465 - }, - { - "epoch": 1.1139817629179332, - "grad_norm": 2.500849485397339, - "learning_rate": 3.7190396857249087e-06, - "loss": 0.2781746983528137, - "mean_token_accuracy": 0.9026005268096924, - "num_tokens": 13075127.0, - "step": 1466 - }, - { - "epoch": 1.114741641337386, - "grad_norm": 1.7445712089538574, - "learning_rate": 3.7172107291699356e-06, - "loss": 0.5055314302444458, - "mean_token_accuracy": 0.8252174258232117, - "num_tokens": 13084843.0, - "step": 1467 - }, - { - "epoch": 1.115501519756839, - "grad_norm": 1.6386256217956543, - "learning_rate": 3.7153809183296174e-06, - "loss": 0.38478314876556396, - "mean_token_accuracy": 0.8600847721099854, - "num_tokens": 13096517.0, - "step": 1468 - }, - { - "epoch": 1.1162613981762919, - "grad_norm": 2.3818395137786865, - "learning_rate": 3.713550254488185e-06, - "loss": 0.40308547019958496, - "mean_token_accuracy": 0.8628184795379639, - "num_tokens": 13102324.0, - "step": 1469 - }, - { - "epoch": 1.1170212765957448, - "grad_norm": 1.73163640499115, - "learning_rate": 3.7117187389304703e-06, - "loss": 0.5035421848297119, - "mean_token_accuracy": 0.8229597210884094, - "num_tokens": 13113763.0, - "step": 1470 - }, - { - "epoch": 1.1177811550151975, - "grad_norm": 3.147177219390869, - "learning_rate": 3.7098863729418997e-06, - "loss": 0.557449221611023, - "mean_token_accuracy": 0.8266849517822266, - "num_tokens": 13118849.0, - "step": 1471 - }, - { - "epoch": 1.1185410334346504, - "grad_norm": 1.5061391592025757, - "learning_rate": 3.7080531578085e-06, - "loss": 0.3759554922580719, - "mean_token_accuracy": 0.8541903495788574, - "num_tokens": 13131337.0, - "step": 1472 - }, - { - "epoch": 1.1193009118541033, - "grad_norm": 2.172346353530884, - "learning_rate": 3.7062190948168906e-06, - "loss": 0.41491609811782837, - "mean_token_accuracy": 0.8531454801559448, - "num_tokens": 13139767.0, - "step": 1473 - }, - { - "epoch": 1.1200607902735562, - "grad_norm": 2.1527154445648193, - "learning_rate": 3.7043841852542884e-06, - "loss": 0.4309239387512207, - "mean_token_accuracy": 0.8327745199203491, - "num_tokens": 13147210.0, - "step": 1474 - }, - { - "epoch": 1.1208206686930091, - "grad_norm": 1.8342832326889038, - "learning_rate": 3.7025484304085035e-06, - "loss": 0.34393298625946045, - "mean_token_accuracy": 0.8948153257369995, - "num_tokens": 13154831.0, - "step": 1475 - }, - { - "epoch": 1.121580547112462, - "grad_norm": 2.509291172027588, - "learning_rate": 3.7007118315679384e-06, - "loss": 0.4479471445083618, - "mean_token_accuracy": 0.8280234336853027, - "num_tokens": 13161040.0, - "step": 1476 - }, - { - "epoch": 1.122340425531915, - "grad_norm": 2.914710521697998, - "learning_rate": 3.6988743900215895e-06, - "loss": 0.3724832832813263, - "mean_token_accuracy": 0.863893985748291, - "num_tokens": 13164975.0, - "step": 1477 - }, - { - "epoch": 1.1231003039513678, - "grad_norm": 3.274808645248413, - "learning_rate": 3.6970361070590443e-06, - "loss": 0.4088161885738373, - "mean_token_accuracy": 0.8474822044372559, - "num_tokens": 13168826.0, - "step": 1478 - }, - { - "epoch": 1.1238601823708207, - "grad_norm": 2.861546277999878, - "learning_rate": 3.695196983970481e-06, - "loss": 0.45837992429733276, - "mean_token_accuracy": 0.8579759001731873, - "num_tokens": 13173794.0, - "step": 1479 - }, - { - "epoch": 1.1246200607902737, - "grad_norm": 1.9491597414016724, - "learning_rate": 3.6933570220466654e-06, - "loss": 0.4333910346031189, - "mean_token_accuracy": 0.8444236516952515, - "num_tokens": 13181598.0, - "step": 1480 - }, - { - "epoch": 1.1253799392097266, - "grad_norm": 1.329848051071167, - "learning_rate": 3.6915162225789546e-06, - "loss": 0.36404621601104736, - "mean_token_accuracy": 0.8694117069244385, - "num_tokens": 13196381.0, - "step": 1481 - }, - { - "epoch": 1.1261398176291793, - "grad_norm": 1.8854197263717651, - "learning_rate": 3.6896745868592924e-06, - "loss": 0.4085756838321686, - "mean_token_accuracy": 0.855188250541687, - "num_tokens": 13205236.0, - "step": 1482 - }, - { - "epoch": 1.1268996960486322, - "grad_norm": 3.01684832572937, - "learning_rate": 3.6878321161802106e-06, - "loss": 0.28105655312538147, - "mean_token_accuracy": 0.9009426236152649, - "num_tokens": 13209380.0, - "step": 1483 - }, - { - "epoch": 1.127659574468085, - "grad_norm": 1.8051308393478394, - "learning_rate": 3.685988811834823e-06, - "loss": 0.3314531147480011, - "mean_token_accuracy": 0.8805814385414124, - "num_tokens": 13217714.0, - "step": 1484 - }, - { - "epoch": 1.128419452887538, - "grad_norm": 1.61757493019104, - "learning_rate": 3.684144675116836e-06, - "loss": 0.4543863534927368, - "mean_token_accuracy": 0.8400536775588989, - "num_tokens": 13229330.0, - "step": 1485 - }, - { - "epoch": 1.1291793313069909, - "grad_norm": 1.602686882019043, - "learning_rate": 3.682299707320532e-06, - "loss": 0.3653204143047333, - "mean_token_accuracy": 0.8655825853347778, - "num_tokens": 13242872.0, - "step": 1486 - }, - { - "epoch": 1.1299392097264438, - "grad_norm": 2.3093113899230957, - "learning_rate": 3.680453909740782e-06, - "loss": 0.4383693039417267, - "mean_token_accuracy": 0.839782178401947, - "num_tokens": 13248976.0, - "step": 1487 - }, - { - "epoch": 1.1306990881458967, - "grad_norm": 1.180559754371643, - "learning_rate": 3.6786072836730376e-06, - "loss": 0.5354755520820618, - "mean_token_accuracy": 0.8151205778121948, - "num_tokens": 13272896.0, - "step": 1488 - }, - { - "epoch": 1.1314589665653496, - "grad_norm": 1.9554040431976318, - "learning_rate": 3.6767598304133325e-06, - "loss": 0.4485316872596741, - "mean_token_accuracy": 0.8399936556816101, - "num_tokens": 13280757.0, - "step": 1489 - }, - { - "epoch": 1.1322188449848025, - "grad_norm": 2.236471176147461, - "learning_rate": 3.674911551258279e-06, - "loss": 0.45594364404678345, - "mean_token_accuracy": 0.8552400469779968, - "num_tokens": 13287328.0, - "step": 1490 - }, - { - "epoch": 1.1329787234042552, - "grad_norm": 2.5228686332702637, - "learning_rate": 3.673062447505072e-06, - "loss": 0.4048641622066498, - "mean_token_accuracy": 0.8617376685142517, - "num_tokens": 13292716.0, - "step": 1491 - }, - { - "epoch": 1.1337386018237081, - "grad_norm": 1.1274473667144775, - "learning_rate": 3.6712125204514836e-06, - "loss": 0.3848876357078552, - "mean_token_accuracy": 0.8672975301742554, - "num_tokens": 13313403.0, - "step": 1492 - }, - { - "epoch": 1.134498480243161, - "grad_norm": 2.349541425704956, - "learning_rate": 3.6693617713958633e-06, - "loss": 0.3166058361530304, - "mean_token_accuracy": 0.8896721601486206, - "num_tokens": 13318720.0, - "step": 1493 - }, - { - "epoch": 1.135258358662614, - "grad_norm": 2.2438278198242188, - "learning_rate": 3.6675102016371387e-06, - "loss": 0.5418218970298767, - "mean_token_accuracy": 0.8256527185440063, - "num_tokens": 13325360.0, - "step": 1494 - }, - { - "epoch": 1.1360182370820668, - "grad_norm": 2.21268892288208, - "learning_rate": 3.665657812474812e-06, - "loss": 0.48603951930999756, - "mean_token_accuracy": 0.8273470401763916, - "num_tokens": 13333217.0, - "step": 1495 - }, - { - "epoch": 1.1367781155015197, - "grad_norm": 2.6105997562408447, - "learning_rate": 3.6638046052089614e-06, - "loss": 0.31221291422843933, - "mean_token_accuracy": 0.888375997543335, - "num_tokens": 13338413.0, - "step": 1496 - }, - { - "epoch": 1.1375379939209727, - "grad_norm": 3.655658483505249, - "learning_rate": 3.661950581140239e-06, - "loss": 0.3609023988246918, - "mean_token_accuracy": 0.8838576078414917, - "num_tokens": 13341499.0, - "step": 1497 - }, - { - "epoch": 1.1382978723404256, - "grad_norm": 2.242009162902832, - "learning_rate": 3.660095741569871e-06, - "loss": 0.40022802352905273, - "mean_token_accuracy": 0.8559960722923279, - "num_tokens": 13347917.0, - "step": 1498 - }, - { - "epoch": 1.1390577507598785, - "grad_norm": 1.7958979606628418, - "learning_rate": 3.658240087799655e-06, - "loss": 0.499157190322876, - "mean_token_accuracy": 0.8423802256584167, - "num_tokens": 13361570.0, - "step": 1499 - }, - { - "epoch": 1.1398176291793314, - "grad_norm": 2.5406908988952637, - "learning_rate": 3.6563836211319593e-06, - "loss": 0.4090137481689453, - "mean_token_accuracy": 0.8769663572311401, - "num_tokens": 13367183.0, - "step": 1500 - }, - { - "epoch": 1.1405775075987843, - "grad_norm": 1.9861716032028198, - "learning_rate": 3.654526342869724e-06, - "loss": 0.5125207304954529, - "mean_token_accuracy": 0.8315266370773315, - "num_tokens": 13376767.0, - "step": 1501 - }, - { - "epoch": 1.141337386018237, - "grad_norm": 1.731188178062439, - "learning_rate": 3.65266825431646e-06, - "loss": 0.39452576637268066, - "mean_token_accuracy": 0.8585706353187561, - "num_tokens": 13388437.0, - "step": 1502 - }, - { - "epoch": 1.1420972644376899, - "grad_norm": 1.5203773975372314, - "learning_rate": 3.6508093567762425e-06, - "loss": 0.39466819167137146, - "mean_token_accuracy": 0.8584027886390686, - "num_tokens": 13399727.0, - "step": 1503 - }, - { - "epoch": 1.1428571428571428, - "grad_norm": 2.606462001800537, - "learning_rate": 3.6489496515537204e-06, - "loss": 0.4521079361438751, - "mean_token_accuracy": 0.8413360118865967, - "num_tokens": 13408426.0, - "step": 1504 - }, - { - "epoch": 1.1436170212765957, - "grad_norm": 2.6207993030548096, - "learning_rate": 3.647089139954104e-06, - "loss": 0.4709353446960449, - "mean_token_accuracy": 0.8397113084793091, - "num_tokens": 13413506.0, - "step": 1505 - }, - { - "epoch": 1.1443768996960486, - "grad_norm": 1.7214165925979614, - "learning_rate": 3.6452278232831734e-06, - "loss": 0.45506367087364197, - "mean_token_accuracy": 0.8466023206710815, - "num_tokens": 13424592.0, - "step": 1506 - }, - { - "epoch": 1.1451367781155015, - "grad_norm": 1.7111759185791016, - "learning_rate": 3.643365702847272e-06, - "loss": 0.5016278624534607, - "mean_token_accuracy": 0.8196234703063965, - "num_tokens": 13434421.0, - "step": 1507 - }, - { - "epoch": 1.1458966565349544, - "grad_norm": 1.7528148889541626, - "learning_rate": 3.641502779953307e-06, - "loss": 0.5020896196365356, - "mean_token_accuracy": 0.826249361038208, - "num_tokens": 13445286.0, - "step": 1508 - }, - { - "epoch": 1.1466565349544073, - "grad_norm": 1.3470909595489502, - "learning_rate": 3.639639055908751e-06, - "loss": 0.45765724778175354, - "mean_token_accuracy": 0.8380560278892517, - "num_tokens": 13465030.0, - "step": 1509 - }, - { - "epoch": 1.1474164133738602, - "grad_norm": 2.4846835136413574, - "learning_rate": 3.6377745320216346e-06, - "loss": 0.46488267183303833, - "mean_token_accuracy": 0.8393925428390503, - "num_tokens": 13470883.0, - "step": 1510 - }, - { - "epoch": 1.1481762917933132, - "grad_norm": 1.770201563835144, - "learning_rate": 3.635909209600555e-06, - "loss": 0.5262179374694824, - "mean_token_accuracy": 0.8201162815093994, - "num_tokens": 13482558.0, - "step": 1511 - }, - { - "epoch": 1.148936170212766, - "grad_norm": 1.5955098867416382, - "learning_rate": 3.6340430899546656e-06, - "loss": 0.430621862411499, - "mean_token_accuracy": 0.8488553762435913, - "num_tokens": 13493003.0, - "step": 1512 - }, - { - "epoch": 1.1496960486322187, - "grad_norm": 2.846176862716675, - "learning_rate": 3.632176174393682e-06, - "loss": 0.23461638391017914, - "mean_token_accuracy": 0.9218817353248596, - "num_tokens": 13496566.0, - "step": 1513 - }, - { - "epoch": 1.1504559270516717, - "grad_norm": 1.9606610536575317, - "learning_rate": 3.630308464227877e-06, - "loss": 0.4940161108970642, - "mean_token_accuracy": 0.8474864959716797, - "num_tokens": 13504843.0, - "step": 1514 - }, - { - "epoch": 1.1512158054711246, - "grad_norm": 1.1588608026504517, - "learning_rate": 3.628439960768082e-06, - "loss": 0.32650992274284363, - "mean_token_accuracy": 0.8797246217727661, - "num_tokens": 13521513.0, - "step": 1515 - }, - { - "epoch": 1.1519756838905775, - "grad_norm": 1.3566495180130005, - "learning_rate": 3.6265706653256837e-06, - "loss": 0.4359064996242523, - "mean_token_accuracy": 0.8379859328269958, - "num_tokens": 13540608.0, - "step": 1516 - }, - { - "epoch": 1.1527355623100304, - "grad_norm": 1.4728609323501587, - "learning_rate": 3.624700579212626e-06, - "loss": 0.29939693212509155, - "mean_token_accuracy": 0.8831408023834229, - "num_tokens": 13550641.0, - "step": 1517 - }, - { - "epoch": 1.1534954407294833, - "grad_norm": 2.162325382232666, - "learning_rate": 3.6228297037414077e-06, - "loss": 0.4097636938095093, - "mean_token_accuracy": 0.8575425148010254, - "num_tokens": 13556931.0, - "step": 1518 - }, - { - "epoch": 1.1542553191489362, - "grad_norm": 1.754439353942871, - "learning_rate": 3.6209580402250816e-06, - "loss": 0.400202214717865, - "mean_token_accuracy": 0.8569821119308472, - "num_tokens": 13565491.0, - "step": 1519 - }, - { - "epoch": 1.155015197568389, - "grad_norm": 1.5250083208084106, - "learning_rate": 3.619085589977251e-06, - "loss": 0.43330419063568115, - "mean_token_accuracy": 0.8492985963821411, - "num_tokens": 13577147.0, - "step": 1520 - }, - { - "epoch": 1.155775075987842, - "grad_norm": 1.9108905792236328, - "learning_rate": 3.617212354312076e-06, - "loss": 0.30567464232444763, - "mean_token_accuracy": 0.8850164413452148, - "num_tokens": 13584366.0, - "step": 1521 - }, - { - "epoch": 1.156534954407295, - "grad_norm": 2.2574243545532227, - "learning_rate": 3.615338334544265e-06, - "loss": 0.4391738772392273, - "mean_token_accuracy": 0.839765727519989, - "num_tokens": 13591816.0, - "step": 1522 - }, - { - "epoch": 1.1572948328267478, - "grad_norm": 2.1235218048095703, - "learning_rate": 3.6134635319890763e-06, - "loss": 0.45043107867240906, - "mean_token_accuracy": 0.8385299444198608, - "num_tokens": 13599736.0, - "step": 1523 - }, - { - "epoch": 1.1580547112462005, - "grad_norm": 2.2274110317230225, - "learning_rate": 3.611587947962319e-06, - "loss": 0.3623226284980774, - "mean_token_accuracy": 0.8724044561386108, - "num_tokens": 13605354.0, - "step": 1524 - }, - { - "epoch": 1.1588145896656534, - "grad_norm": 3.414236545562744, - "learning_rate": 3.6097115837803504e-06, - "loss": 0.30060696601867676, - "mean_token_accuracy": 0.8971061706542969, - "num_tokens": 13608851.0, - "step": 1525 - }, - { - "epoch": 1.1595744680851063, - "grad_norm": 2.496264696121216, - "learning_rate": 3.6078344407600744e-06, - "loss": 0.3567180037498474, - "mean_token_accuracy": 0.8596180081367493, - "num_tokens": 13614339.0, - "step": 1526 - }, - { - "epoch": 1.1603343465045592, - "grad_norm": 2.0191843509674072, - "learning_rate": 3.6059565202189433e-06, - "loss": 0.43206095695495605, - "mean_token_accuracy": 0.8464000821113586, - "num_tokens": 13622395.0, - "step": 1527 - }, - { - "epoch": 1.1610942249240122, - "grad_norm": 1.5475906133651733, - "learning_rate": 3.604077823474954e-06, - "loss": 0.4535648226737976, - "mean_token_accuracy": 0.8391586542129517, - "num_tokens": 13635356.0, - "step": 1528 - }, - { - "epoch": 1.161854103343465, - "grad_norm": 2.1348211765289307, - "learning_rate": 3.6021983518466468e-06, - "loss": 0.2733963429927826, - "mean_token_accuracy": 0.9007417559623718, - "num_tokens": 13640641.0, - "step": 1529 - }, - { - "epoch": 1.162613981762918, - "grad_norm": 2.8452792167663574, - "learning_rate": 3.600318106653108e-06, - "loss": 0.29591235518455505, - "mean_token_accuracy": 0.8934413194656372, - "num_tokens": 13644995.0, - "step": 1530 - }, - { - "epoch": 1.1633738601823709, - "grad_norm": 2.342907190322876, - "learning_rate": 3.5984370892139663e-06, - "loss": 0.4675130248069763, - "mean_token_accuracy": 0.8352028131484985, - "num_tokens": 13652695.0, - "step": 1531 - }, - { - "epoch": 1.1641337386018238, - "grad_norm": 2.3480238914489746, - "learning_rate": 3.5965553008493924e-06, - "loss": 0.3114515542984009, - "mean_token_accuracy": 0.8845353126525879, - "num_tokens": 13658101.0, - "step": 1532 - }, - { - "epoch": 1.1648936170212765, - "grad_norm": 1.8608155250549316, - "learning_rate": 3.594672742880097e-06, - "loss": 0.3864145278930664, - "mean_token_accuracy": 0.867354154586792, - "num_tokens": 13666042.0, - "step": 1533 - }, - { - "epoch": 1.1656534954407296, - "grad_norm": 1.4756088256835938, - "learning_rate": 3.5927894166273324e-06, - "loss": 0.3671600818634033, - "mean_token_accuracy": 0.8695988655090332, - "num_tokens": 13678253.0, - "step": 1534 - }, - { - "epoch": 1.1664133738601823, - "grad_norm": 2.8831355571746826, - "learning_rate": 3.5909053234128893e-06, - "loss": 0.267184317111969, - "mean_token_accuracy": 0.9008115530014038, - "num_tokens": 13681790.0, - "step": 1535 - }, - { - "epoch": 1.1671732522796352, - "grad_norm": 2.1984763145446777, - "learning_rate": 3.5890204645590964e-06, - "loss": 0.4431505799293518, - "mean_token_accuracy": 0.8623673915863037, - "num_tokens": 13688444.0, - "step": 1536 - }, - { - "epoch": 1.167933130699088, - "grad_norm": 1.8271523714065552, - "learning_rate": 3.5871348413888207e-06, - "loss": 0.3861040771007538, - "mean_token_accuracy": 0.8624277114868164, - "num_tokens": 13696872.0, - "step": 1537 - }, - { - "epoch": 1.168693009118541, - "grad_norm": 1.6313756704330444, - "learning_rate": 3.585248455225466e-06, - "loss": 0.3775154948234558, - "mean_token_accuracy": 0.8624461889266968, - "num_tokens": 13706167.0, - "step": 1538 - }, - { - "epoch": 1.169452887537994, - "grad_norm": 2.4377901554107666, - "learning_rate": 3.5833613073929684e-06, - "loss": 0.2308957427740097, - "mean_token_accuracy": 0.920600175857544, - "num_tokens": 13710367.0, - "step": 1539 - }, - { - "epoch": 1.1702127659574468, - "grad_norm": 2.2621750831604004, - "learning_rate": 3.5814733992158025e-06, - "loss": 0.33167219161987305, - "mean_token_accuracy": 0.8963261842727661, - "num_tokens": 13716384.0, - "step": 1540 - }, - { - "epoch": 1.1709726443768997, - "grad_norm": 1.3178150653839111, - "learning_rate": 3.579584732018975e-06, - "loss": 0.3276631832122803, - "mean_token_accuracy": 0.8853521347045898, - "num_tokens": 13731031.0, - "step": 1541 - }, - { - "epoch": 1.1717325227963526, - "grad_norm": 2.177750587463379, - "learning_rate": 3.577695307128024e-06, - "loss": 0.48177266120910645, - "mean_token_accuracy": 0.830329418182373, - "num_tokens": 13737925.0, - "step": 1542 - }, - { - "epoch": 1.1724924012158056, - "grad_norm": 2.2268829345703125, - "learning_rate": 3.5758051258690223e-06, - "loss": 0.48843517899513245, - "mean_token_accuracy": 0.8310644030570984, - "num_tokens": 13746039.0, - "step": 1543 - }, - { - "epoch": 1.1732522796352582, - "grad_norm": 1.498701572418213, - "learning_rate": 3.5739141895685708e-06, - "loss": 0.4542962312698364, - "mean_token_accuracy": 0.8500330448150635, - "num_tokens": 13765002.0, - "step": 1544 - }, - { - "epoch": 1.1740121580547112, - "grad_norm": 1.786670446395874, - "learning_rate": 3.5720224995538023e-06, - "loss": 0.27367928624153137, - "mean_token_accuracy": 0.8916142582893372, - "num_tokens": 13774113.0, - "step": 1545 - }, - { - "epoch": 1.174772036474164, - "grad_norm": 2.0311272144317627, - "learning_rate": 3.5701300571523757e-06, - "loss": 0.559987485408783, - "mean_token_accuracy": 0.8266973495483398, - "num_tokens": 13783912.0, - "step": 1546 - }, - { - "epoch": 1.175531914893617, - "grad_norm": 1.8732186555862427, - "learning_rate": 3.5682368636924825e-06, - "loss": 0.5184751152992249, - "mean_token_accuracy": 0.8450918197631836, - "num_tokens": 13792728.0, - "step": 1547 - }, - { - "epoch": 1.1762917933130699, - "grad_norm": 1.4410661458969116, - "learning_rate": 3.566342920502837e-06, - "loss": 0.383536696434021, - "mean_token_accuracy": 0.8672217726707458, - "num_tokens": 13813590.0, - "step": 1548 - }, - { - "epoch": 1.1770516717325228, - "grad_norm": 3.06056547164917, - "learning_rate": 3.564448228912682e-06, - "loss": 0.3941686153411865, - "mean_token_accuracy": 0.8696402311325073, - "num_tokens": 13817704.0, - "step": 1549 - }, - { - "epoch": 1.1778115501519757, - "grad_norm": 1.6150329113006592, - "learning_rate": 3.562552790251785e-06, - "loss": 0.41606605052948, - "mean_token_accuracy": 0.8488572835922241, - "num_tokens": 13831303.0, - "step": 1550 - }, - { - "epoch": 1.1785714285714286, - "grad_norm": 2.1199934482574463, - "learning_rate": 3.5606566058504377e-06, - "loss": 0.3974752426147461, - "mean_token_accuracy": 0.8686345219612122, - "num_tokens": 13837613.0, - "step": 1551 - }, - { - "epoch": 1.1793313069908815, - "grad_norm": 1.5683876276016235, - "learning_rate": 3.558759677039455e-06, - "loss": 0.35225993394851685, - "mean_token_accuracy": 0.8710784316062927, - "num_tokens": 13846779.0, - "step": 1552 - }, - { - "epoch": 1.1800911854103344, - "grad_norm": 1.4644675254821777, - "learning_rate": 3.5568620051501755e-06, - "loss": 0.38400042057037354, - "mean_token_accuracy": 0.8548328876495361, - "num_tokens": 13860713.0, - "step": 1553 - }, - { - "epoch": 1.1808510638297873, - "grad_norm": 1.461491346359253, - "learning_rate": 3.5549635915144578e-06, - "loss": 0.4572640061378479, - "mean_token_accuracy": 0.8506045937538147, - "num_tokens": 13877289.0, - "step": 1554 - }, - { - "epoch": 1.18161094224924, - "grad_norm": 2.6364715099334717, - "learning_rate": 3.553064437464682e-06, - "loss": 0.3954341411590576, - "mean_token_accuracy": 0.8561649322509766, - "num_tokens": 13882064.0, - "step": 1555 - }, - { - "epoch": 1.182370820668693, - "grad_norm": 2.027273654937744, - "learning_rate": 3.551164544333745e-06, - "loss": 0.47625732421875, - "mean_token_accuracy": 0.8349384069442749, - "num_tokens": 13890306.0, - "step": 1556 - }, - { - "epoch": 1.1831306990881458, - "grad_norm": 2.8427743911743164, - "learning_rate": 3.549263913455069e-06, - "loss": 0.4273033142089844, - "mean_token_accuracy": 0.8541387319564819, - "num_tokens": 13894882.0, - "step": 1557 - }, - { - "epoch": 1.1838905775075987, - "grad_norm": 1.6298975944519043, - "learning_rate": 3.5473625461625884e-06, - "loss": 0.4378639757633209, - "mean_token_accuracy": 0.8634963631629944, - "num_tokens": 13906152.0, - "step": 1558 - }, - { - "epoch": 1.1846504559270516, - "grad_norm": 2.4098947048187256, - "learning_rate": 3.5454604437907535e-06, - "loss": 0.47236716747283936, - "mean_token_accuracy": 0.8646864891052246, - "num_tokens": 13911803.0, - "step": 1559 - }, - { - "epoch": 1.1854103343465046, - "grad_norm": 1.5972497463226318, - "learning_rate": 3.543557607674537e-06, - "loss": 0.3001407980918884, - "mean_token_accuracy": 0.8927055597305298, - "num_tokens": 13921304.0, - "step": 1560 - }, - { - "epoch": 1.1861702127659575, - "grad_norm": 2.1140005588531494, - "learning_rate": 3.54165403914942e-06, - "loss": 0.41898271441459656, - "mean_token_accuracy": 0.8542245626449585, - "num_tokens": 13929434.0, - "step": 1561 - }, - { - "epoch": 1.1869300911854104, - "grad_norm": 1.8733803033828735, - "learning_rate": 3.539749739551401e-06, - "loss": 0.35469961166381836, - "mean_token_accuracy": 0.8805290460586548, - "num_tokens": 13937781.0, - "step": 1562 - }, - { - "epoch": 1.1876899696048633, - "grad_norm": 2.2805802822113037, - "learning_rate": 3.53784471021699e-06, - "loss": 0.44496792554855347, - "mean_token_accuracy": 0.8454172611236572, - "num_tokens": 13944394.0, - "step": 1563 - }, - { - "epoch": 1.1884498480243162, - "grad_norm": 0.9728449583053589, - "learning_rate": 3.535938952483211e-06, - "loss": 0.3156968355178833, - "mean_token_accuracy": 0.8739837408065796, - "num_tokens": 13966712.0, - "step": 1564 - }, - { - "epoch": 1.189209726443769, - "grad_norm": 3.025338888168335, - "learning_rate": 3.534032467687597e-06, - "loss": 0.30036938190460205, - "mean_token_accuracy": 0.9058252573013306, - "num_tokens": 13970183.0, - "step": 1565 - }, - { - "epoch": 1.1899696048632218, - "grad_norm": 2.0659425258636475, - "learning_rate": 3.532125257168193e-06, - "loss": 0.30619731545448303, - "mean_token_accuracy": 0.9041587710380554, - "num_tokens": 13976657.0, - "step": 1566 - }, - { - "epoch": 1.1907294832826747, - "grad_norm": 3.2036776542663574, - "learning_rate": 3.5302173222635526e-06, - "loss": 0.4145944118499756, - "mean_token_accuracy": 0.8502328395843506, - "num_tokens": 13981198.0, - "step": 1567 - }, - { - "epoch": 1.1914893617021276, - "grad_norm": 1.7767539024353027, - "learning_rate": 3.5283086643127396e-06, - "loss": 0.437128484249115, - "mean_token_accuracy": 0.8965631723403931, - "num_tokens": 13990259.0, - "step": 1568 - }, - { - "epoch": 1.1922492401215805, - "grad_norm": 1.7777384519577026, - "learning_rate": 3.5263992846553203e-06, - "loss": 0.33831220865249634, - "mean_token_accuracy": 0.8734279870986938, - "num_tokens": 13999363.0, - "step": 1569 - }, - { - "epoch": 1.1930091185410334, - "grad_norm": 1.6710708141326904, - "learning_rate": 3.5244891846313733e-06, - "loss": 0.4005590081214905, - "mean_token_accuracy": 0.8820298314094543, - "num_tokens": 14008719.0, - "step": 1570 - }, - { - "epoch": 1.1937689969604863, - "grad_norm": 1.0378777980804443, - "learning_rate": 3.5225783655814798e-06, - "loss": 0.3174915313720703, - "mean_token_accuracy": 0.8894162774085999, - "num_tokens": 14025806.0, - "step": 1571 - }, - { - "epoch": 1.1945288753799392, - "grad_norm": 1.2647521495819092, - "learning_rate": 3.520666828846726e-06, - "loss": 0.4173050820827484, - "mean_token_accuracy": 0.8437265157699585, - "num_tokens": 14046445.0, - "step": 1572 - }, - { - "epoch": 1.1952887537993921, - "grad_norm": 2.8625528812408447, - "learning_rate": 3.518754575768702e-06, - "loss": 0.37182557582855225, - "mean_token_accuracy": 0.8660947680473328, - "num_tokens": 14051197.0, - "step": 1573 - }, - { - "epoch": 1.196048632218845, - "grad_norm": 1.1213171482086182, - "learning_rate": 3.516841607689501e-06, - "loss": 0.332731157541275, - "mean_token_accuracy": 0.8573278784751892, - "num_tokens": 14070817.0, - "step": 1574 - }, - { - "epoch": 1.196808510638298, - "grad_norm": 1.197508692741394, - "learning_rate": 3.5149279259517165e-06, - "loss": 0.34058472514152527, - "mean_token_accuracy": 0.8603571653366089, - "num_tokens": 14085301.0, - "step": 1575 - }, - { - "epoch": 1.1975683890577509, - "grad_norm": 4.019949913024902, - "learning_rate": 3.5130135318984454e-06, - "loss": 0.3094622492790222, - "mean_token_accuracy": 0.8905094861984253, - "num_tokens": 14088107.0, - "step": 1576 - }, - { - "epoch": 1.1983282674772036, - "grad_norm": 2.591181755065918, - "learning_rate": 3.5110984268732827e-06, - "loss": 0.3407078981399536, - "mean_token_accuracy": 0.880385160446167, - "num_tokens": 14092887.0, - "step": 1577 - }, - { - "epoch": 1.1990881458966565, - "grad_norm": 1.3069331645965576, - "learning_rate": 3.509182612220322e-06, - "loss": 0.3761988878250122, - "mean_token_accuracy": 0.862013041973114, - "num_tokens": 14109216.0, - "step": 1578 - }, - { - "epoch": 1.1998480243161094, - "grad_norm": 1.7802022695541382, - "learning_rate": 3.507266089284157e-06, - "loss": 0.3824652135372162, - "mean_token_accuracy": 0.8707721829414368, - "num_tokens": 14119645.0, - "step": 1579 - }, - { - "epoch": 1.2006079027355623, - "grad_norm": 2.7937185764312744, - "learning_rate": 3.5053488594098763e-06, - "loss": 0.33828890323638916, - "mean_token_accuracy": 0.8765541315078735, - "num_tokens": 14124628.0, - "step": 1580 - }, - { - "epoch": 1.2013677811550152, - "grad_norm": 1.892671823501587, - "learning_rate": 3.5034309239430664e-06, - "loss": 0.3476094603538513, - "mean_token_accuracy": 0.9053795337677002, - "num_tokens": 14131756.0, - "step": 1581 - }, - { - "epoch": 1.202127659574468, - "grad_norm": 1.6857695579528809, - "learning_rate": 3.501512284229807e-06, - "loss": 0.5397108793258667, - "mean_token_accuracy": 0.8173421025276184, - "num_tokens": 14143024.0, - "step": 1582 - }, - { - "epoch": 1.202887537993921, - "grad_norm": 2.501737117767334, - "learning_rate": 3.4995929416166756e-06, - "loss": 0.4192458391189575, - "mean_token_accuracy": 0.8558136224746704, - "num_tokens": 14149499.0, - "step": 1583 - }, - { - "epoch": 1.203647416413374, - "grad_norm": 2.0133907794952393, - "learning_rate": 3.4976728974507387e-06, - "loss": 0.4791576564311981, - "mean_token_accuracy": 0.8253597021102905, - "num_tokens": 14158381.0, - "step": 1584 - }, - { - "epoch": 1.2044072948328268, - "grad_norm": 2.984611988067627, - "learning_rate": 3.4957521530795576e-06, - "loss": 0.3040750026702881, - "mean_token_accuracy": 0.8902391791343689, - "num_tokens": 14162419.0, - "step": 1585 - }, - { - "epoch": 1.2051671732522795, - "grad_norm": 1.518591284751892, - "learning_rate": 3.493830709851185e-06, - "loss": 0.35539618134498596, - "mean_token_accuracy": 0.8737183809280396, - "num_tokens": 14173048.0, - "step": 1586 - }, - { - "epoch": 1.2059270516717326, - "grad_norm": 2.628758192062378, - "learning_rate": 3.4919085691141636e-06, - "loss": 0.33340200781822205, - "mean_token_accuracy": 0.8705098628997803, - "num_tokens": 14178255.0, - "step": 1587 - }, - { - "epoch": 1.2066869300911853, - "grad_norm": 2.5565974712371826, - "learning_rate": 3.4899857322175252e-06, - "loss": 0.44939476251602173, - "mean_token_accuracy": 0.8315504193305969, - "num_tokens": 14183808.0, - "step": 1588 - }, - { - "epoch": 1.2074468085106382, - "grad_norm": 1.7521045207977295, - "learning_rate": 3.4880622005107916e-06, - "loss": 0.3168621063232422, - "mean_token_accuracy": 0.8824669122695923, - "num_tokens": 14192186.0, - "step": 1589 - }, - { - "epoch": 1.2082066869300911, - "grad_norm": 1.9816104173660278, - "learning_rate": 3.486137975343971e-06, - "loss": 0.3892582058906555, - "mean_token_accuracy": 0.8524188995361328, - "num_tokens": 14200512.0, - "step": 1590 - }, - { - "epoch": 1.208966565349544, - "grad_norm": 1.459800124168396, - "learning_rate": 3.484213058067559e-06, - "loss": 0.45930033922195435, - "mean_token_accuracy": 0.8408471345901489, - "num_tokens": 14215232.0, - "step": 1591 - }, - { - "epoch": 1.209726443768997, - "grad_norm": 2.015493154525757, - "learning_rate": 3.482287450032536e-06, - "loss": 0.5514016151428223, - "mean_token_accuracy": 0.8456779718399048, - "num_tokens": 14225402.0, - "step": 1592 - }, - { - "epoch": 1.2104863221884499, - "grad_norm": 3.4511911869049072, - "learning_rate": 3.4803611525903687e-06, - "loss": 0.4772771894931793, - "mean_token_accuracy": 0.8558698892593384, - "num_tokens": 14229038.0, - "step": 1593 - }, - { - "epoch": 1.2112462006079028, - "grad_norm": 2.2247982025146484, - "learning_rate": 3.4784341670930067e-06, - "loss": 0.4042825996875763, - "mean_token_accuracy": 0.8635870218276978, - "num_tokens": 14237057.0, - "step": 1594 - }, - { - "epoch": 1.2120060790273557, - "grad_norm": 2.0534820556640625, - "learning_rate": 3.4765064948928813e-06, - "loss": 0.34057414531707764, - "mean_token_accuracy": 0.8800770044326782, - "num_tokens": 14243013.0, - "step": 1595 - }, - { - "epoch": 1.2127659574468086, - "grad_norm": 2.594703197479248, - "learning_rate": 3.474578137342909e-06, - "loss": 0.4997410774230957, - "mean_token_accuracy": 0.8302106261253357, - "num_tokens": 14251210.0, - "step": 1596 - }, - { - "epoch": 1.2135258358662613, - "grad_norm": 2.517833948135376, - "learning_rate": 3.4726490957964836e-06, - "loss": 0.3630390465259552, - "mean_token_accuracy": 0.8679884672164917, - "num_tokens": 14255893.0, - "step": 1597 - }, - { - "epoch": 1.2142857142857142, - "grad_norm": 1.5177065134048462, - "learning_rate": 3.4707193716074816e-06, - "loss": 0.36218544840812683, - "mean_token_accuracy": 0.879178524017334, - "num_tokens": 14268143.0, - "step": 1598 - }, - { - "epoch": 1.215045592705167, - "grad_norm": 2.215291738510132, - "learning_rate": 3.4687889661302577e-06, - "loss": 0.4166645407676697, - "mean_token_accuracy": 0.8495793342590332, - "num_tokens": 14276794.0, - "step": 1599 - }, - { - "epoch": 1.21580547112462, - "grad_norm": 1.534294843673706, - "learning_rate": 3.466857880719645e-06, - "loss": 0.2635883092880249, - "mean_token_accuracy": 0.8971712589263916, - "num_tokens": 14287000.0, - "step": 1600 - }, - { - "epoch": 1.216565349544073, - "grad_norm": 1.2338658571243286, - "learning_rate": 3.464926116730953e-06, - "loss": 0.339110404253006, - "mean_token_accuracy": 0.895592987537384, - "num_tokens": 14303217.0, - "step": 1601 - }, - { - "epoch": 1.2173252279635258, - "grad_norm": 1.8717178106307983, - "learning_rate": 3.462993675519968e-06, - "loss": 0.41204726696014404, - "mean_token_accuracy": 0.8560728430747986, - "num_tokens": 14311372.0, - "step": 1602 - }, - { - "epoch": 1.2180851063829787, - "grad_norm": 2.844160795211792, - "learning_rate": 3.4610605584429526e-06, - "loss": 0.4129520058631897, - "mean_token_accuracy": 0.8555002212524414, - "num_tokens": 14316244.0, - "step": 1603 - }, - { - "epoch": 1.2188449848024316, - "grad_norm": 1.099926471710205, - "learning_rate": 3.4591267668566412e-06, - "loss": 0.35783132910728455, - "mean_token_accuracy": 0.8693175315856934, - "num_tokens": 14338414.0, - "step": 1604 - }, - { - "epoch": 1.2196048632218845, - "grad_norm": 1.6448384523391724, - "learning_rate": 3.457192302118244e-06, - "loss": 0.42060258984565735, - "mean_token_accuracy": 0.8557323217391968, - "num_tokens": 14349143.0, - "step": 1605 - }, - { - "epoch": 1.2203647416413375, - "grad_norm": 2.097529888153076, - "learning_rate": 3.455257165585444e-06, - "loss": 0.5227499008178711, - "mean_token_accuracy": 0.828961968421936, - "num_tokens": 14360032.0, - "step": 1606 - }, - { - "epoch": 1.2211246200607904, - "grad_norm": 1.602988600730896, - "learning_rate": 3.453321358616393e-06, - "loss": 0.3537187874317169, - "mean_token_accuracy": 0.8776708841323853, - "num_tokens": 14370005.0, - "step": 1607 - }, - { - "epoch": 1.221884498480243, - "grad_norm": 2.358971357345581, - "learning_rate": 3.4513848825697145e-06, - "loss": 0.3448919653892517, - "mean_token_accuracy": 0.8887944221496582, - "num_tokens": 14375718.0, - "step": 1608 - }, - { - "epoch": 1.222644376899696, - "grad_norm": 1.72306227684021, - "learning_rate": 3.4494477388045035e-06, - "loss": 0.36985084414482117, - "mean_token_accuracy": 0.859595537185669, - "num_tokens": 14385016.0, - "step": 1609 - }, - { - "epoch": 1.2234042553191489, - "grad_norm": 1.5494085550308228, - "learning_rate": 3.4475099286803204e-06, - "loss": 0.49003708362579346, - "mean_token_accuracy": 0.8701964616775513, - "num_tokens": 14399277.0, - "step": 1610 - }, - { - "epoch": 1.2241641337386018, - "grad_norm": 2.6874046325683594, - "learning_rate": 3.445571453557196e-06, - "loss": 0.3424490690231323, - "mean_token_accuracy": 0.8835943937301636, - "num_tokens": 14404182.0, - "step": 1611 - }, - { - "epoch": 1.2249240121580547, - "grad_norm": 2.2163190841674805, - "learning_rate": 3.443632314795627e-06, - "loss": 0.40944457054138184, - "mean_token_accuracy": 0.8649888038635254, - "num_tokens": 14410158.0, - "step": 1612 - }, - { - "epoch": 1.2256838905775076, - "grad_norm": 2.7961158752441406, - "learning_rate": 3.4416925137565756e-06, - "loss": 0.17890746891498566, - "mean_token_accuracy": 0.9439430832862854, - "num_tokens": 14413285.0, - "step": 1613 - }, - { - "epoch": 1.2264437689969605, - "grad_norm": 1.421451210975647, - "learning_rate": 3.439752051801467e-06, - "loss": 0.33948683738708496, - "mean_token_accuracy": 0.8754585981369019, - "num_tokens": 14424674.0, - "step": 1614 - }, - { - "epoch": 1.2272036474164134, - "grad_norm": 2.105196237564087, - "learning_rate": 3.4378109302921946e-06, - "loss": 0.40009379386901855, - "mean_token_accuracy": 0.8600341081619263, - "num_tokens": 14432400.0, - "step": 1615 - }, - { - "epoch": 1.2279635258358663, - "grad_norm": 2.004122734069824, - "learning_rate": 3.4358691505911105e-06, - "loss": 0.46013444662094116, - "mean_token_accuracy": 0.8400925993919373, - "num_tokens": 14440741.0, - "step": 1616 - }, - { - "epoch": 1.2287234042553192, - "grad_norm": 1.8407535552978516, - "learning_rate": 3.4339267140610317e-06, - "loss": 0.38828906416893005, - "mean_token_accuracy": 0.8582802414894104, - "num_tokens": 14448698.0, - "step": 1617 - }, - { - "epoch": 1.2294832826747721, - "grad_norm": 2.4285924434661865, - "learning_rate": 3.4319836220652334e-06, - "loss": 0.3109283447265625, - "mean_token_accuracy": 0.8888344764709473, - "num_tokens": 14453674.0, - "step": 1618 - }, - { - "epoch": 1.2302431610942248, - "grad_norm": 1.6322550773620605, - "learning_rate": 3.430039875967454e-06, - "loss": 0.5222204327583313, - "mean_token_accuracy": 0.825019121170044, - "num_tokens": 14465736.0, - "step": 1619 - }, - { - "epoch": 1.2310030395136777, - "grad_norm": 2.307573080062866, - "learning_rate": 3.428095477131888e-06, - "loss": 0.29477375745773315, - "mean_token_accuracy": 0.8899064660072327, - "num_tokens": 14471266.0, - "step": 1620 - }, - { - "epoch": 1.2317629179331306, - "grad_norm": 1.8044531345367432, - "learning_rate": 3.4261504269231904e-06, - "loss": 0.4883342981338501, - "mean_token_accuracy": 0.8310165405273438, - "num_tokens": 14481679.0, - "step": 1621 - }, - { - "epoch": 1.2325227963525835, - "grad_norm": 2.7585411071777344, - "learning_rate": 3.4242047267064714e-06, - "loss": 0.45369645953178406, - "mean_token_accuracy": 0.8432134985923767, - "num_tokens": 14487299.0, - "step": 1622 - }, - { - "epoch": 1.2332826747720365, - "grad_norm": 2.687490701675415, - "learning_rate": 3.4222583778472997e-06, - "loss": 0.5627540349960327, - "mean_token_accuracy": 0.8186438083648682, - "num_tokens": 14494254.0, - "step": 1623 - }, - { - "epoch": 1.2340425531914894, - "grad_norm": 2.622443199157715, - "learning_rate": 3.4203113817116955e-06, - "loss": 0.28697147965431213, - "mean_token_accuracy": 0.8861737847328186, - "num_tokens": 14498632.0, - "step": 1624 - }, - { - "epoch": 1.2348024316109423, - "grad_norm": 2.6943359375, - "learning_rate": 3.4183637396661372e-06, - "loss": 0.25273287296295166, - "mean_token_accuracy": 0.9104914665222168, - "num_tokens": 14502797.0, - "step": 1625 - }, - { - "epoch": 1.2355623100303952, - "grad_norm": 2.428189992904663, - "learning_rate": 3.4164154530775552e-06, - "loss": 0.4213451147079468, - "mean_token_accuracy": 0.851524293422699, - "num_tokens": 14508503.0, - "step": 1626 - }, - { - "epoch": 1.236322188449848, - "grad_norm": 2.1722824573516846, - "learning_rate": 3.4144665233133318e-06, - "loss": 0.35238856077194214, - "mean_token_accuracy": 0.8730837106704712, - "num_tokens": 14516126.0, - "step": 1627 - }, - { - "epoch": 1.237082066869301, - "grad_norm": 2.291365146636963, - "learning_rate": 3.4125169517413005e-06, - "loss": 0.43963465094566345, - "mean_token_accuracy": 0.8525444865226746, - "num_tokens": 14522507.0, - "step": 1628 - }, - { - "epoch": 1.237841945288754, - "grad_norm": 1.6181648969650269, - "learning_rate": 3.410566739729746e-06, - "loss": 0.2799680233001709, - "mean_token_accuracy": 0.8915654420852661, - "num_tokens": 14531025.0, - "step": 1629 - }, - { - "epoch": 1.2386018237082066, - "grad_norm": 1.4039218425750732, - "learning_rate": 3.408615888647402e-06, - "loss": 0.29756587743759155, - "mean_token_accuracy": 0.8951715230941772, - "num_tokens": 14543770.0, - "step": 1630 - }, - { - "epoch": 1.2393617021276595, - "grad_norm": 2.148325204849243, - "learning_rate": 3.4066643998634506e-06, - "loss": 0.3983418345451355, - "mean_token_accuracy": 0.8635951280593872, - "num_tokens": 14550896.0, - "step": 1631 - }, - { - "epoch": 1.2401215805471124, - "grad_norm": 1.5225859880447388, - "learning_rate": 3.4047122747475227e-06, - "loss": 0.3247569799423218, - "mean_token_accuracy": 0.8727027177810669, - "num_tokens": 14562181.0, - "step": 1632 - }, - { - "epoch": 1.2408814589665653, - "grad_norm": 3.99835467338562, - "learning_rate": 3.402759514669694e-06, - "loss": 0.4317352771759033, - "mean_token_accuracy": 0.8488142490386963, - "num_tokens": 14565521.0, - "step": 1633 - }, - { - "epoch": 1.2416413373860182, - "grad_norm": 1.7306902408599854, - "learning_rate": 3.4008061210004872e-06, - "loss": 0.389854371547699, - "mean_token_accuracy": 0.8553084135055542, - "num_tokens": 14574633.0, - "step": 1634 - }, - { - "epoch": 1.2424012158054711, - "grad_norm": 2.3614673614501953, - "learning_rate": 3.3988520951108683e-06, - "loss": 0.3150152564048767, - "mean_token_accuracy": 0.8865959644317627, - "num_tokens": 14580240.0, - "step": 1635 - }, - { - "epoch": 1.243161094224924, - "grad_norm": 1.5625747442245483, - "learning_rate": 3.3968974383722497e-06, - "loss": 0.43160033226013184, - "mean_token_accuracy": 0.840155839920044, - "num_tokens": 14594255.0, - "step": 1636 - }, - { - "epoch": 1.243920972644377, - "grad_norm": 1.871620535850525, - "learning_rate": 3.3949421521564825e-06, - "loss": 0.49550193548202515, - "mean_token_accuracy": 0.8315126299858093, - "num_tokens": 14605416.0, - "step": 1637 - }, - { - "epoch": 1.2446808510638299, - "grad_norm": 2.111304759979248, - "learning_rate": 3.392986237835863e-06, - "loss": 0.2794899046421051, - "mean_token_accuracy": 0.9049773216247559, - "num_tokens": 14611711.0, - "step": 1638 - }, - { - "epoch": 1.2454407294832828, - "grad_norm": 3.7479894161224365, - "learning_rate": 3.391029696783127e-06, - "loss": 0.469397634267807, - "mean_token_accuracy": 0.8352956771850586, - "num_tokens": 14615536.0, - "step": 1639 - }, - { - "epoch": 1.2462006079027357, - "grad_norm": 3.277726650238037, - "learning_rate": 3.389072530371451e-06, - "loss": 0.35431790351867676, - "mean_token_accuracy": 0.8822286128997803, - "num_tokens": 14619390.0, - "step": 1640 - }, - { - "epoch": 1.2469604863221884, - "grad_norm": 1.9583072662353516, - "learning_rate": 3.3871147399744482e-06, - "loss": 0.3708694577217102, - "mean_token_accuracy": 0.8720351457595825, - "num_tokens": 14626573.0, - "step": 1641 - }, - { - "epoch": 1.2477203647416413, - "grad_norm": 1.8734042644500732, - "learning_rate": 3.385156326966173e-06, - "loss": 0.48163774609565735, - "mean_token_accuracy": 0.8479621410369873, - "num_tokens": 14636382.0, - "step": 1642 - }, - { - "epoch": 1.2484802431610942, - "grad_norm": 2.0085532665252686, - "learning_rate": 3.383197292721114e-06, - "loss": 0.4893198311328888, - "mean_token_accuracy": 0.838238000869751, - "num_tokens": 14645083.0, - "step": 1643 - }, - { - "epoch": 1.249240121580547, - "grad_norm": 2.0874593257904053, - "learning_rate": 3.3812376386141966e-06, - "loss": 0.4610505700111389, - "mean_token_accuracy": 0.8441368341445923, - "num_tokens": 14654048.0, - "step": 1644 - }, - { - "epoch": 1.25, - "grad_norm": 1.6887420415878296, - "learning_rate": 3.379277366020782e-06, - "loss": 0.3628596067428589, - "mean_token_accuracy": 0.8838590383529663, - "num_tokens": 14662317.0, - "step": 1645 - }, - { - "epoch": 1.250759878419453, - "grad_norm": 2.389002561569214, - "learning_rate": 3.3773164763166653e-06, - "loss": 0.21903495490550995, - "mean_token_accuracy": 0.9249413013458252, - "num_tokens": 14666394.0, - "step": 1646 - }, - { - "epoch": 1.2515197568389058, - "grad_norm": 1.7091087102890015, - "learning_rate": 3.3753549708780736e-06, - "loss": 0.37802332639694214, - "mean_token_accuracy": 0.8644627332687378, - "num_tokens": 14676214.0, - "step": 1647 - }, - { - "epoch": 1.2522796352583587, - "grad_norm": 2.5717999935150146, - "learning_rate": 3.3733928510816677e-06, - "loss": 0.4236462116241455, - "mean_token_accuracy": 0.8519910573959351, - "num_tokens": 14681681.0, - "step": 1648 - }, - { - "epoch": 1.2530395136778116, - "grad_norm": 1.958856463432312, - "learning_rate": 3.3714301183045382e-06, - "loss": 0.3923419415950775, - "mean_token_accuracy": 0.8720202445983887, - "num_tokens": 14690419.0, - "step": 1649 - }, - { - "epoch": 1.2537993920972643, - "grad_norm": 1.5900038480758667, - "learning_rate": 3.369466773924207e-06, - "loss": 0.4182325601577759, - "mean_token_accuracy": 0.8515387177467346, - "num_tokens": 14699790.0, - "step": 1650 - }, - { - "epoch": 1.2545592705167175, - "grad_norm": 1.260547161102295, - "learning_rate": 3.3675028193186243e-06, - "loss": 0.3915718197822571, - "mean_token_accuracy": 0.8536830544471741, - "num_tokens": 14717502.0, - "step": 1651 - }, - { - "epoch": 1.2553191489361701, - "grad_norm": 1.8152283430099487, - "learning_rate": 3.365538255866169e-06, - "loss": 0.424524188041687, - "mean_token_accuracy": 0.8434420824050903, - "num_tokens": 14726591.0, - "step": 1652 - }, - { - "epoch": 1.256079027355623, - "grad_norm": 1.3357285261154175, - "learning_rate": 3.3635730849456484e-06, - "loss": 0.2949739396572113, - "mean_token_accuracy": 0.8868321180343628, - "num_tokens": 14739911.0, - "step": 1653 - }, - { - "epoch": 1.256838905775076, - "grad_norm": 1.1770358085632324, - "learning_rate": 3.3616073079362925e-06, - "loss": 0.29939576983451843, - "mean_token_accuracy": 0.8923654556274414, - "num_tokens": 14755521.0, - "step": 1654 - }, - { - "epoch": 1.2575987841945289, - "grad_norm": 2.059162139892578, - "learning_rate": 3.3596409262177633e-06, - "loss": 0.4562555253505707, - "mean_token_accuracy": 0.8585271239280701, - "num_tokens": 14764173.0, - "step": 1655 - }, - { - "epoch": 1.2583586626139818, - "grad_norm": 1.430752158164978, - "learning_rate": 3.357673941170139e-06, - "loss": 0.35301265120506287, - "mean_token_accuracy": 0.8920517563819885, - "num_tokens": 14775596.0, - "step": 1656 - }, - { - "epoch": 1.2591185410334347, - "grad_norm": 1.6066302061080933, - "learning_rate": 3.3557063541739283e-06, - "loss": 0.41129636764526367, - "mean_token_accuracy": 0.8512256145477295, - "num_tokens": 14786289.0, - "step": 1657 - }, - { - "epoch": 1.2598784194528876, - "grad_norm": 1.5471590757369995, - "learning_rate": 3.353738166610058e-06, - "loss": 0.3935067057609558, - "mean_token_accuracy": 0.8514131903648376, - "num_tokens": 14798672.0, - "step": 1658 - }, - { - "epoch": 1.2606382978723405, - "grad_norm": 1.3455181121826172, - "learning_rate": 3.35176937985988e-06, - "loss": 0.3486790657043457, - "mean_token_accuracy": 0.8644362688064575, - "num_tokens": 14811603.0, - "step": 1659 - }, - { - "epoch": 1.2613981762917934, - "grad_norm": 1.891432762145996, - "learning_rate": 3.349799995305162e-06, - "loss": 0.3325638175010681, - "mean_token_accuracy": 0.8844645023345947, - "num_tokens": 14819256.0, - "step": 1660 - }, - { - "epoch": 1.262158054711246, - "grad_norm": 2.600614309310913, - "learning_rate": 3.3478300143280946e-06, - "loss": 0.30310919880867004, - "mean_token_accuracy": 0.9103429317474365, - "num_tokens": 14823706.0, - "step": 1661 - }, - { - "epoch": 1.2629179331306992, - "grad_norm": 3.8636202812194824, - "learning_rate": 3.3458594383112868e-06, - "loss": 0.28377676010131836, - "mean_token_accuracy": 0.9047091007232666, - "num_tokens": 14826688.0, - "step": 1662 - }, - { - "epoch": 1.263677811550152, - "grad_norm": 2.3100268840789795, - "learning_rate": 3.343888268637765e-06, - "loss": 0.4723394513130188, - "mean_token_accuracy": 0.8306777477264404, - "num_tokens": 14835471.0, - "step": 1663 - }, - { - "epoch": 1.2644376899696048, - "grad_norm": 1.7582160234451294, - "learning_rate": 3.341916506690971e-06, - "loss": 0.48168784379959106, - "mean_token_accuracy": 0.8281306028366089, - "num_tokens": 14846513.0, - "step": 1664 - }, - { - "epoch": 1.2651975683890577, - "grad_norm": 2.166055917739868, - "learning_rate": 3.3399441538547638e-06, - "loss": 0.4626024067401886, - "mean_token_accuracy": 0.8377980589866638, - "num_tokens": 14853408.0, - "step": 1665 - }, - { - "epoch": 1.2659574468085106, - "grad_norm": 2.23038911819458, - "learning_rate": 3.337971211513417e-06, - "loss": 0.38434159755706787, - "mean_token_accuracy": 0.8708412647247314, - "num_tokens": 14859919.0, - "step": 1666 - }, - { - "epoch": 1.2667173252279635, - "grad_norm": 2.092505693435669, - "learning_rate": 3.3359976810516164e-06, - "loss": 0.35072219371795654, - "mean_token_accuracy": 0.8761640191078186, - "num_tokens": 14865624.0, - "step": 1667 - }, - { - "epoch": 1.2674772036474165, - "grad_norm": 1.8255130052566528, - "learning_rate": 3.3340235638544633e-06, - "loss": 0.4404270648956299, - "mean_token_accuracy": 0.836356520652771, - "num_tokens": 14874181.0, - "step": 1668 - }, - { - "epoch": 1.2682370820668694, - "grad_norm": 1.9889036417007446, - "learning_rate": 3.332048861307467e-06, - "loss": 0.4199368357658386, - "mean_token_accuracy": 0.8508217334747314, - "num_tokens": 14882275.0, - "step": 1669 - }, - { - "epoch": 1.2689969604863223, - "grad_norm": 4.050281047821045, - "learning_rate": 3.330073574796551e-06, - "loss": 0.4271625280380249, - "mean_token_accuracy": 0.8471108675003052, - "num_tokens": 14893633.0, - "step": 1670 - }, - { - "epoch": 1.2697568389057752, - "grad_norm": 1.998838186264038, - "learning_rate": 3.328097705708047e-06, - "loss": 0.34743767976760864, - "mean_token_accuracy": 0.8771528005599976, - "num_tokens": 14899859.0, - "step": 1671 - }, - { - "epoch": 1.2705167173252279, - "grad_norm": 1.7989062070846558, - "learning_rate": 3.3261212554286977e-06, - "loss": 0.5267184376716614, - "mean_token_accuracy": 0.8323302268981934, - "num_tokens": 14911131.0, - "step": 1672 - }, - { - "epoch": 1.2712765957446808, - "grad_norm": 1.312070369720459, - "learning_rate": 3.324144225345649e-06, - "loss": 0.4675425887107849, - "mean_token_accuracy": 0.8157106637954712, - "num_tokens": 14928955.0, - "step": 1673 - }, - { - "epoch": 1.2720364741641337, - "grad_norm": 2.0547919273376465, - "learning_rate": 3.3221666168464584e-06, - "loss": 0.33704331517219543, - "mean_token_accuracy": 0.8621441125869751, - "num_tokens": 14935536.0, - "step": 1674 - }, - { - "epoch": 1.2727963525835866, - "grad_norm": 2.810413122177124, - "learning_rate": 3.320188431319088e-06, - "loss": 0.4007563292980194, - "mean_token_accuracy": 0.8649672269821167, - "num_tokens": 14940219.0, - "step": 1675 - }, - { - "epoch": 1.2735562310030395, - "grad_norm": 1.3516674041748047, - "learning_rate": 3.318209670151904e-06, - "loss": 0.3457040786743164, - "mean_token_accuracy": 0.8698287010192871, - "num_tokens": 14952904.0, - "step": 1676 - }, - { - "epoch": 1.2743161094224924, - "grad_norm": 2.440643310546875, - "learning_rate": 3.3162303347336765e-06, - "loss": 0.5195086002349854, - "mean_token_accuracy": 0.8348199129104614, - "num_tokens": 14958623.0, - "step": 1677 - }, - { - "epoch": 1.2750759878419453, - "grad_norm": 1.3264343738555908, - "learning_rate": 3.3142504264535808e-06, - "loss": 0.2990425229072571, - "mean_token_accuracy": 0.8961933851242065, - "num_tokens": 14971494.0, - "step": 1678 - }, - { - "epoch": 1.2758358662613982, - "grad_norm": 1.3106894493103027, - "learning_rate": 3.3122699467011913e-06, - "loss": 0.291853666305542, - "mean_token_accuracy": 0.893449068069458, - "num_tokens": 14985239.0, - "step": 1679 - }, - { - "epoch": 1.2765957446808511, - "grad_norm": 2.5387396812438965, - "learning_rate": 3.3102888968664857e-06, - "loss": 0.4336916208267212, - "mean_token_accuracy": 0.8447890877723694, - "num_tokens": 14991453.0, - "step": 1680 - }, - { - "epoch": 1.2773556231003038, - "grad_norm": 2.7052135467529297, - "learning_rate": 3.308307278339842e-06, - "loss": 0.3279378116130829, - "mean_token_accuracy": 0.8935879468917847, - "num_tokens": 14995428.0, - "step": 1681 - }, - { - "epoch": 1.278115501519757, - "grad_norm": 1.6251261234283447, - "learning_rate": 3.306325092512034e-06, - "loss": 0.32066458463668823, - "mean_token_accuracy": 0.8909799456596375, - "num_tokens": 15004841.0, - "step": 1682 - }, - { - "epoch": 1.2788753799392096, - "grad_norm": 2.3014605045318604, - "learning_rate": 3.3043423407742374e-06, - "loss": 0.3523373603820801, - "mean_token_accuracy": 0.8810735940933228, - "num_tokens": 15010742.0, - "step": 1683 - }, - { - "epoch": 1.2796352583586625, - "grad_norm": 2.9563019275665283, - "learning_rate": 3.3023590245180237e-06, - "loss": 0.39715707302093506, - "mean_token_accuracy": 0.8779881000518799, - "num_tokens": 15015357.0, - "step": 1684 - }, - { - "epoch": 1.2803951367781155, - "grad_norm": 1.5787957906723022, - "learning_rate": 3.300375145135361e-06, - "loss": 0.44630166888237, - "mean_token_accuracy": 0.8400174975395203, - "num_tokens": 15031360.0, - "step": 1685 - }, - { - "epoch": 1.2811550151975684, - "grad_norm": 1.6753438711166382, - "learning_rate": 3.2983907040186112e-06, - "loss": 0.3235800862312317, - "mean_token_accuracy": 0.8938044309616089, - "num_tokens": 15040276.0, - "step": 1686 - }, - { - "epoch": 1.2819148936170213, - "grad_norm": 1.7331148386001587, - "learning_rate": 3.296405702560532e-06, - "loss": 0.39061424136161804, - "mean_token_accuracy": 0.8599754571914673, - "num_tokens": 15049725.0, - "step": 1687 - }, - { - "epoch": 1.2826747720364742, - "grad_norm": 2.2029430866241455, - "learning_rate": 3.294420142154274e-06, - "loss": 0.43598297238349915, - "mean_token_accuracy": 0.8663698434829712, - "num_tokens": 15058182.0, - "step": 1688 - }, - { - "epoch": 1.283434650455927, - "grad_norm": 2.943964958190918, - "learning_rate": 3.29243402419338e-06, - "loss": 0.405210942029953, - "mean_token_accuracy": 0.854996919631958, - "num_tokens": 15062920.0, - "step": 1689 - }, - { - "epoch": 1.28419452887538, - "grad_norm": 1.9343379735946655, - "learning_rate": 3.2904473500717826e-06, - "loss": 0.35011449456214905, - "mean_token_accuracy": 0.8745867013931274, - "num_tokens": 15070298.0, - "step": 1690 - }, - { - "epoch": 1.284954407294833, - "grad_norm": 2.559859037399292, - "learning_rate": 3.2884601211838087e-06, - "loss": 0.38816407322883606, - "mean_token_accuracy": 0.854763388633728, - "num_tokens": 15075667.0, - "step": 1691 - }, - { - "epoch": 1.2857142857142856, - "grad_norm": 1.4357839822769165, - "learning_rate": 3.2864723389241697e-06, - "loss": 0.4512745141983032, - "mean_token_accuracy": 0.8398592472076416, - "num_tokens": 15090291.0, - "step": 1692 - }, - { - "epoch": 1.2864741641337387, - "grad_norm": 1.7643728256225586, - "learning_rate": 3.284484004687969e-06, - "loss": 0.3536742627620697, - "mean_token_accuracy": 0.8726381063461304, - "num_tokens": 15099325.0, - "step": 1693 - }, - { - "epoch": 1.2872340425531914, - "grad_norm": 1.853173017501831, - "learning_rate": 3.2824951198706958e-06, - "loss": 0.36579740047454834, - "mean_token_accuracy": 0.8988048434257507, - "num_tokens": 15107090.0, - "step": 1694 - }, - { - "epoch": 1.2879939209726443, - "grad_norm": 1.6526862382888794, - "learning_rate": 3.280505685868226e-06, - "loss": 0.3853636682033539, - "mean_token_accuracy": 0.8743607997894287, - "num_tokens": 15117818.0, - "step": 1695 - }, - { - "epoch": 1.2887537993920972, - "grad_norm": 2.790398597717285, - "learning_rate": 3.278515704076821e-06, - "loss": 0.2707311511039734, - "mean_token_accuracy": 0.9034668803215027, - "num_tokens": 15121641.0, - "step": 1696 - }, - { - "epoch": 1.2895136778115501, - "grad_norm": 1.69557523727417, - "learning_rate": 3.276525175893126e-06, - "loss": 0.3707970082759857, - "mean_token_accuracy": 0.8617855906486511, - "num_tokens": 15130414.0, - "step": 1697 - }, - { - "epoch": 1.290273556231003, - "grad_norm": 1.1360478401184082, - "learning_rate": 3.274534102714172e-06, - "loss": 0.3368082344532013, - "mean_token_accuracy": 0.8781654834747314, - "num_tokens": 15148307.0, - "step": 1698 - }, - { - "epoch": 1.291033434650456, - "grad_norm": 1.5894653797149658, - "learning_rate": 3.272542485937369e-06, - "loss": 0.3870658278465271, - "mean_token_accuracy": 0.8830926418304443, - "num_tokens": 15161841.0, - "step": 1699 - }, - { - "epoch": 1.2917933130699089, - "grad_norm": 2.3735709190368652, - "learning_rate": 3.270550326960511e-06, - "loss": 0.3873991370201111, - "mean_token_accuracy": 0.8729057908058167, - "num_tokens": 15167733.0, - "step": 1700 - }, - { - "epoch": 1.2925531914893618, - "grad_norm": 1.3739598989486694, - "learning_rate": 3.268557627181772e-06, - "loss": 0.30831626057624817, - "mean_token_accuracy": 0.8695719242095947, - "num_tokens": 15180861.0, - "step": 1701 - }, - { - "epoch": 1.2933130699088147, - "grad_norm": 1.7526969909667969, - "learning_rate": 3.2665643879997054e-06, - "loss": 0.4716024398803711, - "mean_token_accuracy": 0.8303275108337402, - "num_tokens": 15191642.0, - "step": 1702 - }, - { - "epoch": 1.2940729483282674, - "grad_norm": 2.7866084575653076, - "learning_rate": 3.2645706108132426e-06, - "loss": 0.33337634801864624, - "mean_token_accuracy": 0.8790726065635681, - "num_tokens": 15196038.0, - "step": 1703 - }, - { - "epoch": 1.2948328267477205, - "grad_norm": 2.319765090942383, - "learning_rate": 3.2625762970216944e-06, - "loss": 0.3999716639518738, - "mean_token_accuracy": 0.8693568706512451, - "num_tokens": 15202075.0, - "step": 1704 - }, - { - "epoch": 1.2955927051671732, - "grad_norm": 3.18292498588562, - "learning_rate": 3.2605814480247454e-06, - "loss": 0.4579541087150574, - "mean_token_accuracy": 0.8516187071800232, - "num_tokens": 15206886.0, - "step": 1705 - }, - { - "epoch": 1.296352583586626, - "grad_norm": 2.1816933155059814, - "learning_rate": 3.258586065222459e-06, - "loss": 0.5198885202407837, - "mean_token_accuracy": 0.8170592784881592, - "num_tokens": 15214088.0, - "step": 1706 - }, - { - "epoch": 1.297112462006079, - "grad_norm": 1.9076340198516846, - "learning_rate": 3.2565901500152702e-06, - "loss": 0.49752360582351685, - "mean_token_accuracy": 0.8681992292404175, - "num_tokens": 15226046.0, - "step": 1707 - }, - { - "epoch": 1.297872340425532, - "grad_norm": 2.0223331451416016, - "learning_rate": 3.2545937038039904e-06, - "loss": 0.4515793025493622, - "mean_token_accuracy": 0.8429619073867798, - "num_tokens": 15234993.0, - "step": 1708 - }, - { - "epoch": 1.2986322188449848, - "grad_norm": 2.5089669227600098, - "learning_rate": 3.2525967279898017e-06, - "loss": 0.43628376722335815, - "mean_token_accuracy": 0.8493682146072388, - "num_tokens": 15240575.0, - "step": 1709 - }, - { - "epoch": 1.2993920972644377, - "grad_norm": 2.8347091674804688, - "learning_rate": 3.2505992239742582e-06, - "loss": 0.25112441182136536, - "mean_token_accuracy": 0.908825159072876, - "num_tokens": 15244085.0, - "step": 1710 - }, - { - "epoch": 1.3001519756838906, - "grad_norm": 2.3157572746276855, - "learning_rate": 3.2486011931592863e-06, - "loss": 0.482818067073822, - "mean_token_accuracy": 0.8305923938751221, - "num_tokens": 15250377.0, - "step": 1711 - }, - { - "epoch": 1.3009118541033435, - "grad_norm": 3.169052839279175, - "learning_rate": 3.2466026369471804e-06, - "loss": 0.3493242561817169, - "mean_token_accuracy": 0.86913001537323, - "num_tokens": 15255041.0, - "step": 1712 - }, - { - "epoch": 1.3016717325227964, - "grad_norm": 1.4475083351135254, - "learning_rate": 3.2446035567406033e-06, - "loss": 0.4177290201187134, - "mean_token_accuracy": 0.8497589826583862, - "num_tokens": 15266946.0, - "step": 1713 - }, - { - "epoch": 1.3024316109422491, - "grad_norm": 1.6473008394241333, - "learning_rate": 3.2426039539425875e-06, - "loss": 0.5272886753082275, - "mean_token_accuracy": 0.8440133333206177, - "num_tokens": 15279263.0, - "step": 1714 - }, - { - "epoch": 1.3031914893617023, - "grad_norm": 2.3996543884277344, - "learning_rate": 3.240603829956531e-06, - "loss": 0.4272066652774811, - "mean_token_accuracy": 0.8495640754699707, - "num_tokens": 15285213.0, - "step": 1715 - }, - { - "epoch": 1.303951367781155, - "grad_norm": 1.63034987449646, - "learning_rate": 3.238603186186198e-06, - "loss": 0.4034635126590729, - "mean_token_accuracy": 0.8638584613800049, - "num_tokens": 15295974.0, - "step": 1716 - }, - { - "epoch": 1.3047112462006079, - "grad_norm": 2.153608798980713, - "learning_rate": 3.2366020240357166e-06, - "loss": 0.30712565779685974, - "mean_token_accuracy": 0.8863866329193115, - "num_tokens": 15302220.0, - "step": 1717 - }, - { - "epoch": 1.3054711246200608, - "grad_norm": 2.9814558029174805, - "learning_rate": 3.2346003449095803e-06, - "loss": 0.3922840356826782, - "mean_token_accuracy": 0.868030309677124, - "num_tokens": 15306747.0, - "step": 1718 - }, - { - "epoch": 1.3062310030395137, - "grad_norm": 3.3417985439300537, - "learning_rate": 3.2325981502126434e-06, - "loss": 0.30750396847724915, - "mean_token_accuracy": 0.9065356850624084, - "num_tokens": 15310309.0, - "step": 1719 - }, - { - "epoch": 1.3069908814589666, - "grad_norm": 2.237682819366455, - "learning_rate": 3.2305954413501252e-06, - "loss": 0.35068294405937195, - "mean_token_accuracy": 0.8887614011764526, - "num_tokens": 15316463.0, - "step": 1720 - }, - { - "epoch": 1.3077507598784195, - "grad_norm": 1.9526605606079102, - "learning_rate": 3.228592219727602e-06, - "loss": 0.42061835527420044, - "mean_token_accuracy": 0.8456839323043823, - "num_tokens": 15323984.0, - "step": 1721 - }, - { - "epoch": 1.3085106382978724, - "grad_norm": 1.6454212665557861, - "learning_rate": 3.226588486751012e-06, - "loss": 0.5189976692199707, - "mean_token_accuracy": 0.8187375068664551, - "num_tokens": 15338807.0, - "step": 1722 - }, - { - "epoch": 1.3092705167173253, - "grad_norm": 1.4521609544754028, - "learning_rate": 3.2245842438266526e-06, - "loss": 0.329673171043396, - "mean_token_accuracy": 0.853867769241333, - "num_tokens": 15350400.0, - "step": 1723 - }, - { - "epoch": 1.3100303951367782, - "grad_norm": 1.8750989437103271, - "learning_rate": 3.222579492361179e-06, - "loss": 0.4635341167449951, - "mean_token_accuracy": 0.8393422365188599, - "num_tokens": 15360557.0, - "step": 1724 - }, - { - "epoch": 1.310790273556231, - "grad_norm": 1.2728849649429321, - "learning_rate": 3.220574233761603e-06, - "loss": 0.3255572021007538, - "mean_token_accuracy": 0.8989741802215576, - "num_tokens": 15376548.0, - "step": 1725 - }, - { - "epoch": 1.3115501519756838, - "grad_norm": 3.5155694484710693, - "learning_rate": 3.2185684694352913e-06, - "loss": 0.34204089641571045, - "mean_token_accuracy": 0.8781906366348267, - "num_tokens": 15380304.0, - "step": 1726 - }, - { - "epoch": 1.3123100303951367, - "grad_norm": 2.059800148010254, - "learning_rate": 3.216562200789968e-06, - "loss": 0.36288338899612427, - "mean_token_accuracy": 0.8595278263092041, - "num_tokens": 15387653.0, - "step": 1727 - }, - { - "epoch": 1.3130699088145896, - "grad_norm": 3.5388240814208984, - "learning_rate": 3.214555429233707e-06, - "loss": 0.5434849858283997, - "mean_token_accuracy": 0.8074631690979004, - "num_tokens": 15391662.0, - "step": 1728 - }, - { - "epoch": 1.3138297872340425, - "grad_norm": 2.8595592975616455, - "learning_rate": 3.2125481561749406e-06, - "loss": 0.5113687515258789, - "mean_token_accuracy": 0.8448649644851685, - "num_tokens": 15397536.0, - "step": 1729 - }, - { - "epoch": 1.3145896656534954, - "grad_norm": 2.50386905670166, - "learning_rate": 3.210540383022449e-06, - "loss": 0.5293697118759155, - "mean_token_accuracy": 0.8096445798873901, - "num_tokens": 15403478.0, - "step": 1730 - }, - { - "epoch": 1.3153495440729484, - "grad_norm": 1.880035400390625, - "learning_rate": 3.208532111185365e-06, - "loss": 0.5344835519790649, - "mean_token_accuracy": 0.8172965049743652, - "num_tokens": 15413812.0, - "step": 1731 - }, - { - "epoch": 1.3161094224924013, - "grad_norm": 1.3688768148422241, - "learning_rate": 3.2065233420731717e-06, - "loss": 0.2577427327632904, - "mean_token_accuracy": 0.9142681360244751, - "num_tokens": 15423583.0, - "step": 1732 - }, - { - "epoch": 1.3168693009118542, - "grad_norm": 1.7945705652236938, - "learning_rate": 3.2045140770956987e-06, - "loss": 0.3983926773071289, - "mean_token_accuracy": 0.8652000427246094, - "num_tokens": 15432473.0, - "step": 1733 - }, - { - "epoch": 1.3176291793313069, - "grad_norm": 1.8243350982666016, - "learning_rate": 3.2025043176631283e-06, - "loss": 0.48644185066223145, - "mean_token_accuracy": 0.8319193124771118, - "num_tokens": 15445463.0, - "step": 1734 - }, - { - "epoch": 1.31838905775076, - "grad_norm": 2.000094175338745, - "learning_rate": 3.2004940651859844e-06, - "loss": 0.43567317724227905, - "mean_token_accuracy": 0.8857482671737671, - "num_tokens": 15452382.0, - "step": 1735 - }, - { - "epoch": 1.3191489361702127, - "grad_norm": 2.379974365234375, - "learning_rate": 3.198483321075141e-06, - "loss": 0.5153506398200989, - "mean_token_accuracy": 0.8295865654945374, - "num_tokens": 15458740.0, - "step": 1736 - }, - { - "epoch": 1.3199088145896656, - "grad_norm": 1.6564184427261353, - "learning_rate": 3.196472086741815e-06, - "loss": 0.508430540561676, - "mean_token_accuracy": 0.8181540369987488, - "num_tokens": 15471844.0, - "step": 1737 - }, - { - "epoch": 1.3206686930091185, - "grad_norm": 2.006925344467163, - "learning_rate": 3.194460363597569e-06, - "loss": 0.34542378783226013, - "mean_token_accuracy": 0.8827437162399292, - "num_tokens": 15478414.0, - "step": 1738 - }, - { - "epoch": 1.3214285714285714, - "grad_norm": 3.589045763015747, - "learning_rate": 3.192448153054306e-06, - "loss": 0.4385780096054077, - "mean_token_accuracy": 0.8480287790298462, - "num_tokens": 15482063.0, - "step": 1739 - }, - { - "epoch": 1.3221884498480243, - "grad_norm": 1.9797427654266357, - "learning_rate": 3.190435456524275e-06, - "loss": 0.4330386519432068, - "mean_token_accuracy": 0.8458058834075928, - "num_tokens": 15489803.0, - "step": 1740 - }, - { - "epoch": 1.3229483282674772, - "grad_norm": 1.4777411222457886, - "learning_rate": 3.188422275420063e-06, - "loss": 0.3997895419597626, - "mean_token_accuracy": 0.8639512062072754, - "num_tokens": 15501103.0, - "step": 1741 - }, - { - "epoch": 1.3237082066869301, - "grad_norm": 2.882338523864746, - "learning_rate": 3.186408611154597e-06, - "loss": 0.2336438149213791, - "mean_token_accuracy": 0.9176726937294006, - "num_tokens": 15504854.0, - "step": 1742 - }, - { - "epoch": 1.324468085106383, - "grad_norm": 2.353503704071045, - "learning_rate": 3.184394465141146e-06, - "loss": 0.4107069671154022, - "mean_token_accuracy": 0.8677014112472534, - "num_tokens": 15510662.0, - "step": 1743 - }, - { - "epoch": 1.325227963525836, - "grad_norm": 2.6551976203918457, - "learning_rate": 3.1823798387933134e-06, - "loss": 0.3862302899360657, - "mean_token_accuracy": 0.8819445371627808, - "num_tokens": 15515681.0, - "step": 1744 - }, - { - "epoch": 1.3259878419452886, - "grad_norm": 1.478572964668274, - "learning_rate": 3.180364733525043e-06, - "loss": 0.43972986936569214, - "mean_token_accuracy": 0.832388162612915, - "num_tokens": 15529542.0, - "step": 1745 - }, - { - "epoch": 1.3267477203647418, - "grad_norm": 1.6003550291061401, - "learning_rate": 3.178349150750612e-06, - "loss": 0.3404902219772339, - "mean_token_accuracy": 0.8764007091522217, - "num_tokens": 15538865.0, - "step": 1746 - }, - { - "epoch": 1.3275075987841944, - "grad_norm": 2.130689859390259, - "learning_rate": 3.1763330918846347e-06, - "loss": 0.383136510848999, - "mean_token_accuracy": 0.8652247190475464, - "num_tokens": 15545567.0, - "step": 1747 - }, - { - "epoch": 1.3282674772036474, - "grad_norm": 2.395937442779541, - "learning_rate": 3.1743165583420586e-06, - "loss": 0.3870319128036499, - "mean_token_accuracy": 0.8618065118789673, - "num_tokens": 15551090.0, - "step": 1748 - }, - { - "epoch": 1.3290273556231003, - "grad_norm": 2.0841057300567627, - "learning_rate": 3.1722995515381644e-06, - "loss": 0.4838739335536957, - "mean_token_accuracy": 0.8548711538314819, - "num_tokens": 15558913.0, - "step": 1749 - }, - { - "epoch": 1.3297872340425532, - "grad_norm": 1.4237847328186035, - "learning_rate": 3.1702820728885657e-06, - "loss": 0.40350261330604553, - "mean_token_accuracy": 0.858984649181366, - "num_tokens": 15572045.0, - "step": 1750 - }, - { - "epoch": 1.330547112462006, - "grad_norm": 2.2641282081604004, - "learning_rate": 3.1682641238092064e-06, - "loss": 0.5117636919021606, - "mean_token_accuracy": 0.8078924417495728, - "num_tokens": 15579753.0, - "step": 1751 - }, - { - "epoch": 1.331306990881459, - "grad_norm": 1.0010309219360352, - "learning_rate": 3.1662457057163603e-06, - "loss": 0.3220978379249573, - "mean_token_accuracy": 0.8786559104919434, - "num_tokens": 15602823.0, - "step": 1752 - }, - { - "epoch": 1.332066869300912, - "grad_norm": 2.441230535507202, - "learning_rate": 3.164226820026632e-06, - "loss": 0.37529727816581726, - "mean_token_accuracy": 0.8886898756027222, - "num_tokens": 15608473.0, - "step": 1753 - }, - { - "epoch": 1.3328267477203648, - "grad_norm": 1.2960991859436035, - "learning_rate": 3.162207468156952e-06, - "loss": 0.3393767476081848, - "mean_token_accuracy": 0.8766993284225464, - "num_tokens": 15620893.0, - "step": 1754 - }, - { - "epoch": 1.3335866261398177, - "grad_norm": 2.0806996822357178, - "learning_rate": 3.16018765152458e-06, - "loss": 0.38034507632255554, - "mean_token_accuracy": 0.8854838609695435, - "num_tokens": 15627068.0, - "step": 1755 - }, - { - "epoch": 1.3343465045592704, - "grad_norm": 1.4316699504852295, - "learning_rate": 3.1581673715471007e-06, - "loss": 0.3665890693664551, - "mean_token_accuracy": 0.870919406414032, - "num_tokens": 15641070.0, - "step": 1756 - }, - { - "epoch": 1.3351063829787235, - "grad_norm": 1.3466622829437256, - "learning_rate": 3.1561466296424247e-06, - "loss": 0.37387198209762573, - "mean_token_accuracy": 0.8633951544761658, - "num_tokens": 15653777.0, - "step": 1757 - }, - { - "epoch": 1.3358662613981762, - "grad_norm": 1.8108628988265991, - "learning_rate": 3.154125427228786e-06, - "loss": 0.38428938388824463, - "mean_token_accuracy": 0.85402512550354, - "num_tokens": 15662494.0, - "step": 1758 - }, - { - "epoch": 1.3366261398176291, - "grad_norm": 1.3221700191497803, - "learning_rate": 3.152103765724743e-06, - "loss": 0.42825520038604736, - "mean_token_accuracy": 0.8435465097427368, - "num_tokens": 15677552.0, - "step": 1759 - }, - { - "epoch": 1.337386018237082, - "grad_norm": 2.6247692108154297, - "learning_rate": 3.150081646549174e-06, - "loss": 0.36186715960502625, - "mean_token_accuracy": 0.8767328262329102, - "num_tokens": 15682103.0, - "step": 1760 - }, - { - "epoch": 1.338145896656535, - "grad_norm": 2.1469814777374268, - "learning_rate": 3.1480590711212823e-06, - "loss": 0.3734385669231415, - "mean_token_accuracy": 0.8711104393005371, - "num_tokens": 15689182.0, - "step": 1761 - }, - { - "epoch": 1.3389057750759878, - "grad_norm": 2.1702585220336914, - "learning_rate": 3.1460360408605866e-06, - "loss": 0.2795315086841583, - "mean_token_accuracy": 0.8892190456390381, - "num_tokens": 15694272.0, - "step": 1762 - }, - { - "epoch": 1.3396656534954408, - "grad_norm": 1.918797254562378, - "learning_rate": 3.144012557186931e-06, - "loss": 0.4363473057746887, - "mean_token_accuracy": 0.8573931455612183, - "num_tokens": 15703532.0, - "step": 1763 - }, - { - "epoch": 1.3404255319148937, - "grad_norm": 2.5579960346221924, - "learning_rate": 3.14198862152047e-06, - "loss": 0.406247079372406, - "mean_token_accuracy": 0.8617593050003052, - "num_tokens": 15708652.0, - "step": 1764 - }, - { - "epoch": 1.3411854103343466, - "grad_norm": 2.3617870807647705, - "learning_rate": 3.1399642352816825e-06, - "loss": 0.2839522659778595, - "mean_token_accuracy": 0.8996064066886902, - "num_tokens": 15713598.0, - "step": 1765 - }, - { - "epoch": 1.3419452887537995, - "grad_norm": 1.248302936553955, - "learning_rate": 3.1379393998913594e-06, - "loss": 0.2922290861606598, - "mean_token_accuracy": 0.8948773145675659, - "num_tokens": 15726693.0, - "step": 1766 - }, - { - "epoch": 1.3427051671732522, - "grad_norm": 2.143599510192871, - "learning_rate": 3.135914116770609e-06, - "loss": 0.32176223397254944, - "mean_token_accuracy": 0.8808754682540894, - "num_tokens": 15731901.0, - "step": 1767 - }, - { - "epoch": 1.3434650455927053, - "grad_norm": 4.226369857788086, - "learning_rate": 3.1338883873408517e-06, - "loss": 0.4682556390762329, - "mean_token_accuracy": 0.8566025495529175, - "num_tokens": 15735029.0, - "step": 1768 - }, - { - "epoch": 1.344224924012158, - "grad_norm": 1.8695988655090332, - "learning_rate": 3.1318622130238237e-06, - "loss": 0.4297192394733429, - "mean_token_accuracy": 0.8419148921966553, - "num_tokens": 15744310.0, - "step": 1769 - }, - { - "epoch": 1.344984802431611, - "grad_norm": 2.4321305751800537, - "learning_rate": 3.1298355952415714e-06, - "loss": 0.36076444387435913, - "mean_token_accuracy": 0.8826035261154175, - "num_tokens": 15749337.0, - "step": 1770 - }, - { - "epoch": 1.3457446808510638, - "grad_norm": 1.5500011444091797, - "learning_rate": 3.127808535416454e-06, - "loss": 0.48664039373397827, - "mean_token_accuracy": 0.844344437122345, - "num_tokens": 15761096.0, - "step": 1771 - }, - { - "epoch": 1.3465045592705167, - "grad_norm": 2.1498289108276367, - "learning_rate": 3.1257810349711388e-06, - "loss": 0.4841752052307129, - "mean_token_accuracy": 0.8324567079544067, - "num_tokens": 15768646.0, - "step": 1772 - }, - { - "epoch": 1.3472644376899696, - "grad_norm": 1.2995187044143677, - "learning_rate": 3.1237530953286046e-06, - "loss": 0.492019385099411, - "mean_token_accuracy": 0.8285316228866577, - "num_tokens": 15788401.0, - "step": 1773 - }, - { - "epoch": 1.3480243161094225, - "grad_norm": 2.324819803237915, - "learning_rate": 3.121724717912138e-06, - "loss": 0.33166298270225525, - "mean_token_accuracy": 0.8856451511383057, - "num_tokens": 15794097.0, - "step": 1774 - }, - { - "epoch": 1.3487841945288754, - "grad_norm": 1.9611430168151855, - "learning_rate": 3.11969590414533e-06, - "loss": 0.3974284827709198, - "mean_token_accuracy": 0.8751305937767029, - "num_tokens": 15801065.0, - "step": 1775 - }, - { - "epoch": 1.3495440729483283, - "grad_norm": 1.7084417343139648, - "learning_rate": 3.1176666554520827e-06, - "loss": 0.38729435205459595, - "mean_token_accuracy": 0.8680770397186279, - "num_tokens": 15810353.0, - "step": 1776 - }, - { - "epoch": 1.3503039513677813, - "grad_norm": 1.7616240978240967, - "learning_rate": 3.1156369732566006e-06, - "loss": 0.4271578788757324, - "mean_token_accuracy": 0.843730092048645, - "num_tokens": 15821889.0, - "step": 1777 - }, - { - "epoch": 1.351063829787234, - "grad_norm": 2.030747413635254, - "learning_rate": 3.113606858983391e-06, - "loss": 0.361891508102417, - "mean_token_accuracy": 0.8522407412528992, - "num_tokens": 15830800.0, - "step": 1778 - }, - { - "epoch": 1.3518237082066868, - "grad_norm": 1.4842649698257446, - "learning_rate": 3.1115763140572686e-06, - "loss": 0.466334730386734, - "mean_token_accuracy": 0.8433995246887207, - "num_tokens": 15849422.0, - "step": 1779 - }, - { - "epoch": 1.3525835866261398, - "grad_norm": 1.6595379114151, - "learning_rate": 3.109545339903347e-06, - "loss": 0.4622533321380615, - "mean_token_accuracy": 0.8526314496994019, - "num_tokens": 15860431.0, - "step": 1780 - }, - { - "epoch": 1.3533434650455927, - "grad_norm": 2.1235809326171875, - "learning_rate": 3.107513937947041e-06, - "loss": 0.42694270610809326, - "mean_token_accuracy": 0.854864239692688, - "num_tokens": 15869044.0, - "step": 1781 - }, - { - "epoch": 1.3541033434650456, - "grad_norm": 1.5889263153076172, - "learning_rate": 3.1054821096140675e-06, - "loss": 0.41838499903678894, - "mean_token_accuracy": 0.8671513795852661, - "num_tokens": 15878598.0, - "step": 1782 - }, - { - "epoch": 1.3548632218844985, - "grad_norm": 2.2261741161346436, - "learning_rate": 3.1034498563304435e-06, - "loss": 0.4045066237449646, - "mean_token_accuracy": 0.843826949596405, - "num_tokens": 15885167.0, - "step": 1783 - }, - { - "epoch": 1.3556231003039514, - "grad_norm": 2.2569329738616943, - "learning_rate": 3.1014171795224794e-06, - "loss": 0.36677104234695435, - "mean_token_accuracy": 0.8747833967208862, - "num_tokens": 15891308.0, - "step": 1784 - }, - { - "epoch": 1.3563829787234043, - "grad_norm": 2.1027088165283203, - "learning_rate": 3.0993840806167884e-06, - "loss": 0.437946081161499, - "mean_token_accuracy": 0.8370785117149353, - "num_tokens": 15898952.0, - "step": 1785 - }, - { - "epoch": 1.3571428571428572, - "grad_norm": 1.8768929243087769, - "learning_rate": 3.0973505610402767e-06, - "loss": 0.4201734662055969, - "mean_token_accuracy": 0.8474810123443604, - "num_tokens": 15907340.0, - "step": 1786 - }, - { - "epoch": 1.35790273556231, - "grad_norm": 1.7216229438781738, - "learning_rate": 3.0953166222201474e-06, - "loss": 0.4225231409072876, - "mean_token_accuracy": 0.8437749147415161, - "num_tokens": 15917852.0, - "step": 1787 - }, - { - "epoch": 1.358662613981763, - "grad_norm": 2.6256966590881348, - "learning_rate": 3.093282265583895e-06, - "loss": 0.435439795255661, - "mean_token_accuracy": 0.8452040553092957, - "num_tokens": 15923739.0, - "step": 1788 - }, - { - "epoch": 1.3594224924012157, - "grad_norm": 2.90028977394104, - "learning_rate": 3.0912474925593124e-06, - "loss": 0.3730456829071045, - "mean_token_accuracy": 0.8766646385192871, - "num_tokens": 15927943.0, - "step": 1789 - }, - { - "epoch": 1.3601823708206686, - "grad_norm": 1.5966626405715942, - "learning_rate": 3.0892123045744787e-06, - "loss": 0.42150455713272095, - "mean_token_accuracy": 0.854656457901001, - "num_tokens": 15939922.0, - "step": 1790 - }, - { - "epoch": 1.3609422492401215, - "grad_norm": 1.8069748878479004, - "learning_rate": 3.0871767030577686e-06, - "loss": 0.4954872131347656, - "mean_token_accuracy": 0.8289790153503418, - "num_tokens": 15950095.0, - "step": 1791 - }, - { - "epoch": 1.3617021276595744, - "grad_norm": 2.0855250358581543, - "learning_rate": 3.085140689437846e-06, - "loss": 0.41999945044517517, - "mean_token_accuracy": 0.8517382144927979, - "num_tokens": 15957972.0, - "step": 1792 - }, - { - "epoch": 1.3624620060790273, - "grad_norm": 2.108659267425537, - "learning_rate": 3.0831042651436634e-06, - "loss": 0.3668023645877838, - "mean_token_accuracy": 0.8710855841636658, - "num_tokens": 15965614.0, - "step": 1793 - }, - { - "epoch": 1.3632218844984803, - "grad_norm": 1.3799632787704468, - "learning_rate": 3.0810674316044602e-06, - "loss": 0.351409375667572, - "mean_token_accuracy": 0.870837390422821, - "num_tokens": 15978854.0, - "step": 1794 - }, - { - "epoch": 1.3639817629179332, - "grad_norm": 1.540397047996521, - "learning_rate": 3.0790301902497664e-06, - "loss": 0.403600811958313, - "mean_token_accuracy": 0.8485002517700195, - "num_tokens": 15993324.0, - "step": 1795 - }, - { - "epoch": 1.364741641337386, - "grad_norm": 1.946882963180542, - "learning_rate": 3.076992542509396e-06, - "loss": 0.40118327736854553, - "mean_token_accuracy": 0.8607497811317444, - "num_tokens": 16001937.0, - "step": 1796 - }, - { - "epoch": 1.365501519756839, - "grad_norm": 2.0464305877685547, - "learning_rate": 3.0749544898134487e-06, - "loss": 0.31742292642593384, - "mean_token_accuracy": 0.8878391981124878, - "num_tokens": 16009277.0, - "step": 1797 - }, - { - "epoch": 1.3662613981762917, - "grad_norm": 2.091754913330078, - "learning_rate": 3.072916033592307e-06, - "loss": 0.31580421328544617, - "mean_token_accuracy": 0.8875244855880737, - "num_tokens": 16015756.0, - "step": 1798 - }, - { - "epoch": 1.3670212765957448, - "grad_norm": 3.4449212551116943, - "learning_rate": 3.0708771752766397e-06, - "loss": 0.4692591726779938, - "mean_token_accuracy": 0.8456202149391174, - "num_tokens": 16019912.0, - "step": 1799 - }, - { - "epoch": 1.3677811550151975, - "grad_norm": 1.600419521331787, - "learning_rate": 3.068837916297396e-06, - "loss": 0.40389442443847656, - "mean_token_accuracy": 0.8378961086273193, - "num_tokens": 16032637.0, - "step": 1800 - }, - { - "epoch": 1.3685410334346504, - "grad_norm": 1.5282686948776245, - "learning_rate": 3.0667982580858047e-06, - "loss": 0.379841685295105, - "mean_token_accuracy": 0.8752143383026123, - "num_tokens": 16045205.0, - "step": 1801 - }, - { - "epoch": 1.3693009118541033, - "grad_norm": 2.486079454421997, - "learning_rate": 3.0647582020733773e-06, - "loss": 0.41060030460357666, - "mean_token_accuracy": 0.8575131893157959, - "num_tokens": 16051189.0, - "step": 1802 - }, - { - "epoch": 1.3700607902735562, - "grad_norm": 1.9458621740341187, - "learning_rate": 3.062717749691904e-06, - "loss": 0.4442213773727417, - "mean_token_accuracy": 0.8451495170593262, - "num_tokens": 16059700.0, - "step": 1803 - }, - { - "epoch": 1.3708206686930091, - "grad_norm": 1.4333001375198364, - "learning_rate": 3.0606769023734535e-06, - "loss": 0.39132001996040344, - "mean_token_accuracy": 0.8609901666641235, - "num_tokens": 16072458.0, - "step": 1804 - }, - { - "epoch": 1.371580547112462, - "grad_norm": 1.490355372428894, - "learning_rate": 3.0586356615503693e-06, - "loss": 0.4108564257621765, - "mean_token_accuracy": 0.8871046304702759, - "num_tokens": 16083142.0, - "step": 1805 - }, - { - "epoch": 1.372340425531915, - "grad_norm": 1.7765129804611206, - "learning_rate": 3.056594028655274e-06, - "loss": 0.3850266635417938, - "mean_token_accuracy": 0.8923365473747253, - "num_tokens": 16092519.0, - "step": 1806 - }, - { - "epoch": 1.3731003039513678, - "grad_norm": 1.955661416053772, - "learning_rate": 3.0545520051210637e-06, - "loss": 0.4665378928184509, - "mean_token_accuracy": 0.837419867515564, - "num_tokens": 16100618.0, - "step": 1807 - }, - { - "epoch": 1.3738601823708207, - "grad_norm": 3.259265422821045, - "learning_rate": 3.052509592380909e-06, - "loss": 0.24722981452941895, - "mean_token_accuracy": 0.9106054306030273, - "num_tokens": 16103836.0, - "step": 1808 - }, - { - "epoch": 1.3746200607902734, - "grad_norm": 1.7995736598968506, - "learning_rate": 3.050466791868254e-06, - "loss": 0.4982220530509949, - "mean_token_accuracy": 0.8298169374465942, - "num_tokens": 16114727.0, - "step": 1809 - }, - { - "epoch": 1.3753799392097266, - "grad_norm": 1.9643093347549438, - "learning_rate": 3.048423605016815e-06, - "loss": 0.5076829195022583, - "mean_token_accuracy": 0.8303098678588867, - "num_tokens": 16129491.0, - "step": 1810 - }, - { - "epoch": 1.3761398176291793, - "grad_norm": 3.505594491958618, - "learning_rate": 3.0463800332605787e-06, - "loss": 0.27466052770614624, - "mean_token_accuracy": 0.9018045663833618, - "num_tokens": 16132640.0, - "step": 1811 - }, - { - "epoch": 1.3768996960486322, - "grad_norm": 1.798437237739563, - "learning_rate": 3.0443360780338034e-06, - "loss": 0.4004853069782257, - "mean_token_accuracy": 0.8569544553756714, - "num_tokens": 16143317.0, - "step": 1812 - }, - { - "epoch": 1.377659574468085, - "grad_norm": 2.276740789413452, - "learning_rate": 3.042291740771014e-06, - "loss": 0.3823797106742859, - "mean_token_accuracy": 0.8764113783836365, - "num_tokens": 16148898.0, - "step": 1813 - }, - { - "epoch": 1.378419452887538, - "grad_norm": 2.5051357746124268, - "learning_rate": 3.0402470229070057e-06, - "loss": 0.40365856885910034, - "mean_token_accuracy": 0.8809891939163208, - "num_tokens": 16153815.0, - "step": 1814 - }, - { - "epoch": 1.3791793313069909, - "grad_norm": 1.2379236221313477, - "learning_rate": 3.03820192587684e-06, - "loss": 0.3955119848251343, - "mean_token_accuracy": 0.8536627292633057, - "num_tokens": 16167783.0, - "step": 1815 - }, - { - "epoch": 1.3799392097264438, - "grad_norm": 2.2286343574523926, - "learning_rate": 3.036156451115846e-06, - "loss": 0.39647501707077026, - "mean_token_accuracy": 0.8621993064880371, - "num_tokens": 16174707.0, - "step": 1816 - }, - { - "epoch": 1.3806990881458967, - "grad_norm": 1.884639024734497, - "learning_rate": 3.034110600059616e-06, - "loss": 0.31612110137939453, - "mean_token_accuracy": 0.8942475318908691, - "num_tokens": 16181919.0, - "step": 1817 - }, - { - "epoch": 1.3814589665653496, - "grad_norm": 1.891312599182129, - "learning_rate": 3.0320643741440052e-06, - "loss": 0.46209126710891724, - "mean_token_accuracy": 0.8374713659286499, - "num_tokens": 16189276.0, - "step": 1818 - }, - { - "epoch": 1.3822188449848025, - "grad_norm": 2.507478713989258, - "learning_rate": 3.0300177748051375e-06, - "loss": 0.37601593136787415, - "mean_token_accuracy": 0.8633589148521423, - "num_tokens": 16194346.0, - "step": 1819 - }, - { - "epoch": 1.3829787234042552, - "grad_norm": 1.5046696662902832, - "learning_rate": 3.0279708034793907e-06, - "loss": 0.3284982144832611, - "mean_token_accuracy": 0.8792630434036255, - "num_tokens": 16205457.0, - "step": 1820 - }, - { - "epoch": 1.3837386018237083, - "grad_norm": 2.4244449138641357, - "learning_rate": 3.025923461603412e-06, - "loss": 0.40939009189605713, - "mean_token_accuracy": 0.8596426248550415, - "num_tokens": 16211866.0, - "step": 1821 - }, - { - "epoch": 1.384498480243161, - "grad_norm": 2.8656933307647705, - "learning_rate": 3.0238757506141013e-06, - "loss": 0.4397110044956207, - "mean_token_accuracy": 0.8597331047058105, - "num_tokens": 16216607.0, - "step": 1822 - }, - { - "epoch": 1.385258358662614, - "grad_norm": 2.0718610286712646, - "learning_rate": 3.0218276719486245e-06, - "loss": 0.49057573080062866, - "mean_token_accuracy": 0.8325331211090088, - "num_tokens": 16224014.0, - "step": 1823 - }, - { - "epoch": 1.3860182370820668, - "grad_norm": 1.054450273513794, - "learning_rate": 3.019779227044398e-06, - "loss": 0.3758106827735901, - "mean_token_accuracy": 0.8689473867416382, - "num_tokens": 16248627.0, - "step": 1824 - }, - { - "epoch": 1.3867781155015197, - "grad_norm": 2.1115148067474365, - "learning_rate": 3.0177304173391038e-06, - "loss": 0.502967119216919, - "mean_token_accuracy": 0.823198676109314, - "num_tokens": 16256255.0, - "step": 1825 - }, - { - "epoch": 1.3875379939209727, - "grad_norm": 2.207277297973633, - "learning_rate": 3.015681244270672e-06, - "loss": 0.3458971083164215, - "mean_token_accuracy": 0.8930196762084961, - "num_tokens": 16261823.0, - "step": 1826 - }, - { - "epoch": 1.3882978723404256, - "grad_norm": 1.289669156074524, - "learning_rate": 3.0136317092772923e-06, - "loss": 0.4422765374183655, - "mean_token_accuracy": 0.8358346819877625, - "num_tokens": 16280659.0, - "step": 1827 - }, - { - "epoch": 1.3890577507598785, - "grad_norm": 2.233865737915039, - "learning_rate": 3.0115818137974066e-06, - "loss": 0.3643006384372711, - "mean_token_accuracy": 0.8682862520217896, - "num_tokens": 16286356.0, - "step": 1828 - }, - { - "epoch": 1.3898176291793314, - "grad_norm": 1.0950042009353638, - "learning_rate": 3.0095315592697126e-06, - "loss": 0.34712421894073486, - "mean_token_accuracy": 0.8578766584396362, - "num_tokens": 16307298.0, - "step": 1829 - }, - { - "epoch": 1.3905775075987843, - "grad_norm": 1.1708037853240967, - "learning_rate": 3.007480947133155e-06, - "loss": 0.33152541518211365, - "mean_token_accuracy": 0.894973874092102, - "num_tokens": 16323232.0, - "step": 1830 - }, - { - "epoch": 1.391337386018237, - "grad_norm": 1.2226970195770264, - "learning_rate": 3.0054299788269343e-06, - "loss": 0.3915635943412781, - "mean_token_accuracy": 0.8575779795646667, - "num_tokens": 16339273.0, - "step": 1831 - }, - { - "epoch": 1.39209726443769, - "grad_norm": 1.2226042747497559, - "learning_rate": 3.0033786557904982e-06, - "loss": 0.45846253633499146, - "mean_token_accuracy": 0.8290432691574097, - "num_tokens": 16360145.0, - "step": 1832 - }, - { - "epoch": 1.3928571428571428, - "grad_norm": 2.0117406845092773, - "learning_rate": 3.001326979463545e-06, - "loss": 0.3837882876396179, - "mean_token_accuracy": 0.8941739797592163, - "num_tokens": 16366602.0, - "step": 1833 - }, - { - "epoch": 1.3936170212765957, - "grad_norm": 1.8419997692108154, - "learning_rate": 2.9992749512860177e-06, - "loss": 0.40777021646499634, - "mean_token_accuracy": 0.854655385017395, - "num_tokens": 16375611.0, - "step": 1834 - }, - { - "epoch": 1.3943768996960486, - "grad_norm": 1.9405122995376587, - "learning_rate": 2.9972225726981114e-06, - "loss": 0.46685922145843506, - "mean_token_accuracy": 0.8493201732635498, - "num_tokens": 16384878.0, - "step": 1835 - }, - { - "epoch": 1.3951367781155015, - "grad_norm": 1.2425674200057983, - "learning_rate": 2.995169845140264e-06, - "loss": 0.394692063331604, - "mean_token_accuracy": 0.851348876953125, - "num_tokens": 16404452.0, - "step": 1836 - }, - { - "epoch": 1.3958966565349544, - "grad_norm": 1.2215365171432495, - "learning_rate": 2.9931167700531575e-06, - "loss": 0.31412452459335327, - "mean_token_accuracy": 0.882760763168335, - "num_tokens": 16419358.0, - "step": 1837 - }, - { - "epoch": 1.3966565349544073, - "grad_norm": 1.912168025970459, - "learning_rate": 2.9910633488777198e-06, - "loss": 0.5065487623214722, - "mean_token_accuracy": 0.8524355292320251, - "num_tokens": 16430418.0, - "step": 1838 - }, - { - "epoch": 1.3974164133738602, - "grad_norm": 2.2173948287963867, - "learning_rate": 2.989009583055121e-06, - "loss": 0.4290938377380371, - "mean_token_accuracy": 0.8381836414337158, - "num_tokens": 16438267.0, - "step": 1839 - }, - { - "epoch": 1.3981762917933132, - "grad_norm": 1.8293484449386597, - "learning_rate": 2.9869554740267726e-06, - "loss": 0.41683733463287354, - "mean_token_accuracy": 0.8548779487609863, - "num_tokens": 16447382.0, - "step": 1840 - }, - { - "epoch": 1.398936170212766, - "grad_norm": 1.835015892982483, - "learning_rate": 2.9849010232343274e-06, - "loss": 0.5080599784851074, - "mean_token_accuracy": 0.8193596601486206, - "num_tokens": 16458541.0, - "step": 1841 - }, - { - "epoch": 1.3996960486322187, - "grad_norm": 2.031339645385742, - "learning_rate": 2.982846232119679e-06, - "loss": 0.5168882012367249, - "mean_token_accuracy": 0.8525956869125366, - "num_tokens": 16467747.0, - "step": 1842 - }, - { - "epoch": 1.4004559270516717, - "grad_norm": 1.5554167032241821, - "learning_rate": 2.9807911021249573e-06, - "loss": 0.35098958015441895, - "mean_token_accuracy": 0.888373851776123, - "num_tokens": 16479319.0, - "step": 1843 - }, - { - "epoch": 1.4012158054711246, - "grad_norm": 1.7183740139007568, - "learning_rate": 2.9787356346925327e-06, - "loss": 0.41263148188591003, - "mean_token_accuracy": 0.8478364944458008, - "num_tokens": 16489952.0, - "step": 1844 - }, - { - "epoch": 1.4019756838905775, - "grad_norm": 1.7743209600448608, - "learning_rate": 2.9766798312650112e-06, - "loss": 0.4211183190345764, - "mean_token_accuracy": 0.8641136884689331, - "num_tokens": 16498655.0, - "step": 1845 - }, - { - "epoch": 1.4027355623100304, - "grad_norm": 2.141300916671753, - "learning_rate": 2.9746236932852355e-06, - "loss": 0.49548980593681335, - "mean_token_accuracy": 0.8304252028465271, - "num_tokens": 16506348.0, - "step": 1846 - }, - { - "epoch": 1.4034954407294833, - "grad_norm": 2.341571807861328, - "learning_rate": 2.9725672221962804e-06, - "loss": 0.40804803371429443, - "mean_token_accuracy": 0.8545800447463989, - "num_tokens": 16513091.0, - "step": 1847 - }, - { - "epoch": 1.4042553191489362, - "grad_norm": 1.934428095817566, - "learning_rate": 2.9705104194414587e-06, - "loss": 0.30029812455177307, - "mean_token_accuracy": 0.9032052755355835, - "num_tokens": 16519455.0, - "step": 1848 - }, - { - "epoch": 1.405015197568389, - "grad_norm": 1.420804500579834, - "learning_rate": 2.9684532864643123e-06, - "loss": 0.4384060502052307, - "mean_token_accuracy": 0.8465110063552856, - "num_tokens": 16533222.0, - "step": 1849 - }, - { - "epoch": 1.405775075987842, - "grad_norm": 2.1180737018585205, - "learning_rate": 2.9663958247086165e-06, - "loss": 0.3915565609931946, - "mean_token_accuracy": 0.8633890748023987, - "num_tokens": 16539489.0, - "step": 1850 - }, - { - "epoch": 1.4065349544072947, - "grad_norm": 1.408048152923584, - "learning_rate": 2.964338035618378e-06, - "loss": 0.46166157722473145, - "mean_token_accuracy": 0.8305013179779053, - "num_tokens": 16555785.0, - "step": 1851 - }, - { - "epoch": 1.4072948328267478, - "grad_norm": 1.3418530225753784, - "learning_rate": 2.9622799206378306e-06, - "loss": 0.5314373970031738, - "mean_token_accuracy": 0.81779944896698, - "num_tokens": 16578111.0, - "step": 1852 - }, - { - "epoch": 1.4080547112462005, - "grad_norm": 1.4634262323379517, - "learning_rate": 2.9602214812114414e-06, - "loss": 0.4859408140182495, - "mean_token_accuracy": 0.8261818885803223, - "num_tokens": 16591976.0, - "step": 1853 - }, - { - "epoch": 1.4088145896656534, - "grad_norm": 1.4840295314788818, - "learning_rate": 2.9581627187838997e-06, - "loss": 0.4079628586769104, - "mean_token_accuracy": 0.8549603223800659, - "num_tokens": 16603631.0, - "step": 1854 - }, - { - "epoch": 1.4095744680851063, - "grad_norm": 2.1474642753601074, - "learning_rate": 2.956103634800126e-06, - "loss": 0.32997995615005493, - "mean_token_accuracy": 0.8836915493011475, - "num_tokens": 16609875.0, - "step": 1855 - }, - { - "epoch": 1.4103343465045592, - "grad_norm": 2.627460241317749, - "learning_rate": 2.9540442307052643e-06, - "loss": 0.3229186236858368, - "mean_token_accuracy": 0.8852157592773438, - "num_tokens": 16614113.0, - "step": 1856 - }, - { - "epoch": 1.4110942249240122, - "grad_norm": 1.9569811820983887, - "learning_rate": 2.9519845079446824e-06, - "loss": 0.5057883858680725, - "mean_token_accuracy": 0.8585711717605591, - "num_tokens": 16624611.0, - "step": 1857 - }, - { - "epoch": 1.411854103343465, - "grad_norm": 2.0604090690612793, - "learning_rate": 2.949924467963975e-06, - "loss": 0.4681510329246521, - "mean_token_accuracy": 0.8390560150146484, - "num_tokens": 16632938.0, - "step": 1858 - }, - { - "epoch": 1.412613981762918, - "grad_norm": 2.5430450439453125, - "learning_rate": 2.9478641122089563e-06, - "loss": 0.3090999126434326, - "mean_token_accuracy": 0.8943990468978882, - "num_tokens": 16637135.0, - "step": 1859 - }, - { - "epoch": 1.4133738601823709, - "grad_norm": 1.3275387287139893, - "learning_rate": 2.945803442125663e-06, - "loss": 0.3592180013656616, - "mean_token_accuracy": 0.8678265810012817, - "num_tokens": 16650322.0, - "step": 1860 - }, - { - "epoch": 1.4141337386018238, - "grad_norm": 1.9070929288864136, - "learning_rate": 2.943742459160354e-06, - "loss": 0.5332518815994263, - "mean_token_accuracy": 0.8475706577301025, - "num_tokens": 16660240.0, - "step": 1861 - }, - { - "epoch": 1.4148936170212765, - "grad_norm": 2.8724546432495117, - "learning_rate": 2.9416811647595052e-06, - "loss": 0.5052884817123413, - "mean_token_accuracy": 0.8363175392150879, - "num_tokens": 16665481.0, - "step": 1862 - }, - { - "epoch": 1.4156534954407296, - "grad_norm": 4.203817844390869, - "learning_rate": 2.939619560369813e-06, - "loss": 0.546925961971283, - "mean_token_accuracy": 0.834044337272644, - "num_tokens": 16669615.0, - "step": 1863 - }, - { - "epoch": 1.4164133738601823, - "grad_norm": 1.6466281414031982, - "learning_rate": 2.9375576474381907e-06, - "loss": 0.3474533259868622, - "mean_token_accuracy": 0.8571163415908813, - "num_tokens": 16678893.0, - "step": 1864 - }, - { - "epoch": 1.4171732522796352, - "grad_norm": 1.8885842561721802, - "learning_rate": 2.9354954274117683e-06, - "loss": 0.3726021349430084, - "mean_token_accuracy": 0.8629094958305359, - "num_tokens": 16685939.0, - "step": 1865 - }, - { - "epoch": 1.417933130699088, - "grad_norm": 2.830599784851074, - "learning_rate": 2.9334329017378898e-06, - "loss": 0.4138668477535248, - "mean_token_accuracy": 0.8670746088027954, - "num_tokens": 16690012.0, - "step": 1866 - }, - { - "epoch": 1.418693009118541, - "grad_norm": 1.6838961839675903, - "learning_rate": 2.9313700718641167e-06, - "loss": 0.33954259753227234, - "mean_token_accuracy": 0.8660278916358948, - "num_tokens": 16700061.0, - "step": 1867 - }, - { - "epoch": 1.419452887537994, - "grad_norm": 2.8767011165618896, - "learning_rate": 2.9293069392382224e-06, - "loss": 0.4650302827358246, - "mean_token_accuracy": 0.8448452949523926, - "num_tokens": 16705072.0, - "step": 1868 - }, - { - "epoch": 1.4202127659574468, - "grad_norm": 1.5901305675506592, - "learning_rate": 2.927243505308192e-06, - "loss": 0.40838998556137085, - "mean_token_accuracy": 0.8560664653778076, - "num_tokens": 16714763.0, - "step": 1869 - }, - { - "epoch": 1.4209726443768997, - "grad_norm": 1.3293657302856445, - "learning_rate": 2.925179771522223e-06, - "loss": 0.34712862968444824, - "mean_token_accuracy": 0.8633697032928467, - "num_tokens": 16729575.0, - "step": 1870 - }, - { - "epoch": 1.4217325227963526, - "grad_norm": 1.7465964555740356, - "learning_rate": 2.9231157393287234e-06, - "loss": 0.48190903663635254, - "mean_token_accuracy": 0.8255834579467773, - "num_tokens": 16742529.0, - "step": 1871 - }, - { - "epoch": 1.4224924012158056, - "grad_norm": 1.865749716758728, - "learning_rate": 2.9210514101763116e-06, - "loss": 0.4912028908729553, - "mean_token_accuracy": 0.8309572339057922, - "num_tokens": 16753989.0, - "step": 1872 - }, - { - "epoch": 1.4232522796352582, - "grad_norm": 2.55780291557312, - "learning_rate": 2.9189867855138103e-06, - "loss": 0.4550635814666748, - "mean_token_accuracy": 0.8584091067314148, - "num_tokens": 16758906.0, - "step": 1873 - }, - { - "epoch": 1.4240121580547114, - "grad_norm": 1.867530107498169, - "learning_rate": 2.9169218667902562e-06, - "loss": 0.3524911105632782, - "mean_token_accuracy": 0.8715004920959473, - "num_tokens": 16765969.0, - "step": 1874 - }, - { - "epoch": 1.424772036474164, - "grad_norm": 1.8886862993240356, - "learning_rate": 2.9148566554548857e-06, - "loss": 0.37144535779953003, - "mean_token_accuracy": 0.8640961050987244, - "num_tokens": 16773935.0, - "step": 1875 - }, - { - "epoch": 1.425531914893617, - "grad_norm": 1.266065239906311, - "learning_rate": 2.912791152957145e-06, - "loss": 0.3341747522354126, - "mean_token_accuracy": 0.8929134607315063, - "num_tokens": 16787780.0, - "step": 1876 - }, - { - "epoch": 1.4262917933130699, - "grad_norm": 2.524888753890991, - "learning_rate": 2.9107253607466833e-06, - "loss": 0.33709171414375305, - "mean_token_accuracy": 0.8857531547546387, - "num_tokens": 16792753.0, - "step": 1877 - }, - { - "epoch": 1.4270516717325228, - "grad_norm": 1.9269018173217773, - "learning_rate": 2.908659280273354e-06, - "loss": 0.32599249482154846, - "mean_token_accuracy": 0.8777773380279541, - "num_tokens": 16799904.0, - "step": 1878 - }, - { - "epoch": 1.4278115501519757, - "grad_norm": 1.9844375848770142, - "learning_rate": 2.9065929129872097e-06, - "loss": 0.4086732268333435, - "mean_token_accuracy": 0.8505409955978394, - "num_tokens": 16807774.0, - "step": 1879 - }, - { - "epoch": 1.4285714285714286, - "grad_norm": 4.0958662033081055, - "learning_rate": 2.9045262603385073e-06, - "loss": 0.3838827610015869, - "mean_token_accuracy": 0.877601146697998, - "num_tokens": 16810908.0, - "step": 1880 - }, - { - "epoch": 1.4293313069908815, - "grad_norm": 1.7323768138885498, - "learning_rate": 2.902459323777704e-06, - "loss": 0.37459003925323486, - "mean_token_accuracy": 0.8655836582183838, - "num_tokens": 16819494.0, - "step": 1881 - }, - { - "epoch": 1.4300911854103344, - "grad_norm": 2.608043670654297, - "learning_rate": 2.900392104755455e-06, - "loss": 0.5798726677894592, - "mean_token_accuracy": 0.8382592797279358, - "num_tokens": 16827745.0, - "step": 1882 - }, - { - "epoch": 1.4308510638297873, - "grad_norm": 1.3262078762054443, - "learning_rate": 2.8983246047226137e-06, - "loss": 0.3724595904350281, - "mean_token_accuracy": 0.8651963472366333, - "num_tokens": 16844171.0, - "step": 1883 - }, - { - "epoch": 1.43161094224924, - "grad_norm": 1.7250545024871826, - "learning_rate": 2.8962568251302327e-06, - "loss": 0.3478979468345642, - "mean_token_accuracy": 0.8807886242866516, - "num_tokens": 16852838.0, - "step": 1884 - }, - { - "epoch": 1.4323708206686931, - "grad_norm": 2.114525318145752, - "learning_rate": 2.8941887674295573e-06, - "loss": 0.5156140327453613, - "mean_token_accuracy": 0.825178861618042, - "num_tokens": 16861087.0, - "step": 1885 - }, - { - "epoch": 1.4331306990881458, - "grad_norm": 2.400829792022705, - "learning_rate": 2.892120433072031e-06, - "loss": 0.2807392477989197, - "mean_token_accuracy": 0.8907361030578613, - "num_tokens": 16866557.0, - "step": 1886 - }, - { - "epoch": 1.4338905775075987, - "grad_norm": 2.490880012512207, - "learning_rate": 2.8900518235092908e-06, - "loss": 0.2615952491760254, - "mean_token_accuracy": 0.9152894020080566, - "num_tokens": 16871357.0, - "step": 1887 - }, - { - "epoch": 1.4346504559270516, - "grad_norm": 1.9058431386947632, - "learning_rate": 2.887982940193165e-06, - "loss": 0.43623363971710205, - "mean_token_accuracy": 0.84696364402771, - "num_tokens": 16879016.0, - "step": 1888 - }, - { - "epoch": 1.4354103343465046, - "grad_norm": 1.4520210027694702, - "learning_rate": 2.8859137845756785e-06, - "loss": 0.3961856961250305, - "mean_token_accuracy": 0.8518897294998169, - "num_tokens": 16892254.0, - "step": 1889 - }, - { - "epoch": 1.4361702127659575, - "grad_norm": 2.500274896621704, - "learning_rate": 2.8838443581090415e-06, - "loss": 0.41457289457321167, - "mean_token_accuracy": 0.8751448392868042, - "num_tokens": 16897156.0, - "step": 1890 - }, - { - "epoch": 1.4369300911854104, - "grad_norm": 2.9312057495117188, - "learning_rate": 2.8817746622456585e-06, - "loss": 0.45875269174575806, - "mean_token_accuracy": 0.8411039113998413, - "num_tokens": 16902291.0, - "step": 1891 - }, - { - "epoch": 1.4376899696048633, - "grad_norm": 2.367419481277466, - "learning_rate": 2.879704698438121e-06, - "loss": 0.3643629848957062, - "mean_token_accuracy": 0.8771071434020996, - "num_tokens": 16908128.0, - "step": 1892 - }, - { - "epoch": 1.4384498480243162, - "grad_norm": 1.9907705783843994, - "learning_rate": 2.8776344681392106e-06, - "loss": 0.3206835389137268, - "mean_token_accuracy": 0.879996657371521, - "num_tokens": 16914918.0, - "step": 1893 - }, - { - "epoch": 1.439209726443769, - "grad_norm": 3.536956310272217, - "learning_rate": 2.875563972801893e-06, - "loss": 0.3640141785144806, - "mean_token_accuracy": 0.8814959526062012, - "num_tokens": 16918187.0, - "step": 1894 - }, - { - "epoch": 1.4399696048632218, - "grad_norm": 1.3451156616210938, - "learning_rate": 2.8734932138793226e-06, - "loss": 0.3427346348762512, - "mean_token_accuracy": 0.8835382461547852, - "num_tokens": 16931135.0, - "step": 1895 - }, - { - "epoch": 1.4407294832826747, - "grad_norm": 2.0735955238342285, - "learning_rate": 2.871422192824837e-06, - "loss": 0.4265315532684326, - "mean_token_accuracy": 0.8452677726745605, - "num_tokens": 16937995.0, - "step": 1896 - }, - { - "epoch": 1.4414893617021276, - "grad_norm": 1.5124932527542114, - "learning_rate": 2.8693509110919597e-06, - "loss": 0.497121661901474, - "mean_token_accuracy": 0.815092921257019, - "num_tokens": 16952743.0, - "step": 1897 - }, - { - "epoch": 1.4422492401215805, - "grad_norm": 3.716669797897339, - "learning_rate": 2.867279370134395e-06, - "loss": 0.5452651381492615, - "mean_token_accuracy": 0.8150380849838257, - "num_tokens": 16956797.0, - "step": 1898 - }, - { - "epoch": 1.4430091185410334, - "grad_norm": 1.3571398258209229, - "learning_rate": 2.8652075714060296e-06, - "loss": 0.4249724745750427, - "mean_token_accuracy": 0.8675867915153503, - "num_tokens": 16974494.0, - "step": 1899 - }, - { - "epoch": 1.4437689969604863, - "grad_norm": 2.310673475265503, - "learning_rate": 2.863135516360932e-06, - "loss": 0.39368677139282227, - "mean_token_accuracy": 0.878392219543457, - "num_tokens": 16980612.0, - "step": 1900 - }, - { - "epoch": 1.4445288753799392, - "grad_norm": 1.9025533199310303, - "learning_rate": 2.8610632064533517e-06, - "loss": 0.4786127805709839, - "mean_token_accuracy": 0.8720556497573853, - "num_tokens": 16992262.0, - "step": 1901 - }, - { - "epoch": 1.4452887537993921, - "grad_norm": 2.528564453125, - "learning_rate": 2.8589906431377133e-06, - "loss": 0.4223094582557678, - "mean_token_accuracy": 0.8513246178627014, - "num_tokens": 16997717.0, - "step": 1902 - }, - { - "epoch": 1.446048632218845, - "grad_norm": 1.010425329208374, - "learning_rate": 2.8569178278686222e-06, - "loss": 0.3908255696296692, - "mean_token_accuracy": 0.8620463609695435, - "num_tokens": 17020903.0, - "step": 1903 - }, - { - "epoch": 1.4468085106382977, - "grad_norm": 1.5760232210159302, - "learning_rate": 2.8548447621008614e-06, - "loss": 0.4134044051170349, - "mean_token_accuracy": 0.8472093343734741, - "num_tokens": 17035250.0, - "step": 1904 - }, - { - "epoch": 1.4475683890577509, - "grad_norm": 2.0668535232543945, - "learning_rate": 2.8527714472893866e-06, - "loss": 0.44095730781555176, - "mean_token_accuracy": 0.881983757019043, - "num_tokens": 17042170.0, - "step": 1905 - }, - { - "epoch": 1.4483282674772036, - "grad_norm": 1.1620599031448364, - "learning_rate": 2.85069788488933e-06, - "loss": 0.3607163429260254, - "mean_token_accuracy": 0.8684282898902893, - "num_tokens": 17061937.0, - "step": 1906 - }, - { - "epoch": 1.4490881458966565, - "grad_norm": 2.1316568851470947, - "learning_rate": 2.8486240763559984e-06, - "loss": 0.3478124141693115, - "mean_token_accuracy": 0.8772403001785278, - "num_tokens": 17068628.0, - "step": 1907 - }, - { - "epoch": 1.4498480243161094, - "grad_norm": 2.4756391048431396, - "learning_rate": 2.8465500231448707e-06, - "loss": 0.46441152691841125, - "mean_token_accuracy": 0.8436450958251953, - "num_tokens": 17075495.0, - "step": 1908 - }, - { - "epoch": 1.4506079027355623, - "grad_norm": 2.249720573425293, - "learning_rate": 2.844475726711595e-06, - "loss": 0.41565513610839844, - "mean_token_accuracy": 0.8525094985961914, - "num_tokens": 17080940.0, - "step": 1909 - }, - { - "epoch": 1.4513677811550152, - "grad_norm": 2.3081841468811035, - "learning_rate": 2.8424011885119956e-06, - "loss": 0.49903199076652527, - "mean_token_accuracy": 0.8212426900863647, - "num_tokens": 17092024.0, - "step": 1910 - }, - { - "epoch": 1.452127659574468, - "grad_norm": 1.2929959297180176, - "learning_rate": 2.8403264100020613e-06, - "loss": 0.47038257122039795, - "mean_token_accuracy": 0.8319816589355469, - "num_tokens": 17108840.0, - "step": 1911 - }, - { - "epoch": 1.452887537993921, - "grad_norm": 1.6476463079452515, - "learning_rate": 2.8382513926379508e-06, - "loss": 0.42287829518318176, - "mean_token_accuracy": 0.8555682897567749, - "num_tokens": 17119704.0, - "step": 1912 - }, - { - "epoch": 1.453647416413374, - "grad_norm": 1.759998083114624, - "learning_rate": 2.836176137875993e-06, - "loss": 0.40904951095581055, - "mean_token_accuracy": 0.8698266744613647, - "num_tokens": 17130676.0, - "step": 1913 - }, - { - "epoch": 1.4544072948328268, - "grad_norm": 1.510909914970398, - "learning_rate": 2.8341006471726817e-06, - "loss": 0.47834792733192444, - "mean_token_accuracy": 0.8335825204849243, - "num_tokens": 17146304.0, - "step": 1914 - }, - { - "epoch": 1.4551671732522795, - "grad_norm": 3.538071632385254, - "learning_rate": 2.832024921984674e-06, - "loss": 0.34059035778045654, - "mean_token_accuracy": 0.8769031763076782, - "num_tokens": 17150458.0, - "step": 1915 - }, - { - "epoch": 1.4559270516717326, - "grad_norm": 2.3368659019470215, - "learning_rate": 2.8299489637687955e-06, - "loss": 0.43068382143974304, - "mean_token_accuracy": 0.845360517501831, - "num_tokens": 17157368.0, - "step": 1916 - }, - { - "epoch": 1.4566869300911853, - "grad_norm": 1.8720396757125854, - "learning_rate": 2.8278727739820334e-06, - "loss": 0.37013399600982666, - "mean_token_accuracy": 0.854241132736206, - "num_tokens": 17166325.0, - "step": 1917 - }, - { - "epoch": 1.4574468085106382, - "grad_norm": 1.6706892251968384, - "learning_rate": 2.825796354081537e-06, - "loss": 0.5397020578384399, - "mean_token_accuracy": 0.8309713006019592, - "num_tokens": 17178920.0, - "step": 1918 - }, - { - "epoch": 1.4582066869300911, - "grad_norm": 2.729210376739502, - "learning_rate": 2.8237197055246175e-06, - "loss": 0.25137859582901, - "mean_token_accuracy": 0.9148792028427124, - "num_tokens": 17183107.0, - "step": 1919 - }, - { - "epoch": 1.458966565349544, - "grad_norm": 3.023500680923462, - "learning_rate": 2.821642829768748e-06, - "loss": 0.43312495946884155, - "mean_token_accuracy": 0.8481811285018921, - "num_tokens": 17187853.0, - "step": 1920 - }, - { - "epoch": 1.459726443768997, - "grad_norm": 1.8108519315719604, - "learning_rate": 2.8195657282715595e-06, - "loss": 0.5101792216300964, - "mean_token_accuracy": 0.8315553069114685, - "num_tokens": 17199247.0, - "step": 1921 - }, - { - "epoch": 1.4604863221884499, - "grad_norm": 2.0262672901153564, - "learning_rate": 2.817488402490841e-06, - "loss": 0.4449934959411621, - "mean_token_accuracy": 0.8634527325630188, - "num_tokens": 17206348.0, - "step": 1922 - }, - { - "epoch": 1.4612462006079028, - "grad_norm": 2.6163926124572754, - "learning_rate": 2.8154108538845405e-06, - "loss": 0.43052345514297485, - "mean_token_accuracy": 0.8375401496887207, - "num_tokens": 17211702.0, - "step": 1923 - }, - { - "epoch": 1.4620060790273557, - "grad_norm": 2.0854408740997314, - "learning_rate": 2.813333083910761e-06, - "loss": 0.5011380910873413, - "mean_token_accuracy": 0.8359915018081665, - "num_tokens": 17219096.0, - "step": 1924 - }, - { - "epoch": 1.4627659574468086, - "grad_norm": 2.2081687450408936, - "learning_rate": 2.8112550940277615e-06, - "loss": 0.5239193439483643, - "mean_token_accuracy": 0.8499593734741211, - "num_tokens": 17229266.0, - "step": 1925 - }, - { - "epoch": 1.4635258358662613, - "grad_norm": 1.798343539237976, - "learning_rate": 2.809176885693956e-06, - "loss": 0.4515029191970825, - "mean_token_accuracy": 0.8400485515594482, - "num_tokens": 17239280.0, - "step": 1926 - }, - { - "epoch": 1.4642857142857144, - "grad_norm": 1.897887945175171, - "learning_rate": 2.807098460367911e-06, - "loss": 0.35935714840888977, - "mean_token_accuracy": 0.8776072263717651, - "num_tokens": 17247132.0, - "step": 1927 - }, - { - "epoch": 1.465045592705167, - "grad_norm": 2.705836296081543, - "learning_rate": 2.8050198195083445e-06, - "loss": 0.3728443682193756, - "mean_token_accuracy": 0.8649885654449463, - "num_tokens": 17251865.0, - "step": 1928 - }, - { - "epoch": 1.46580547112462, - "grad_norm": 1.841178059577942, - "learning_rate": 2.802940964574127e-06, - "loss": 0.40604841709136963, - "mean_token_accuracy": 0.8537783622741699, - "num_tokens": 17260163.0, - "step": 1929 - }, - { - "epoch": 1.466565349544073, - "grad_norm": 2.7393605709075928, - "learning_rate": 2.800861897024279e-06, - "loss": 0.39346879720687866, - "mean_token_accuracy": 0.8628787994384766, - "num_tokens": 17264876.0, - "step": 1930 - }, - { - "epoch": 1.4673252279635258, - "grad_norm": 1.84367835521698, - "learning_rate": 2.798782618317971e-06, - "loss": 0.37411895394325256, - "mean_token_accuracy": 0.8605265617370605, - "num_tokens": 17273049.0, - "step": 1931 - }, - { - "epoch": 1.4680851063829787, - "grad_norm": 1.6546733379364014, - "learning_rate": 2.796703129914519e-06, - "loss": 0.4997844099998474, - "mean_token_accuracy": 0.8267433643341064, - "num_tokens": 17285074.0, - "step": 1932 - }, - { - "epoch": 1.4688449848024316, - "grad_norm": 2.2749221324920654, - "learning_rate": 2.79462343327339e-06, - "loss": 0.35453367233276367, - "mean_token_accuracy": 0.8746850490570068, - "num_tokens": 17290273.0, - "step": 1933 - }, - { - "epoch": 1.4696048632218845, - "grad_norm": 1.7142518758773804, - "learning_rate": 2.7925435298541944e-06, - "loss": 0.345878541469574, - "mean_token_accuracy": 0.8600981831550598, - "num_tokens": 17301045.0, - "step": 1934 - }, - { - "epoch": 1.4703647416413375, - "grad_norm": 3.163342237472534, - "learning_rate": 2.7904634211166877e-06, - "loss": 0.4356975853443146, - "mean_token_accuracy": 0.8460350036621094, - "num_tokens": 17305108.0, - "step": 1935 - }, - { - "epoch": 1.4711246200607904, - "grad_norm": 1.6377612352371216, - "learning_rate": 2.7883831085207707e-06, - "loss": 0.4459729790687561, - "mean_token_accuracy": 0.8463394641876221, - "num_tokens": 17315479.0, - "step": 1936 - }, - { - "epoch": 1.471884498480243, - "grad_norm": 1.865268588066101, - "learning_rate": 2.7863025935264876e-06, - "loss": 0.394723117351532, - "mean_token_accuracy": 0.864177942276001, - "num_tokens": 17324795.0, - "step": 1937 - }, - { - "epoch": 1.4726443768996962, - "grad_norm": 1.241937518119812, - "learning_rate": 2.784221877594024e-06, - "loss": 0.2752220630645752, - "mean_token_accuracy": 0.8998259902000427, - "num_tokens": 17338000.0, - "step": 1938 - }, - { - "epoch": 1.4734042553191489, - "grad_norm": 1.8013651371002197, - "learning_rate": 2.7821409621837042e-06, - "loss": 0.4251005947589874, - "mean_token_accuracy": 0.8518919348716736, - "num_tokens": 17347351.0, - "step": 1939 - }, - { - "epoch": 1.4741641337386018, - "grad_norm": 1.2902207374572754, - "learning_rate": 2.7800598487559976e-06, - "loss": 0.3640727400779724, - "mean_token_accuracy": 0.8592870235443115, - "num_tokens": 17362335.0, - "step": 1940 - }, - { - "epoch": 1.4749240121580547, - "grad_norm": 2.5427513122558594, - "learning_rate": 2.777978538771508e-06, - "loss": 0.38166797161102295, - "mean_token_accuracy": 0.8653234839439392, - "num_tokens": 17367733.0, - "step": 1941 - }, - { - "epoch": 1.4756838905775076, - "grad_norm": 1.7793641090393066, - "learning_rate": 2.7758970336909795e-06, - "loss": 0.3113783895969391, - "mean_token_accuracy": 0.8812868595123291, - "num_tokens": 17375267.0, - "step": 1942 - }, - { - "epoch": 1.4764437689969605, - "grad_norm": 3.4031741619110107, - "learning_rate": 2.7738153349752923e-06, - "loss": 0.4800986647605896, - "mean_token_accuracy": 0.8336698412895203, - "num_tokens": 17379549.0, - "step": 1943 - }, - { - "epoch": 1.4772036474164134, - "grad_norm": 1.3451651334762573, - "learning_rate": 2.7717334440854634e-06, - "loss": 0.3115345239639282, - "mean_token_accuracy": 0.908623218536377, - "num_tokens": 17394455.0, - "step": 1944 - }, - { - "epoch": 1.4779635258358663, - "grad_norm": 1.980919599533081, - "learning_rate": 2.7696513624826422e-06, - "loss": 0.391154944896698, - "mean_token_accuracy": 0.8650267720222473, - "num_tokens": 17401931.0, - "step": 1945 - }, - { - "epoch": 1.4787234042553192, - "grad_norm": 1.0118765830993652, - "learning_rate": 2.7675690916281158e-06, - "loss": 0.3157956600189209, - "mean_token_accuracy": 0.8827471733093262, - "num_tokens": 17424144.0, - "step": 1946 - }, - { - "epoch": 1.4794832826747721, - "grad_norm": 1.579654335975647, - "learning_rate": 2.7654866329833e-06, - "loss": 0.4578486382961273, - "mean_token_accuracy": 0.8361750245094299, - "num_tokens": 17435769.0, - "step": 1947 - }, - { - "epoch": 1.4802431610942248, - "grad_norm": 1.7706717252731323, - "learning_rate": 2.763403988009746e-06, - "loss": 0.3564416170120239, - "mean_token_accuracy": 0.8689201474189758, - "num_tokens": 17444088.0, - "step": 1948 - }, - { - "epoch": 1.4810030395136777, - "grad_norm": 1.2264244556427002, - "learning_rate": 2.761321158169134e-06, - "loss": 0.30763837695121765, - "mean_token_accuracy": 0.8960219621658325, - "num_tokens": 17458096.0, - "step": 1949 - }, - { - "epoch": 1.4817629179331306, - "grad_norm": 1.214431881904602, - "learning_rate": 2.759238144923274e-06, - "loss": 0.49099457263946533, - "mean_token_accuracy": 0.8279136419296265, - "num_tokens": 17481062.0, - "step": 1950 - }, - { - "epoch": 1.4825227963525835, - "grad_norm": 1.593892216682434, - "learning_rate": 2.7571549497341044e-06, - "loss": 0.3745320737361908, - "mean_token_accuracy": 0.8690779209136963, - "num_tokens": 17490874.0, - "step": 1951 - }, - { - "epoch": 1.4832826747720365, - "grad_norm": 2.409924268722534, - "learning_rate": 2.755071574063692e-06, - "loss": 0.4310247600078583, - "mean_token_accuracy": 0.8521159291267395, - "num_tokens": 17496942.0, - "step": 1952 - }, - { - "epoch": 1.4840425531914894, - "grad_norm": 1.2557463645935059, - "learning_rate": 2.7529880193742297e-06, - "loss": 0.34304720163345337, - "mean_token_accuracy": 0.8748183250427246, - "num_tokens": 17514391.0, - "step": 1953 - }, - { - "epoch": 1.4848024316109423, - "grad_norm": 1.17310631275177, - "learning_rate": 2.7509042871280373e-06, - "loss": 0.3835817277431488, - "mean_token_accuracy": 0.8853274583816528, - "num_tokens": 17533289.0, - "step": 1954 - }, - { - "epoch": 1.4855623100303952, - "grad_norm": 1.5261479616165161, - "learning_rate": 2.748820378787558e-06, - "loss": 0.4799988865852356, - "mean_token_accuracy": 0.8252149820327759, - "num_tokens": 17544118.0, - "step": 1955 - }, - { - "epoch": 1.486322188449848, - "grad_norm": 2.030930757522583, - "learning_rate": 2.7467362958153585e-06, - "loss": 0.35690805315971375, - "mean_token_accuracy": 0.8959587216377258, - "num_tokens": 17550431.0, - "step": 1956 - }, - { - "epoch": 1.4870820668693008, - "grad_norm": 2.376520872116089, - "learning_rate": 2.7446520396741293e-06, - "loss": 0.262234091758728, - "mean_token_accuracy": 0.9054547548294067, - "num_tokens": 17554853.0, - "step": 1957 - }, - { - "epoch": 1.487841945288754, - "grad_norm": 1.6944479942321777, - "learning_rate": 2.742567611826681e-06, - "loss": 0.529259979724884, - "mean_token_accuracy": 0.8195339441299438, - "num_tokens": 17568016.0, - "step": 1958 - }, - { - "epoch": 1.4886018237082066, - "grad_norm": 2.833029270172119, - "learning_rate": 2.7404830137359445e-06, - "loss": 0.30229634046554565, - "mean_token_accuracy": 0.8933001756668091, - "num_tokens": 17572587.0, - "step": 1959 - }, - { - "epoch": 1.4893617021276595, - "grad_norm": 1.7040144205093384, - "learning_rate": 2.7383982468649715e-06, - "loss": 0.3166356682777405, - "mean_token_accuracy": 0.8871906399726868, - "num_tokens": 17580966.0, - "step": 1960 - }, - { - "epoch": 1.4901215805471124, - "grad_norm": 1.7539052963256836, - "learning_rate": 2.7363133126769326e-06, - "loss": 0.4231064021587372, - "mean_token_accuracy": 0.8708304166793823, - "num_tokens": 17590907.0, - "step": 1961 - }, - { - "epoch": 1.4908814589665653, - "grad_norm": 1.6198650598526, - "learning_rate": 2.7342282126351145e-06, - "loss": 0.4198967218399048, - "mean_token_accuracy": 0.8723280429840088, - "num_tokens": 17604291.0, - "step": 1962 - }, - { - "epoch": 1.4916413373860182, - "grad_norm": 1.8437711000442505, - "learning_rate": 2.73214294820292e-06, - "loss": 0.38923323154449463, - "mean_token_accuracy": 0.8697006106376648, - "num_tokens": 17612291.0, - "step": 1963 - }, - { - "epoch": 1.4924012158054711, - "grad_norm": 1.1129369735717773, - "learning_rate": 2.7300575208438684e-06, - "loss": 0.3107512593269348, - "mean_token_accuracy": 0.878618597984314, - "num_tokens": 17630073.0, - "step": 1964 - }, - { - "epoch": 1.493161094224924, - "grad_norm": 3.0210442543029785, - "learning_rate": 2.7279719320215924e-06, - "loss": 0.4630751609802246, - "mean_token_accuracy": 0.8567075729370117, - "num_tokens": 17634758.0, - "step": 1965 - }, - { - "epoch": 1.493920972644377, - "grad_norm": 2.8825972080230713, - "learning_rate": 2.725886183199839e-06, - "loss": 0.35351765155792236, - "mean_token_accuracy": 0.8711981773376465, - "num_tokens": 17639613.0, - "step": 1966 - }, - { - "epoch": 1.4946808510638299, - "grad_norm": 2.111238718032837, - "learning_rate": 2.723800275842468e-06, - "loss": 0.3529569208621979, - "mean_token_accuracy": 0.8679244518280029, - "num_tokens": 17645308.0, - "step": 1967 - }, - { - "epoch": 1.4954407294832825, - "grad_norm": 2.080509901046753, - "learning_rate": 2.7217142114134466e-06, - "loss": 0.43321219086647034, - "mean_token_accuracy": 0.8848220109939575, - "num_tokens": 17652292.0, - "step": 1968 - }, - { - "epoch": 1.4962006079027357, - "grad_norm": 2.8686363697052, - "learning_rate": 2.7196279913768587e-06, - "loss": 0.417035311460495, - "mean_token_accuracy": 0.8724601864814758, - "num_tokens": 17656908.0, - "step": 1969 - }, - { - "epoch": 1.4969604863221884, - "grad_norm": 3.294193744659424, - "learning_rate": 2.717541617196891e-06, - "loss": 0.3551934063434601, - "mean_token_accuracy": 0.8838565349578857, - "num_tokens": 17660590.0, - "step": 1970 - }, - { - "epoch": 1.4977203647416413, - "grad_norm": 1.766292929649353, - "learning_rate": 2.7154550903378425e-06, - "loss": 0.36521971225738525, - "mean_token_accuracy": 0.8810199499130249, - "num_tokens": 17668214.0, - "step": 1971 - }, - { - "epoch": 1.4984802431610942, - "grad_norm": 1.2127676010131836, - "learning_rate": 2.713368412264118e-06, - "loss": 0.35184425115585327, - "mean_token_accuracy": 0.8672580718994141, - "num_tokens": 17684736.0, - "step": 1972 - }, - { - "epoch": 1.499240121580547, - "grad_norm": 2.268256664276123, - "learning_rate": 2.711281584440228e-06, - "loss": 0.40115267038345337, - "mean_token_accuracy": 0.8517841100692749, - "num_tokens": 17691510.0, - "step": 1973 - }, - { - "epoch": 1.5, - "grad_norm": 2.7196054458618164, - "learning_rate": 2.70919460833079e-06, - "loss": 0.3819037675857544, - "mean_token_accuracy": 0.8765411376953125, - "num_tokens": 17696179.0, - "step": 1974 - }, - { - "epoch": 1.500759878419453, - "grad_norm": 2.969406843185425, - "learning_rate": 2.7071074854005206e-06, - "loss": 0.3922455608844757, - "mean_token_accuracy": 0.8796037435531616, - "num_tokens": 17700597.0, - "step": 1975 - }, - { - "epoch": 1.5015197568389058, - "grad_norm": 2.2965853214263916, - "learning_rate": 2.705020217114248e-06, - "loss": 0.5433666110038757, - "mean_token_accuracy": 0.809639036655426, - "num_tokens": 17708895.0, - "step": 1976 - }, - { - "epoch": 1.5022796352583585, - "grad_norm": 1.5584394931793213, - "learning_rate": 2.7029328049368942e-06, - "loss": 0.4736343324184418, - "mean_token_accuracy": 0.8197190761566162, - "num_tokens": 17725202.0, - "step": 1977 - }, - { - "epoch": 1.5030395136778116, - "grad_norm": 1.3903142213821411, - "learning_rate": 2.700845250333486e-06, - "loss": 0.4471571445465088, - "mean_token_accuracy": 0.839043140411377, - "num_tokens": 17742835.0, - "step": 1978 - }, - { - "epoch": 1.5037993920972643, - "grad_norm": 3.080716609954834, - "learning_rate": 2.69875755476915e-06, - "loss": 0.45760005712509155, - "mean_token_accuracy": 0.8366328477859497, - "num_tokens": 17747324.0, - "step": 1979 - }, - { - "epoch": 1.5045592705167175, - "grad_norm": 1.0150405168533325, - "learning_rate": 2.696669719709111e-06, - "loss": 0.33638954162597656, - "mean_token_accuracy": 0.8591676354408264, - "num_tokens": 17765565.0, - "step": 1980 - }, - { - "epoch": 1.5053191489361701, - "grad_norm": 2.402927875518799, - "learning_rate": 2.694581746618691e-06, - "loss": 0.4086601436138153, - "mean_token_accuracy": 0.8769911527633667, - "num_tokens": 17771275.0, - "step": 1981 - }, - { - "epoch": 1.506079027355623, - "grad_norm": 2.030583381652832, - "learning_rate": 2.6924936369633126e-06, - "loss": 0.5115457773208618, - "mean_token_accuracy": 0.8054746389389038, - "num_tokens": 17779999.0, - "step": 1982 - }, - { - "epoch": 1.506838905775076, - "grad_norm": 2.575199604034424, - "learning_rate": 2.6904053922084893e-06, - "loss": 0.363183856010437, - "mean_token_accuracy": 0.8716042637825012, - "num_tokens": 17785473.0, - "step": 1983 - }, - { - "epoch": 1.5075987841945289, - "grad_norm": 1.8497480154037476, - "learning_rate": 2.688317013819832e-06, - "loss": 0.4254384934902191, - "mean_token_accuracy": 0.8549597263336182, - "num_tokens": 17793812.0, - "step": 1984 - }, - { - "epoch": 1.5083586626139818, - "grad_norm": 1.7786511182785034, - "learning_rate": 2.686228503263045e-06, - "loss": 0.33400774002075195, - "mean_token_accuracy": 0.9027615189552307, - "num_tokens": 17801783.0, - "step": 1985 - }, - { - "epoch": 1.5091185410334347, - "grad_norm": 1.8365367650985718, - "learning_rate": 2.684139862003927e-06, - "loss": 0.35765063762664795, - "mean_token_accuracy": 0.8663736581802368, - "num_tokens": 17809562.0, - "step": 1986 - }, - { - "epoch": 1.5098784194528876, - "grad_norm": 1.8817477226257324, - "learning_rate": 2.682051091508365e-06, - "loss": 0.4627506732940674, - "mean_token_accuracy": 0.8358862400054932, - "num_tokens": 17819094.0, - "step": 1987 - }, - { - "epoch": 1.5106382978723403, - "grad_norm": 2.221547842025757, - "learning_rate": 2.679962193242338e-06, - "loss": 0.577020525932312, - "mean_token_accuracy": 0.80013108253479, - "num_tokens": 17826666.0, - "step": 1988 - }, - { - "epoch": 1.5113981762917934, - "grad_norm": 2.6618270874023438, - "learning_rate": 2.6778731686719177e-06, - "loss": 0.44632256031036377, - "mean_token_accuracy": 0.8611289262771606, - "num_tokens": 17833172.0, - "step": 1989 - }, - { - "epoch": 1.512158054711246, - "grad_norm": 2.9495689868927, - "learning_rate": 2.67578401926326e-06, - "loss": 0.3482511043548584, - "mean_token_accuracy": 0.8703314661979675, - "num_tokens": 17837220.0, - "step": 1990 - }, - { - "epoch": 1.5129179331306992, - "grad_norm": 2.0943644046783447, - "learning_rate": 2.6736947464826107e-06, - "loss": 0.2354314625263214, - "mean_token_accuracy": 0.9137634038925171, - "num_tokens": 17842712.0, - "step": 1991 - }, - { - "epoch": 1.513677811550152, - "grad_norm": 1.1303033828735352, - "learning_rate": 2.671605351796302e-06, - "loss": 0.3624761700630188, - "mean_token_accuracy": 0.8769594430923462, - "num_tokens": 17860902.0, - "step": 1992 - }, - { - "epoch": 1.5144376899696048, - "grad_norm": 2.8921146392822266, - "learning_rate": 2.6695158366707526e-06, - "loss": 0.2517220973968506, - "mean_token_accuracy": 0.8974182605743408, - "num_tokens": 17865160.0, - "step": 1993 - }, - { - "epoch": 1.5151975683890577, - "grad_norm": 2.320587158203125, - "learning_rate": 2.667426202572463e-06, - "loss": 0.4589889943599701, - "mean_token_accuracy": 0.8379613161087036, - "num_tokens": 17871994.0, - "step": 1994 - }, - { - "epoch": 1.5159574468085106, - "grad_norm": 1.1407674551010132, - "learning_rate": 2.665336450968019e-06, - "loss": 0.34412115812301636, - "mean_token_accuracy": 0.8776306509971619, - "num_tokens": 17889941.0, - "step": 1995 - }, - { - "epoch": 1.5167173252279635, - "grad_norm": 2.069814920425415, - "learning_rate": 2.6632465833240895e-06, - "loss": 0.47524404525756836, - "mean_token_accuracy": 0.830310046672821, - "num_tokens": 17898447.0, - "step": 1996 - }, - { - "epoch": 1.5174772036474165, - "grad_norm": 1.822415828704834, - "learning_rate": 2.661156601107424e-06, - "loss": 0.4541318416595459, - "mean_token_accuracy": 0.8856616020202637, - "num_tokens": 17908729.0, - "step": 1997 - }, - { - "epoch": 1.5182370820668694, - "grad_norm": 2.851428985595703, - "learning_rate": 2.659066505784852e-06, - "loss": 0.41761666536331177, - "mean_token_accuracy": 0.8710572719573975, - "num_tokens": 17913860.0, - "step": 1998 - }, - { - "epoch": 1.518996960486322, - "grad_norm": 1.8483710289001465, - "learning_rate": 2.6569762988232838e-06, - "loss": 0.45517268776893616, - "mean_token_accuracy": 0.8411115407943726, - "num_tokens": 17923497.0, - "step": 1999 - }, - { - "epoch": 1.5197568389057752, - "grad_norm": 1.9044219255447388, - "learning_rate": 2.654885981689706e-06, - "loss": 0.42533189058303833, - "mean_token_accuracy": 0.8597894906997681, - "num_tokens": 17932670.0, - "step": 2000 - }, - { - "epoch": 1.5205167173252279, - "grad_norm": 1.8170348405838013, - "learning_rate": 2.652795555851184e-06, - "loss": 0.4009692072868347, - "mean_token_accuracy": 0.8553036451339722, - "num_tokens": 17941616.0, - "step": 2001 - }, - { - "epoch": 1.521276595744681, - "grad_norm": 1.4704090356826782, - "learning_rate": 2.6507050227748595e-06, - "loss": 0.3732764720916748, - "mean_token_accuracy": 0.8788566589355469, - "num_tokens": 17957187.0, - "step": 2002 - }, - { - "epoch": 1.5220364741641337, - "grad_norm": 1.6681534051895142, - "learning_rate": 2.648614383927949e-06, - "loss": 0.341326504945755, - "mean_token_accuracy": 0.874875545501709, - "num_tokens": 17966668.0, - "step": 2003 - }, - { - "epoch": 1.5227963525835866, - "grad_norm": 1.8578619956970215, - "learning_rate": 2.646523640777741e-06, - "loss": 0.3937399983406067, - "mean_token_accuracy": 0.8656851053237915, - "num_tokens": 17976194.0, - "step": 2004 - }, - { - "epoch": 1.5235562310030395, - "grad_norm": 1.7520431280136108, - "learning_rate": 2.6444327947916037e-06, - "loss": 0.3392767906188965, - "mean_token_accuracy": 0.8799679279327393, - "num_tokens": 17984492.0, - "step": 2005 - }, - { - "epoch": 1.5243161094224924, - "grad_norm": 3.4649906158447266, - "learning_rate": 2.6423418474369707e-06, - "loss": 0.3451516032218933, - "mean_token_accuracy": 0.8753262758255005, - "num_tokens": 17988240.0, - "step": 2006 - }, - { - "epoch": 1.5250759878419453, - "grad_norm": 1.8037052154541016, - "learning_rate": 2.64025080018135e-06, - "loss": 0.34428173303604126, - "mean_token_accuracy": 0.8719067573547363, - "num_tokens": 17996644.0, - "step": 2007 - }, - { - "epoch": 1.5258358662613982, - "grad_norm": 1.743722677230835, - "learning_rate": 2.6381596544923184e-06, - "loss": 0.4446655213832855, - "mean_token_accuracy": 0.8612518906593323, - "num_tokens": 18005109.0, - "step": 2008 - }, - { - "epoch": 1.5265957446808511, - "grad_norm": 1.3357981443405151, - "learning_rate": 2.636068411837523e-06, - "loss": 0.38647788763046265, - "mean_token_accuracy": 0.858294665813446, - "num_tokens": 18018193.0, - "step": 2009 - }, - { - "epoch": 1.5273556231003038, - "grad_norm": 1.4848440885543823, - "learning_rate": 2.6339770736846794e-06, - "loss": 0.3597261607646942, - "mean_token_accuracy": 0.8760983943939209, - "num_tokens": 18028959.0, - "step": 2010 - }, - { - "epoch": 1.528115501519757, - "grad_norm": 2.356933832168579, - "learning_rate": 2.6318856415015664e-06, - "loss": 0.2697138488292694, - "mean_token_accuracy": 0.9078473448753357, - "num_tokens": 18033946.0, - "step": 2011 - }, - { - "epoch": 1.5288753799392096, - "grad_norm": 1.964368224143982, - "learning_rate": 2.629794116756035e-06, - "loss": 0.41349685192108154, - "mean_token_accuracy": 0.8567900657653809, - "num_tokens": 18042724.0, - "step": 2012 - }, - { - "epoch": 1.5296352583586628, - "grad_norm": 1.5630402565002441, - "learning_rate": 2.627702500915995e-06, - "loss": 0.49310681223869324, - "mean_token_accuracy": 0.8229681253433228, - "num_tokens": 18054396.0, - "step": 2013 - }, - { - "epoch": 1.5303951367781155, - "grad_norm": 1.6657718420028687, - "learning_rate": 2.625610795449424e-06, - "loss": 0.4263935387134552, - "mean_token_accuracy": 0.8634918332099915, - "num_tokens": 18064347.0, - "step": 2014 - }, - { - "epoch": 1.5311550151975684, - "grad_norm": 1.3684180974960327, - "learning_rate": 2.6235190018243623e-06, - "loss": 0.2903984487056732, - "mean_token_accuracy": 0.8930408358573914, - "num_tokens": 18076826.0, - "step": 2015 - }, - { - "epoch": 1.5319148936170213, - "grad_norm": 1.635044813156128, - "learning_rate": 2.6214271215089106e-06, - "loss": 0.3066539168357849, - "mean_token_accuracy": 0.8912158012390137, - "num_tokens": 18085761.0, - "step": 2016 - }, - { - "epoch": 1.5326747720364742, - "grad_norm": 2.431518316268921, - "learning_rate": 2.6193351559712294e-06, - "loss": 0.31123271584510803, - "mean_token_accuracy": 0.8865828514099121, - "num_tokens": 18091715.0, - "step": 2017 - }, - { - "epoch": 1.533434650455927, - "grad_norm": 1.8317419290542603, - "learning_rate": 2.6172431066795428e-06, - "loss": 0.5042020082473755, - "mean_token_accuracy": 0.8245081901550293, - "num_tokens": 18102095.0, - "step": 2018 - }, - { - "epoch": 1.53419452887538, - "grad_norm": 3.4221980571746826, - "learning_rate": 2.6151509751021307e-06, - "loss": 0.2885819971561432, - "mean_token_accuracy": 0.8997149467468262, - "num_tokens": 18105456.0, - "step": 2019 - }, - { - "epoch": 1.534954407294833, - "grad_norm": 1.4435855150222778, - "learning_rate": 2.6130587627073315e-06, - "loss": 0.45573529601097107, - "mean_token_accuracy": 0.837191104888916, - "num_tokens": 18119039.0, - "step": 2020 - }, - { - "epoch": 1.5357142857142856, - "grad_norm": 1.5748237371444702, - "learning_rate": 2.6109664709635413e-06, - "loss": 0.4561889171600342, - "mean_token_accuracy": 0.8334558010101318, - "num_tokens": 18132150.0, - "step": 2021 - }, - { - "epoch": 1.5364741641337387, - "grad_norm": 2.8278751373291016, - "learning_rate": 2.60887410133921e-06, - "loss": 0.3495104908943176, - "mean_token_accuracy": 0.8926796913146973, - "num_tokens": 18136528.0, - "step": 2022 - }, - { - "epoch": 1.5372340425531914, - "grad_norm": 2.5045573711395264, - "learning_rate": 2.606781655302843e-06, - "loss": 0.45362481474876404, - "mean_token_accuracy": 0.8379551768302917, - "num_tokens": 18142581.0, - "step": 2023 - }, - { - "epoch": 1.5379939209726445, - "grad_norm": 2.5984106063842773, - "learning_rate": 2.604689134322999e-06, - "loss": 0.4210243821144104, - "mean_token_accuracy": 0.8571645021438599, - "num_tokens": 18148152.0, - "step": 2024 - }, - { - "epoch": 1.5387537993920972, - "grad_norm": 1.7180702686309814, - "learning_rate": 2.602596539868292e-06, - "loss": 0.2478562295436859, - "mean_token_accuracy": 0.9227135181427002, - "num_tokens": 18155435.0, - "step": 2025 - }, - { - "epoch": 1.5395136778115501, - "grad_norm": 2.3721933364868164, - "learning_rate": 2.6005038734073833e-06, - "loss": 0.3820664584636688, - "mean_token_accuracy": 0.8788443803787231, - "num_tokens": 18161403.0, - "step": 2026 - }, - { - "epoch": 1.540273556231003, - "grad_norm": 1.4967509508132935, - "learning_rate": 2.5984111364089875e-06, - "loss": 0.34247124195098877, - "mean_token_accuracy": 0.8809049129486084, - "num_tokens": 18173724.0, - "step": 2027 - }, - { - "epoch": 1.541033434650456, - "grad_norm": 2.5226845741271973, - "learning_rate": 2.5963183303418682e-06, - "loss": 0.2647642493247986, - "mean_token_accuracy": 0.8988642692565918, - "num_tokens": 18178927.0, - "step": 2028 - }, - { - "epoch": 1.5417933130699089, - "grad_norm": 2.217228412628174, - "learning_rate": 2.594225456674837e-06, - "loss": 0.37754058837890625, - "mean_token_accuracy": 0.8660204410552979, - "num_tokens": 18185268.0, - "step": 2029 - }, - { - "epoch": 1.5425531914893615, - "grad_norm": 2.336409091949463, - "learning_rate": 2.592132516876753e-06, - "loss": 0.45098528265953064, - "mean_token_accuracy": 0.842115044593811, - "num_tokens": 18192372.0, - "step": 2030 - }, - { - "epoch": 1.5433130699088147, - "grad_norm": 3.5437142848968506, - "learning_rate": 2.5900395124165216e-06, - "loss": 0.5326460003852844, - "mean_token_accuracy": 0.8125103712081909, - "num_tokens": 18199182.0, - "step": 2031 - }, - { - "epoch": 1.5440729483282674, - "grad_norm": 1.5785651206970215, - "learning_rate": 2.5879464447630947e-06, - "loss": 0.3714991509914398, - "mean_token_accuracy": 0.8711390495300293, - "num_tokens": 18209045.0, - "step": 2032 - }, - { - "epoch": 1.5448328267477205, - "grad_norm": 2.3616182804107666, - "learning_rate": 2.5858533153854676e-06, - "loss": 0.4548399746417999, - "mean_token_accuracy": 0.8411449193954468, - "num_tokens": 18215487.0, - "step": 2033 - }, - { - "epoch": 1.5455927051671732, - "grad_norm": 2.0750479698181152, - "learning_rate": 2.583760125752679e-06, - "loss": 0.3980535566806793, - "mean_token_accuracy": 0.8603327870368958, - "num_tokens": 18222606.0, - "step": 2034 - }, - { - "epoch": 1.5463525835866263, - "grad_norm": 2.609295129776001, - "learning_rate": 2.58166687733381e-06, - "loss": 0.40177756547927856, - "mean_token_accuracy": 0.8652099370956421, - "num_tokens": 18227341.0, - "step": 2035 - }, - { - "epoch": 1.547112462006079, - "grad_norm": 2.1621339321136475, - "learning_rate": 2.5795735715979826e-06, - "loss": 0.45104342699050903, - "mean_token_accuracy": 0.8481369018554688, - "num_tokens": 18235820.0, - "step": 2036 - }, - { - "epoch": 1.547872340425532, - "grad_norm": 1.0381370782852173, - "learning_rate": 2.577480210014359e-06, - "loss": 0.32621103525161743, - "mean_token_accuracy": 0.8867391347885132, - "num_tokens": 18258307.0, - "step": 2037 - }, - { - "epoch": 1.5486322188449848, - "grad_norm": 1.7634375095367432, - "learning_rate": 2.575386794052142e-06, - "loss": 0.5115169882774353, - "mean_token_accuracy": 0.818779468536377, - "num_tokens": 18272782.0, - "step": 2038 - }, - { - "epoch": 1.5493920972644377, - "grad_norm": 1.874875545501709, - "learning_rate": 2.5732933251805716e-06, - "loss": 0.4381459951400757, - "mean_token_accuracy": 0.8594684600830078, - "num_tokens": 18282618.0, - "step": 2039 - }, - { - "epoch": 1.5501519756838906, - "grad_norm": 2.1316351890563965, - "learning_rate": 2.571199804868923e-06, - "loss": 0.5410124063491821, - "mean_token_accuracy": 0.8247587084770203, - "num_tokens": 18289750.0, - "step": 2040 - }, - { - "epoch": 1.5509118541033433, - "grad_norm": 1.7574573755264282, - "learning_rate": 2.569106234586511e-06, - "loss": 0.29967373609542847, - "mean_token_accuracy": 0.8913218975067139, - "num_tokens": 18298110.0, - "step": 2041 - }, - { - "epoch": 1.5516717325227964, - "grad_norm": 1.929626703262329, - "learning_rate": 2.5670126158026843e-06, - "loss": 0.3287760019302368, - "mean_token_accuracy": 0.8870488405227661, - "num_tokens": 18305702.0, - "step": 2042 - }, - { - "epoch": 1.5524316109422491, - "grad_norm": 3.020153284072876, - "learning_rate": 2.5649189499868233e-06, - "loss": 0.38523542881011963, - "mean_token_accuracy": 0.854824960231781, - "num_tokens": 18309830.0, - "step": 2043 - }, - { - "epoch": 1.5531914893617023, - "grad_norm": 1.6378421783447266, - "learning_rate": 2.5628252386083443e-06, - "loss": 0.47371378540992737, - "mean_token_accuracy": 0.8627713918685913, - "num_tokens": 18322820.0, - "step": 2044 - }, - { - "epoch": 1.553951367781155, - "grad_norm": 1.3711130619049072, - "learning_rate": 2.560731483136694e-06, - "loss": 0.3319293260574341, - "mean_token_accuracy": 0.8704103231430054, - "num_tokens": 18335074.0, - "step": 2045 - }, - { - "epoch": 1.5547112462006079, - "grad_norm": 1.7589185237884521, - "learning_rate": 2.558637685041352e-06, - "loss": 0.4446021020412445, - "mean_token_accuracy": 0.8446722626686096, - "num_tokens": 18344115.0, - "step": 2046 - }, - { - "epoch": 1.5554711246200608, - "grad_norm": 2.5249195098876953, - "learning_rate": 2.5565438457918247e-06, - "loss": 0.4625541865825653, - "mean_token_accuracy": 0.8451195359230042, - "num_tokens": 18349235.0, - "step": 2047 - }, - { - "epoch": 1.5562310030395137, - "grad_norm": 1.0562543869018555, - "learning_rate": 2.5544499668576508e-06, - "loss": 0.33747735619544983, - "mean_token_accuracy": 0.8503615856170654, - "num_tokens": 18368253.0, - "step": 2048 - }, - { - "epoch": 1.5569908814589666, - "grad_norm": 2.9451215267181396, - "learning_rate": 2.5523560497083927e-06, - "loss": 0.3958815932273865, - "mean_token_accuracy": 0.8393744826316833, - "num_tokens": 18372887.0, - "step": 2049 - }, - { - "epoch": 1.5577507598784195, - "grad_norm": 1.3597660064697266, - "learning_rate": 2.5502620958136444e-06, - "loss": 0.46281275153160095, - "mean_token_accuracy": 0.8269470930099487, - "num_tokens": 18388074.0, - "step": 2050 - }, - { - "epoch": 1.5585106382978724, - "grad_norm": 3.269068717956543, - "learning_rate": 2.548168106643022e-06, - "loss": 0.2309008538722992, - "mean_token_accuracy": 0.9178205728530884, - "num_tokens": 18391406.0, - "step": 2051 - }, - { - "epoch": 1.559270516717325, - "grad_norm": 2.1459391117095947, - "learning_rate": 2.546074083666169e-06, - "loss": 0.4006733298301697, - "mean_token_accuracy": 0.8631902933120728, - "num_tokens": 18397497.0, - "step": 2052 - }, - { - "epoch": 1.5600303951367782, - "grad_norm": 1.4614566564559937, - "learning_rate": 2.5439800283527495e-06, - "loss": 0.40810418128967285, - "mean_token_accuracy": 0.8473483920097351, - "num_tokens": 18409474.0, - "step": 2053 - }, - { - "epoch": 1.560790273556231, - "grad_norm": 2.084808826446533, - "learning_rate": 2.541885942172454e-06, - "loss": 0.34967708587646484, - "mean_token_accuracy": 0.8707003593444824, - "num_tokens": 18416400.0, - "step": 2054 - }, - { - "epoch": 1.561550151975684, - "grad_norm": 1.90664541721344, - "learning_rate": 2.539791826594991e-06, - "loss": 0.37694251537323, - "mean_token_accuracy": 0.8704941272735596, - "num_tokens": 18424206.0, - "step": 2055 - }, - { - "epoch": 1.5623100303951367, - "grad_norm": 1.880176305770874, - "learning_rate": 2.537697683090093e-06, - "loss": 0.32510411739349365, - "mean_token_accuracy": 0.8848961591720581, - "num_tokens": 18431676.0, - "step": 2056 - }, - { - "epoch": 1.5630699088145896, - "grad_norm": 2.133375406265259, - "learning_rate": 2.5356035131275096e-06, - "loss": 0.30538493394851685, - "mean_token_accuracy": 0.8890067338943481, - "num_tokens": 18438014.0, - "step": 2057 - }, - { - "epoch": 1.5638297872340425, - "grad_norm": 2.3495655059814453, - "learning_rate": 2.5335093181770105e-06, - "loss": 0.3126775324344635, - "mean_token_accuracy": 0.8865689039230347, - "num_tokens": 18443604.0, - "step": 2058 - }, - { - "epoch": 1.5645896656534954, - "grad_norm": 2.37949538230896, - "learning_rate": 2.531415099708382e-06, - "loss": 0.3257793188095093, - "mean_token_accuracy": 0.8809669017791748, - "num_tokens": 18448654.0, - "step": 2059 - }, - { - "epoch": 1.5653495440729484, - "grad_norm": 1.8285472393035889, - "learning_rate": 2.5293208591914265e-06, - "loss": 0.32376936078071594, - "mean_token_accuracy": 0.8816431760787964, - "num_tokens": 18456619.0, - "step": 2060 - }, - { - "epoch": 1.5661094224924013, - "grad_norm": 2.3238534927368164, - "learning_rate": 2.5272265980959644e-06, - "loss": 0.40366506576538086, - "mean_token_accuracy": 0.8496750593185425, - "num_tokens": 18462788.0, - "step": 2061 - }, - { - "epoch": 1.5668693009118542, - "grad_norm": 1.8954942226409912, - "learning_rate": 2.525132317891827e-06, - "loss": 0.3405473828315735, - "mean_token_accuracy": 0.8849360942840576, - "num_tokens": 18470719.0, - "step": 2062 - }, - { - "epoch": 1.5676291793313069, - "grad_norm": 1.6268190145492554, - "learning_rate": 2.523038020048861e-06, - "loss": 0.3662685751914978, - "mean_token_accuracy": 0.8865662813186646, - "num_tokens": 18482095.0, - "step": 2063 - }, - { - "epoch": 1.56838905775076, - "grad_norm": 2.5198733806610107, - "learning_rate": 2.5209437060369266e-06, - "loss": 0.3968311548233032, - "mean_token_accuracy": 0.8643308281898499, - "num_tokens": 18488069.0, - "step": 2064 - }, - { - "epoch": 1.5691489361702127, - "grad_norm": 2.9197335243225098, - "learning_rate": 2.518849377325893e-06, - "loss": 0.24738386273384094, - "mean_token_accuracy": 0.91959547996521, - "num_tokens": 18491762.0, - "step": 2065 - }, - { - "epoch": 1.5699088145896658, - "grad_norm": 1.5914254188537598, - "learning_rate": 2.51675503538564e-06, - "loss": 0.33473581075668335, - "mean_token_accuracy": 0.8794662952423096, - "num_tokens": 18501316.0, - "step": 2066 - }, - { - "epoch": 1.5706686930091185, - "grad_norm": 2.5130460262298584, - "learning_rate": 2.5146606816860597e-06, - "loss": 0.4067240357398987, - "mean_token_accuracy": 0.8564209342002869, - "num_tokens": 18507169.0, - "step": 2067 - }, - { - "epoch": 1.5714285714285714, - "grad_norm": 2.093353509902954, - "learning_rate": 2.5125663176970475e-06, - "loss": 0.4312136769294739, - "mean_token_accuracy": 0.8540225028991699, - "num_tokens": 18514536.0, - "step": 2068 - }, - { - "epoch": 1.5721884498480243, - "grad_norm": 1.284495234489441, - "learning_rate": 2.5104719448885103e-06, - "loss": 0.3813856542110443, - "mean_token_accuracy": 0.8435653448104858, - "num_tokens": 18529947.0, - "step": 2069 - }, - { - "epoch": 1.5729483282674772, - "grad_norm": 2.0383973121643066, - "learning_rate": 2.5083775647303583e-06, - "loss": 0.4428079426288605, - "mean_token_accuracy": 0.8841741681098938, - "num_tokens": 18537109.0, - "step": 2070 - }, - { - "epoch": 1.5737082066869301, - "grad_norm": 1.7991697788238525, - "learning_rate": 2.5062831786925102e-06, - "loss": 0.460052490234375, - "mean_token_accuracy": 0.8459943532943726, - "num_tokens": 18547108.0, - "step": 2071 - }, - { - "epoch": 1.574468085106383, - "grad_norm": 2.2168822288513184, - "learning_rate": 2.5041887882448845e-06, - "loss": 0.2863885462284088, - "mean_token_accuracy": 0.906816840171814, - "num_tokens": 18552357.0, - "step": 2072 - }, - { - "epoch": 1.575227963525836, - "grad_norm": 3.918499708175659, - "learning_rate": 2.5020943948574056e-06, - "loss": 0.3439999222755432, - "mean_token_accuracy": 0.8742123246192932, - "num_tokens": 18555272.0, - "step": 2073 - }, - { - "epoch": 1.5759878419452886, - "grad_norm": 1.773869514465332, - "learning_rate": 2.5e-06, - "loss": 0.2815646827220917, - "mean_token_accuracy": 0.8939872980117798, - "num_tokens": 18562989.0, - "step": 2074 - }, - { - "epoch": 1.5767477203647418, - "grad_norm": 1.8675572872161865, - "learning_rate": 2.497905605142595e-06, - "loss": 0.5005829930305481, - "mean_token_accuracy": 0.8242729902267456, - "num_tokens": 18575587.0, - "step": 2075 - }, - { - "epoch": 1.5775075987841944, - "grad_norm": 2.3143508434295654, - "learning_rate": 2.4958112117551163e-06, - "loss": 0.42472895979881287, - "mean_token_accuracy": 0.8540043830871582, - "num_tokens": 18581666.0, - "step": 2076 - }, - { - "epoch": 1.5782674772036476, - "grad_norm": 2.529740333557129, - "learning_rate": 2.4937168213074906e-06, - "loss": 0.24539905786514282, - "mean_token_accuracy": 0.9041235446929932, - "num_tokens": 18585773.0, - "step": 2077 - }, - { - "epoch": 1.5790273556231003, - "grad_norm": 2.5188395977020264, - "learning_rate": 2.491622435269642e-06, - "loss": 0.23059265315532684, - "mean_token_accuracy": 0.9204603433609009, - "num_tokens": 18589915.0, - "step": 2078 - }, - { - "epoch": 1.5797872340425532, - "grad_norm": 2.7752444744110107, - "learning_rate": 2.489528055111491e-06, - "loss": 0.452225923538208, - "mean_token_accuracy": 0.8444918990135193, - "num_tokens": 18595488.0, - "step": 2079 - }, - { - "epoch": 1.580547112462006, - "grad_norm": 1.174774408340454, - "learning_rate": 2.487433682302953e-06, - "loss": 0.3399246633052826, - "mean_token_accuracy": 0.8608446717262268, - "num_tokens": 18613756.0, - "step": 2080 - }, - { - "epoch": 1.581306990881459, - "grad_norm": 1.515575647354126, - "learning_rate": 2.485339318313941e-06, - "loss": 0.45886170864105225, - "mean_token_accuracy": 0.8479131460189819, - "num_tokens": 18629610.0, - "step": 2081 - }, - { - "epoch": 1.582066869300912, - "grad_norm": 1.7039403915405273, - "learning_rate": 2.4832449646143605e-06, - "loss": 0.349803626537323, - "mean_token_accuracy": 0.8721815347671509, - "num_tokens": 18637523.0, - "step": 2082 - }, - { - "epoch": 1.5828267477203646, - "grad_norm": 3.2289421558380127, - "learning_rate": 2.4811506226741077e-06, - "loss": 0.4967171549797058, - "mean_token_accuracy": 0.8303675651550293, - "num_tokens": 18641826.0, - "step": 2083 - }, - { - "epoch": 1.5835866261398177, - "grad_norm": 1.71235990524292, - "learning_rate": 2.4790562939630738e-06, - "loss": 0.4202485680580139, - "mean_token_accuracy": 0.8581224679946899, - "num_tokens": 18653146.0, - "step": 2084 - }, - { - "epoch": 1.5843465045592704, - "grad_norm": 1.710036277770996, - "learning_rate": 2.4769619799511392e-06, - "loss": 0.3942421078681946, - "mean_token_accuracy": 0.8553562164306641, - "num_tokens": 18663826.0, - "step": 2085 - }, - { - "epoch": 1.5851063829787235, - "grad_norm": 1.464859127998352, - "learning_rate": 2.474867682108174e-06, - "loss": 0.4093329906463623, - "mean_token_accuracy": 0.8598780632019043, - "num_tokens": 18675325.0, - "step": 2086 - }, - { - "epoch": 1.5858662613981762, - "grad_norm": 2.083707809448242, - "learning_rate": 2.472773401904037e-06, - "loss": 0.4252093434333801, - "mean_token_accuracy": 0.8433356881141663, - "num_tokens": 18682416.0, - "step": 2087 - }, - { - "epoch": 1.5866261398176293, - "grad_norm": 1.5577973127365112, - "learning_rate": 2.470679140808574e-06, - "loss": 0.3680085241794586, - "mean_token_accuracy": 0.8609116077423096, - "num_tokens": 18694445.0, - "step": 2088 - }, - { - "epoch": 1.587386018237082, - "grad_norm": 2.1617276668548584, - "learning_rate": 2.4685849002916184e-06, - "loss": 0.40488749742507935, - "mean_token_accuracy": 0.8429721593856812, - "num_tokens": 18701204.0, - "step": 2089 - }, - { - "epoch": 1.588145896656535, - "grad_norm": 2.046678304672241, - "learning_rate": 2.4664906818229903e-06, - "loss": 0.329141229391098, - "mean_token_accuracy": 0.8830771446228027, - "num_tokens": 18708354.0, - "step": 2090 - }, - { - "epoch": 1.5889057750759878, - "grad_norm": 2.7741200923919678, - "learning_rate": 2.4643964868724916e-06, - "loss": 0.42294493317604065, - "mean_token_accuracy": 0.8612706065177917, - "num_tokens": 18713017.0, - "step": 2091 - }, - { - "epoch": 1.5896656534954408, - "grad_norm": 2.085151433944702, - "learning_rate": 2.4623023169099074e-06, - "loss": 0.39038220047950745, - "mean_token_accuracy": 0.861169695854187, - "num_tokens": 18721423.0, - "step": 2092 - }, - { - "epoch": 1.5904255319148937, - "grad_norm": 2.8721165657043457, - "learning_rate": 2.4602081734050093e-06, - "loss": 0.27753859758377075, - "mean_token_accuracy": 0.8959167003631592, - "num_tokens": 18725044.0, - "step": 2093 - }, - { - "epoch": 1.5911854103343464, - "grad_norm": 1.7388207912445068, - "learning_rate": 2.4581140578275473e-06, - "loss": 0.3570033311843872, - "mean_token_accuracy": 0.8715590238571167, - "num_tokens": 18733891.0, - "step": 2094 - }, - { - "epoch": 1.5919452887537995, - "grad_norm": 2.3645241260528564, - "learning_rate": 2.456019971647251e-06, - "loss": 0.38982006907463074, - "mean_token_accuracy": 0.8734139800071716, - "num_tokens": 18740464.0, - "step": 2095 - }, - { - "epoch": 1.5927051671732522, - "grad_norm": 3.674072027206421, - "learning_rate": 2.4539259163338317e-06, - "loss": 0.4068281650543213, - "mean_token_accuracy": 0.8397839069366455, - "num_tokens": 18744857.0, - "step": 2096 - }, - { - "epoch": 1.5934650455927053, - "grad_norm": 1.8209186792373657, - "learning_rate": 2.4518318933569786e-06, - "loss": 0.3471015691757202, - "mean_token_accuracy": 0.8709044456481934, - "num_tokens": 18752414.0, - "step": 2097 - }, - { - "epoch": 1.594224924012158, - "grad_norm": 1.8138704299926758, - "learning_rate": 2.449737904186357e-06, - "loss": 0.3438487648963928, - "mean_token_accuracy": 0.8766711950302124, - "num_tokens": 18760587.0, - "step": 2098 - }, - { - "epoch": 1.594984802431611, - "grad_norm": 1.7893842458724976, - "learning_rate": 2.447643950291608e-06, - "loss": 0.43519508838653564, - "mean_token_accuracy": 0.8682907819747925, - "num_tokens": 18770293.0, - "step": 2099 - }, - { - "epoch": 1.5957446808510638, - "grad_norm": 1.4305094480514526, - "learning_rate": 2.4455500331423505e-06, - "loss": 0.37106508016586304, - "mean_token_accuracy": 0.8611354827880859, - "num_tokens": 18782456.0, - "step": 2100 - }, - { - "epoch": 1.5965045592705167, - "grad_norm": 2.0797057151794434, - "learning_rate": 2.4434561542081765e-06, - "loss": 0.43942689895629883, - "mean_token_accuracy": 0.8477288484573364, - "num_tokens": 18789547.0, - "step": 2101 - }, - { - "epoch": 1.5972644376899696, - "grad_norm": 1.2983288764953613, - "learning_rate": 2.441362314958649e-06, - "loss": 0.46385765075683594, - "mean_token_accuracy": 0.8340978622436523, - "num_tokens": 18809456.0, - "step": 2102 - }, - { - "epoch": 1.5980243161094225, - "grad_norm": 2.60866641998291, - "learning_rate": 2.439268516863306e-06, - "loss": 0.3106239140033722, - "mean_token_accuracy": 0.8859497308731079, - "num_tokens": 18813781.0, - "step": 2103 - }, - { - "epoch": 1.5987841945288754, - "grad_norm": 3.389376163482666, - "learning_rate": 2.4371747613916566e-06, - "loss": 0.44926169514656067, - "mean_token_accuracy": 0.8664819002151489, - "num_tokens": 18817666.0, - "step": 2104 - }, - { - "epoch": 1.5995440729483281, - "grad_norm": 3.3417351245880127, - "learning_rate": 2.4350810500131776e-06, - "loss": 0.4786076545715332, - "mean_token_accuracy": 0.8357523679733276, - "num_tokens": 18823717.0, - "step": 2105 - }, - { - "epoch": 1.6003039513677813, - "grad_norm": 1.5215197801589966, - "learning_rate": 2.4329873841973174e-06, - "loss": 0.4123923182487488, - "mean_token_accuracy": 0.853337287902832, - "num_tokens": 18835163.0, - "step": 2106 - }, - { - "epoch": 1.601063829787234, - "grad_norm": 1.8798415660858154, - "learning_rate": 2.4308937654134893e-06, - "loss": 0.45594000816345215, - "mean_token_accuracy": 0.8553717732429504, - "num_tokens": 18843923.0, - "step": 2107 - }, - { - "epoch": 1.601823708206687, - "grad_norm": 2.1012487411499023, - "learning_rate": 2.428800195131078e-06, - "loss": 0.4340161085128784, - "mean_token_accuracy": 0.8448120355606079, - "num_tokens": 18851852.0, - "step": 2108 - }, - { - "epoch": 1.6025835866261398, - "grad_norm": 2.827080726623535, - "learning_rate": 2.4267066748194297e-06, - "loss": 0.25922513008117676, - "mean_token_accuracy": 0.9024698734283447, - "num_tokens": 18856113.0, - "step": 2109 - }, - { - "epoch": 1.6033434650455927, - "grad_norm": 1.641032338142395, - "learning_rate": 2.4246132059478582e-06, - "loss": 0.591558575630188, - "mean_token_accuracy": 0.7960667610168457, - "num_tokens": 18870618.0, - "step": 2110 - }, - { - "epoch": 1.6041033434650456, - "grad_norm": 2.600771188735962, - "learning_rate": 2.4225197899856416e-06, - "loss": 0.382815957069397, - "mean_token_accuracy": 0.8654585480690002, - "num_tokens": 18875456.0, - "step": 2111 - }, - { - "epoch": 1.6048632218844985, - "grad_norm": 1.5125449895858765, - "learning_rate": 2.4204264284020182e-06, - "loss": 0.4643454849720001, - "mean_token_accuracy": 0.837038516998291, - "num_tokens": 18887979.0, - "step": 2112 - }, - { - "epoch": 1.6056231003039514, - "grad_norm": 1.7571941614151, - "learning_rate": 2.4183331226661913e-06, - "loss": 0.30713701248168945, - "mean_token_accuracy": 0.8856921195983887, - "num_tokens": 18896143.0, - "step": 2113 - }, - { - "epoch": 1.6063829787234043, - "grad_norm": 2.124593496322632, - "learning_rate": 2.4162398742473216e-06, - "loss": 0.2873607575893402, - "mean_token_accuracy": 0.8986717462539673, - "num_tokens": 18902364.0, - "step": 2114 - }, - { - "epoch": 1.6071428571428572, - "grad_norm": 2.3496272563934326, - "learning_rate": 2.4141466846145332e-06, - "loss": 0.33715200424194336, - "mean_token_accuracy": 0.8816461563110352, - "num_tokens": 18908038.0, - "step": 2115 - }, - { - "epoch": 1.60790273556231, - "grad_norm": 1.2783573865890503, - "learning_rate": 2.4120535552369057e-06, - "loss": 0.45153388381004333, - "mean_token_accuracy": 0.8345640897750854, - "num_tokens": 18926687.0, - "step": 2116 - }, - { - "epoch": 1.608662613981763, - "grad_norm": 2.1481080055236816, - "learning_rate": 2.4099604875834796e-06, - "loss": 0.43976694345474243, - "mean_token_accuracy": 0.847899317741394, - "num_tokens": 18932974.0, - "step": 2117 - }, - { - "epoch": 1.6094224924012157, - "grad_norm": 1.8669065237045288, - "learning_rate": 2.407867483123248e-06, - "loss": 0.4649358093738556, - "mean_token_accuracy": 0.8310785293579102, - "num_tokens": 18942551.0, - "step": 2118 - }, - { - "epoch": 1.6101823708206688, - "grad_norm": 2.7667746543884277, - "learning_rate": 2.4057745433251637e-06, - "loss": 0.4542210102081299, - "mean_token_accuracy": 0.8450086116790771, - "num_tokens": 18947525.0, - "step": 2119 - }, - { - "epoch": 1.6109422492401215, - "grad_norm": 2.2865076065063477, - "learning_rate": 2.4036816696581326e-06, - "loss": 0.34291431307792664, - "mean_token_accuracy": 0.8741394281387329, - "num_tokens": 18952967.0, - "step": 2120 - }, - { - "epoch": 1.6117021276595744, - "grad_norm": 3.055197238922119, - "learning_rate": 2.401588863591013e-06, - "loss": 0.4686807692050934, - "mean_token_accuracy": 0.8440030217170715, - "num_tokens": 18958257.0, - "step": 2121 - }, - { - "epoch": 1.6124620060790273, - "grad_norm": 2.268456220626831, - "learning_rate": 2.3994961265926166e-06, - "loss": 0.440069317817688, - "mean_token_accuracy": 0.8534891605377197, - "num_tokens": 18964745.0, - "step": 2122 - }, - { - "epoch": 1.6132218844984803, - "grad_norm": 2.061185359954834, - "learning_rate": 2.3974034601317085e-06, - "loss": 0.4383159279823303, - "mean_token_accuracy": 0.8484808802604675, - "num_tokens": 18972136.0, - "step": 2123 - }, - { - "epoch": 1.6139817629179332, - "grad_norm": 1.5121275186538696, - "learning_rate": 2.3953108656770018e-06, - "loss": 0.42403632402420044, - "mean_token_accuracy": 0.8467602133750916, - "num_tokens": 18985353.0, - "step": 2124 - }, - { - "epoch": 1.614741641337386, - "grad_norm": 1.9965397119522095, - "learning_rate": 2.3932183446971584e-06, - "loss": 0.3915751576423645, - "mean_token_accuracy": 0.8622956275939941, - "num_tokens": 18992017.0, - "step": 2125 - }, - { - "epoch": 1.615501519756839, - "grad_norm": 1.6688618659973145, - "learning_rate": 2.3911258986607907e-06, - "loss": 0.468288391828537, - "mean_token_accuracy": 0.8372251987457275, - "num_tokens": 19001930.0, - "step": 2126 - }, - { - "epoch": 1.6162613981762917, - "grad_norm": 1.8984699249267578, - "learning_rate": 2.3890335290364596e-06, - "loss": 0.3082895278930664, - "mean_token_accuracy": 0.8815990686416626, - "num_tokens": 19009712.0, - "step": 2127 - }, - { - "epoch": 1.6170212765957448, - "grad_norm": 2.6934773921966553, - "learning_rate": 2.386941237292669e-06, - "loss": 0.48406022787094116, - "mean_token_accuracy": 0.8300775289535522, - "num_tokens": 19015212.0, - "step": 2128 - }, - { - "epoch": 1.6177811550151975, - "grad_norm": 1.6615487337112427, - "learning_rate": 2.3848490248978693e-06, - "loss": 0.45227736234664917, - "mean_token_accuracy": 0.8421006798744202, - "num_tokens": 19027115.0, - "step": 2129 - }, - { - "epoch": 1.6185410334346506, - "grad_norm": 1.4625248908996582, - "learning_rate": 2.3827568933204576e-06, - "loss": 0.4141014814376831, - "mean_token_accuracy": 0.8479453325271606, - "num_tokens": 19041103.0, - "step": 2130 - }, - { - "epoch": 1.6193009118541033, - "grad_norm": 1.856701135635376, - "learning_rate": 2.3806648440287715e-06, - "loss": 0.3440483808517456, - "mean_token_accuracy": 0.8978210687637329, - "num_tokens": 19048124.0, - "step": 2131 - }, - { - "epoch": 1.6200607902735562, - "grad_norm": 1.7056550979614258, - "learning_rate": 2.378572878491091e-06, - "loss": 0.4136195182800293, - "mean_token_accuracy": 0.8579289317131042, - "num_tokens": 19057113.0, - "step": 2132 - }, - { - "epoch": 1.6208206686930091, - "grad_norm": 1.4673033952713013, - "learning_rate": 2.376480998175638e-06, - "loss": 0.40176504850387573, - "mean_token_accuracy": 0.8677150011062622, - "num_tokens": 19068258.0, - "step": 2133 - }, - { - "epoch": 1.621580547112462, - "grad_norm": 2.12859845161438, - "learning_rate": 2.3743892045505764e-06, - "loss": 0.39754825830459595, - "mean_token_accuracy": 0.8486959934234619, - "num_tokens": 19075469.0, - "step": 2134 - }, - { - "epoch": 1.622340425531915, - "grad_norm": 1.474247694015503, - "learning_rate": 2.372297499084006e-06, - "loss": 0.3546760678291321, - "mean_token_accuracy": 0.8767229318618774, - "num_tokens": 19086744.0, - "step": 2135 - }, - { - "epoch": 1.6231003039513676, - "grad_norm": 1.9945709705352783, - "learning_rate": 2.3702058832439667e-06, - "loss": 0.4200798273086548, - "mean_token_accuracy": 0.8435655832290649, - "num_tokens": 19095903.0, - "step": 2136 - }, - { - "epoch": 1.6238601823708207, - "grad_norm": 2.71991229057312, - "learning_rate": 2.368114358498434e-06, - "loss": 0.44925457239151, - "mean_token_accuracy": 0.8348450660705566, - "num_tokens": 19100864.0, - "step": 2137 - }, - { - "epoch": 1.6246200607902734, - "grad_norm": 2.817664623260498, - "learning_rate": 2.366022926315322e-06, - "loss": 0.44386279582977295, - "mean_token_accuracy": 0.8739628791809082, - "num_tokens": 19105355.0, - "step": 2138 - }, - { - "epoch": 1.6253799392097266, - "grad_norm": 1.3673229217529297, - "learning_rate": 2.3639315881624776e-06, - "loss": 0.3693230152130127, - "mean_token_accuracy": 0.8698620796203613, - "num_tokens": 19116748.0, - "step": 2139 - }, - { - "epoch": 1.6261398176291793, - "grad_norm": 2.712531805038452, - "learning_rate": 2.361840345507683e-06, - "loss": 0.4442938268184662, - "mean_token_accuracy": 0.8433241844177246, - "num_tokens": 19121437.0, - "step": 2140 - }, - { - "epoch": 1.6268996960486324, - "grad_norm": 2.2885231971740723, - "learning_rate": 2.359749199818651e-06, - "loss": 0.4021872878074646, - "mean_token_accuracy": 0.8605252504348755, - "num_tokens": 19127633.0, - "step": 2141 - }, - { - "epoch": 1.627659574468085, - "grad_norm": 1.9257299900054932, - "learning_rate": 2.3576581525630297e-06, - "loss": 0.3577788472175598, - "mean_token_accuracy": 0.8691596388816833, - "num_tokens": 19134450.0, - "step": 2142 - }, - { - "epoch": 1.628419452887538, - "grad_norm": 1.5035467147827148, - "learning_rate": 2.355567205208397e-06, - "loss": 0.3800235986709595, - "mean_token_accuracy": 0.867794394493103, - "num_tokens": 19146149.0, - "step": 2143 - }, - { - "epoch": 1.6291793313069909, - "grad_norm": 2.110445737838745, - "learning_rate": 2.353476359222259e-06, - "loss": 0.34394145011901855, - "mean_token_accuracy": 0.8777303695678711, - "num_tokens": 19152017.0, - "step": 2144 - }, - { - "epoch": 1.6299392097264438, - "grad_norm": 1.1713787317276, - "learning_rate": 2.351385616072052e-06, - "loss": 0.4060516357421875, - "mean_token_accuracy": 0.8411345481872559, - "num_tokens": 19172089.0, - "step": 2145 - }, - { - "epoch": 1.6306990881458967, - "grad_norm": 1.7600529193878174, - "learning_rate": 2.3492949772251418e-06, - "loss": 0.5299694538116455, - "mean_token_accuracy": 0.8218191862106323, - "num_tokens": 19184041.0, - "step": 2146 - }, - { - "epoch": 1.6314589665653494, - "grad_norm": 1.7126617431640625, - "learning_rate": 2.3472044441488175e-06, - "loss": 0.38628721237182617, - "mean_token_accuracy": 0.8526935577392578, - "num_tokens": 19193101.0, - "step": 2147 - }, - { - "epoch": 1.6322188449848025, - "grad_norm": 1.210344672203064, - "learning_rate": 2.345114018310295e-06, - "loss": 0.2732373774051666, - "mean_token_accuracy": 0.8903822898864746, - "num_tokens": 19206697.0, - "step": 2148 - }, - { - "epoch": 1.6329787234042552, - "grad_norm": 1.6693075895309448, - "learning_rate": 2.3430237011767166e-06, - "loss": 0.3472709655761719, - "mean_token_accuracy": 0.8767187595367432, - "num_tokens": 19217008.0, - "step": 2149 - }, - { - "epoch": 1.6337386018237083, - "grad_norm": 1.5242515802383423, - "learning_rate": 2.3409334942151485e-06, - "loss": 0.4345507025718689, - "mean_token_accuracy": 0.8481311202049255, - "num_tokens": 19231573.0, - "step": 2150 - }, - { - "epoch": 1.634498480243161, - "grad_norm": 2.470122814178467, - "learning_rate": 2.3388433988925767e-06, - "loss": 0.4453052878379822, - "mean_token_accuracy": 0.8411355018615723, - "num_tokens": 19237076.0, - "step": 2151 - }, - { - "epoch": 1.635258358662614, - "grad_norm": 2.4177467823028564, - "learning_rate": 2.3367534166759105e-06, - "loss": 0.454534113407135, - "mean_token_accuracy": 0.8635509014129639, - "num_tokens": 19242890.0, - "step": 2152 - }, - { - "epoch": 1.6360182370820668, - "grad_norm": 2.8036744594573975, - "learning_rate": 2.3346635490319815e-06, - "loss": 0.4396413564682007, - "mean_token_accuracy": 0.8491836786270142, - "num_tokens": 19247492.0, - "step": 2153 - }, - { - "epoch": 1.6367781155015197, - "grad_norm": 1.9286335706710815, - "learning_rate": 2.3325737974275382e-06, - "loss": 0.34988659620285034, - "mean_token_accuracy": 0.8704243898391724, - "num_tokens": 19254966.0, - "step": 2154 - }, - { - "epoch": 1.6375379939209727, - "grad_norm": 1.8929904699325562, - "learning_rate": 2.3304841633292487e-06, - "loss": 0.4195491671562195, - "mean_token_accuracy": 0.857181966304779, - "num_tokens": 19263324.0, - "step": 2155 - }, - { - "epoch": 1.6382978723404256, - "grad_norm": 2.2598466873168945, - "learning_rate": 2.328394648203698e-06, - "loss": 0.37977826595306396, - "mean_token_accuracy": 0.8626722097396851, - "num_tokens": 19269363.0, - "step": 2156 - }, - { - "epoch": 1.6390577507598785, - "grad_norm": 1.8118126392364502, - "learning_rate": 2.32630525351739e-06, - "loss": 0.3532063364982605, - "mean_token_accuracy": 0.8677854537963867, - "num_tokens": 19277360.0, - "step": 2157 - }, - { - "epoch": 1.6398176291793312, - "grad_norm": 1.5216798782348633, - "learning_rate": 2.324215980736741e-06, - "loss": 0.38609349727630615, - "mean_token_accuracy": 0.8685325980186462, - "num_tokens": 19292159.0, - "step": 2158 - }, - { - "epoch": 1.6405775075987843, - "grad_norm": 3.0511462688446045, - "learning_rate": 2.3221268313280836e-06, - "loss": 0.21988365054130554, - "mean_token_accuracy": 0.9172534942626953, - "num_tokens": 19295735.0, - "step": 2159 - }, - { - "epoch": 1.641337386018237, - "grad_norm": 1.957828164100647, - "learning_rate": 2.320037806757662e-06, - "loss": 0.3868909478187561, - "mean_token_accuracy": 0.8605331182479858, - "num_tokens": 19303287.0, - "step": 2160 - }, - { - "epoch": 1.64209726443769, - "grad_norm": 2.590040922164917, - "learning_rate": 2.317948908491636e-06, - "loss": 0.3940129578113556, - "mean_token_accuracy": 0.8814224004745483, - "num_tokens": 19308101.0, - "step": 2161 - }, - { - "epoch": 1.6428571428571428, - "grad_norm": 2.859248161315918, - "learning_rate": 2.315860137996074e-06, - "loss": 0.3437344431877136, - "mean_token_accuracy": 0.8789017200469971, - "num_tokens": 19313026.0, - "step": 2162 - }, - { - "epoch": 1.6436170212765957, - "grad_norm": 1.1788666248321533, - "learning_rate": 2.3137714967369544e-06, - "loss": 0.3976179361343384, - "mean_token_accuracy": 0.8383771181106567, - "num_tokens": 19331103.0, - "step": 2163 - }, - { - "epoch": 1.6443768996960486, - "grad_norm": 1.8409802913665771, - "learning_rate": 2.3116829861801687e-06, - "loss": 0.41898879408836365, - "mean_token_accuracy": 0.8575010299682617, - "num_tokens": 19340866.0, - "step": 2164 - }, - { - "epoch": 1.6451367781155015, - "grad_norm": 1.4124691486358643, - "learning_rate": 2.3095946077915115e-06, - "loss": 0.333813339471817, - "mean_token_accuracy": 0.8766071796417236, - "num_tokens": 19353673.0, - "step": 2165 - }, - { - "epoch": 1.6458966565349544, - "grad_norm": 1.76325261592865, - "learning_rate": 2.307506363036688e-06, - "loss": 0.4158991575241089, - "mean_token_accuracy": 0.8522704839706421, - "num_tokens": 19363635.0, - "step": 2166 - }, - { - "epoch": 1.6466565349544073, - "grad_norm": 1.758833885192871, - "learning_rate": 2.305418253381309e-06, - "loss": 0.298480749130249, - "mean_token_accuracy": 0.888424277305603, - "num_tokens": 19372291.0, - "step": 2167 - }, - { - "epoch": 1.6474164133738602, - "grad_norm": 1.6387488842010498, - "learning_rate": 2.3033302802908895e-06, - "loss": 0.4309447109699249, - "mean_token_accuracy": 0.8672212362289429, - "num_tokens": 19383480.0, - "step": 2168 - }, - { - "epoch": 1.648176291793313, - "grad_norm": 1.5251084566116333, - "learning_rate": 2.301242445230851e-06, - "loss": 0.44890880584716797, - "mean_token_accuracy": 0.847392737865448, - "num_tokens": 19394810.0, - "step": 2169 - }, - { - "epoch": 1.648936170212766, - "grad_norm": 1.6106950044631958, - "learning_rate": 2.299154749666515e-06, - "loss": 0.4403916597366333, - "mean_token_accuracy": 0.8379756212234497, - "num_tokens": 19405551.0, - "step": 2170 - }, - { - "epoch": 1.6496960486322187, - "grad_norm": 1.4238437414169312, - "learning_rate": 2.2970671950631066e-06, - "loss": 0.4015567898750305, - "mean_token_accuracy": 0.851482629776001, - "num_tokens": 19418621.0, - "step": 2171 - }, - { - "epoch": 1.6504559270516719, - "grad_norm": 1.3026156425476074, - "learning_rate": 2.2949797828857527e-06, - "loss": 0.3680947422981262, - "mean_token_accuracy": 0.8641397953033447, - "num_tokens": 19432118.0, - "step": 2172 - }, - { - "epoch": 1.6512158054711246, - "grad_norm": 2.1265358924865723, - "learning_rate": 2.2928925145994798e-06, - "loss": 0.43980664014816284, - "mean_token_accuracy": 0.8358430862426758, - "num_tokens": 19439069.0, - "step": 2173 - }, - { - "epoch": 1.6519756838905775, - "grad_norm": 1.8399443626403809, - "learning_rate": 2.290805391669212e-06, - "loss": 0.29801061749458313, - "mean_token_accuracy": 0.8773187398910522, - "num_tokens": 19446745.0, - "step": 2174 - }, - { - "epoch": 1.6527355623100304, - "grad_norm": 1.8680047988891602, - "learning_rate": 2.2887184155597725e-06, - "loss": 0.3235543966293335, - "mean_token_accuracy": 0.8754611015319824, - "num_tokens": 19455266.0, - "step": 2175 - }, - { - "epoch": 1.6534954407294833, - "grad_norm": 2.3048481941223145, - "learning_rate": 2.286631587735883e-06, - "loss": 0.4011988043785095, - "mean_token_accuracy": 0.8531811237335205, - "num_tokens": 19461049.0, - "step": 2176 - }, - { - "epoch": 1.6542553191489362, - "grad_norm": 2.6067066192626953, - "learning_rate": 2.2845449096621583e-06, - "loss": 0.4957500696182251, - "mean_token_accuracy": 0.8255549073219299, - "num_tokens": 19466884.0, - "step": 2177 - }, - { - "epoch": 1.655015197568389, - "grad_norm": 1.5211488008499146, - "learning_rate": 2.282458382803109e-06, - "loss": 0.32245099544525146, - "mean_token_accuracy": 0.8865629434585571, - "num_tokens": 19477294.0, - "step": 2178 - }, - { - "epoch": 1.655775075987842, - "grad_norm": 2.245542526245117, - "learning_rate": 2.280372008623142e-06, - "loss": 0.3790864944458008, - "mean_token_accuracy": 0.8766552209854126, - "num_tokens": 19483385.0, - "step": 2179 - }, - { - "epoch": 1.6565349544072947, - "grad_norm": 2.1158151626586914, - "learning_rate": 2.2782857885865538e-06, - "loss": 0.4726812243461609, - "mean_token_accuracy": 0.8384029865264893, - "num_tokens": 19491367.0, - "step": 2180 - }, - { - "epoch": 1.6572948328267478, - "grad_norm": 3.301389694213867, - "learning_rate": 2.2761997241575335e-06, - "loss": 0.37664809823036194, - "mean_token_accuracy": 0.8913813829421997, - "num_tokens": 19494876.0, - "step": 2181 - }, - { - "epoch": 1.6580547112462005, - "grad_norm": 2.2964162826538086, - "learning_rate": 2.274113816800161e-06, - "loss": 0.4110721945762634, - "mean_token_accuracy": 0.8551756143569946, - "num_tokens": 19500546.0, - "step": 2182 - }, - { - "epoch": 1.6588145896656536, - "grad_norm": 3.368161916732788, - "learning_rate": 2.272028067978408e-06, - "loss": 0.39089250564575195, - "mean_token_accuracy": 0.8786845207214355, - "num_tokens": 19504142.0, - "step": 2183 - }, - { - "epoch": 1.6595744680851063, - "grad_norm": 1.7299834489822388, - "learning_rate": 2.2699424791561324e-06, - "loss": 0.5205090641975403, - "mean_token_accuracy": 0.8394201993942261, - "num_tokens": 19514523.0, - "step": 2184 - }, - { - "epoch": 1.6603343465045592, - "grad_norm": 2.045919418334961, - "learning_rate": 2.267857051797081e-06, - "loss": 0.49093255400657654, - "mean_token_accuracy": 0.8338311910629272, - "num_tokens": 19522439.0, - "step": 2185 - }, - { - "epoch": 1.6610942249240122, - "grad_norm": 1.2035714387893677, - "learning_rate": 2.265771787364886e-06, - "loss": 0.37247753143310547, - "mean_token_accuracy": 0.8873692750930786, - "num_tokens": 19536717.0, - "step": 2186 - }, - { - "epoch": 1.661854103343465, - "grad_norm": 2.6186633110046387, - "learning_rate": 2.263686687323068e-06, - "loss": 0.3318040370941162, - "mean_token_accuracy": 0.8720577955245972, - "num_tokens": 19541966.0, - "step": 2187 - }, - { - "epoch": 1.662613981762918, - "grad_norm": 2.6845929622650146, - "learning_rate": 2.261601753135029e-06, - "loss": 0.32441991567611694, - "mean_token_accuracy": 0.8700553178787231, - "num_tokens": 19546644.0, - "step": 2188 - }, - { - "epoch": 1.6633738601823707, - "grad_norm": 2.078998327255249, - "learning_rate": 2.259516986264057e-06, - "loss": 0.3424156904220581, - "mean_token_accuracy": 0.8707810044288635, - "num_tokens": 19553472.0, - "step": 2189 - }, - { - "epoch": 1.6641337386018238, - "grad_norm": 2.380747079849243, - "learning_rate": 2.2574323881733202e-06, - "loss": 0.4994799494743347, - "mean_token_accuracy": 0.817003607749939, - "num_tokens": 19560502.0, - "step": 2190 - }, - { - "epoch": 1.6648936170212765, - "grad_norm": 1.2984378337860107, - "learning_rate": 2.255347960325871e-06, - "loss": 0.33139657974243164, - "mean_token_accuracy": 0.8763977289199829, - "num_tokens": 19575624.0, - "step": 2191 - }, - { - "epoch": 1.6656534954407296, - "grad_norm": 1.3232799768447876, - "learning_rate": 2.2532637041846423e-06, - "loss": 0.32994017004966736, - "mean_token_accuracy": 0.8790634274482727, - "num_tokens": 19588636.0, - "step": 2192 - }, - { - "epoch": 1.6664133738601823, - "grad_norm": 2.11212158203125, - "learning_rate": 2.2511796212124424e-06, - "loss": 0.3140082359313965, - "mean_token_accuracy": 0.8946622014045715, - "num_tokens": 19594917.0, - "step": 2193 - }, - { - "epoch": 1.6671732522796354, - "grad_norm": 2.7206521034240723, - "learning_rate": 2.2490957128719627e-06, - "loss": 0.3723612427711487, - "mean_token_accuracy": 0.8781955242156982, - "num_tokens": 19599310.0, - "step": 2194 - }, - { - "epoch": 1.667933130699088, - "grad_norm": 2.6681952476501465, - "learning_rate": 2.247011980625771e-06, - "loss": 0.3740317225456238, - "mean_token_accuracy": 0.8780536651611328, - "num_tokens": 19604172.0, - "step": 2195 - }, - { - "epoch": 1.668693009118541, - "grad_norm": 1.8933384418487549, - "learning_rate": 2.2449284259363093e-06, - "loss": 0.3359421491622925, - "mean_token_accuracy": 0.8785334825515747, - "num_tokens": 19612030.0, - "step": 2196 - }, - { - "epoch": 1.669452887537994, - "grad_norm": 2.4779889583587646, - "learning_rate": 2.2428450502658964e-06, - "loss": 0.3724144399166107, - "mean_token_accuracy": 0.8739810585975647, - "num_tokens": 19617800.0, - "step": 2197 - }, - { - "epoch": 1.6702127659574468, - "grad_norm": 3.0661120414733887, - "learning_rate": 2.240761855076727e-06, - "loss": 0.3627531826496124, - "mean_token_accuracy": 0.865296483039856, - "num_tokens": 19621885.0, - "step": 2198 - }, - { - "epoch": 1.6709726443768997, - "grad_norm": 2.431708574295044, - "learning_rate": 2.238678841830867e-06, - "loss": 0.31396129727363586, - "mean_token_accuracy": 0.9026765823364258, - "num_tokens": 19627122.0, - "step": 2199 - }, - { - "epoch": 1.6717325227963524, - "grad_norm": 2.5498745441436768, - "learning_rate": 2.2365960119902543e-06, - "loss": 0.3193191885948181, - "mean_token_accuracy": 0.8750600218772888, - "num_tokens": 19631771.0, - "step": 2200 - }, - { - "epoch": 1.6724924012158056, - "grad_norm": 2.0419046878814697, - "learning_rate": 2.2345133670167e-06, - "loss": 0.32747960090637207, - "mean_token_accuracy": 0.8603148460388184, - "num_tokens": 19638972.0, - "step": 2201 - }, - { - "epoch": 1.6732522796352582, - "grad_norm": 2.0412306785583496, - "learning_rate": 2.232430908371885e-06, - "loss": 0.4701780676841736, - "mean_token_accuracy": 0.8318476676940918, - "num_tokens": 19647968.0, - "step": 2202 - }, - { - "epoch": 1.6740121580547114, - "grad_norm": 2.054070472717285, - "learning_rate": 2.2303486375173586e-06, - "loss": 0.33284813165664673, - "mean_token_accuracy": 0.8760920763015747, - "num_tokens": 19654032.0, - "step": 2203 - }, - { - "epoch": 1.674772036474164, - "grad_norm": 1.6053217649459839, - "learning_rate": 2.228266555914538e-06, - "loss": 0.34431374073028564, - "mean_token_accuracy": 0.8764770030975342, - "num_tokens": 19663785.0, - "step": 2204 - }, - { - "epoch": 1.675531914893617, - "grad_norm": 1.474494457244873, - "learning_rate": 2.2261846650247077e-06, - "loss": 0.3541037440299988, - "mean_token_accuracy": 0.8782497644424438, - "num_tokens": 19675498.0, - "step": 2205 - }, - { - "epoch": 1.6762917933130699, - "grad_norm": 1.9318026304244995, - "learning_rate": 2.224102966309021e-06, - "loss": 0.4291660189628601, - "mean_token_accuracy": 0.8424201607704163, - "num_tokens": 19684576.0, - "step": 2206 - }, - { - "epoch": 1.6770516717325228, - "grad_norm": 2.2150020599365234, - "learning_rate": 2.2220214612284925e-06, - "loss": 0.46187907457351685, - "mean_token_accuracy": 0.840459942817688, - "num_tokens": 19690412.0, - "step": 2207 - }, - { - "epoch": 1.6778115501519757, - "grad_norm": 1.667281150817871, - "learning_rate": 2.2199401512440037e-06, - "loss": 0.37440744042396545, - "mean_token_accuracy": 0.8694081902503967, - "num_tokens": 19699600.0, - "step": 2208 - }, - { - "epoch": 1.6785714285714286, - "grad_norm": 2.6446619033813477, - "learning_rate": 2.2178590378162957e-06, - "loss": 0.3301953077316284, - "mean_token_accuracy": 0.8992182016372681, - "num_tokens": 19704162.0, - "step": 2209 - }, - { - "epoch": 1.6793313069908815, - "grad_norm": 1.4266780614852905, - "learning_rate": 2.215778122405977e-06, - "loss": 0.3811204135417938, - "mean_token_accuracy": 0.861638069152832, - "num_tokens": 19716511.0, - "step": 2210 - }, - { - "epoch": 1.6800911854103342, - "grad_norm": 1.826087474822998, - "learning_rate": 2.2136974064735132e-06, - "loss": 0.4790012836456299, - "mean_token_accuracy": 0.8404909372329712, - "num_tokens": 19726645.0, - "step": 2211 - }, - { - "epoch": 1.6808510638297873, - "grad_norm": 1.8551808595657349, - "learning_rate": 2.2116168914792293e-06, - "loss": 0.40999075770378113, - "mean_token_accuracy": 0.8419463634490967, - "num_tokens": 19735601.0, - "step": 2212 - }, - { - "epoch": 1.68161094224924, - "grad_norm": 2.560124158859253, - "learning_rate": 2.209536578883313e-06, - "loss": 0.43428558111190796, - "mean_token_accuracy": 0.8689159750938416, - "num_tokens": 19741138.0, - "step": 2213 - }, - { - "epoch": 1.6823708206686931, - "grad_norm": 2.0154869556427, - "learning_rate": 2.207456470145807e-06, - "loss": 0.43633338809013367, - "mean_token_accuracy": 0.8646916151046753, - "num_tokens": 19751929.0, - "step": 2214 - }, - { - "epoch": 1.6831306990881458, - "grad_norm": 1.3583155870437622, - "learning_rate": 2.205376566726611e-06, - "loss": 0.3050280511379242, - "mean_token_accuracy": 0.8998798727989197, - "num_tokens": 19764012.0, - "step": 2215 - }, - { - "epoch": 1.6838905775075987, - "grad_norm": 1.266262173652649, - "learning_rate": 2.2032968700854813e-06, - "loss": 0.4039713144302368, - "mean_token_accuracy": 0.8571382164955139, - "num_tokens": 19780683.0, - "step": 2216 - }, - { - "epoch": 1.6846504559270516, - "grad_norm": 1.864356517791748, - "learning_rate": 2.2012173816820297e-06, - "loss": 0.361503541469574, - "mean_token_accuracy": 0.868161678314209, - "num_tokens": 19788907.0, - "step": 2217 - }, - { - "epoch": 1.6854103343465046, - "grad_norm": 1.320155382156372, - "learning_rate": 2.1991381029757216e-06, - "loss": 0.28228244185447693, - "mean_token_accuracy": 0.8945217132568359, - "num_tokens": 19800354.0, - "step": 2218 - }, - { - "epoch": 1.6861702127659575, - "grad_norm": 1.9706367254257202, - "learning_rate": 2.1970590354258745e-06, - "loss": 0.2849377989768982, - "mean_token_accuracy": 0.9065699577331543, - "num_tokens": 19806735.0, - "step": 2219 - }, - { - "epoch": 1.6869300911854104, - "grad_norm": 1.9150370359420776, - "learning_rate": 2.1949801804916563e-06, - "loss": 0.4125257730484009, - "mean_token_accuracy": 0.8642163872718811, - "num_tokens": 19814056.0, - "step": 2220 - }, - { - "epoch": 1.6876899696048633, - "grad_norm": 2.062589645385742, - "learning_rate": 2.19290153963209e-06, - "loss": 0.451707124710083, - "mean_token_accuracy": 0.8311163187026978, - "num_tokens": 19821263.0, - "step": 2221 - }, - { - "epoch": 1.688449848024316, - "grad_norm": 1.3959208726882935, - "learning_rate": 2.190823114306045e-06, - "loss": 0.3326707184314728, - "mean_token_accuracy": 0.9037837982177734, - "num_tokens": 19835163.0, - "step": 2222 - }, - { - "epoch": 1.689209726443769, - "grad_norm": 2.09995698928833, - "learning_rate": 2.188744905972239e-06, - "loss": 0.4144105315208435, - "mean_token_accuracy": 0.8512029051780701, - "num_tokens": 19843164.0, - "step": 2223 - }, - { - "epoch": 1.6899696048632218, - "grad_norm": 1.4759427309036255, - "learning_rate": 2.186666916089239e-06, - "loss": 0.4707002639770508, - "mean_token_accuracy": 0.8371601104736328, - "num_tokens": 19858551.0, - "step": 2224 - }, - { - "epoch": 1.690729483282675, - "grad_norm": 2.3398702144622803, - "learning_rate": 2.1845891461154604e-06, - "loss": 0.34672820568084717, - "mean_token_accuracy": 0.879936695098877, - "num_tokens": 19864348.0, - "step": 2225 - }, - { - "epoch": 1.6914893617021276, - "grad_norm": 1.6283963918685913, - "learning_rate": 2.1825115975091594e-06, - "loss": 0.31835079193115234, - "mean_token_accuracy": 0.8695961833000183, - "num_tokens": 19873560.0, - "step": 2226 - }, - { - "epoch": 1.6922492401215805, - "grad_norm": 2.035759687423706, - "learning_rate": 2.1804342717284414e-06, - "loss": 0.43110257387161255, - "mean_token_accuracy": 0.8593922853469849, - "num_tokens": 19880796.0, - "step": 2227 - }, - { - "epoch": 1.6930091185410334, - "grad_norm": 2.1340725421905518, - "learning_rate": 2.1783571702312523e-06, - "loss": 0.46967440843582153, - "mean_token_accuracy": 0.8839266300201416, - "num_tokens": 19887911.0, - "step": 2228 - }, - { - "epoch": 1.6937689969604863, - "grad_norm": 1.710340142250061, - "learning_rate": 2.176280294475383e-06, - "loss": 0.4167519807815552, - "mean_token_accuracy": 0.8526116609573364, - "num_tokens": 19896674.0, - "step": 2229 - }, - { - "epoch": 1.6945288753799392, - "grad_norm": 1.7793304920196533, - "learning_rate": 2.174203645918464e-06, - "loss": 0.3875434994697571, - "mean_token_accuracy": 0.8637192249298096, - "num_tokens": 19904825.0, - "step": 2230 - }, - { - "epoch": 1.6952887537993921, - "grad_norm": 1.7908778190612793, - "learning_rate": 2.172127226017967e-06, - "loss": 0.42065349221229553, - "mean_token_accuracy": 0.850834846496582, - "num_tokens": 19914377.0, - "step": 2231 - }, - { - "epoch": 1.696048632218845, - "grad_norm": 3.0943970680236816, - "learning_rate": 2.1700510362312053e-06, - "loss": 0.44845050573349, - "mean_token_accuracy": 0.8460367918014526, - "num_tokens": 19918929.0, - "step": 2232 - }, - { - "epoch": 1.6968085106382977, - "grad_norm": 1.5586018562316895, - "learning_rate": 2.1679750780153265e-06, - "loss": 0.4723482131958008, - "mean_token_accuracy": 0.871384859085083, - "num_tokens": 19932738.0, - "step": 2233 - }, - { - "epoch": 1.6975683890577509, - "grad_norm": 2.014230728149414, - "learning_rate": 2.1658993528273196e-06, - "loss": 0.43307146430015564, - "mean_token_accuracy": 0.8677935600280762, - "num_tokens": 19940246.0, - "step": 2234 - }, - { - "epoch": 1.6983282674772036, - "grad_norm": 1.528979778289795, - "learning_rate": 2.163823862124007e-06, - "loss": 0.3897377550601959, - "mean_token_accuracy": 0.8737689256668091, - "num_tokens": 19951187.0, - "step": 2235 - }, - { - "epoch": 1.6990881458966567, - "grad_norm": 1.9856207370758057, - "learning_rate": 2.1617486073620496e-06, - "loss": 0.4285745620727539, - "mean_token_accuracy": 0.8744081258773804, - "num_tokens": 19957768.0, - "step": 2236 - }, - { - "epoch": 1.6998480243161094, - "grad_norm": 2.130525827407837, - "learning_rate": 2.15967358999794e-06, - "loss": 0.405293732881546, - "mean_token_accuracy": 0.8588452935218811, - "num_tokens": 19965354.0, - "step": 2237 - }, - { - "epoch": 1.7006079027355623, - "grad_norm": 1.665329098701477, - "learning_rate": 2.1575988114880057e-06, - "loss": 0.42987754940986633, - "mean_token_accuracy": 0.846322238445282, - "num_tokens": 19975780.0, - "step": 2238 - }, - { - "epoch": 1.7013677811550152, - "grad_norm": 1.0725677013397217, - "learning_rate": 2.155524273288405e-06, - "loss": 0.31892159581184387, - "mean_token_accuracy": 0.8692483305931091, - "num_tokens": 19995875.0, - "step": 2239 - }, - { - "epoch": 1.702127659574468, - "grad_norm": 2.282604455947876, - "learning_rate": 2.15344997685513e-06, - "loss": 0.4460654556751251, - "mean_token_accuracy": 0.8623759746551514, - "num_tokens": 20001466.0, - "step": 2240 - }, - { - "epoch": 1.702887537993921, - "grad_norm": 1.1385949850082397, - "learning_rate": 2.1513759236440024e-06, - "loss": 0.37046104669570923, - "mean_token_accuracy": 0.8637164831161499, - "num_tokens": 20020998.0, - "step": 2241 - }, - { - "epoch": 1.7036474164133737, - "grad_norm": 1.5521315336227417, - "learning_rate": 2.1493021151106704e-06, - "loss": 0.4526556134223938, - "mean_token_accuracy": 0.8675785064697266, - "num_tokens": 20032750.0, - "step": 2242 - }, - { - "epoch": 1.7044072948328268, - "grad_norm": 1.7777446508407593, - "learning_rate": 2.147228552710614e-06, - "loss": 0.41294580698013306, - "mean_token_accuracy": 0.8597785234451294, - "num_tokens": 20041901.0, - "step": 2243 - }, - { - "epoch": 1.7051671732522795, - "grad_norm": 1.5157700777053833, - "learning_rate": 2.145155237899139e-06, - "loss": 0.4158926010131836, - "mean_token_accuracy": 0.8512611985206604, - "num_tokens": 20053705.0, - "step": 2244 - }, - { - "epoch": 1.7059270516717326, - "grad_norm": 1.5116809606552124, - "learning_rate": 2.143082172131378e-06, - "loss": 0.43943172693252563, - "mean_token_accuracy": 0.8429899215698242, - "num_tokens": 20069468.0, - "step": 2245 - }, - { - "epoch": 1.7066869300911853, - "grad_norm": 1.6095285415649414, - "learning_rate": 2.141009356862288e-06, - "loss": 0.41325604915618896, - "mean_token_accuracy": 0.8832963705062866, - "num_tokens": 20080596.0, - "step": 2246 - }, - { - "epoch": 1.7074468085106385, - "grad_norm": 1.39210844039917, - "learning_rate": 2.138936793546649e-06, - "loss": 0.3945302963256836, - "mean_token_accuracy": 0.8698325753211975, - "num_tokens": 20094158.0, - "step": 2247 - }, - { - "epoch": 1.7082066869300911, - "grad_norm": 2.9576594829559326, - "learning_rate": 2.1368644836390684e-06, - "loss": 0.16507276892662048, - "mean_token_accuracy": 0.9410445690155029, - "num_tokens": 20097002.0, - "step": 2248 - }, - { - "epoch": 1.708966565349544, - "grad_norm": 1.7631266117095947, - "learning_rate": 2.134792428593971e-06, - "loss": 0.519780695438385, - "mean_token_accuracy": 0.8276066780090332, - "num_tokens": 20107947.0, - "step": 2249 - }, - { - "epoch": 1.709726443768997, - "grad_norm": 2.144636869430542, - "learning_rate": 2.1327206298656055e-06, - "loss": 0.32923734188079834, - "mean_token_accuracy": 0.8766019344329834, - "num_tokens": 20113676.0, - "step": 2250 - }, - { - "epoch": 1.7104863221884499, - "grad_norm": 1.9511034488677979, - "learning_rate": 2.130649088908041e-06, - "loss": 0.4043842554092407, - "mean_token_accuracy": 0.8525843620300293, - "num_tokens": 20120787.0, - "step": 2251 - }, - { - "epoch": 1.7112462006079028, - "grad_norm": 1.5001336336135864, - "learning_rate": 2.1285778071751638e-06, - "loss": 0.4800187051296234, - "mean_token_accuracy": 0.8398486375808716, - "num_tokens": 20133534.0, - "step": 2252 - }, - { - "epoch": 1.7120060790273555, - "grad_norm": 1.435195803642273, - "learning_rate": 2.126506786120678e-06, - "loss": 0.44489604234695435, - "mean_token_accuracy": 0.8444881439208984, - "num_tokens": 20151787.0, - "step": 2253 - }, - { - "epoch": 1.7127659574468086, - "grad_norm": 1.3056137561798096, - "learning_rate": 2.1244360271981073e-06, - "loss": 0.300567090511322, - "mean_token_accuracy": 0.8903113007545471, - "num_tokens": 20163390.0, - "step": 2254 - }, - { - "epoch": 1.7135258358662613, - "grad_norm": 1.7347925901412964, - "learning_rate": 2.1223655318607907e-06, - "loss": 0.30601179599761963, - "mean_token_accuracy": 0.8845717906951904, - "num_tokens": 20171354.0, - "step": 2255 - }, - { - "epoch": 1.7142857142857144, - "grad_norm": 1.316306471824646, - "learning_rate": 2.1202953015618794e-06, - "loss": 0.3972984552383423, - "mean_token_accuracy": 0.845410943031311, - "num_tokens": 20184464.0, - "step": 2256 - }, - { - "epoch": 1.715045592705167, - "grad_norm": 2.1052892208099365, - "learning_rate": 2.1182253377543428e-06, - "loss": 0.3357020616531372, - "mean_token_accuracy": 0.8853542804718018, - "num_tokens": 20190539.0, - "step": 2257 - }, - { - "epoch": 1.71580547112462, - "grad_norm": 1.4192553758621216, - "learning_rate": 2.116155641890959e-06, - "loss": 0.3881692588329315, - "mean_token_accuracy": 0.8442144989967346, - "num_tokens": 20204570.0, - "step": 2258 - }, - { - "epoch": 1.716565349544073, - "grad_norm": 2.134113311767578, - "learning_rate": 2.1140862154243223e-06, - "loss": 0.37803274393081665, - "mean_token_accuracy": 0.8703107237815857, - "num_tokens": 20210535.0, - "step": 2259 - }, - { - "epoch": 1.7173252279635258, - "grad_norm": 2.9149155616760254, - "learning_rate": 2.1120170598068353e-06, - "loss": 0.34860676527023315, - "mean_token_accuracy": 0.8734345436096191, - "num_tokens": 20214375.0, - "step": 2260 - }, - { - "epoch": 1.7180851063829787, - "grad_norm": 1.6855589151382446, - "learning_rate": 2.109948176490711e-06, - "loss": 0.3676984906196594, - "mean_token_accuracy": 0.8531560301780701, - "num_tokens": 20223791.0, - "step": 2261 - }, - { - "epoch": 1.7188449848024316, - "grad_norm": 2.09671950340271, - "learning_rate": 2.10787956692797e-06, - "loss": 0.41744115948677063, - "mean_token_accuracy": 0.8570001125335693, - "num_tokens": 20231254.0, - "step": 2262 - }, - { - "epoch": 1.7196048632218845, - "grad_norm": 3.148813009262085, - "learning_rate": 2.1058112325704436e-06, - "loss": 0.20556189119815826, - "mean_token_accuracy": 0.926898717880249, - "num_tokens": 20234470.0, - "step": 2263 - }, - { - "epoch": 1.7203647416413372, - "grad_norm": 1.9707107543945312, - "learning_rate": 2.103743174869769e-06, - "loss": 0.40733110904693604, - "mean_token_accuracy": 0.8740406036376953, - "num_tokens": 20242286.0, - "step": 2264 - }, - { - "epoch": 1.7211246200607904, - "grad_norm": 1.2756069898605347, - "learning_rate": 2.1016753952773867e-06, - "loss": 0.3940718173980713, - "mean_token_accuracy": 0.860906720161438, - "num_tokens": 20260382.0, - "step": 2265 - }, - { - "epoch": 1.721884498480243, - "grad_norm": 1.5074653625488281, - "learning_rate": 2.0996078952445453e-06, - "loss": 0.3353617191314697, - "mean_token_accuracy": 0.8809853792190552, - "num_tokens": 20271665.0, - "step": 2266 - }, - { - "epoch": 1.7226443768996962, - "grad_norm": 1.4331210851669312, - "learning_rate": 2.0975406762222966e-06, - "loss": 0.32260069251060486, - "mean_token_accuracy": 0.901330828666687, - "num_tokens": 20283122.0, - "step": 2267 - }, - { - "epoch": 1.7234042553191489, - "grad_norm": 2.2378969192504883, - "learning_rate": 2.095473739661494e-06, - "loss": 0.39086243510246277, - "mean_token_accuracy": 0.8681687116622925, - "num_tokens": 20289243.0, - "step": 2268 - }, - { - "epoch": 1.7241641337386018, - "grad_norm": 2.754582405090332, - "learning_rate": 2.093407087012791e-06, - "loss": 0.42927244305610657, - "mean_token_accuracy": 0.8594136834144592, - "num_tokens": 20294537.0, - "step": 2269 - }, - { - "epoch": 1.7249240121580547, - "grad_norm": 2.2721824645996094, - "learning_rate": 2.091340719726647e-06, - "loss": 0.42479783296585083, - "mean_token_accuracy": 0.8411722183227539, - "num_tokens": 20301502.0, - "step": 2270 - }, - { - "epoch": 1.7256838905775076, - "grad_norm": 2.3230299949645996, - "learning_rate": 2.089274639253317e-06, - "loss": 0.4218963384628296, - "mean_token_accuracy": 0.8498032093048096, - "num_tokens": 20307710.0, - "step": 2271 - }, - { - "epoch": 1.7264437689969605, - "grad_norm": 2.3499748706817627, - "learning_rate": 2.0872088470428553e-06, - "loss": 0.4472277760505676, - "mean_token_accuracy": 0.8487255573272705, - "num_tokens": 20313945.0, - "step": 2272 - }, - { - "epoch": 1.7272036474164134, - "grad_norm": 1.3709690570831299, - "learning_rate": 2.0851433445451142e-06, - "loss": 0.38701117038726807, - "mean_token_accuracy": 0.8592075109481812, - "num_tokens": 20328023.0, - "step": 2273 - }, - { - "epoch": 1.7279635258358663, - "grad_norm": 1.1293425559997559, - "learning_rate": 2.0830781332097446e-06, - "loss": 0.34000539779663086, - "mean_token_accuracy": 0.8779317140579224, - "num_tokens": 20346767.0, - "step": 2274 - }, - { - "epoch": 1.728723404255319, - "grad_norm": 2.9770123958587646, - "learning_rate": 2.08101321448619e-06, - "loss": 0.4437636733055115, - "mean_token_accuracy": 0.8398602604866028, - "num_tokens": 20352306.0, - "step": 2275 - }, - { - "epoch": 1.7294832826747721, - "grad_norm": 3.510955572128296, - "learning_rate": 2.0789485898236897e-06, - "loss": 0.3359706401824951, - "mean_token_accuracy": 0.8872498273849487, - "num_tokens": 20355560.0, - "step": 2276 - }, - { - "epoch": 1.7302431610942248, - "grad_norm": 2.0873279571533203, - "learning_rate": 2.076884260671276e-06, - "loss": 0.38720619678497314, - "mean_token_accuracy": 0.865881621837616, - "num_tokens": 20362802.0, - "step": 2277 - }, - { - "epoch": 1.731003039513678, - "grad_norm": 2.4871230125427246, - "learning_rate": 2.0748202284777775e-06, - "loss": 0.3250775933265686, - "mean_token_accuracy": 0.8867610692977905, - "num_tokens": 20367080.0, - "step": 2278 - }, - { - "epoch": 1.7317629179331306, - "grad_norm": 3.5603582859039307, - "learning_rate": 2.072756494691809e-06, - "loss": 0.35600754618644714, - "mean_token_accuracy": 0.8781189918518066, - "num_tokens": 20370625.0, - "step": 2279 - }, - { - "epoch": 1.7325227963525835, - "grad_norm": 2.0948755741119385, - "learning_rate": 2.070693060761779e-06, - "loss": 0.3558604419231415, - "mean_token_accuracy": 0.902066707611084, - "num_tokens": 20376835.0, - "step": 2280 - }, - { - "epoch": 1.7332826747720365, - "grad_norm": 2.391188859939575, - "learning_rate": 2.0686299281358837e-06, - "loss": 0.36596938967704773, - "mean_token_accuracy": 0.8741272687911987, - "num_tokens": 20382282.0, - "step": 2281 - }, - { - "epoch": 1.7340425531914894, - "grad_norm": 1.6906369924545288, - "learning_rate": 2.0665670982621107e-06, - "loss": 0.5241266489028931, - "mean_token_accuracy": 0.8091107606887817, - "num_tokens": 20393736.0, - "step": 2282 - }, - { - "epoch": 1.7348024316109423, - "grad_norm": 1.7578394412994385, - "learning_rate": 2.0645045725882334e-06, - "loss": 0.37041786313056946, - "mean_token_accuracy": 0.8907113075256348, - "num_tokens": 20402715.0, - "step": 2283 - }, - { - "epoch": 1.7355623100303952, - "grad_norm": 2.191727638244629, - "learning_rate": 2.0624423525618097e-06, - "loss": 0.43301627039909363, - "mean_token_accuracy": 0.8706433773040771, - "num_tokens": 20409976.0, - "step": 2284 - }, - { - "epoch": 1.736322188449848, - "grad_norm": 1.958005666732788, - "learning_rate": 2.0603804396301875e-06, - "loss": 0.29002684354782104, - "mean_token_accuracy": 0.8914110660552979, - "num_tokens": 20417099.0, - "step": 2285 - }, - { - "epoch": 1.7370820668693008, - "grad_norm": 2.477837085723877, - "learning_rate": 2.058318835240495e-06, - "loss": 0.2953898310661316, - "mean_token_accuracy": 0.8975275754928589, - "num_tokens": 20422251.0, - "step": 2286 - }, - { - "epoch": 1.737841945288754, - "grad_norm": 2.156764268875122, - "learning_rate": 2.0562575408396475e-06, - "loss": 0.4063698649406433, - "mean_token_accuracy": 0.8497642278671265, - "num_tokens": 20429338.0, - "step": 2287 - }, - { - "epoch": 1.7386018237082066, - "grad_norm": 1.6748939752578735, - "learning_rate": 2.0541965578743373e-06, - "loss": 0.3272587060928345, - "mean_token_accuracy": 0.8646700382232666, - "num_tokens": 20439680.0, - "step": 2288 - }, - { - "epoch": 1.7393617021276597, - "grad_norm": 1.9948776960372925, - "learning_rate": 2.0521358877910446e-06, - "loss": 0.36843347549438477, - "mean_token_accuracy": 0.8613901138305664, - "num_tokens": 20448492.0, - "step": 2289 - }, - { - "epoch": 1.7401215805471124, - "grad_norm": 2.231428623199463, - "learning_rate": 2.0500755320360263e-06, - "loss": 0.3905152380466461, - "mean_token_accuracy": 0.8980990052223206, - "num_tokens": 20453945.0, - "step": 2290 - }, - { - "epoch": 1.7408814589665653, - "grad_norm": 2.2187650203704834, - "learning_rate": 2.048015492055319e-06, - "loss": 0.45920854806900024, - "mean_token_accuracy": 0.8282852172851562, - "num_tokens": 20462378.0, - "step": 2291 - }, - { - "epoch": 1.7416413373860182, - "grad_norm": 2.0668466091156006, - "learning_rate": 2.045955769294737e-06, - "loss": 0.3227751553058624, - "mean_token_accuracy": 0.8805934190750122, - "num_tokens": 20469822.0, - "step": 2292 - }, - { - "epoch": 1.7424012158054711, - "grad_norm": 1.9162774085998535, - "learning_rate": 2.0438963651998747e-06, - "loss": 0.4604800343513489, - "mean_token_accuracy": 0.8441175818443298, - "num_tokens": 20479099.0, - "step": 2293 - }, - { - "epoch": 1.743161094224924, - "grad_norm": 2.645329713821411, - "learning_rate": 2.0418372812161015e-06, - "loss": 0.3239654004573822, - "mean_token_accuracy": 0.8888648748397827, - "num_tokens": 20483926.0, - "step": 2294 - }, - { - "epoch": 1.743920972644377, - "grad_norm": 1.39468514919281, - "learning_rate": 2.03977851878856e-06, - "loss": 0.4003690183162689, - "mean_token_accuracy": 0.8769714832305908, - "num_tokens": 20496501.0, - "step": 2295 - }, - { - "epoch": 1.7446808510638299, - "grad_norm": 3.509174346923828, - "learning_rate": 2.0377200793621694e-06, - "loss": 0.2948213517665863, - "mean_token_accuracy": 0.8972329497337341, - "num_tokens": 20500000.0, - "step": 2296 - }, - { - "epoch": 1.7454407294832825, - "grad_norm": 1.5033894777297974, - "learning_rate": 2.0356619643816234e-06, - "loss": 0.40694737434387207, - "mean_token_accuracy": 0.8607243895530701, - "num_tokens": 20513473.0, - "step": 2297 - }, - { - "epoch": 1.7462006079027357, - "grad_norm": 1.4324895143508911, - "learning_rate": 2.0336041752913843e-06, - "loss": 0.3899157643318176, - "mean_token_accuracy": 0.858935534954071, - "num_tokens": 20524516.0, - "step": 2298 - }, - { - "epoch": 1.7469604863221884, - "grad_norm": 2.359544277191162, - "learning_rate": 2.031546713535688e-06, - "loss": 0.369213342666626, - "mean_token_accuracy": 0.8741403818130493, - "num_tokens": 20530421.0, - "step": 2299 - }, - { - "epoch": 1.7477203647416415, - "grad_norm": 2.282637357711792, - "learning_rate": 2.029489580558542e-06, - "loss": 0.3255441188812256, - "mean_token_accuracy": 0.9045462608337402, - "num_tokens": 20535954.0, - "step": 2300 - }, - { - "epoch": 1.7484802431610942, - "grad_norm": 1.7367198467254639, - "learning_rate": 2.0274327778037204e-06, - "loss": 0.43890488147735596, - "mean_token_accuracy": 0.8494667410850525, - "num_tokens": 20548638.0, - "step": 2301 - }, - { - "epoch": 1.749240121580547, - "grad_norm": 1.6236488819122314, - "learning_rate": 2.0253763067147657e-06, - "loss": 0.4440777897834778, - "mean_token_accuracy": 0.8414230942726135, - "num_tokens": 20559263.0, - "step": 2302 - }, - { - "epoch": 1.75, - "grad_norm": 1.3755455017089844, - "learning_rate": 2.0233201687349888e-06, - "loss": 0.3473797142505646, - "mean_token_accuracy": 0.8742472529411316, - "num_tokens": 20573109.0, - "step": 2303 - }, - { - "epoch": 1.750759878419453, - "grad_norm": 3.271153688430786, - "learning_rate": 2.0212643653074677e-06, - "loss": 0.4965784549713135, - "mean_token_accuracy": 0.8596988916397095, - "num_tokens": 20578525.0, - "step": 2304 - }, - { - "epoch": 1.7515197568389058, - "grad_norm": 2.6341168880462646, - "learning_rate": 2.019208897875043e-06, - "loss": 0.37775442004203796, - "mean_token_accuracy": 0.8721816539764404, - "num_tokens": 20583641.0, - "step": 2305 - }, - { - "epoch": 1.7522796352583585, - "grad_norm": 1.8308569192886353, - "learning_rate": 2.0171537678803222e-06, - "loss": 0.3243415355682373, - "mean_token_accuracy": 0.8837124109268188, - "num_tokens": 20591725.0, - "step": 2306 - }, - { - "epoch": 1.7530395136778116, - "grad_norm": 2.4362998008728027, - "learning_rate": 2.015098976765673e-06, - "loss": 0.3738787770271301, - "mean_token_accuracy": 0.8974303007125854, - "num_tokens": 20596587.0, - "step": 2307 - }, - { - "epoch": 1.7537993920972643, - "grad_norm": 3.2920920848846436, - "learning_rate": 2.0130445259732282e-06, - "loss": 0.33901530504226685, - "mean_token_accuracy": 0.9019063115119934, - "num_tokens": 20600379.0, - "step": 2308 - }, - { - "epoch": 1.7545592705167175, - "grad_norm": 1.290475606918335, - "learning_rate": 2.01099041694488e-06, - "loss": 0.37150678038597107, - "mean_token_accuracy": 0.8542044758796692, - "num_tokens": 20614340.0, - "step": 2309 - }, - { - "epoch": 1.7553191489361701, - "grad_norm": 2.7794933319091797, - "learning_rate": 2.0089366511222815e-06, - "loss": 0.3746095895767212, - "mean_token_accuracy": 0.8653185367584229, - "num_tokens": 20622056.0, - "step": 2310 - }, - { - "epoch": 1.756079027355623, - "grad_norm": 2.2112278938293457, - "learning_rate": 2.006883229946843e-06, - "loss": 0.35793858766555786, - "mean_token_accuracy": 0.875727653503418, - "num_tokens": 20628930.0, - "step": 2311 - }, - { - "epoch": 1.756838905775076, - "grad_norm": 1.5240603685379028, - "learning_rate": 2.0048301548597365e-06, - "loss": 0.512831449508667, - "mean_token_accuracy": 0.8139172792434692, - "num_tokens": 20643159.0, - "step": 2312 - }, - { - "epoch": 1.7575987841945289, - "grad_norm": 1.810485601425171, - "learning_rate": 2.0027774273018894e-06, - "loss": 0.43870818614959717, - "mean_token_accuracy": 0.8313089609146118, - "num_tokens": 20651914.0, - "step": 2313 - }, - { - "epoch": 1.7583586626139818, - "grad_norm": 1.748178243637085, - "learning_rate": 2.0007250487139827e-06, - "loss": 0.42277514934539795, - "mean_token_accuracy": 0.8463197946548462, - "num_tokens": 20660054.0, - "step": 2314 - }, - { - "epoch": 1.7591185410334347, - "grad_norm": 1.511717677116394, - "learning_rate": 1.998673020536456e-06, - "loss": 0.38304439187049866, - "mean_token_accuracy": 0.8508470058441162, - "num_tokens": 20673371.0, - "step": 2315 - }, - { - "epoch": 1.7598784194528876, - "grad_norm": 1.7790700197219849, - "learning_rate": 1.996621344209503e-06, - "loss": 0.3838311433792114, - "mean_token_accuracy": 0.8676829934120178, - "num_tokens": 20682072.0, - "step": 2316 - }, - { - "epoch": 1.7606382978723403, - "grad_norm": 1.9128468036651611, - "learning_rate": 1.994570021173067e-06, - "loss": 0.40384364128112793, - "mean_token_accuracy": 0.8747294545173645, - "num_tokens": 20689000.0, - "step": 2317 - }, - { - "epoch": 1.7613981762917934, - "grad_norm": 3.286569118499756, - "learning_rate": 1.9925190528668455e-06, - "loss": 0.38019680976867676, - "mean_token_accuracy": 0.8678069114685059, - "num_tokens": 20692763.0, - "step": 2318 - }, - { - "epoch": 1.762158054711246, - "grad_norm": 1.6108927726745605, - "learning_rate": 1.990468440730288e-06, - "loss": 0.3144170045852661, - "mean_token_accuracy": 0.8695170879364014, - "num_tokens": 20702620.0, - "step": 2319 - }, - { - "epoch": 1.7629179331306992, - "grad_norm": 3.185225009918213, - "learning_rate": 1.9884181862025938e-06, - "loss": 0.41619348526000977, - "mean_token_accuracy": 0.8543670177459717, - "num_tokens": 20706857.0, - "step": 2320 - }, - { - "epoch": 1.763677811550152, - "grad_norm": 2.3699469566345215, - "learning_rate": 1.986368290722709e-06, - "loss": 0.5115842819213867, - "mean_token_accuracy": 0.8141909837722778, - "num_tokens": 20713997.0, - "step": 2321 - }, - { - "epoch": 1.7644376899696048, - "grad_norm": 1.4449706077575684, - "learning_rate": 1.9843187557293286e-06, - "loss": 0.419655442237854, - "mean_token_accuracy": 0.8545533418655396, - "num_tokens": 20726548.0, - "step": 2322 - }, - { - "epoch": 1.7651975683890577, - "grad_norm": 2.127614974975586, - "learning_rate": 1.9822695826608975e-06, - "loss": 0.43722522258758545, - "mean_token_accuracy": 0.8542283773422241, - "num_tokens": 20733469.0, - "step": 2323 - }, - { - "epoch": 1.7659574468085106, - "grad_norm": 3.3081557750701904, - "learning_rate": 1.9802207729556023e-06, - "loss": 0.30904972553253174, - "mean_token_accuracy": 0.8896352648735046, - "num_tokens": 20737190.0, - "step": 2324 - }, - { - "epoch": 1.7667173252279635, - "grad_norm": 2.603506326675415, - "learning_rate": 1.978172328051377e-06, - "loss": 0.30952537059783936, - "mean_token_accuracy": 0.8868587017059326, - "num_tokens": 20741780.0, - "step": 2325 - }, - { - "epoch": 1.7674772036474165, - "grad_norm": 2.576824903488159, - "learning_rate": 1.9761242493858987e-06, - "loss": 0.29593953490257263, - "mean_token_accuracy": 0.888198733329773, - "num_tokens": 20746324.0, - "step": 2326 - }, - { - "epoch": 1.7682370820668694, - "grad_norm": 1.6168320178985596, - "learning_rate": 1.9740765383965894e-06, - "loss": 0.5093998908996582, - "mean_token_accuracy": 0.8301646709442139, - "num_tokens": 20760140.0, - "step": 2327 - }, - { - "epoch": 1.768996960486322, - "grad_norm": 2.1162400245666504, - "learning_rate": 1.9720291965206097e-06, - "loss": 0.36714404821395874, - "mean_token_accuracy": 0.8699671626091003, - "num_tokens": 20766961.0, - "step": 2328 - }, - { - "epoch": 1.7697568389057752, - "grad_norm": 1.046911597251892, - "learning_rate": 1.969982225194864e-06, - "loss": 0.40783989429473877, - "mean_token_accuracy": 0.8474892377853394, - "num_tokens": 20786737.0, - "step": 2329 - }, - { - "epoch": 1.7705167173252279, - "grad_norm": 1.7059568166732788, - "learning_rate": 1.9679356258559943e-06, - "loss": 0.44083845615386963, - "mean_token_accuracy": 0.841221034526825, - "num_tokens": 20798907.0, - "step": 2330 - }, - { - "epoch": 1.771276595744681, - "grad_norm": 1.5157767534255981, - "learning_rate": 1.9658893999403847e-06, - "loss": 0.4671107828617096, - "mean_token_accuracy": 0.8252813816070557, - "num_tokens": 20814304.0, - "step": 2331 - }, - { - "epoch": 1.7720364741641337, - "grad_norm": 2.1340525150299072, - "learning_rate": 1.9638435488841543e-06, - "loss": 0.4088709354400635, - "mean_token_accuracy": 0.8595127463340759, - "num_tokens": 20821827.0, - "step": 2332 - }, - { - "epoch": 1.7727963525835866, - "grad_norm": 1.948072910308838, - "learning_rate": 1.96179807412316e-06, - "loss": 0.3692860007286072, - "mean_token_accuracy": 0.8678920269012451, - "num_tokens": 20828612.0, - "step": 2333 - }, - { - "epoch": 1.7735562310030395, - "grad_norm": 1.5731977224349976, - "learning_rate": 1.959752977092995e-06, - "loss": 0.3743135929107666, - "mean_token_accuracy": 0.8723479509353638, - "num_tokens": 20838497.0, - "step": 2334 - }, - { - "epoch": 1.7743161094224924, - "grad_norm": 1.5506012439727783, - "learning_rate": 1.957708259228987e-06, - "loss": 0.4403391182422638, - "mean_token_accuracy": 0.854604959487915, - "num_tokens": 20851603.0, - "step": 2335 - }, - { - "epoch": 1.7750759878419453, - "grad_norm": 1.154336929321289, - "learning_rate": 1.9556639219661983e-06, - "loss": 0.5281188488006592, - "mean_token_accuracy": 0.8101300001144409, - "num_tokens": 20875661.0, - "step": 2336 - }, - { - "epoch": 1.7758358662613982, - "grad_norm": 4.720771312713623, - "learning_rate": 1.9536199667394217e-06, - "loss": 0.44419822096824646, - "mean_token_accuracy": 0.8740090131759644, - "num_tokens": 20886971.0, - "step": 2337 - }, - { - "epoch": 1.7765957446808511, - "grad_norm": 1.5492230653762817, - "learning_rate": 1.9515763949831852e-06, - "loss": 0.4538637697696686, - "mean_token_accuracy": 0.8362185955047607, - "num_tokens": 20899212.0, - "step": 2338 - }, - { - "epoch": 1.7773556231003038, - "grad_norm": 1.354101538658142, - "learning_rate": 1.9495332081317466e-06, - "loss": 0.4341534376144409, - "mean_token_accuracy": 0.8380170464515686, - "num_tokens": 20913065.0, - "step": 2339 - }, - { - "epoch": 1.778115501519757, - "grad_norm": 1.5805599689483643, - "learning_rate": 1.947490407619092e-06, - "loss": 0.40928739309310913, - "mean_token_accuracy": 0.8524469137191772, - "num_tokens": 20922919.0, - "step": 2340 - }, - { - "epoch": 1.7788753799392096, - "grad_norm": 2.097221851348877, - "learning_rate": 1.945447994878937e-06, - "loss": 0.4816104769706726, - "mean_token_accuracy": 0.888654351234436, - "num_tokens": 20931350.0, - "step": 2341 - }, - { - "epoch": 1.7796352583586628, - "grad_norm": 1.7193297147750854, - "learning_rate": 1.9434059713447264e-06, - "loss": 0.44925639033317566, - "mean_token_accuracy": 0.8500319123268127, - "num_tokens": 20940546.0, - "step": 2342 - }, - { - "epoch": 1.7803951367781155, - "grad_norm": 1.5971747636795044, - "learning_rate": 1.9413643384496315e-06, - "loss": 0.29559412598609924, - "mean_token_accuracy": 0.8871279954910278, - "num_tokens": 20950604.0, - "step": 2343 - }, - { - "epoch": 1.7811550151975684, - "grad_norm": 2.788029670715332, - "learning_rate": 1.9393230976265478e-06, - "loss": 0.31713539361953735, - "mean_token_accuracy": 0.8866176605224609, - "num_tokens": 20955296.0, - "step": 2344 - }, - { - "epoch": 1.7819148936170213, - "grad_norm": 1.5747952461242676, - "learning_rate": 1.937282250308096e-06, - "loss": 0.41813358664512634, - "mean_token_accuracy": 0.8418053984642029, - "num_tokens": 20967664.0, - "step": 2345 - }, - { - "epoch": 1.7826747720364742, - "grad_norm": 2.0813145637512207, - "learning_rate": 1.935241797926623e-06, - "loss": 0.39056286215782166, - "mean_token_accuracy": 0.8601781129837036, - "num_tokens": 20975895.0, - "step": 2346 - }, - { - "epoch": 1.783434650455927, - "grad_norm": 2.143022298812866, - "learning_rate": 1.933201741914196e-06, - "loss": 0.40797823667526245, - "mean_token_accuracy": 0.8846398591995239, - "num_tokens": 20983683.0, - "step": 2347 - }, - { - "epoch": 1.78419452887538, - "grad_norm": 1.8451775312423706, - "learning_rate": 1.931162083702606e-06, - "loss": 0.34083136916160583, - "mean_token_accuracy": 0.8643462657928467, - "num_tokens": 20992621.0, - "step": 2348 - }, - { - "epoch": 1.784954407294833, - "grad_norm": 1.8603935241699219, - "learning_rate": 1.9291228247233607e-06, - "loss": 0.4860231280326843, - "mean_token_accuracy": 0.8391251564025879, - "num_tokens": 21002427.0, - "step": 2349 - }, - { - "epoch": 1.7857142857142856, - "grad_norm": 2.751711845397949, - "learning_rate": 1.9270839664076937e-06, - "loss": 0.30588358640670776, - "mean_token_accuracy": 0.8836315274238586, - "num_tokens": 21006898.0, - "step": 2350 - }, - { - "epoch": 1.7864741641337387, - "grad_norm": 1.0335345268249512, - "learning_rate": 1.9250455101865526e-06, - "loss": 0.3119634985923767, - "mean_token_accuracy": 0.8912283182144165, - "num_tokens": 21024930.0, - "step": 2351 - }, - { - "epoch": 1.7872340425531914, - "grad_norm": 2.4693806171417236, - "learning_rate": 1.9230074574906043e-06, - "loss": 0.1976669877767563, - "mean_token_accuracy": 0.928974986076355, - "num_tokens": 21029027.0, - "step": 2352 - }, - { - "epoch": 1.7879939209726445, - "grad_norm": 1.2892690896987915, - "learning_rate": 1.920969809750234e-06, - "loss": 0.46008217334747314, - "mean_token_accuracy": 0.8299605846405029, - "num_tokens": 21047671.0, - "step": 2353 - }, - { - "epoch": 1.7887537993920972, - "grad_norm": 3.162534713745117, - "learning_rate": 1.91893256839554e-06, - "loss": 0.2916071116924286, - "mean_token_accuracy": 0.8932807445526123, - "num_tokens": 21051555.0, - "step": 2354 - }, - { - "epoch": 1.7895136778115501, - "grad_norm": 1.7627713680267334, - "learning_rate": 1.916895734856338e-06, - "loss": 0.3223535120487213, - "mean_token_accuracy": 0.8852578401565552, - "num_tokens": 21060056.0, - "step": 2355 - }, - { - "epoch": 1.790273556231003, - "grad_norm": 1.9448071718215942, - "learning_rate": 1.9148593105621542e-06, - "loss": 0.3650452196598053, - "mean_token_accuracy": 0.8709862232208252, - "num_tokens": 21067190.0, - "step": 2356 - }, - { - "epoch": 1.791033434650456, - "grad_norm": 2.026644229888916, - "learning_rate": 1.9128232969422318e-06, - "loss": 0.3620566427707672, - "mean_token_accuracy": 0.865707516670227, - "num_tokens": 21075197.0, - "step": 2357 - }, - { - "epoch": 1.7917933130699089, - "grad_norm": 2.2628564834594727, - "learning_rate": 1.9107876954255217e-06, - "loss": 0.353444367647171, - "mean_token_accuracy": 0.8590385913848877, - "num_tokens": 21080823.0, - "step": 2358 - }, - { - "epoch": 1.7925531914893615, - "grad_norm": 2.5959067344665527, - "learning_rate": 1.908752507440689e-06, - "loss": 0.43711763620376587, - "mean_token_accuracy": 0.8539710640907288, - "num_tokens": 21086016.0, - "step": 2359 - }, - { - "epoch": 1.7933130699088147, - "grad_norm": 1.6228864192962646, - "learning_rate": 1.906717734416105e-06, - "loss": 0.38630396127700806, - "mean_token_accuracy": 0.8611987829208374, - "num_tokens": 21096573.0, - "step": 2360 - }, - { - "epoch": 1.7940729483282674, - "grad_norm": 1.8471404314041138, - "learning_rate": 1.9046833777798534e-06, - "loss": 0.46608641743659973, - "mean_token_accuracy": 0.8782031536102295, - "num_tokens": 21105817.0, - "step": 2361 - }, - { - "epoch": 1.7948328267477205, - "grad_norm": 2.6532235145568848, - "learning_rate": 1.9026494389597239e-06, - "loss": 0.3310372829437256, - "mean_token_accuracy": 0.8781720399856567, - "num_tokens": 21111192.0, - "step": 2362 - }, - { - "epoch": 1.7955927051671732, - "grad_norm": 2.172534942626953, - "learning_rate": 1.9006159193832124e-06, - "loss": 0.49921661615371704, - "mean_token_accuracy": 0.8215196132659912, - "num_tokens": 21117878.0, - "step": 2363 - }, - { - "epoch": 1.7963525835866263, - "grad_norm": 1.6507720947265625, - "learning_rate": 1.8985828204775206e-06, - "loss": 0.4189162850379944, - "mean_token_accuracy": 0.8520572185516357, - "num_tokens": 21128287.0, - "step": 2364 - }, - { - "epoch": 1.797112462006079, - "grad_norm": 1.5932034254074097, - "learning_rate": 1.8965501436695578e-06, - "loss": 0.45531854033470154, - "mean_token_accuracy": 0.8391242027282715, - "num_tokens": 21140605.0, - "step": 2365 - }, - { - "epoch": 1.797872340425532, - "grad_norm": 2.4680638313293457, - "learning_rate": 1.894517890385933e-06, - "loss": 0.41174983978271484, - "mean_token_accuracy": 0.8616886138916016, - "num_tokens": 21147045.0, - "step": 2366 - }, - { - "epoch": 1.7986322188449848, - "grad_norm": 1.61875319480896, - "learning_rate": 1.8924860620529594e-06, - "loss": 0.47573935985565186, - "mean_token_accuracy": 0.8347671031951904, - "num_tokens": 21157253.0, - "step": 2367 - }, - { - "epoch": 1.7993920972644377, - "grad_norm": 3.4389333724975586, - "learning_rate": 1.8904546600966539e-06, - "loss": 0.34975939989089966, - "mean_token_accuracy": 0.8915865421295166, - "num_tokens": 21160486.0, - "step": 2368 - }, - { - "epoch": 1.8001519756838906, - "grad_norm": 2.0069527626037598, - "learning_rate": 1.888423685942732e-06, - "loss": 0.379585325717926, - "mean_token_accuracy": 0.8605983257293701, - "num_tokens": 21168016.0, - "step": 2369 - }, - { - "epoch": 1.8009118541033433, - "grad_norm": 3.0740530490875244, - "learning_rate": 1.886393141016609e-06, - "loss": 0.5244829058647156, - "mean_token_accuracy": 0.8282772302627563, - "num_tokens": 21172851.0, - "step": 2370 - }, - { - "epoch": 1.8016717325227964, - "grad_norm": 1.5724968910217285, - "learning_rate": 1.8843630267434e-06, - "loss": 0.2020694762468338, - "mean_token_accuracy": 0.8882503509521484, - "num_tokens": 21179866.0, - "step": 2371 - }, - { - "epoch": 1.8024316109422491, - "grad_norm": 2.1539509296417236, - "learning_rate": 1.8823333445479175e-06, - "loss": 0.37903186678886414, - "mean_token_accuracy": 0.8525497317314148, - "num_tokens": 21186941.0, - "step": 2372 - }, - { - "epoch": 1.8031914893617023, - "grad_norm": 2.0247764587402344, - "learning_rate": 1.8803040958546708e-06, - "loss": 0.293364018201828, - "mean_token_accuracy": 0.8954306244850159, - "num_tokens": 21193659.0, - "step": 2373 - }, - { - "epoch": 1.803951367781155, - "grad_norm": 1.7034926414489746, - "learning_rate": 1.8782752820878636e-06, - "loss": 0.33828210830688477, - "mean_token_accuracy": 0.9032940864562988, - "num_tokens": 21201399.0, - "step": 2374 - }, - { - "epoch": 1.8047112462006079, - "grad_norm": 1.7864601612091064, - "learning_rate": 1.8762469046713954e-06, - "loss": 0.3165147006511688, - "mean_token_accuracy": 0.8997465372085571, - "num_tokens": 21209105.0, - "step": 2375 - }, - { - "epoch": 1.8054711246200608, - "grad_norm": 2.3371729850769043, - "learning_rate": 1.8742189650288617e-06, - "loss": 0.4036901593208313, - "mean_token_accuracy": 0.8549420833587646, - "num_tokens": 21215429.0, - "step": 2376 - }, - { - "epoch": 1.8062310030395137, - "grad_norm": 1.7922348976135254, - "learning_rate": 1.872191464583547e-06, - "loss": 0.4366671144962311, - "mean_token_accuracy": 0.8614166975021362, - "num_tokens": 21226823.0, - "step": 2377 - }, - { - "epoch": 1.8069908814589666, - "grad_norm": 2.1667943000793457, - "learning_rate": 1.8701644047584294e-06, - "loss": 0.3543647825717926, - "mean_token_accuracy": 0.9031318426132202, - "num_tokens": 21232823.0, - "step": 2378 - }, - { - "epoch": 1.8077507598784195, - "grad_norm": 1.7554421424865723, - "learning_rate": 1.868137786976177e-06, - "loss": 0.32704365253448486, - "mean_token_accuracy": 0.8990532755851746, - "num_tokens": 21242036.0, - "step": 2379 - }, - { - "epoch": 1.8085106382978724, - "grad_norm": 1.6723839044570923, - "learning_rate": 1.8661116126591492e-06, - "loss": 0.3665752410888672, - "mean_token_accuracy": 0.8828305006027222, - "num_tokens": 21251290.0, - "step": 2380 - }, - { - "epoch": 1.809270516717325, - "grad_norm": 1.5078409910202026, - "learning_rate": 1.8640858832293924e-06, - "loss": 0.368108332157135, - "mean_token_accuracy": 0.8720884323120117, - "num_tokens": 21263510.0, - "step": 2381 - }, - { - "epoch": 1.8100303951367782, - "grad_norm": 2.245553493499756, - "learning_rate": 1.8620606001086423e-06, - "loss": 0.3189915716648102, - "mean_token_accuracy": 0.9015103578567505, - "num_tokens": 21269690.0, - "step": 2382 - }, - { - "epoch": 1.810790273556231, - "grad_norm": 1.780027151107788, - "learning_rate": 1.8600357647183188e-06, - "loss": 0.40369710326194763, - "mean_token_accuracy": 0.8539618253707886, - "num_tokens": 21278523.0, - "step": 2383 - }, - { - "epoch": 1.811550151975684, - "grad_norm": 2.1727912425994873, - "learning_rate": 1.8580113784795306e-06, - "loss": 0.29285651445388794, - "mean_token_accuracy": 0.8954071998596191, - "num_tokens": 21284717.0, - "step": 2384 - }, - { - "epoch": 1.8123100303951367, - "grad_norm": 2.310225248336792, - "learning_rate": 1.8559874428130708e-06, - "loss": 0.3090948760509491, - "mean_token_accuracy": 0.8853784203529358, - "num_tokens": 21290484.0, - "step": 2385 - }, - { - "epoch": 1.8130699088145896, - "grad_norm": 1.6556873321533203, - "learning_rate": 1.8539639591394131e-06, - "loss": 0.4425269663333893, - "mean_token_accuracy": 0.8488757610321045, - "num_tokens": 21302588.0, - "step": 2386 - }, - { - "epoch": 1.8138297872340425, - "grad_norm": 1.9238256216049194, - "learning_rate": 1.8519409288787182e-06, - "loss": 0.4781329929828644, - "mean_token_accuracy": 0.8392970561981201, - "num_tokens": 21310598.0, - "step": 2387 - }, - { - "epoch": 1.8145896656534954, - "grad_norm": 1.4976142644882202, - "learning_rate": 1.8499183534508263e-06, - "loss": 0.36829859018325806, - "mean_token_accuracy": 0.8687542676925659, - "num_tokens": 21322668.0, - "step": 2388 - }, - { - "epoch": 1.8153495440729484, - "grad_norm": 2.0216941833496094, - "learning_rate": 1.8478962342752584e-06, - "loss": 0.385962575674057, - "mean_token_accuracy": 0.8908089399337769, - "num_tokens": 21330378.0, - "step": 2389 - }, - { - "epoch": 1.8161094224924013, - "grad_norm": 1.647863507270813, - "learning_rate": 1.8458745727712142e-06, - "loss": 0.30903705954551697, - "mean_token_accuracy": 0.8914397954940796, - "num_tokens": 21339932.0, - "step": 2390 - }, - { - "epoch": 1.8168693009118542, - "grad_norm": 1.5832399129867554, - "learning_rate": 1.8438533703575757e-06, - "loss": 0.3636384606361389, - "mean_token_accuracy": 0.8611595630645752, - "num_tokens": 21351557.0, - "step": 2391 - }, - { - "epoch": 1.8176291793313069, - "grad_norm": 3.0069241523742676, - "learning_rate": 1.8418326284528997e-06, - "loss": 0.37970617413520813, - "mean_token_accuracy": 0.8620643615722656, - "num_tokens": 21355704.0, - "step": 2392 - }, - { - "epoch": 1.81838905775076, - "grad_norm": 2.004526376724243, - "learning_rate": 1.8398123484754204e-06, - "loss": 0.5333225131034851, - "mean_token_accuracy": 0.8062554597854614, - "num_tokens": 21364640.0, - "step": 2393 - }, - { - "epoch": 1.8191489361702127, - "grad_norm": 1.449981689453125, - "learning_rate": 1.8377925318430478e-06, - "loss": 0.3736325800418854, - "mean_token_accuracy": 0.858788251876831, - "num_tokens": 21377025.0, - "step": 2394 - }, - { - "epoch": 1.8199088145896658, - "grad_norm": 1.1959524154663086, - "learning_rate": 1.8357731799733686e-06, - "loss": 0.3272058963775635, - "mean_token_accuracy": 0.8840590715408325, - "num_tokens": 21395378.0, - "step": 2395 - }, - { - "epoch": 1.8206686930091185, - "grad_norm": 2.134742498397827, - "learning_rate": 1.8337542942836406e-06, - "loss": 0.3737856149673462, - "mean_token_accuracy": 0.8674061298370361, - "num_tokens": 21402106.0, - "step": 2396 - }, - { - "epoch": 1.8214285714285714, - "grad_norm": 2.2179460525512695, - "learning_rate": 1.8317358761907945e-06, - "loss": 0.37301796674728394, - "mean_token_accuracy": 0.8605623245239258, - "num_tokens": 21408367.0, - "step": 2397 - }, - { - "epoch": 1.8221884498480243, - "grad_norm": 2.1718010902404785, - "learning_rate": 1.8297179271114345e-06, - "loss": 0.2772231101989746, - "mean_token_accuracy": 0.8997501730918884, - "num_tokens": 21414274.0, - "step": 2398 - }, - { - "epoch": 1.8229483282674772, - "grad_norm": 1.410933494567871, - "learning_rate": 1.827700448461836e-06, - "loss": 0.4834601581096649, - "mean_token_accuracy": 0.8382522463798523, - "num_tokens": 21429120.0, - "step": 2399 - }, - { - "epoch": 1.8237082066869301, - "grad_norm": 3.4779679775238037, - "learning_rate": 1.8256834416579423e-06, - "loss": 0.44643428921699524, - "mean_token_accuracy": 0.8308249711990356, - "num_tokens": 21432437.0, - "step": 2400 - }, - { - "epoch": 1.824468085106383, - "grad_norm": 1.374484658241272, - "learning_rate": 1.8236669081153657e-06, - "loss": 0.3947869837284088, - "mean_token_accuracy": 0.8605848550796509, - "num_tokens": 21445656.0, - "step": 2401 - }, - { - "epoch": 1.825227963525836, - "grad_norm": 1.9599316120147705, - "learning_rate": 1.8216508492493887e-06, - "loss": 0.49040719866752625, - "mean_token_accuracy": 0.839459240436554, - "num_tokens": 21452889.0, - "step": 2402 - }, - { - "epoch": 1.8259878419452886, - "grad_norm": 2.1267881393432617, - "learning_rate": 1.8196352664749578e-06, - "loss": 0.3233179450035095, - "mean_token_accuracy": 0.8841243386268616, - "num_tokens": 21458788.0, - "step": 2403 - }, - { - "epoch": 1.8267477203647418, - "grad_norm": 2.6356115341186523, - "learning_rate": 1.8176201612066874e-06, - "loss": 0.43436336517333984, - "mean_token_accuracy": 0.850265622138977, - "num_tokens": 21464305.0, - "step": 2404 - }, - { - "epoch": 1.8275075987841944, - "grad_norm": 2.0232386589050293, - "learning_rate": 1.8156055348588548e-06, - "loss": 0.37281763553619385, - "mean_token_accuracy": 0.8616300821304321, - "num_tokens": 21471722.0, - "step": 2405 - }, - { - "epoch": 1.8282674772036476, - "grad_norm": 3.2616260051727295, - "learning_rate": 1.8135913888454034e-06, - "loss": 0.2882898151874542, - "mean_token_accuracy": 0.9001147747039795, - "num_tokens": 21475400.0, - "step": 2406 - }, - { - "epoch": 1.8290273556231003, - "grad_norm": 2.1665611267089844, - "learning_rate": 1.8115777245799383e-06, - "loss": 0.45269185304641724, - "mean_token_accuracy": 0.8420798778533936, - "num_tokens": 21481827.0, - "step": 2407 - }, - { - "epoch": 1.8297872340425532, - "grad_norm": 1.4406569004058838, - "learning_rate": 1.8095645434757261e-06, - "loss": 0.43665701150894165, - "mean_token_accuracy": 0.8401381969451904, - "num_tokens": 21496441.0, - "step": 2408 - }, - { - "epoch": 1.830547112462006, - "grad_norm": 1.6756342649459839, - "learning_rate": 1.8075518469456944e-06, - "loss": 0.3521783947944641, - "mean_token_accuracy": 0.8737466335296631, - "num_tokens": 21505568.0, - "step": 2409 - }, - { - "epoch": 1.831306990881459, - "grad_norm": 1.6623140573501587, - "learning_rate": 1.8055396364024318e-06, - "loss": 0.344537615776062, - "mean_token_accuracy": 0.886972188949585, - "num_tokens": 21513252.0, - "step": 2410 - }, - { - "epoch": 1.832066869300912, - "grad_norm": 2.064835548400879, - "learning_rate": 1.803527913258186e-06, - "loss": 0.3252706229686737, - "mean_token_accuracy": 0.885245680809021, - "num_tokens": 21520242.0, - "step": 2411 - }, - { - "epoch": 1.8328267477203646, - "grad_norm": 1.9969112873077393, - "learning_rate": 1.8015166789248606e-06, - "loss": 0.34694376587867737, - "mean_token_accuracy": 0.8818766474723816, - "num_tokens": 21527524.0, - "step": 2412 - }, - { - "epoch": 1.8335866261398177, - "grad_norm": 2.086148977279663, - "learning_rate": 1.7995059348140165e-06, - "loss": 0.23109188675880432, - "mean_token_accuracy": 0.912773609161377, - "num_tokens": 21532829.0, - "step": 2413 - }, - { - "epoch": 1.8343465045592704, - "grad_norm": 1.80828058719635, - "learning_rate": 1.7974956823368728e-06, - "loss": 0.5422223210334778, - "mean_token_accuracy": 0.8058640956878662, - "num_tokens": 21544440.0, - "step": 2414 - }, - { - "epoch": 1.8351063829787235, - "grad_norm": 1.8121788501739502, - "learning_rate": 1.7954859229043017e-06, - "loss": 0.3674035668373108, - "mean_token_accuracy": 0.8628277778625488, - "num_tokens": 21553160.0, - "step": 2415 - }, - { - "epoch": 1.8358662613981762, - "grad_norm": 1.9307979345321655, - "learning_rate": 1.7934766579268292e-06, - "loss": 0.4528796672821045, - "mean_token_accuracy": 0.8328302502632141, - "num_tokens": 21563485.0, - "step": 2416 - }, - { - "epoch": 1.8366261398176293, - "grad_norm": 1.2312756776809692, - "learning_rate": 1.7914678888146347e-06, - "loss": 0.40424543619155884, - "mean_token_accuracy": 0.8571025133132935, - "num_tokens": 21582662.0, - "step": 2417 - }, - { - "epoch": 1.837386018237082, - "grad_norm": 1.6305770874023438, - "learning_rate": 1.7894596169775514e-06, - "loss": 0.36575305461883545, - "mean_token_accuracy": 0.8768579959869385, - "num_tokens": 21592930.0, - "step": 2418 - }, - { - "epoch": 1.838145896656535, - "grad_norm": 1.8107178211212158, - "learning_rate": 1.7874518438250598e-06, - "loss": 0.3260963261127472, - "mean_token_accuracy": 0.896018385887146, - "num_tokens": 21600509.0, - "step": 2419 - }, - { - "epoch": 1.8389057750759878, - "grad_norm": 2.7195847034454346, - "learning_rate": 1.785444570766293e-06, - "loss": 0.2728347182273865, - "mean_token_accuracy": 0.9178709983825684, - "num_tokens": 21604489.0, - "step": 2420 - }, - { - "epoch": 1.8396656534954408, - "grad_norm": 1.9783591032028198, - "learning_rate": 1.7834377992100332e-06, - "loss": 0.3136378526687622, - "mean_token_accuracy": 0.8844017386436462, - "num_tokens": 21612060.0, - "step": 2421 - }, - { - "epoch": 1.8404255319148937, - "grad_norm": 2.1911418437957764, - "learning_rate": 1.7814315305647095e-06, - "loss": 0.39013993740081787, - "mean_token_accuracy": 0.8688976764678955, - "num_tokens": 21618778.0, - "step": 2422 - }, - { - "epoch": 1.8411854103343464, - "grad_norm": 1.9143604040145874, - "learning_rate": 1.779425766238398e-06, - "loss": 0.5113036632537842, - "mean_token_accuracy": 0.8329141139984131, - "num_tokens": 21628976.0, - "step": 2423 - }, - { - "epoch": 1.8419452887537995, - "grad_norm": 1.4184197187423706, - "learning_rate": 1.7774205076388207e-06, - "loss": 0.3821067810058594, - "mean_token_accuracy": 0.8604007959365845, - "num_tokens": 21643145.0, - "step": 2424 - }, - { - "epoch": 1.8427051671732522, - "grad_norm": 2.45896577835083, - "learning_rate": 1.7754157561733476e-06, - "loss": 0.3004961311817169, - "mean_token_accuracy": 0.89884352684021, - "num_tokens": 21647441.0, - "step": 2425 - }, - { - "epoch": 1.8434650455927053, - "grad_norm": 1.7999277114868164, - "learning_rate": 1.7734115132489887e-06, - "loss": 0.42533132433891296, - "mean_token_accuracy": 0.8838746547698975, - "num_tokens": 21657445.0, - "step": 2426 - }, - { - "epoch": 1.844224924012158, - "grad_norm": 2.099728584289551, - "learning_rate": 1.7714077802723994e-06, - "loss": 0.36200380325317383, - "mean_token_accuracy": 0.86548912525177, - "num_tokens": 21663966.0, - "step": 2427 - }, - { - "epoch": 1.844984802431611, - "grad_norm": 2.1970369815826416, - "learning_rate": 1.7694045586498754e-06, - "loss": 0.34944331645965576, - "mean_token_accuracy": 0.8670865297317505, - "num_tokens": 21670051.0, - "step": 2428 - }, - { - "epoch": 1.8457446808510638, - "grad_norm": 2.2928519248962402, - "learning_rate": 1.7674018497873568e-06, - "loss": 0.39500880241394043, - "mean_token_accuracy": 0.8744652271270752, - "num_tokens": 21676054.0, - "step": 2429 - }, - { - "epoch": 1.8465045592705167, - "grad_norm": 1.7598960399627686, - "learning_rate": 1.7653996550904208e-06, - "loss": 0.40113672614097595, - "mean_token_accuracy": 0.8552819490432739, - "num_tokens": 21685514.0, - "step": 2430 - }, - { - "epoch": 1.8472644376899696, - "grad_norm": 2.0529749393463135, - "learning_rate": 1.7633979759642844e-06, - "loss": 0.47586584091186523, - "mean_token_accuracy": 0.8412872552871704, - "num_tokens": 21693282.0, - "step": 2431 - }, - { - "epoch": 1.8480243161094225, - "grad_norm": 2.2423181533813477, - "learning_rate": 1.7613968138138027e-06, - "loss": 0.2757381796836853, - "mean_token_accuracy": 0.8992017507553101, - "num_tokens": 21698439.0, - "step": 2432 - }, - { - "epoch": 1.8487841945288754, - "grad_norm": 1.3280467987060547, - "learning_rate": 1.7593961700434692e-06, - "loss": 0.29535043239593506, - "mean_token_accuracy": 0.8943840861320496, - "num_tokens": 21711823.0, - "step": 2433 - }, - { - "epoch": 1.8495440729483281, - "grad_norm": 2.589221715927124, - "learning_rate": 1.7573960460574133e-06, - "loss": 0.46775516867637634, - "mean_token_accuracy": 0.8654797673225403, - "num_tokens": 21717180.0, - "step": 2434 - }, - { - "epoch": 1.8503039513677813, - "grad_norm": 2.1137642860412598, - "learning_rate": 1.7553964432593976e-06, - "loss": 0.3808780610561371, - "mean_token_accuracy": 0.8759565353393555, - "num_tokens": 21723980.0, - "step": 2435 - }, - { - "epoch": 1.851063829787234, - "grad_norm": 2.386967182159424, - "learning_rate": 1.75339736305282e-06, - "loss": 0.42688336968421936, - "mean_token_accuracy": 0.8488960266113281, - "num_tokens": 21730411.0, - "step": 2436 - }, - { - "epoch": 1.851823708206687, - "grad_norm": 1.586552619934082, - "learning_rate": 1.7513988068407145e-06, - "loss": 0.33497530221939087, - "mean_token_accuracy": 0.8809621334075928, - "num_tokens": 21740228.0, - "step": 2437 - }, - { - "epoch": 1.8525835866261398, - "grad_norm": 2.107167959213257, - "learning_rate": 1.7494007760257428e-06, - "loss": 0.3801528513431549, - "mean_token_accuracy": 0.8666986227035522, - "num_tokens": 21746718.0, - "step": 2438 - }, - { - "epoch": 1.8533434650455927, - "grad_norm": 2.514514684677124, - "learning_rate": 1.7474032720101991e-06, - "loss": 0.285498708486557, - "mean_token_accuracy": 0.901540219783783, - "num_tokens": 21751009.0, - "step": 2439 - }, - { - "epoch": 1.8541033434650456, - "grad_norm": 1.8152034282684326, - "learning_rate": 1.7454062961960102e-06, - "loss": 0.3704795241355896, - "mean_token_accuracy": 0.8630262613296509, - "num_tokens": 21760164.0, - "step": 2440 - }, - { - "epoch": 1.8548632218844985, - "grad_norm": 2.714531183242798, - "learning_rate": 1.7434098499847308e-06, - "loss": 0.5070809125900269, - "mean_token_accuracy": 0.8408594131469727, - "num_tokens": 21765602.0, - "step": 2441 - }, - { - "epoch": 1.8556231003039514, - "grad_norm": 2.173832893371582, - "learning_rate": 1.7414139347775423e-06, - "loss": 0.3500945568084717, - "mean_token_accuracy": 0.8733699321746826, - "num_tokens": 21772029.0, - "step": 2442 - }, - { - "epoch": 1.8563829787234043, - "grad_norm": 1.580376148223877, - "learning_rate": 1.7394185519752546e-06, - "loss": 0.5137908458709717, - "mean_token_accuracy": 0.8141944408416748, - "num_tokens": 21784531.0, - "step": 2443 - }, - { - "epoch": 1.8571428571428572, - "grad_norm": 2.079318046569824, - "learning_rate": 1.7374237029783064e-06, - "loss": 0.41820770502090454, - "mean_token_accuracy": 0.8513275384902954, - "num_tokens": 21792047.0, - "step": 2444 - }, - { - "epoch": 1.85790273556231, - "grad_norm": 2.6890387535095215, - "learning_rate": 1.7354293891867582e-06, - "loss": 0.3810037672519684, - "mean_token_accuracy": 0.8790096044540405, - "num_tokens": 21796634.0, - "step": 2445 - }, - { - "epoch": 1.858662613981763, - "grad_norm": 2.161081552505493, - "learning_rate": 1.7334356120002956e-06, - "loss": 0.48064762353897095, - "mean_token_accuracy": 0.8329977989196777, - "num_tokens": 21803509.0, - "step": 2446 - }, - { - "epoch": 1.8594224924012157, - "grad_norm": 1.9201551675796509, - "learning_rate": 1.7314423728182283e-06, - "loss": 0.36369895935058594, - "mean_token_accuracy": 0.8713955879211426, - "num_tokens": 21810528.0, - "step": 2447 - }, - { - "epoch": 1.8601823708206688, - "grad_norm": 1.8095223903656006, - "learning_rate": 1.7294496730394897e-06, - "loss": 0.41493499279022217, - "mean_token_accuracy": 0.855312705039978, - "num_tokens": 21821176.0, - "step": 2448 - }, - { - "epoch": 1.8609422492401215, - "grad_norm": 2.172389507293701, - "learning_rate": 1.7274575140626318e-06, - "loss": 0.3467463552951813, - "mean_token_accuracy": 0.8801594972610474, - "num_tokens": 21827486.0, - "step": 2449 - }, - { - "epoch": 1.8617021276595744, - "grad_norm": 2.8139185905456543, - "learning_rate": 1.7254658972858293e-06, - "loss": 0.35121995210647583, - "mean_token_accuracy": 0.8741901516914368, - "num_tokens": 21831915.0, - "step": 2450 - }, - { - "epoch": 1.8624620060790273, - "grad_norm": 1.2572762966156006, - "learning_rate": 1.7234748241068742e-06, - "loss": 0.3775328993797302, - "mean_token_accuracy": 0.8547425866127014, - "num_tokens": 21849623.0, - "step": 2451 - }, - { - "epoch": 1.8632218844984803, - "grad_norm": 1.2357900142669678, - "learning_rate": 1.7214842959231796e-06, - "loss": 0.28715917468070984, - "mean_token_accuracy": 0.9034290313720703, - "num_tokens": 21864507.0, - "step": 2452 - }, - { - "epoch": 1.8639817629179332, - "grad_norm": 1.2349165678024292, - "learning_rate": 1.719494314131775e-06, - "loss": 0.27918580174446106, - "mean_token_accuracy": 0.9073119759559631, - "num_tokens": 21878519.0, - "step": 2453 - }, - { - "epoch": 1.864741641337386, - "grad_norm": 1.960353136062622, - "learning_rate": 1.7175048801293042e-06, - "loss": 0.49304282665252686, - "mean_token_accuracy": 0.8193954229354858, - "num_tokens": 21886861.0, - "step": 2454 - }, - { - "epoch": 1.865501519756839, - "grad_norm": 1.480118751525879, - "learning_rate": 1.7155159953120315e-06, - "loss": 0.39433127641677856, - "mean_token_accuracy": 0.8674266338348389, - "num_tokens": 21899131.0, - "step": 2455 - }, - { - "epoch": 1.8662613981762917, - "grad_norm": 2.3136367797851562, - "learning_rate": 1.7135276610758309e-06, - "loss": 0.40943437814712524, - "mean_token_accuracy": 0.8511340022087097, - "num_tokens": 21905550.0, - "step": 2456 - }, - { - "epoch": 1.8670212765957448, - "grad_norm": 1.3622872829437256, - "learning_rate": 1.7115398788161923e-06, - "loss": 0.4255254566669464, - "mean_token_accuracy": 0.8457357883453369, - "num_tokens": 21919943.0, - "step": 2457 - }, - { - "epoch": 1.8677811550151975, - "grad_norm": 1.8197853565216064, - "learning_rate": 1.7095526499282172e-06, - "loss": 0.33384573459625244, - "mean_token_accuracy": 0.8757365942001343, - "num_tokens": 21928368.0, - "step": 2458 - }, - { - "epoch": 1.8685410334346506, - "grad_norm": 1.8771090507507324, - "learning_rate": 1.7075659758066207e-06, - "loss": 0.38854318857192993, - "mean_token_accuracy": 0.8565001487731934, - "num_tokens": 21936624.0, - "step": 2459 - }, - { - "epoch": 1.8693009118541033, - "grad_norm": 1.449811577796936, - "learning_rate": 1.7055798578457267e-06, - "loss": 0.45504286885261536, - "mean_token_accuracy": 0.8338158130645752, - "num_tokens": 21952192.0, - "step": 2460 - }, - { - "epoch": 1.8700607902735562, - "grad_norm": 2.253678321838379, - "learning_rate": 1.703594297439469e-06, - "loss": 0.44300752878189087, - "mean_token_accuracy": 0.8451106548309326, - "num_tokens": 21959107.0, - "step": 2461 - }, - { - "epoch": 1.8708206686930091, - "grad_norm": 2.5431747436523438, - "learning_rate": 1.7016092959813892e-06, - "loss": 0.34692925214767456, - "mean_token_accuracy": 0.8823766708374023, - "num_tokens": 21964543.0, - "step": 2462 - }, - { - "epoch": 1.871580547112462, - "grad_norm": 2.7001953125, - "learning_rate": 1.6996248548646393e-06, - "loss": 0.5270686745643616, - "mean_token_accuracy": 0.8366886377334595, - "num_tokens": 21970157.0, - "step": 2463 - }, - { - "epoch": 1.872340425531915, - "grad_norm": 2.3855581283569336, - "learning_rate": 1.6976409754819767e-06, - "loss": 0.40109893679618835, - "mean_token_accuracy": 0.8477234840393066, - "num_tokens": 21976046.0, - "step": 2464 - }, - { - "epoch": 1.8731003039513676, - "grad_norm": 1.6014364957809448, - "learning_rate": 1.6956576592257635e-06, - "loss": 0.4344262480735779, - "mean_token_accuracy": 0.8464433550834656, - "num_tokens": 21986299.0, - "step": 2465 - }, - { - "epoch": 1.8738601823708207, - "grad_norm": 2.221372127532959, - "learning_rate": 1.6936749074879663e-06, - "loss": 0.24239015579223633, - "mean_token_accuracy": 0.9185566306114197, - "num_tokens": 21991541.0, - "step": 2466 - }, - { - "epoch": 1.8746200607902734, - "grad_norm": 1.6672178506851196, - "learning_rate": 1.6916927216601593e-06, - "loss": 0.35219496488571167, - "mean_token_accuracy": 0.8668237328529358, - "num_tokens": 22000797.0, - "step": 2467 - }, - { - "epoch": 1.8753799392097266, - "grad_norm": 1.364131212234497, - "learning_rate": 1.6897111031335145e-06, - "loss": 0.4456409513950348, - "mean_token_accuracy": 0.8350487947463989, - "num_tokens": 22018297.0, - "step": 2468 - }, - { - "epoch": 1.8761398176291793, - "grad_norm": 1.4535794258117676, - "learning_rate": 1.6877300532988095e-06, - "loss": 0.395782470703125, - "mean_token_accuracy": 0.8482908010482788, - "num_tokens": 22030096.0, - "step": 2469 - }, - { - "epoch": 1.8768996960486324, - "grad_norm": 2.0192270278930664, - "learning_rate": 1.6857495735464196e-06, - "loss": 0.31406813859939575, - "mean_token_accuracy": 0.889453649520874, - "num_tokens": 22036082.0, - "step": 2470 - }, - { - "epoch": 1.877659574468085, - "grad_norm": 2.159257173538208, - "learning_rate": 1.6837696652663244e-06, - "loss": 0.43942126631736755, - "mean_token_accuracy": 0.8518660068511963, - "num_tokens": 22043413.0, - "step": 2471 - }, - { - "epoch": 1.878419452887538, - "grad_norm": 1.9774882793426514, - "learning_rate": 1.681790329848097e-06, - "loss": 0.42464935779571533, - "mean_token_accuracy": 0.8545591831207275, - "num_tokens": 22050290.0, - "step": 2472 - }, - { - "epoch": 1.8791793313069909, - "grad_norm": 1.0219167470932007, - "learning_rate": 1.6798115686809125e-06, - "loss": 0.36917346715927124, - "mean_token_accuracy": 0.8650286197662354, - "num_tokens": 22070408.0, - "step": 2473 - }, - { - "epoch": 1.8799392097264438, - "grad_norm": 1.2943378686904907, - "learning_rate": 1.677833383153542e-06, - "loss": 0.3434808850288391, - "mean_token_accuracy": 0.878541111946106, - "num_tokens": 22083567.0, - "step": 2474 - }, - { - "epoch": 1.8806990881458967, - "grad_norm": 3.582855224609375, - "learning_rate": 1.6758557746543518e-06, - "loss": 0.39738911390304565, - "mean_token_accuracy": 0.8951535224914551, - "num_tokens": 22086886.0, - "step": 2475 - }, - { - "epoch": 1.8814589665653494, - "grad_norm": 1.680220365524292, - "learning_rate": 1.673878744571304e-06, - "loss": 0.38146206736564636, - "mean_token_accuracy": 0.8596681356430054, - "num_tokens": 22095564.0, - "step": 2476 - }, - { - "epoch": 1.8822188449848025, - "grad_norm": 1.448194146156311, - "learning_rate": 1.6719022942919527e-06, - "loss": 0.43309977650642395, - "mean_token_accuracy": 0.8669528961181641, - "num_tokens": 22109333.0, - "step": 2477 - }, - { - "epoch": 1.8829787234042552, - "grad_norm": 1.5353537797927856, - "learning_rate": 1.6699264252034498e-06, - "loss": 0.4479079842567444, - "mean_token_accuracy": 0.8379873037338257, - "num_tokens": 22124735.0, - "step": 2478 - }, - { - "epoch": 1.8837386018237083, - "grad_norm": 1.1744320392608643, - "learning_rate": 1.6679511386925337e-06, - "loss": 0.31951260566711426, - "mean_token_accuracy": 0.8792685270309448, - "num_tokens": 22140882.0, - "step": 2479 - }, - { - "epoch": 1.884498480243161, - "grad_norm": 2.1996841430664062, - "learning_rate": 1.6659764361455383e-06, - "loss": 0.39045992493629456, - "mean_token_accuracy": 0.8587675094604492, - "num_tokens": 22146843.0, - "step": 2480 - }, - { - "epoch": 1.885258358662614, - "grad_norm": 3.494931697845459, - "learning_rate": 1.6640023189483836e-06, - "loss": 0.44756871461868286, - "mean_token_accuracy": 0.8643628358840942, - "num_tokens": 22150504.0, - "step": 2481 - }, - { - "epoch": 1.8860182370820668, - "grad_norm": 2.2455973625183105, - "learning_rate": 1.6620287884865831e-06, - "loss": 0.3308878540992737, - "mean_token_accuracy": 0.8748078942298889, - "num_tokens": 22156537.0, - "step": 2482 - }, - { - "epoch": 1.8867781155015197, - "grad_norm": 2.31868314743042, - "learning_rate": 1.6600558461452368e-06, - "loss": 0.46583569049835205, - "mean_token_accuracy": 0.8438903093338013, - "num_tokens": 22163501.0, - "step": 2483 - }, - { - "epoch": 1.8875379939209727, - "grad_norm": 1.5695412158966064, - "learning_rate": 1.65808349330903e-06, - "loss": 0.351986825466156, - "mean_token_accuracy": 0.8707568645477295, - "num_tokens": 22173880.0, - "step": 2484 - }, - { - "epoch": 1.8882978723404256, - "grad_norm": 1.4109563827514648, - "learning_rate": 1.656111731362236e-06, - "loss": 0.36058586835861206, - "mean_token_accuracy": 0.8606001138687134, - "num_tokens": 22189000.0, - "step": 2485 - }, - { - "epoch": 1.8890577507598785, - "grad_norm": 1.0398776531219482, - "learning_rate": 1.6541405616887138e-06, - "loss": 0.36524999141693115, - "mean_token_accuracy": 0.8690586090087891, - "num_tokens": 22209187.0, - "step": 2486 - }, - { - "epoch": 1.8898176291793312, - "grad_norm": 2.1050004959106445, - "learning_rate": 1.6521699856719065e-06, - "loss": 0.2988269329071045, - "mean_token_accuracy": 0.8887280225753784, - "num_tokens": 22215539.0, - "step": 2487 - }, - { - "epoch": 1.8905775075987843, - "grad_norm": 2.5606791973114014, - "learning_rate": 1.650200004694839e-06, - "loss": 0.41077330708503723, - "mean_token_accuracy": 0.8436049818992615, - "num_tokens": 22221133.0, - "step": 2488 - }, - { - "epoch": 1.891337386018237, - "grad_norm": 1.5786094665527344, - "learning_rate": 1.6482306201401211e-06, - "loss": 0.4217292368412018, - "mean_token_accuracy": 0.859939455986023, - "num_tokens": 22231578.0, - "step": 2489 - }, - { - "epoch": 1.89209726443769, - "grad_norm": 1.7131884098052979, - "learning_rate": 1.6462618333899422e-06, - "loss": 0.3945464789867401, - "mean_token_accuracy": 0.8679244518280029, - "num_tokens": 22241252.0, - "step": 2490 - }, - { - "epoch": 1.8928571428571428, - "grad_norm": 2.8350300788879395, - "learning_rate": 1.6442936458260723e-06, - "loss": 0.3992699384689331, - "mean_token_accuracy": 0.8717275857925415, - "num_tokens": 22246226.0, - "step": 2491 - }, - { - "epoch": 1.8936170212765957, - "grad_norm": 2.2180120944976807, - "learning_rate": 1.6423260588298608e-06, - "loss": 0.3381099998950958, - "mean_token_accuracy": 0.8968075513839722, - "num_tokens": 22252355.0, - "step": 2492 - }, - { - "epoch": 1.8943768996960486, - "grad_norm": 2.6498866081237793, - "learning_rate": 1.6403590737822378e-06, - "loss": 0.36339250206947327, - "mean_token_accuracy": 0.8633373379707336, - "num_tokens": 22257407.0, - "step": 2493 - }, - { - "epoch": 1.8951367781155015, - "grad_norm": 2.634241819381714, - "learning_rate": 1.6383926920637077e-06, - "loss": 0.2562698721885681, - "mean_token_accuracy": 0.8999600410461426, - "num_tokens": 22261858.0, - "step": 2494 - }, - { - "epoch": 1.8958966565349544, - "grad_norm": 2.0163333415985107, - "learning_rate": 1.6364269150543533e-06, - "loss": 0.3413389027118683, - "mean_token_accuracy": 0.8718398809432983, - "num_tokens": 22268517.0, - "step": 2495 - }, - { - "epoch": 1.8966565349544073, - "grad_norm": 2.8333005905151367, - "learning_rate": 1.6344617441338311e-06, - "loss": 0.4354540705680847, - "mean_token_accuracy": 0.8491238355636597, - "num_tokens": 22273648.0, - "step": 2496 - }, - { - "epoch": 1.8974164133738602, - "grad_norm": 1.6280957460403442, - "learning_rate": 1.6324971806813766e-06, - "loss": 0.3015792965888977, - "mean_token_accuracy": 0.8937206268310547, - "num_tokens": 22282521.0, - "step": 2497 - }, - { - "epoch": 1.898176291793313, - "grad_norm": 1.2246302366256714, - "learning_rate": 1.6305332260757937e-06, - "loss": 0.26619502902030945, - "mean_token_accuracy": 0.8886681199073792, - "num_tokens": 22295179.0, - "step": 2498 - }, - { - "epoch": 1.898936170212766, - "grad_norm": 2.4014432430267334, - "learning_rate": 1.6285698816954626e-06, - "loss": 0.3735058903694153, - "mean_token_accuracy": 0.8693109750747681, - "num_tokens": 22300681.0, - "step": 2499 - }, - { - "epoch": 1.8996960486322187, - "grad_norm": 1.4447300434112549, - "learning_rate": 1.6266071489183327e-06, - "loss": 0.40768876671791077, - "mean_token_accuracy": 0.8556059002876282, - "num_tokens": 22312442.0, - "step": 2500 - }, - { - "epoch": 1.9004559270516719, - "grad_norm": 2.1339821815490723, - "learning_rate": 1.6246450291219268e-06, - "loss": 0.33442017436027527, - "mean_token_accuracy": 0.8837105631828308, - "num_tokens": 22318779.0, - "step": 2501 - }, - { - "epoch": 1.9012158054711246, - "grad_norm": 2.8564913272857666, - "learning_rate": 1.6226835236833356e-06, - "loss": 0.36013197898864746, - "mean_token_accuracy": 0.8810569047927856, - "num_tokens": 22323390.0, - "step": 2502 - }, - { - "epoch": 1.9019756838905775, - "grad_norm": 2.1201915740966797, - "learning_rate": 1.620722633979219e-06, - "loss": 0.4587489664554596, - "mean_token_accuracy": 0.8517274856567383, - "num_tokens": 22330275.0, - "step": 2503 - }, - { - "epoch": 1.9027355623100304, - "grad_norm": 2.211402177810669, - "learning_rate": 1.6187623613858038e-06, - "loss": 0.3698349595069885, - "mean_token_accuracy": 0.8768182992935181, - "num_tokens": 22336041.0, - "step": 2504 - }, - { - "epoch": 1.9034954407294833, - "grad_norm": 1.421604871749878, - "learning_rate": 1.6168027072788868e-06, - "loss": 0.38086453080177307, - "mean_token_accuracy": 0.8622198104858398, - "num_tokens": 22349310.0, - "step": 2505 - }, - { - "epoch": 1.9042553191489362, - "grad_norm": 2.4304113388061523, - "learning_rate": 1.6148436730338279e-06, - "loss": 0.34694477915763855, - "mean_token_accuracy": 0.8833136558532715, - "num_tokens": 22355069.0, - "step": 2506 - }, - { - "epoch": 1.905015197568389, - "grad_norm": 2.1076772212982178, - "learning_rate": 1.6128852600255518e-06, - "loss": 0.4973800778388977, - "mean_token_accuracy": 0.851190984249115, - "num_tokens": 22362402.0, - "step": 2507 - }, - { - "epoch": 1.905775075987842, - "grad_norm": 3.0934200286865234, - "learning_rate": 1.6109274696285496e-06, - "loss": 0.46498024463653564, - "mean_token_accuracy": 0.8436626195907593, - "num_tokens": 22367390.0, - "step": 2508 - }, - { - "epoch": 1.9065349544072947, - "grad_norm": 2.0114359855651855, - "learning_rate": 1.6089703032168736e-06, - "loss": 0.45143815875053406, - "mean_token_accuracy": 0.852748692035675, - "num_tokens": 22377032.0, - "step": 2509 - }, - { - "epoch": 1.9072948328267478, - "grad_norm": 1.8780893087387085, - "learning_rate": 1.6070137621641382e-06, - "loss": 0.3977179527282715, - "mean_token_accuracy": 0.8556262850761414, - "num_tokens": 22386880.0, - "step": 2510 - }, - { - "epoch": 1.9080547112462005, - "grad_norm": 1.6748069524765015, - "learning_rate": 1.6050578478435184e-06, - "loss": 0.35590440034866333, - "mean_token_accuracy": 0.8702141046524048, - "num_tokens": 22396616.0, - "step": 2511 - }, - { - "epoch": 1.9088145896656536, - "grad_norm": 0.9799401760101318, - "learning_rate": 1.6031025616277512e-06, - "loss": 0.3325427770614624, - "mean_token_accuracy": 0.8771291971206665, - "num_tokens": 22419580.0, - "step": 2512 - }, - { - "epoch": 1.9095744680851063, - "grad_norm": 1.5084866285324097, - "learning_rate": 1.6011479048891323e-06, - "loss": 0.44336390495300293, - "mean_token_accuracy": 0.8786209225654602, - "num_tokens": 22434235.0, - "step": 2513 - }, - { - "epoch": 1.9103343465045592, - "grad_norm": 1.8544305562973022, - "learning_rate": 1.5991938789995138e-06, - "loss": 0.3055306375026703, - "mean_token_accuracy": 0.9043174982070923, - "num_tokens": 22442003.0, - "step": 2514 - }, - { - "epoch": 1.9110942249240122, - "grad_norm": 4.29932165145874, - "learning_rate": 1.5972404853303061e-06, - "loss": 0.386760413646698, - "mean_token_accuracy": 0.8914207220077515, - "num_tokens": 22444787.0, - "step": 2515 - }, - { - "epoch": 1.911854103343465, - "grad_norm": 1.7560505867004395, - "learning_rate": 1.595287725252478e-06, - "loss": 0.4141422510147095, - "mean_token_accuracy": 0.862310528755188, - "num_tokens": 22453625.0, - "step": 2516 - }, - { - "epoch": 1.912613981762918, - "grad_norm": 2.685443878173828, - "learning_rate": 1.5933356001365502e-06, - "loss": 0.36217260360717773, - "mean_token_accuracy": 0.868883490562439, - "num_tokens": 22458597.0, - "step": 2517 - }, - { - "epoch": 1.9133738601823707, - "grad_norm": 2.2587239742279053, - "learning_rate": 1.591384111352599e-06, - "loss": 0.5298880934715271, - "mean_token_accuracy": 0.821168839931488, - "num_tokens": 22466091.0, - "step": 2518 - }, - { - "epoch": 1.9141337386018238, - "grad_norm": 2.273380756378174, - "learning_rate": 1.5894332602702545e-06, - "loss": 0.3194117546081543, - "mean_token_accuracy": 0.8849239945411682, - "num_tokens": 22471785.0, - "step": 2519 - }, - { - "epoch": 1.9148936170212765, - "grad_norm": 2.314634084701538, - "learning_rate": 1.5874830482587003e-06, - "loss": 0.457550585269928, - "mean_token_accuracy": 0.8367670774459839, - "num_tokens": 22479091.0, - "step": 2520 - }, - { - "epoch": 1.9156534954407296, - "grad_norm": 2.16206693649292, - "learning_rate": 1.585533476686669e-06, - "loss": 0.43055859208106995, - "mean_token_accuracy": 0.8659856915473938, - "num_tokens": 22487379.0, - "step": 2521 - }, - { - "epoch": 1.9164133738601823, - "grad_norm": 2.2091798782348633, - "learning_rate": 1.5835845469224447e-06, - "loss": 0.45421302318573, - "mean_token_accuracy": 0.8418087959289551, - "num_tokens": 22493755.0, - "step": 2522 - }, - { - "epoch": 1.9171732522796354, - "grad_norm": 1.6166985034942627, - "learning_rate": 1.5816362603338632e-06, - "loss": 0.5211667418479919, - "mean_token_accuracy": 0.809440016746521, - "num_tokens": 22506648.0, - "step": 2523 - }, - { - "epoch": 1.917933130699088, - "grad_norm": 2.4998703002929688, - "learning_rate": 1.5796886182883053e-06, - "loss": 0.45915648341178894, - "mean_token_accuracy": 0.833067774772644, - "num_tokens": 22513216.0, - "step": 2524 - }, - { - "epoch": 1.918693009118541, - "grad_norm": 1.492928147315979, - "learning_rate": 1.577741622152702e-06, - "loss": 0.45581498742103577, - "mean_token_accuracy": 0.8531479835510254, - "num_tokens": 22524908.0, - "step": 2525 - }, - { - "epoch": 1.919452887537994, - "grad_norm": 2.0502207279205322, - "learning_rate": 1.5757952732935288e-06, - "loss": 0.4156759977340698, - "mean_token_accuracy": 0.8677599430084229, - "num_tokens": 22532275.0, - "step": 2526 - }, - { - "epoch": 1.9202127659574468, - "grad_norm": 2.4572031497955322, - "learning_rate": 1.5738495730768104e-06, - "loss": 0.43373313546180725, - "mean_token_accuracy": 0.8435516357421875, - "num_tokens": 22538272.0, - "step": 2527 - }, - { - "epoch": 1.9209726443768997, - "grad_norm": 2.071903705596924, - "learning_rate": 1.5719045228681127e-06, - "loss": 0.3211413621902466, - "mean_token_accuracy": 0.87841796875, - "num_tokens": 22545487.0, - "step": 2528 - }, - { - "epoch": 1.9217325227963524, - "grad_norm": 1.6742064952850342, - "learning_rate": 1.5699601240325474e-06, - "loss": 0.3704240322113037, - "mean_token_accuracy": 0.8646563291549683, - "num_tokens": 22554840.0, - "step": 2529 - }, - { - "epoch": 1.9224924012158056, - "grad_norm": 1.0941399335861206, - "learning_rate": 1.5680163779347668e-06, - "loss": 0.3595704436302185, - "mean_token_accuracy": 0.8680597543716431, - "num_tokens": 22572627.0, - "step": 2530 - }, - { - "epoch": 1.9232522796352582, - "grad_norm": 2.9815237522125244, - "learning_rate": 1.5660732859389687e-06, - "loss": 0.2941335141658783, - "mean_token_accuracy": 0.8847303986549377, - "num_tokens": 22576851.0, - "step": 2531 - }, - { - "epoch": 1.9240121580547114, - "grad_norm": 2.898106813430786, - "learning_rate": 1.5641308494088903e-06, - "loss": 0.4066317081451416, - "mean_token_accuracy": 0.8469538688659668, - "num_tokens": 22581431.0, - "step": 2532 - }, - { - "epoch": 1.924772036474164, - "grad_norm": 1.6757515668869019, - "learning_rate": 1.5621890697078069e-06, - "loss": 0.33923569321632385, - "mean_token_accuracy": 0.8790708184242249, - "num_tokens": 22590648.0, - "step": 2533 - }, - { - "epoch": 1.925531914893617, - "grad_norm": 1.747314214706421, - "learning_rate": 1.5602479481985333e-06, - "loss": 0.4865703582763672, - "mean_token_accuracy": 0.8314566612243652, - "num_tokens": 22600153.0, - "step": 2534 - }, - { - "epoch": 1.9262917933130699, - "grad_norm": 2.7927849292755127, - "learning_rate": 1.5583074862434254e-06, - "loss": 0.335658460855484, - "mean_token_accuracy": 0.8769067525863647, - "num_tokens": 22604864.0, - "step": 2535 - }, - { - "epoch": 1.9270516717325228, - "grad_norm": 2.2553000450134277, - "learning_rate": 1.5563676852043738e-06, - "loss": 0.4442562460899353, - "mean_token_accuracy": 0.8381515145301819, - "num_tokens": 22611102.0, - "step": 2536 - }, - { - "epoch": 1.9278115501519757, - "grad_norm": 1.1937638521194458, - "learning_rate": 1.5544285464428044e-06, - "loss": 0.38608425855636597, - "mean_token_accuracy": 0.8589644432067871, - "num_tokens": 22627781.0, - "step": 2537 - }, - { - "epoch": 1.9285714285714286, - "grad_norm": 3.282639980316162, - "learning_rate": 1.55249007131968e-06, - "loss": 0.31231993436813354, - "mean_token_accuracy": 0.8917703032493591, - "num_tokens": 22632341.0, - "step": 2538 - }, - { - "epoch": 1.9293313069908815, - "grad_norm": 2.3212976455688477, - "learning_rate": 1.5505522611954977e-06, - "loss": 0.34952571988105774, - "mean_token_accuracy": 0.8752106428146362, - "num_tokens": 22638572.0, - "step": 2539 - }, - { - "epoch": 1.9300911854103342, - "grad_norm": 1.389098882675171, - "learning_rate": 1.548615117430286e-06, - "loss": 0.4298851788043976, - "mean_token_accuracy": 0.871698260307312, - "num_tokens": 22651875.0, - "step": 2540 - }, - { - "epoch": 1.9308510638297873, - "grad_norm": 1.5333977937698364, - "learning_rate": 1.5466786413836077e-06, - "loss": 0.45540744066238403, - "mean_token_accuracy": 0.8409075736999512, - "num_tokens": 22662903.0, - "step": 2541 - }, - { - "epoch": 1.93161094224924, - "grad_norm": 1.7833251953125, - "learning_rate": 1.5447428344145565e-06, - "loss": 0.333247572183609, - "mean_token_accuracy": 0.8796100616455078, - "num_tokens": 22671125.0, - "step": 2542 - }, - { - "epoch": 1.9323708206686931, - "grad_norm": 1.5165303945541382, - "learning_rate": 1.5428076978817564e-06, - "loss": 0.3085063099861145, - "mean_token_accuracy": 0.888705849647522, - "num_tokens": 22681482.0, - "step": 2543 - }, - { - "epoch": 1.9331306990881458, - "grad_norm": 2.3556196689605713, - "learning_rate": 1.5408732331433596e-06, - "loss": 0.44008776545524597, - "mean_token_accuracy": 0.8578170537948608, - "num_tokens": 22686952.0, - "step": 2544 - }, - { - "epoch": 1.9338905775075987, - "grad_norm": 2.9572882652282715, - "learning_rate": 1.538939441557048e-06, - "loss": 0.3779261112213135, - "mean_token_accuracy": 0.8657241463661194, - "num_tokens": 22691211.0, - "step": 2545 - }, - { - "epoch": 1.9346504559270516, - "grad_norm": 2.373473644256592, - "learning_rate": 1.5370063244800326e-06, - "loss": 0.4113072454929352, - "mean_token_accuracy": 0.872116208076477, - "num_tokens": 22697442.0, - "step": 2546 - }, - { - "epoch": 1.9354103343465046, - "grad_norm": 2.270207643508911, - "learning_rate": 1.5350738832690479e-06, - "loss": 0.4021070897579193, - "mean_token_accuracy": 0.8750372529029846, - "num_tokens": 22703693.0, - "step": 2547 - }, - { - "epoch": 1.9361702127659575, - "grad_norm": 2.429445266723633, - "learning_rate": 1.5331421192803565e-06, - "loss": 0.40210235118865967, - "mean_token_accuracy": 0.8593704104423523, - "num_tokens": 22709285.0, - "step": 2548 - }, - { - "epoch": 1.9369300911854104, - "grad_norm": 1.4576458930969238, - "learning_rate": 1.5312110338697427e-06, - "loss": 0.44822201132774353, - "mean_token_accuracy": 0.8737322688102722, - "num_tokens": 22723743.0, - "step": 2549 - }, - { - "epoch": 1.9376899696048633, - "grad_norm": 2.1008098125457764, - "learning_rate": 1.5292806283925192e-06, - "loss": 0.3514235019683838, - "mean_token_accuracy": 0.8689005374908447, - "num_tokens": 22730135.0, - "step": 2550 - }, - { - "epoch": 1.938449848024316, - "grad_norm": 1.9786806106567383, - "learning_rate": 1.5273509042035172e-06, - "loss": 0.4483771324157715, - "mean_token_accuracy": 0.8353633880615234, - "num_tokens": 22738717.0, - "step": 2551 - }, - { - "epoch": 1.939209726443769, - "grad_norm": 1.0649693012237549, - "learning_rate": 1.5254218626570927e-06, - "loss": 0.30712205171585083, - "mean_token_accuracy": 0.8802675008773804, - "num_tokens": 22757346.0, - "step": 2552 - }, - { - "epoch": 1.9399696048632218, - "grad_norm": 3.0401108264923096, - "learning_rate": 1.5234935051071193e-06, - "loss": 0.5213959217071533, - "mean_token_accuracy": 0.8249514102935791, - "num_tokens": 22762169.0, - "step": 2553 - }, - { - "epoch": 1.940729483282675, - "grad_norm": 2.892486572265625, - "learning_rate": 1.521565832906994e-06, - "loss": 0.5694394111633301, - "mean_token_accuracy": 0.8139263391494751, - "num_tokens": 22767824.0, - "step": 2554 - }, - { - "epoch": 1.9414893617021276, - "grad_norm": 1.6187207698822021, - "learning_rate": 1.519638847409632e-06, - "loss": 0.46748271584510803, - "mean_token_accuracy": 0.8541051149368286, - "num_tokens": 22778195.0, - "step": 2555 - }, - { - "epoch": 1.9422492401215805, - "grad_norm": 1.3857731819152832, - "learning_rate": 1.5177125499674639e-06, - "loss": 0.35661786794662476, - "mean_token_accuracy": 0.8711516857147217, - "num_tokens": 22792353.0, - "step": 2556 - }, - { - "epoch": 1.9430091185410334, - "grad_norm": 1.108441710472107, - "learning_rate": 1.515786941932441e-06, - "loss": 0.3537200391292572, - "mean_token_accuracy": 0.8739079833030701, - "num_tokens": 22813185.0, - "step": 2557 - }, - { - "epoch": 1.9437689969604863, - "grad_norm": 2.0528404712677, - "learning_rate": 1.5138620246560295e-06, - "loss": 0.4161028265953064, - "mean_token_accuracy": 0.8385938405990601, - "num_tokens": 22821227.0, - "step": 2558 - }, - { - "epoch": 1.9445288753799392, - "grad_norm": 1.5123628377914429, - "learning_rate": 1.5119377994892095e-06, - "loss": 0.4420986473560333, - "mean_token_accuracy": 0.8664361834526062, - "num_tokens": 22835064.0, - "step": 2559 - }, - { - "epoch": 1.9452887537993921, - "grad_norm": 2.5354838371276855, - "learning_rate": 1.5100142677824752e-06, - "loss": 0.3837323784828186, - "mean_token_accuracy": 0.8607655763626099, - "num_tokens": 22840455.0, - "step": 2560 - }, - { - "epoch": 1.946048632218845, - "grad_norm": 1.1354057788848877, - "learning_rate": 1.5080914308858375e-06, - "loss": 0.39776813983917236, - "mean_token_accuracy": 0.8586497902870178, - "num_tokens": 22858828.0, - "step": 2561 - }, - { - "epoch": 1.9468085106382977, - "grad_norm": 1.576740026473999, - "learning_rate": 1.5061692901488161e-06, - "loss": 0.3167848289012909, - "mean_token_accuracy": 0.8876185417175293, - "num_tokens": 22868674.0, - "step": 2562 - }, - { - "epoch": 1.9475683890577509, - "grad_norm": 1.4835401773452759, - "learning_rate": 1.5042478469204437e-06, - "loss": 0.44950318336486816, - "mean_token_accuracy": 0.8526639342308044, - "num_tokens": 22883019.0, - "step": 2563 - }, - { - "epoch": 1.9483282674772036, - "grad_norm": 1.617073655128479, - "learning_rate": 1.502327102549262e-06, - "loss": 0.45711010694503784, - "mean_token_accuracy": 0.834361732006073, - "num_tokens": 22896834.0, - "step": 2564 - }, - { - "epoch": 1.9490881458966567, - "grad_norm": 1.3348414897918701, - "learning_rate": 1.5004070583833252e-06, - "loss": 0.3691314458847046, - "mean_token_accuracy": 0.8779371380805969, - "num_tokens": 22912350.0, - "step": 2565 - }, - { - "epoch": 1.9498480243161094, - "grad_norm": 1.711234450340271, - "learning_rate": 1.4984877157701932e-06, - "loss": 0.38726937770843506, - "mean_token_accuracy": 0.8704015016555786, - "num_tokens": 22922575.0, - "step": 2566 - }, - { - "epoch": 1.9506079027355623, - "grad_norm": 2.4587950706481934, - "learning_rate": 1.4965690760569346e-06, - "loss": 0.4455464482307434, - "mean_token_accuracy": 0.8481032252311707, - "num_tokens": 22928717.0, - "step": 2567 - }, - { - "epoch": 1.9513677811550152, - "grad_norm": 2.4189560413360596, - "learning_rate": 1.4946511405901237e-06, - "loss": 0.4120418429374695, - "mean_token_accuracy": 0.8519487380981445, - "num_tokens": 22934977.0, - "step": 2568 - }, - { - "epoch": 1.952127659574468, - "grad_norm": 1.2503050565719604, - "learning_rate": 1.4927339107158437e-06, - "loss": 0.4434332251548767, - "mean_token_accuracy": 0.8448144793510437, - "num_tokens": 22950061.0, - "step": 2569 - }, - { - "epoch": 1.952887537993921, - "grad_norm": 1.788493275642395, - "learning_rate": 1.4908173877796784e-06, - "loss": 0.49203023314476013, - "mean_token_accuracy": 0.8601495623588562, - "num_tokens": 22961838.0, - "step": 2570 - }, - { - "epoch": 1.9536474164133737, - "grad_norm": 1.4260050058364868, - "learning_rate": 1.4889015731267186e-06, - "loss": 0.3286570906639099, - "mean_token_accuracy": 0.882429838180542, - "num_tokens": 22973192.0, - "step": 2571 - }, - { - "epoch": 1.9544072948328268, - "grad_norm": 1.6754822731018066, - "learning_rate": 1.486986468101555e-06, - "loss": 0.34655290842056274, - "mean_token_accuracy": 0.8807861804962158, - "num_tokens": 22983661.0, - "step": 2572 - }, - { - "epoch": 1.9551671732522795, - "grad_norm": 1.9064570665359497, - "learning_rate": 1.4850720740482842e-06, - "loss": 0.34020254015922546, - "mean_token_accuracy": 0.86677086353302, - "num_tokens": 22991231.0, - "step": 2573 - }, - { - "epoch": 1.9559270516717326, - "grad_norm": 1.977444052696228, - "learning_rate": 1.4831583923105e-06, - "loss": 0.21505260467529297, - "mean_token_accuracy": 0.921241819858551, - "num_tokens": 22996828.0, - "step": 2574 - }, - { - "epoch": 1.9566869300911853, - "grad_norm": 1.1019235849380493, - "learning_rate": 1.481245424231298e-06, - "loss": 0.3804295063018799, - "mean_token_accuracy": 0.8582668900489807, - "num_tokens": 23016018.0, - "step": 2575 - }, - { - "epoch": 1.9574468085106385, - "grad_norm": 1.7943179607391357, - "learning_rate": 1.4793331711532743e-06, - "loss": 0.38565245270729065, - "mean_token_accuracy": 0.8599048256874084, - "num_tokens": 23024461.0, - "step": 2576 - }, - { - "epoch": 1.9582066869300911, - "grad_norm": 2.273824453353882, - "learning_rate": 1.4774216344185204e-06, - "loss": 0.46297723054885864, - "mean_token_accuracy": 0.8294345140457153, - "num_tokens": 23031687.0, - "step": 2577 - }, - { - "epoch": 1.958966565349544, - "grad_norm": 2.308509111404419, - "learning_rate": 1.4755108153686275e-06, - "loss": 0.4366525411605835, - "mean_token_accuracy": 0.8515903949737549, - "num_tokens": 23037072.0, - "step": 2578 - }, - { - "epoch": 1.959726443768997, - "grad_norm": 2.069028377532959, - "learning_rate": 1.4736007153446803e-06, - "loss": 0.33900877833366394, - "mean_token_accuracy": 0.8937177658081055, - "num_tokens": 23043207.0, - "step": 2579 - }, - { - "epoch": 1.9604863221884499, - "grad_norm": 2.905163288116455, - "learning_rate": 1.4716913356872614e-06, - "loss": 0.3708382844924927, - "mean_token_accuracy": 0.8936747312545776, - "num_tokens": 23047020.0, - "step": 2580 - }, - { - "epoch": 1.9612462006079028, - "grad_norm": 2.4153175354003906, - "learning_rate": 1.4697826777364478e-06, - "loss": 0.473562091588974, - "mean_token_accuracy": 0.8350275158882141, - "num_tokens": 23053282.0, - "step": 2581 - }, - { - "epoch": 1.9620060790273555, - "grad_norm": 2.21589994430542, - "learning_rate": 1.467874742831808e-06, - "loss": 0.3812660276889801, - "mean_token_accuracy": 0.8623865842819214, - "num_tokens": 23059399.0, - "step": 2582 - }, - { - "epoch": 1.9627659574468086, - "grad_norm": 1.0847623348236084, - "learning_rate": 1.4659675323124037e-06, - "loss": 0.3846944570541382, - "mean_token_accuracy": 0.8633466958999634, - "num_tokens": 23081005.0, - "step": 2583 - }, - { - "epoch": 1.9635258358662613, - "grad_norm": 1.8754645586013794, - "learning_rate": 1.46406104751679e-06, - "loss": 0.3460300862789154, - "mean_token_accuracy": 0.8757443428039551, - "num_tokens": 23088710.0, - "step": 2584 - }, - { - "epoch": 1.9642857142857144, - "grad_norm": 2.13075852394104, - "learning_rate": 1.462155289783011e-06, - "loss": 0.3060935139656067, - "mean_token_accuracy": 0.9070644378662109, - "num_tokens": 23094862.0, - "step": 2585 - }, - { - "epoch": 1.965045592705167, - "grad_norm": 2.9674458503723145, - "learning_rate": 1.4602502604486e-06, - "loss": 0.4464406371116638, - "mean_token_accuracy": 0.8497441411018372, - "num_tokens": 23099821.0, - "step": 2586 - }, - { - "epoch": 1.96580547112462, - "grad_norm": 1.9171007871627808, - "learning_rate": 1.45834596085058e-06, - "loss": 0.3905114531517029, - "mean_token_accuracy": 0.8564352989196777, - "num_tokens": 23107804.0, - "step": 2587 - }, - { - "epoch": 1.966565349544073, - "grad_norm": 2.0817408561706543, - "learning_rate": 1.456442392325463e-06, - "loss": 0.3903818130493164, - "mean_token_accuracy": 0.8671162128448486, - "num_tokens": 23115224.0, - "step": 2588 - }, - { - "epoch": 1.9673252279635258, - "grad_norm": 2.6379549503326416, - "learning_rate": 1.4545395562092467e-06, - "loss": 0.22965987026691437, - "mean_token_accuracy": 0.9160916805267334, - "num_tokens": 23119184.0, - "step": 2589 - }, - { - "epoch": 1.9680851063829787, - "grad_norm": 2.525221824645996, - "learning_rate": 1.4526374538374133e-06, - "loss": 0.4132574498653412, - "mean_token_accuracy": 0.8486990332603455, - "num_tokens": 23124679.0, - "step": 2590 - }, - { - "epoch": 1.9688449848024316, - "grad_norm": 2.0362391471862793, - "learning_rate": 1.4507360865449318e-06, - "loss": 0.29624345898628235, - "mean_token_accuracy": 0.888127863407135, - "num_tokens": 23130756.0, - "step": 2591 - }, - { - "epoch": 1.9696048632218845, - "grad_norm": 1.5150481462478638, - "learning_rate": 1.4488354556662553e-06, - "loss": 0.3852264881134033, - "mean_token_accuracy": 0.8532775640487671, - "num_tokens": 23141597.0, - "step": 2592 - }, - { - "epoch": 1.9703647416413372, - "grad_norm": 1.5255193710327148, - "learning_rate": 1.4469355625353199e-06, - "loss": 0.37015780806541443, - "mean_token_accuracy": 0.8669752478599548, - "num_tokens": 23152487.0, - "step": 2593 - }, - { - "epoch": 1.9711246200607904, - "grad_norm": 1.1780041456222534, - "learning_rate": 1.4450364084855433e-06, - "loss": 0.34421291947364807, - "mean_token_accuracy": 0.8593694567680359, - "num_tokens": 23168769.0, - "step": 2594 - }, - { - "epoch": 1.971884498480243, - "grad_norm": 2.4549946784973145, - "learning_rate": 1.4431379948498254e-06, - "loss": 0.4000544548034668, - "mean_token_accuracy": 0.8551953434944153, - "num_tokens": 23175428.0, - "step": 2595 - }, - { - "epoch": 1.9726443768996962, - "grad_norm": 2.374192476272583, - "learning_rate": 1.4412403229605453e-06, - "loss": 0.31329840421676636, - "mean_token_accuracy": 0.8917277455329895, - "num_tokens": 23180678.0, - "step": 2596 - }, - { - "epoch": 1.9734042553191489, - "grad_norm": 1.268515706062317, - "learning_rate": 1.4393433941495638e-06, - "loss": 0.34808623790740967, - "mean_token_accuracy": 0.8726245164871216, - "num_tokens": 23194733.0, - "step": 2597 - }, - { - "epoch": 1.9741641337386018, - "grad_norm": 2.0898988246917725, - "learning_rate": 1.4374472097482156e-06, - "loss": 0.45849233865737915, - "mean_token_accuracy": 0.8414266109466553, - "num_tokens": 23202211.0, - "step": 2598 - }, - { - "epoch": 1.9749240121580547, - "grad_norm": 2.1497802734375, - "learning_rate": 1.4355517710873184e-06, - "loss": 0.4304521977901459, - "mean_token_accuracy": 0.8502874374389648, - "num_tokens": 23209623.0, - "step": 2599 - }, - { - "epoch": 1.9756838905775076, - "grad_norm": 1.821786880493164, - "learning_rate": 1.4336570794971643e-06, - "loss": 0.3910462558269501, - "mean_token_accuracy": 0.8962477445602417, - "num_tokens": 23218904.0, - "step": 2600 - }, - { - "epoch": 1.9764437689969605, - "grad_norm": 2.2523093223571777, - "learning_rate": 1.4317631363075186e-06, - "loss": 0.3456020951271057, - "mean_token_accuracy": 0.8703117370605469, - "num_tokens": 23225602.0, - "step": 2601 - }, - { - "epoch": 1.9772036474164134, - "grad_norm": 1.6920030117034912, - "learning_rate": 1.4298699428476236e-06, - "loss": 0.4629668593406677, - "mean_token_accuracy": 0.841956615447998, - "num_tokens": 23236812.0, - "step": 2602 - }, - { - "epoch": 1.9779635258358663, - "grad_norm": 1.8796344995498657, - "learning_rate": 1.427977500446199e-06, - "loss": 0.3302173316478729, - "mean_token_accuracy": 0.8769404888153076, - "num_tokens": 23245851.0, - "step": 2603 - }, - { - "epoch": 1.978723404255319, - "grad_norm": 2.4003775119781494, - "learning_rate": 1.4260858104314299e-06, - "loss": 0.48402607440948486, - "mean_token_accuracy": 0.8477497100830078, - "num_tokens": 23252429.0, - "step": 2604 - }, - { - "epoch": 1.9794832826747721, - "grad_norm": 3.576800584793091, - "learning_rate": 1.4241948741309783e-06, - "loss": 0.2943669259548187, - "mean_token_accuracy": 0.8933546543121338, - "num_tokens": 23255431.0, - "step": 2605 - }, - { - "epoch": 1.9802431610942248, - "grad_norm": 2.7589938640594482, - "learning_rate": 1.4223046928719764e-06, - "loss": 0.5138746500015259, - "mean_token_accuracy": 0.817468523979187, - "num_tokens": 23261351.0, - "step": 2606 - }, - { - "epoch": 1.981003039513678, - "grad_norm": 1.6950130462646484, - "learning_rate": 1.420415267981026e-06, - "loss": 0.2744991183280945, - "mean_token_accuracy": 0.9005721211433411, - "num_tokens": 23269482.0, - "step": 2607 - }, - { - "epoch": 1.9817629179331306, - "grad_norm": 1.5962934494018555, - "learning_rate": 1.418526600784198e-06, - "loss": 0.4629114270210266, - "mean_token_accuracy": 0.8337699174880981, - "num_tokens": 23279796.0, - "step": 2608 - }, - { - "epoch": 1.9825227963525835, - "grad_norm": 1.4962197542190552, - "learning_rate": 1.4166386926070322e-06, - "loss": 0.4217689633369446, - "mean_token_accuracy": 0.8445580005645752, - "num_tokens": 23293050.0, - "step": 2609 - }, - { - "epoch": 1.9832826747720365, - "grad_norm": 1.4243721961975098, - "learning_rate": 1.414751544774535e-06, - "loss": 0.4888152480125427, - "mean_token_accuracy": 0.8298524022102356, - "num_tokens": 23308501.0, - "step": 2610 - }, - { - "epoch": 1.9840425531914894, - "grad_norm": 1.5776121616363525, - "learning_rate": 1.412865158611179e-06, - "loss": 0.3156965970993042, - "mean_token_accuracy": 0.8773540258407593, - "num_tokens": 23317401.0, - "step": 2611 - }, - { - "epoch": 1.9848024316109423, - "grad_norm": 1.4690552949905396, - "learning_rate": 1.4109795354409045e-06, - "loss": 0.35854774713516235, - "mean_token_accuracy": 0.869156002998352, - "num_tokens": 23328891.0, - "step": 2612 - }, - { - "epoch": 1.9855623100303952, - "grad_norm": 1.5036180019378662, - "learning_rate": 1.4090946765871105e-06, - "loss": 0.3579009771347046, - "mean_token_accuracy": 0.8698509931564331, - "num_tokens": 23340473.0, - "step": 2613 - }, - { - "epoch": 1.986322188449848, - "grad_norm": 2.0811538696289062, - "learning_rate": 1.4072105833726685e-06, - "loss": 0.2905905246734619, - "mean_token_accuracy": 0.9131759405136108, - "num_tokens": 23346480.0, - "step": 2614 - }, - { - "epoch": 1.9870820668693008, - "grad_norm": 1.2866275310516357, - "learning_rate": 1.4053272571199037e-06, - "loss": 0.4091147184371948, - "mean_token_accuracy": 0.8537255525588989, - "num_tokens": 23361957.0, - "step": 2615 - }, - { - "epoch": 1.987841945288754, - "grad_norm": 1.439497470855713, - "learning_rate": 1.4034446991506084e-06, - "loss": 0.4888972342014313, - "mean_token_accuracy": 0.8451695442199707, - "num_tokens": 23374936.0, - "step": 2616 - }, - { - "epoch": 1.9886018237082066, - "grad_norm": 1.758204698562622, - "learning_rate": 1.401562910786034e-06, - "loss": 0.4976118803024292, - "mean_token_accuracy": 0.8346713781356812, - "num_tokens": 23386102.0, - "step": 2617 - }, - { - "epoch": 1.9893617021276597, - "grad_norm": 1.436486840248108, - "learning_rate": 1.3996818933468926e-06, - "loss": 0.42407113313674927, - "mean_token_accuracy": 0.8529444932937622, - "num_tokens": 23398645.0, - "step": 2618 - }, - { - "epoch": 1.9901215805471124, - "grad_norm": 2.1466588973999023, - "learning_rate": 1.397801648153354e-06, - "loss": 0.45519331097602844, - "mean_token_accuracy": 0.8460411429405212, - "num_tokens": 23406162.0, - "step": 2619 - }, - { - "epoch": 1.9908814589665653, - "grad_norm": 2.0492005348205566, - "learning_rate": 1.395922176525047e-06, - "loss": 0.31093084812164307, - "mean_token_accuracy": 0.8927264213562012, - "num_tokens": 23412051.0, - "step": 2620 - }, - { - "epoch": 1.9916413373860182, - "grad_norm": 2.2639048099517822, - "learning_rate": 1.3940434797810567e-06, - "loss": 0.3804079592227936, - "mean_token_accuracy": 0.8720212578773499, - "num_tokens": 23418252.0, - "step": 2621 - }, - { - "epoch": 1.9924012158054711, - "grad_norm": 1.9541687965393066, - "learning_rate": 1.3921655592399256e-06, - "loss": 0.38776344060897827, - "mean_token_accuracy": 0.858753502368927, - "num_tokens": 23425901.0, - "step": 2622 - }, - { - "epoch": 1.993161094224924, - "grad_norm": 1.5119032859802246, - "learning_rate": 1.3902884162196509e-06, - "loss": 0.39581215381622314, - "mean_token_accuracy": 0.8539663553237915, - "num_tokens": 23439390.0, - "step": 2623 - }, - { - "epoch": 1.993920972644377, - "grad_norm": 2.1608591079711914, - "learning_rate": 1.388412052037682e-06, - "loss": 0.41801220178604126, - "mean_token_accuracy": 0.8703387975692749, - "num_tokens": 23445725.0, - "step": 2624 - }, - { - "epoch": 1.9946808510638299, - "grad_norm": 2.463165521621704, - "learning_rate": 1.3865364680109239e-06, - "loss": 0.3252835273742676, - "mean_token_accuracy": 0.9031686186790466, - "num_tokens": 23451122.0, - "step": 2625 - }, - { - "epoch": 1.9954407294832825, - "grad_norm": 1.1901201009750366, - "learning_rate": 1.384661665455736e-06, - "loss": 0.3358447253704071, - "mean_token_accuracy": 0.8767676949501038, - "num_tokens": 23467381.0, - "step": 2626 - }, - { - "epoch": 1.9962006079027357, - "grad_norm": 1.3035757541656494, - "learning_rate": 1.3827876456879247e-06, - "loss": 0.3736562430858612, - "mean_token_accuracy": 0.849855899810791, - "num_tokens": 23482192.0, - "step": 2627 - }, - { - "epoch": 1.9969604863221884, - "grad_norm": 1.8807034492492676, - "learning_rate": 1.3809144100227483e-06, - "loss": 0.45943766832351685, - "mean_token_accuracy": 0.8456380367279053, - "num_tokens": 23495167.0, - "step": 2628 - }, - { - "epoch": 1.9977203647416415, - "grad_norm": 2.3645784854888916, - "learning_rate": 1.3790419597749198e-06, - "loss": 0.4271511435508728, - "mean_token_accuracy": 0.846099853515625, - "num_tokens": 23500790.0, - "step": 2629 - }, - { - "epoch": 1.9984802431610942, - "grad_norm": 1.8451792001724243, - "learning_rate": 1.3771702962585928e-06, - "loss": 0.38092344999313354, - "mean_token_accuracy": 0.8641276359558105, - "num_tokens": 23508845.0, - "step": 2630 - }, - { - "epoch": 1.999240121580547, - "grad_norm": 1.1115045547485352, - "learning_rate": 1.3752994207873743e-06, - "loss": 0.35954269766807556, - "mean_token_accuracy": 0.8642125129699707, - "num_tokens": 23527929.0, - "step": 2631 - }, - { - "epoch": 2.0, - "grad_norm": 1.406253457069397, - "learning_rate": 1.373429334674317e-06, - "loss": 0.33467042446136475, - "mean_token_accuracy": 0.8713197708129883, - "num_tokens": 23539356.0, - "step": 2632 - }, - { - "epoch": 2.0007598784194527, - "grad_norm": 2.8150978088378906, - "learning_rate": 1.3715600392319186e-06, - "loss": 0.22929656505584717, - "mean_token_accuracy": 0.9197485446929932, - "num_tokens": 23543746.0, - "step": 2633 - }, - { - "epoch": 2.001519756838906, - "grad_norm": 2.6291964054107666, - "learning_rate": 1.369691535772123e-06, - "loss": 0.290000855922699, - "mean_token_accuracy": 0.8979663848876953, - "num_tokens": 23548633.0, - "step": 2634 - }, - { - "epoch": 2.0022796352583585, - "grad_norm": 1.724357008934021, - "learning_rate": 1.3678238256063193e-06, - "loss": 0.3717018663883209, - "mean_token_accuracy": 0.8743406534194946, - "num_tokens": 23557187.0, - "step": 2635 - }, - { - "epoch": 2.0030395136778116, - "grad_norm": 2.3801965713500977, - "learning_rate": 1.3659569100453346e-06, - "loss": 0.3452329635620117, - "mean_token_accuracy": 0.8799462914466858, - "num_tokens": 23563321.0, - "step": 2636 - }, - { - "epoch": 2.0037993920972643, - "grad_norm": 1.8925955295562744, - "learning_rate": 1.3640907903994455e-06, - "loss": 0.32880955934524536, - "mean_token_accuracy": 0.888347864151001, - "num_tokens": 23570571.0, - "step": 2637 - }, - { - "epoch": 2.0045592705167175, - "grad_norm": 1.0761849880218506, - "learning_rate": 1.3622254679783665e-06, - "loss": 0.395224004983902, - "mean_token_accuracy": 0.8637001514434814, - "num_tokens": 23589504.0, - "step": 2638 - }, - { - "epoch": 2.00531914893617, - "grad_norm": 2.1172127723693848, - "learning_rate": 1.3603609440912508e-06, - "loss": 0.32195356488227844, - "mean_token_accuracy": 0.8984324932098389, - "num_tokens": 23595586.0, - "step": 2639 - }, - { - "epoch": 2.0060790273556233, - "grad_norm": 2.127723217010498, - "learning_rate": 1.3584972200466936e-06, - "loss": 0.4710606634616852, - "mean_token_accuracy": 0.8563182950019836, - "num_tokens": 23602747.0, - "step": 2640 - }, - { - "epoch": 2.006838905775076, - "grad_norm": 1.9752192497253418, - "learning_rate": 1.356634297152729e-06, - "loss": 0.24204617738723755, - "mean_token_accuracy": 0.9082983136177063, - "num_tokens": 23609005.0, - "step": 2641 - }, - { - "epoch": 2.007598784194529, - "grad_norm": 2.5435397624969482, - "learning_rate": 1.3547721767168273e-06, - "loss": 0.16702288389205933, - "mean_token_accuracy": 0.9353867769241333, - "num_tokens": 23612852.0, - "step": 2642 - }, - { - "epoch": 2.0083586626139818, - "grad_norm": 1.8113304376602173, - "learning_rate": 1.3529108600458967e-06, - "loss": 0.4245433509349823, - "mean_token_accuracy": 0.8446527719497681, - "num_tokens": 23621462.0, - "step": 2643 - }, - { - "epoch": 2.0091185410334345, - "grad_norm": 1.0438088178634644, - "learning_rate": 1.3510503484462807e-06, - "loss": 0.3710743188858032, - "mean_token_accuracy": 0.8731123208999634, - "num_tokens": 23642029.0, - "step": 2644 - }, - { - "epoch": 2.0098784194528876, - "grad_norm": 1.9650516510009766, - "learning_rate": 1.349190643223758e-06, - "loss": 0.32384324073791504, - "mean_token_accuracy": 0.8859044313430786, - "num_tokens": 23648970.0, - "step": 2645 - }, - { - "epoch": 2.0106382978723403, - "grad_norm": 1.4213180541992188, - "learning_rate": 1.347331745683542e-06, - "loss": 0.42391857504844666, - "mean_token_accuracy": 0.8568997383117676, - "num_tokens": 23663012.0, - "step": 2646 - }, - { - "epoch": 2.0113981762917934, - "grad_norm": 1.852386236190796, - "learning_rate": 1.3454736571302761e-06, - "loss": 0.37283188104629517, - "mean_token_accuracy": 0.9096506834030151, - "num_tokens": 23671632.0, - "step": 2647 - }, - { - "epoch": 2.012158054711246, - "grad_norm": 1.8350872993469238, - "learning_rate": 1.3436163788680411e-06, - "loss": 0.21148793399333954, - "mean_token_accuracy": 0.9306647181510925, - "num_tokens": 23678554.0, - "step": 2648 - }, - { - "epoch": 2.012917933130699, - "grad_norm": 1.8285188674926758, - "learning_rate": 1.3417599122003464e-06, - "loss": 0.2638583183288574, - "mean_token_accuracy": 0.904695987701416, - "num_tokens": 23686905.0, - "step": 2649 - }, - { - "epoch": 2.013677811550152, - "grad_norm": 1.1955424547195435, - "learning_rate": 1.3399042584301298e-06, - "loss": 0.30598434805870056, - "mean_token_accuracy": 0.8953701257705688, - "num_tokens": 23702734.0, - "step": 2650 - }, - { - "epoch": 2.014437689969605, - "grad_norm": 1.5378512144088745, - "learning_rate": 1.3380494188597603e-06, - "loss": 0.33754611015319824, - "mean_token_accuracy": 0.9063926935195923, - "num_tokens": 23715891.0, - "step": 2651 - }, - { - "epoch": 2.0151975683890577, - "grad_norm": 1.6957111358642578, - "learning_rate": 1.3361953947910394e-06, - "loss": 0.26302939653396606, - "mean_token_accuracy": 0.90192711353302, - "num_tokens": 23724034.0, - "step": 2652 - }, - { - "epoch": 2.015957446808511, - "grad_norm": 1.1756837368011475, - "learning_rate": 1.334342187525189e-06, - "loss": 0.3312695622444153, - "mean_token_accuracy": 0.870500385761261, - "num_tokens": 23741241.0, - "step": 2653 - }, - { - "epoch": 2.0167173252279635, - "grad_norm": 1.027145266532898, - "learning_rate": 1.3324897983628621e-06, - "loss": 0.2534530758857727, - "mean_token_accuracy": 0.894199550151825, - "num_tokens": 23758399.0, - "step": 2654 - }, - { - "epoch": 2.0174772036474162, - "grad_norm": 2.2585113048553467, - "learning_rate": 1.330638228604137e-06, - "loss": 0.4558389186859131, - "mean_token_accuracy": 0.8372241258621216, - "num_tokens": 23766871.0, - "step": 2655 - }, - { - "epoch": 2.0182370820668694, - "grad_norm": 1.886893630027771, - "learning_rate": 1.3287874795485168e-06, - "loss": 0.29894912242889404, - "mean_token_accuracy": 0.9086098670959473, - "num_tokens": 23774935.0, - "step": 2656 - }, - { - "epoch": 2.018996960486322, - "grad_norm": 2.082537889480591, - "learning_rate": 1.3269375524949286e-06, - "loss": 0.39323803782463074, - "mean_token_accuracy": 0.8598287105560303, - "num_tokens": 23781303.0, - "step": 2657 - }, - { - "epoch": 2.019756838905775, - "grad_norm": 1.7059803009033203, - "learning_rate": 1.3250884487417227e-06, - "loss": 0.17909850180149078, - "mean_token_accuracy": 0.9276094436645508, - "num_tokens": 23789148.0, - "step": 2658 - }, - { - "epoch": 2.020516717325228, - "grad_norm": 2.150275945663452, - "learning_rate": 1.3232401695866686e-06, - "loss": 0.3707781434059143, - "mean_token_accuracy": 0.8587700128555298, - "num_tokens": 23795484.0, - "step": 2659 - }, - { - "epoch": 2.021276595744681, - "grad_norm": 2.0554518699645996, - "learning_rate": 1.321392716326963e-06, - "loss": 0.33217954635620117, - "mean_token_accuracy": 0.874828577041626, - "num_tokens": 23802968.0, - "step": 2660 - }, - { - "epoch": 2.0220364741641337, - "grad_norm": 2.4556071758270264, - "learning_rate": 1.3195460902592193e-06, - "loss": 0.2790899872779846, - "mean_token_accuracy": 0.9071618914604187, - "num_tokens": 23807788.0, - "step": 2661 - }, - { - "epoch": 2.022796352583587, - "grad_norm": 1.7501509189605713, - "learning_rate": 1.3177002926794685e-06, - "loss": 0.3080750107765198, - "mean_token_accuracy": 0.8942672610282898, - "num_tokens": 23816023.0, - "step": 2662 - }, - { - "epoch": 2.0235562310030395, - "grad_norm": 1.3934804201126099, - "learning_rate": 1.3158553248831658e-06, - "loss": 0.286912202835083, - "mean_token_accuracy": 0.9284837245941162, - "num_tokens": 23827186.0, - "step": 2663 - }, - { - "epoch": 2.024316109422492, - "grad_norm": 1.2530465126037598, - "learning_rate": 1.3140111881651773e-06, - "loss": 0.2630627155303955, - "mean_token_accuracy": 0.9029854536056519, - "num_tokens": 23841399.0, - "step": 2664 - }, - { - "epoch": 2.0250759878419453, - "grad_norm": 1.3417384624481201, - "learning_rate": 1.312167883819791e-06, - "loss": 0.37794870138168335, - "mean_token_accuracy": 0.8722256422042847, - "num_tokens": 23856061.0, - "step": 2665 - }, - { - "epoch": 2.025835866261398, - "grad_norm": 2.234257698059082, - "learning_rate": 1.3103254131407082e-06, - "loss": 0.2739933133125305, - "mean_token_accuracy": 0.9055665135383606, - "num_tokens": 23861865.0, - "step": 2666 - }, - { - "epoch": 2.026595744680851, - "grad_norm": 1.4187006950378418, - "learning_rate": 1.308483777421046e-06, - "loss": 0.24370817840099335, - "mean_token_accuracy": 0.9145886301994324, - "num_tokens": 23873632.0, - "step": 2667 - }, - { - "epoch": 2.027355623100304, - "grad_norm": 2.3645882606506348, - "learning_rate": 1.3066429779533352e-06, - "loss": 0.23659822344779968, - "mean_token_accuracy": 0.9209753274917603, - "num_tokens": 23878866.0, - "step": 2668 - }, - { - "epoch": 2.028115501519757, - "grad_norm": 1.4782226085662842, - "learning_rate": 1.3048030160295196e-06, - "loss": 0.3353138267993927, - "mean_token_accuracy": 0.8747807741165161, - "num_tokens": 23891089.0, - "step": 2669 - }, - { - "epoch": 2.0288753799392096, - "grad_norm": 2.051754951477051, - "learning_rate": 1.3029638929409555e-06, - "loss": 0.2905973196029663, - "mean_token_accuracy": 0.887441873550415, - "num_tokens": 23897653.0, - "step": 2670 - }, - { - "epoch": 2.0296352583586628, - "grad_norm": 1.322279453277588, - "learning_rate": 1.3011256099784103e-06, - "loss": 0.3938416540622711, - "mean_token_accuracy": 0.8911079168319702, - "num_tokens": 23912525.0, - "step": 2671 - }, - { - "epoch": 2.0303951367781155, - "grad_norm": 1.87980318069458, - "learning_rate": 1.2992881684320627e-06, - "loss": 0.16637520492076874, - "mean_token_accuracy": 0.9472321271896362, - "num_tokens": 23918752.0, - "step": 2672 - }, - { - "epoch": 2.0311550151975686, - "grad_norm": 2.0867233276367188, - "learning_rate": 1.297451569591498e-06, - "loss": 0.37282776832580566, - "mean_token_accuracy": 0.8688399195671082, - "num_tokens": 23925918.0, - "step": 2673 - }, - { - "epoch": 2.0319148936170213, - "grad_norm": 1.129468560218811, - "learning_rate": 1.2956158147457116e-06, - "loss": 0.33072173595428467, - "mean_token_accuracy": 0.8788217306137085, - "num_tokens": 23944702.0, - "step": 2674 - }, - { - "epoch": 2.032674772036474, - "grad_norm": 3.6016290187835693, - "learning_rate": 1.2937809051831102e-06, - "loss": 0.28343498706817627, - "mean_token_accuracy": 0.911794900894165, - "num_tokens": 23948417.0, - "step": 2675 - }, - { - "epoch": 2.033434650455927, - "grad_norm": 1.4904811382293701, - "learning_rate": 1.2919468421915008e-06, - "loss": 0.4072638750076294, - "mean_token_accuracy": 0.8615934252738953, - "num_tokens": 23963654.0, - "step": 2676 - }, - { - "epoch": 2.0341945288753798, - "grad_norm": 2.90740704536438, - "learning_rate": 1.2901136270580994e-06, - "loss": 0.3685106635093689, - "mean_token_accuracy": 0.8923419713973999, - "num_tokens": 23968608.0, - "step": 2677 - }, - { - "epoch": 2.034954407294833, - "grad_norm": 1.8772104978561401, - "learning_rate": 1.2882812610695305e-06, - "loss": 0.2947828471660614, - "mean_token_accuracy": 0.9065762758255005, - "num_tokens": 23978298.0, - "step": 2678 - }, - { - "epoch": 2.0357142857142856, - "grad_norm": 1.2135536670684814, - "learning_rate": 1.2864497455118152e-06, - "loss": 0.36015012860298157, - "mean_token_accuracy": 0.8481813073158264, - "num_tokens": 23995784.0, - "step": 2679 - }, - { - "epoch": 2.0364741641337387, - "grad_norm": 1.941889762878418, - "learning_rate": 1.2846190816703836e-06, - "loss": 0.3004198670387268, - "mean_token_accuracy": 0.8843618631362915, - "num_tokens": 24002651.0, - "step": 2680 - }, - { - "epoch": 2.0372340425531914, - "grad_norm": 1.8905075788497925, - "learning_rate": 1.2827892708300648e-06, - "loss": 0.26640570163726807, - "mean_token_accuracy": 0.9079146385192871, - "num_tokens": 24010400.0, - "step": 2681 - }, - { - "epoch": 2.0379939209726445, - "grad_norm": 1.2975934743881226, - "learning_rate": 1.280960314275092e-06, - "loss": 0.19093887507915497, - "mean_token_accuracy": 0.9277223348617554, - "num_tokens": 24021528.0, - "step": 2682 - }, - { - "epoch": 2.038753799392097, - "grad_norm": 1.6483098268508911, - "learning_rate": 1.279132213289096e-06, - "loss": 0.29260069131851196, - "mean_token_accuracy": 0.892486572265625, - "num_tokens": 24030470.0, - "step": 2683 - }, - { - "epoch": 2.0395136778115504, - "grad_norm": 1.6875916719436646, - "learning_rate": 1.2773049691551103e-06, - "loss": 0.3784627914428711, - "mean_token_accuracy": 0.8682783842086792, - "num_tokens": 24041608.0, - "step": 2684 - }, - { - "epoch": 2.040273556231003, - "grad_norm": 2.1055848598480225, - "learning_rate": 1.2754785831555617e-06, - "loss": 0.14676237106323242, - "mean_token_accuracy": 0.9532995223999023, - "num_tokens": 24046687.0, - "step": 2685 - }, - { - "epoch": 2.0410334346504557, - "grad_norm": 1.3862961530685425, - "learning_rate": 1.273653056572282e-06, - "loss": 0.34408485889434814, - "mean_token_accuracy": 0.8748919367790222, - "num_tokens": 24059147.0, - "step": 2686 - }, - { - "epoch": 2.041793313069909, - "grad_norm": 2.936876058578491, - "learning_rate": 1.2718283906864939e-06, - "loss": 0.2471027672290802, - "mean_token_accuracy": 0.9177526235580444, - "num_tokens": 24062963.0, - "step": 2687 - }, - { - "epoch": 2.0425531914893615, - "grad_norm": 1.3992520570755005, - "learning_rate": 1.2700045867788184e-06, - "loss": 0.421109139919281, - "mean_token_accuracy": 0.8664785623550415, - "num_tokens": 24077912.0, - "step": 2688 - }, - { - "epoch": 2.0433130699088147, - "grad_norm": 3.0531985759735107, - "learning_rate": 1.2681816461292715e-06, - "loss": 0.292591392993927, - "mean_token_accuracy": 0.8992351293563843, - "num_tokens": 24082058.0, - "step": 2689 - }, - { - "epoch": 2.0440729483282674, - "grad_norm": 1.4562251567840576, - "learning_rate": 1.2663595700172631e-06, - "loss": 0.39367130398750305, - "mean_token_accuracy": 0.8894597887992859, - "num_tokens": 24093954.0, - "step": 2690 - }, - { - "epoch": 2.0448328267477205, - "grad_norm": 1.9354028701782227, - "learning_rate": 1.2645383597215965e-06, - "loss": 0.28203579783439636, - "mean_token_accuracy": 0.9011955261230469, - "num_tokens": 24100590.0, - "step": 2691 - }, - { - "epoch": 2.045592705167173, - "grad_norm": 1.5010690689086914, - "learning_rate": 1.2627180165204671e-06, - "loss": 0.3463609516620636, - "mean_token_accuracy": 0.8978298306465149, - "num_tokens": 24111104.0, - "step": 2692 - }, - { - "epoch": 2.0463525835866263, - "grad_norm": 2.585813045501709, - "learning_rate": 1.2608985416914616e-06, - "loss": 0.2142711877822876, - "mean_token_accuracy": 0.9260460138320923, - "num_tokens": 24115301.0, - "step": 2693 - }, - { - "epoch": 2.047112462006079, - "grad_norm": 2.317268133163452, - "learning_rate": 1.259079936511558e-06, - "loss": 0.14454546570777893, - "mean_token_accuracy": 0.9498077034950256, - "num_tokens": 24120295.0, - "step": 2694 - }, - { - "epoch": 2.047872340425532, - "grad_norm": 1.966550350189209, - "learning_rate": 1.257262202257124e-06, - "loss": 0.20745311677455902, - "mean_token_accuracy": 0.9157166481018066, - "num_tokens": 24127158.0, - "step": 2695 - }, - { - "epoch": 2.048632218844985, - "grad_norm": 1.6521401405334473, - "learning_rate": 1.2554453402039124e-06, - "loss": 0.2547406256198883, - "mean_token_accuracy": 0.9356101751327515, - "num_tokens": 24135620.0, - "step": 2696 - }, - { - "epoch": 2.0493920972644375, - "grad_norm": 2.341756582260132, - "learning_rate": 1.2536293516270704e-06, - "loss": 0.35540008544921875, - "mean_token_accuracy": 0.874363899230957, - "num_tokens": 24141766.0, - "step": 2697 - }, - { - "epoch": 2.0501519756838906, - "grad_norm": 1.7938716411590576, - "learning_rate": 1.251814237801128e-06, - "loss": 0.37250861525535583, - "mean_token_accuracy": 0.8644422292709351, - "num_tokens": 24149997.0, - "step": 2698 - }, - { - "epoch": 2.0509118541033433, - "grad_norm": 2.0868122577667236, - "learning_rate": 1.2500000000000007e-06, - "loss": 0.44527092576026917, - "mean_token_accuracy": 0.8510264158248901, - "num_tokens": 24158208.0, - "step": 2699 - }, - { - "epoch": 2.0516717325227964, - "grad_norm": 2.412604808807373, - "learning_rate": 1.24818663949699e-06, - "loss": 0.19276219606399536, - "mean_token_accuracy": 0.9317681789398193, - "num_tokens": 24162905.0, - "step": 2700 - }, - { - "epoch": 2.052431610942249, - "grad_norm": 1.4488455057144165, - "learning_rate": 1.246374157564785e-06, - "loss": 0.3493705093860626, - "mean_token_accuracy": 0.9016396999359131, - "num_tokens": 24175852.0, - "step": 2701 - }, - { - "epoch": 2.0531914893617023, - "grad_norm": 2.1629185676574707, - "learning_rate": 1.2445625554754526e-06, - "loss": 0.30588388442993164, - "mean_token_accuracy": 0.8871392011642456, - "num_tokens": 24181507.0, - "step": 2702 - }, - { - "epoch": 2.053951367781155, - "grad_norm": 2.0489449501037598, - "learning_rate": 1.2427518345004459e-06, - "loss": 0.4578161835670471, - "mean_token_accuracy": 0.8498104214668274, - "num_tokens": 24191918.0, - "step": 2703 - }, - { - "epoch": 2.054711246200608, - "grad_norm": 2.063019037246704, - "learning_rate": 1.2409419959105981e-06, - "loss": 0.31680572032928467, - "mean_token_accuracy": 0.8809083700180054, - "num_tokens": 24199336.0, - "step": 2704 - }, - { - "epoch": 2.0554711246200608, - "grad_norm": 2.4594223499298096, - "learning_rate": 1.239133040976124e-06, - "loss": 0.3048282265663147, - "mean_token_accuracy": 0.8897095322608948, - "num_tokens": 24205118.0, - "step": 2705 - }, - { - "epoch": 2.056231003039514, - "grad_norm": 1.6359999179840088, - "learning_rate": 1.237324970966618e-06, - "loss": 0.4312370717525482, - "mean_token_accuracy": 0.8526142835617065, - "num_tokens": 24215792.0, - "step": 2706 - }, - { - "epoch": 2.0569908814589666, - "grad_norm": 1.5534536838531494, - "learning_rate": 1.2355177871510538e-06, - "loss": 0.3647908568382263, - "mean_token_accuracy": 0.8680631518363953, - "num_tokens": 24235325.0, - "step": 2707 - }, - { - "epoch": 2.0577507598784193, - "grad_norm": 2.4902515411376953, - "learning_rate": 1.2337114907977798e-06, - "loss": 0.3605276942253113, - "mean_token_accuracy": 0.8776376843452454, - "num_tokens": 24241502.0, - "step": 2708 - }, - { - "epoch": 2.0585106382978724, - "grad_norm": 1.7282993793487549, - "learning_rate": 1.2319060831745273e-06, - "loss": 0.38326722383499146, - "mean_token_accuracy": 0.8531644344329834, - "num_tokens": 24252665.0, - "step": 2709 - }, - { - "epoch": 2.059270516717325, - "grad_norm": 1.4213361740112305, - "learning_rate": 1.2301015655484006e-06, - "loss": 0.32221150398254395, - "mean_token_accuracy": 0.8890664577484131, - "num_tokens": 24266409.0, - "step": 2710 - }, - { - "epoch": 2.060030395136778, - "grad_norm": 2.6412453651428223, - "learning_rate": 1.2282979391858767e-06, - "loss": 0.20225220918655396, - "mean_token_accuracy": 0.9287782311439514, - "num_tokens": 24271069.0, - "step": 2711 - }, - { - "epoch": 2.060790273556231, - "grad_norm": 3.2601654529571533, - "learning_rate": 1.2264952053528145e-06, - "loss": 0.23259003460407257, - "mean_token_accuracy": 0.9290606379508972, - "num_tokens": 24274992.0, - "step": 2712 - }, - { - "epoch": 2.061550151975684, - "grad_norm": 1.6633410453796387, - "learning_rate": 1.2246933653144386e-06, - "loss": 0.355314165353775, - "mean_token_accuracy": 0.870380163192749, - "num_tokens": 24284917.0, - "step": 2713 - }, - { - "epoch": 2.0623100303951367, - "grad_norm": 2.9081318378448486, - "learning_rate": 1.2228924203353507e-06, - "loss": 0.38050833344459534, - "mean_token_accuracy": 0.8879997730255127, - "num_tokens": 24289694.0, - "step": 2714 - }, - { - "epoch": 2.06306990881459, - "grad_norm": 3.2404227256774902, - "learning_rate": 1.2210923716795233e-06, - "loss": 0.2502570152282715, - "mean_token_accuracy": 0.9150978922843933, - "num_tokens": 24293254.0, - "step": 2715 - }, - { - "epoch": 2.0638297872340425, - "grad_norm": 1.9262174367904663, - "learning_rate": 1.2192932206103e-06, - "loss": 0.26763200759887695, - "mean_token_accuracy": 0.9203122854232788, - "num_tokens": 24300881.0, - "step": 2716 - }, - { - "epoch": 2.0645896656534957, - "grad_norm": 1.6790109872817993, - "learning_rate": 1.2174949683903943e-06, - "loss": 0.22275440394878387, - "mean_token_accuracy": 0.9212621450424194, - "num_tokens": 24309288.0, - "step": 2717 - }, - { - "epoch": 2.0653495440729484, - "grad_norm": 1.8272414207458496, - "learning_rate": 1.2156976162818895e-06, - "loss": 0.3183424472808838, - "mean_token_accuracy": 0.8813169002532959, - "num_tokens": 24316980.0, - "step": 2718 - }, - { - "epoch": 2.066109422492401, - "grad_norm": 2.7388651371002197, - "learning_rate": 1.2139011655462338e-06, - "loss": 0.24794816970825195, - "mean_token_accuracy": 0.9109550714492798, - "num_tokens": 24321867.0, - "step": 2719 - }, - { - "epoch": 2.066869300911854, - "grad_norm": 1.4866925477981567, - "learning_rate": 1.2121056174442484e-06, - "loss": 0.24177205562591553, - "mean_token_accuracy": 0.9102780818939209, - "num_tokens": 24332874.0, - "step": 2720 - }, - { - "epoch": 2.067629179331307, - "grad_norm": 1.6006059646606445, - "learning_rate": 1.2103109732361178e-06, - "loss": 0.29220807552337646, - "mean_token_accuracy": 0.8947570323944092, - "num_tokens": 24342790.0, - "step": 2721 - }, - { - "epoch": 2.06838905775076, - "grad_norm": 2.2688677310943604, - "learning_rate": 1.208517234181391e-06, - "loss": 0.39247143268585205, - "mean_token_accuracy": 0.8514304161071777, - "num_tokens": 24349329.0, - "step": 2722 - }, - { - "epoch": 2.0691489361702127, - "grad_norm": 2.404534339904785, - "learning_rate": 1.2067244015389829e-06, - "loss": 0.4461793303489685, - "mean_token_accuracy": 0.8531662821769714, - "num_tokens": 24356287.0, - "step": 2723 - }, - { - "epoch": 2.069908814589666, - "grad_norm": 1.813341498374939, - "learning_rate": 1.204932476567175e-06, - "loss": 0.38300177454948425, - "mean_token_accuracy": 0.8597674369812012, - "num_tokens": 24366181.0, - "step": 2724 - }, - { - "epoch": 2.0706686930091185, - "grad_norm": 3.49125337600708, - "learning_rate": 1.2031414605236066e-06, - "loss": 0.33281540870666504, - "mean_token_accuracy": 0.8774969577789307, - "num_tokens": 24370362.0, - "step": 2725 - }, - { - "epoch": 2.0714285714285716, - "grad_norm": 1.7682114839553833, - "learning_rate": 1.2013513546652827e-06, - "loss": 0.3001813590526581, - "mean_token_accuracy": 0.8840254545211792, - "num_tokens": 24380469.0, - "step": 2726 - }, - { - "epoch": 2.0721884498480243, - "grad_norm": 2.3688952922821045, - "learning_rate": 1.1995621602485685e-06, - "loss": 0.20055249333381653, - "mean_token_accuracy": 0.9246129989624023, - "num_tokens": 24385474.0, - "step": 2727 - }, - { - "epoch": 2.072948328267477, - "grad_norm": 2.3368382453918457, - "learning_rate": 1.1977738785291894e-06, - "loss": 0.18379954993724823, - "mean_token_accuracy": 0.9385529160499573, - "num_tokens": 24390002.0, - "step": 2728 - }, - { - "epoch": 2.07370820668693, - "grad_norm": 1.857473373413086, - "learning_rate": 1.1959865107622306e-06, - "loss": 0.4606894552707672, - "mean_token_accuracy": 0.8437427282333374, - "num_tokens": 24400880.0, - "step": 2729 - }, - { - "epoch": 2.074468085106383, - "grad_norm": 1.2714136838912964, - "learning_rate": 1.1942000582021355e-06, - "loss": 0.21171459555625916, - "mean_token_accuracy": 0.9216019511222839, - "num_tokens": 24413113.0, - "step": 2730 - }, - { - "epoch": 2.075227963525836, - "grad_norm": 2.2025210857391357, - "learning_rate": 1.1924145221027048e-06, - "loss": 0.44211941957473755, - "mean_token_accuracy": 0.8538386821746826, - "num_tokens": 24420504.0, - "step": 2731 - }, - { - "epoch": 2.0759878419452886, - "grad_norm": 1.6706589460372925, - "learning_rate": 1.190629903717097e-06, - "loss": 0.35163265466690063, - "mean_token_accuracy": 0.8716240525245667, - "num_tokens": 24430203.0, - "step": 2732 - }, - { - "epoch": 2.0767477203647418, - "grad_norm": 2.299182176589966, - "learning_rate": 1.1888462042978268e-06, - "loss": 0.30983975529670715, - "mean_token_accuracy": 0.8859797716140747, - "num_tokens": 24437387.0, - "step": 2733 - }, - { - "epoch": 2.0775075987841944, - "grad_norm": 2.975123167037964, - "learning_rate": 1.1870634250967606e-06, - "loss": 0.23585952818393707, - "mean_token_accuracy": 0.9167368412017822, - "num_tokens": 24441176.0, - "step": 2734 - }, - { - "epoch": 2.0782674772036476, - "grad_norm": 1.1052464246749878, - "learning_rate": 1.1852815673651246e-06, - "loss": 0.24136316776275635, - "mean_token_accuracy": 0.8897353410720825, - "num_tokens": 24457092.0, - "step": 2735 - }, - { - "epoch": 2.0790273556231003, - "grad_norm": 1.5531870126724243, - "learning_rate": 1.1835006323534926e-06, - "loss": 0.302223265171051, - "mean_token_accuracy": 0.8940514326095581, - "num_tokens": 24467643.0, - "step": 2736 - }, - { - "epoch": 2.0797872340425534, - "grad_norm": 1.706140398979187, - "learning_rate": 1.1817206213117943e-06, - "loss": 0.39235255122184753, - "mean_token_accuracy": 0.8615218997001648, - "num_tokens": 24477715.0, - "step": 2737 - }, - { - "epoch": 2.080547112462006, - "grad_norm": 2.1109750270843506, - "learning_rate": 1.1799415354893103e-06, - "loss": 0.2526751756668091, - "mean_token_accuracy": 0.9108465909957886, - "num_tokens": 24484248.0, - "step": 2738 - }, - { - "epoch": 2.0813069908814588, - "grad_norm": 1.9943277835845947, - "learning_rate": 1.178163376134671e-06, - "loss": 0.3540172874927521, - "mean_token_accuracy": 0.9131139516830444, - "num_tokens": 24492207.0, - "step": 2739 - }, - { - "epoch": 2.082066869300912, - "grad_norm": 1.9536099433898926, - "learning_rate": 1.1763861444958573e-06, - "loss": 0.3902950584888458, - "mean_token_accuracy": 0.8611530065536499, - "num_tokens": 24501567.0, - "step": 2740 - }, - { - "epoch": 2.0828267477203646, - "grad_norm": 3.146925926208496, - "learning_rate": 1.1746098418201987e-06, - "loss": 0.43440669775009155, - "mean_token_accuracy": 0.8709320425987244, - "num_tokens": 24506684.0, - "step": 2741 - }, - { - "epoch": 2.0835866261398177, - "grad_norm": 2.763427495956421, - "learning_rate": 1.172834469354373e-06, - "loss": 0.3513452410697937, - "mean_token_accuracy": 0.8774256110191345, - "num_tokens": 24511509.0, - "step": 2742 - }, - { - "epoch": 2.0843465045592704, - "grad_norm": 2.773829221725464, - "learning_rate": 1.1710600283444048e-06, - "loss": 0.24668049812316895, - "mean_token_accuracy": 0.9146889448165894, - "num_tokens": 24516030.0, - "step": 2743 - }, - { - "epoch": 2.0851063829787235, - "grad_norm": 1.666471242904663, - "learning_rate": 1.169286520035666e-06, - "loss": 0.36206915974617004, - "mean_token_accuracy": 0.8711973428726196, - "num_tokens": 24526656.0, - "step": 2744 - }, - { - "epoch": 2.085866261398176, - "grad_norm": 2.818890333175659, - "learning_rate": 1.1675139456728702e-06, - "loss": 0.32967281341552734, - "mean_token_accuracy": 0.880983829498291, - "num_tokens": 24531625.0, - "step": 2745 - }, - { - "epoch": 2.0866261398176293, - "grad_norm": 1.09058678150177, - "learning_rate": 1.1657423065000811e-06, - "loss": 0.36224377155303955, - "mean_token_accuracy": 0.8708326816558838, - "num_tokens": 24557123.0, - "step": 2746 - }, - { - "epoch": 2.087386018237082, - "grad_norm": 1.1434987783432007, - "learning_rate": 1.1639716037607036e-06, - "loss": 0.26490458846092224, - "mean_token_accuracy": 0.9131897687911987, - "num_tokens": 24573223.0, - "step": 2747 - }, - { - "epoch": 2.088145896656535, - "grad_norm": 2.437505006790161, - "learning_rate": 1.1622018386974829e-06, - "loss": 0.18964408338069916, - "mean_token_accuracy": 0.9271818399429321, - "num_tokens": 24578306.0, - "step": 2748 - }, - { - "epoch": 2.088905775075988, - "grad_norm": 1.797308325767517, - "learning_rate": 1.160433012552508e-06, - "loss": 0.3090781569480896, - "mean_token_accuracy": 0.8960750102996826, - "num_tokens": 24587562.0, - "step": 2749 - }, - { - "epoch": 2.0896656534954405, - "grad_norm": 2.4050841331481934, - "learning_rate": 1.1586651265672122e-06, - "loss": 0.4001041054725647, - "mean_token_accuracy": 0.8588370084762573, - "num_tokens": 24594223.0, - "step": 2750 - }, - { - "epoch": 2.0904255319148937, - "grad_norm": 1.8757156133651733, - "learning_rate": 1.1568981819823636e-06, - "loss": 0.37845075130462646, - "mean_token_accuracy": 0.866146445274353, - "num_tokens": 24602556.0, - "step": 2751 - }, - { - "epoch": 2.0911854103343464, - "grad_norm": 1.8205114603042603, - "learning_rate": 1.1551321800380722e-06, - "loss": 0.24738016724586487, - "mean_token_accuracy": 0.923284113407135, - "num_tokens": 24611627.0, - "step": 2752 - }, - { - "epoch": 2.0919452887537995, - "grad_norm": 2.107512950897217, - "learning_rate": 1.153367121973786e-06, - "loss": 0.3062688410282135, - "mean_token_accuracy": 0.8909003734588623, - "num_tokens": 24619569.0, - "step": 2753 - }, - { - "epoch": 2.092705167173252, - "grad_norm": 1.93110191822052, - "learning_rate": 1.1516030090282915e-06, - "loss": 0.38658422231674194, - "mean_token_accuracy": 0.869437038898468, - "num_tokens": 24628869.0, - "step": 2754 - }, - { - "epoch": 2.0934650455927053, - "grad_norm": 2.3618004322052, - "learning_rate": 1.1498398424397106e-06, - "loss": 0.19193072617053986, - "mean_token_accuracy": 0.9329519271850586, - "num_tokens": 24633724.0, - "step": 2755 - }, - { - "epoch": 2.094224924012158, - "grad_norm": 2.274510622024536, - "learning_rate": 1.1480776234455024e-06, - "loss": 0.24939998984336853, - "mean_token_accuracy": 0.9104958772659302, - "num_tokens": 24642762.0, - "step": 2756 - }, - { - "epoch": 2.094984802431611, - "grad_norm": 1.7468934059143066, - "learning_rate": 1.1463163532824572e-06, - "loss": 0.3876607418060303, - "mean_token_accuracy": 0.8540539145469666, - "num_tokens": 24652138.0, - "step": 2757 - }, - { - "epoch": 2.095744680851064, - "grad_norm": 2.905381441116333, - "learning_rate": 1.1445560331867054e-06, - "loss": 0.33666878938674927, - "mean_token_accuracy": 0.8805598616600037, - "num_tokens": 24656612.0, - "step": 2758 - }, - { - "epoch": 2.096504559270517, - "grad_norm": 1.5513007640838623, - "learning_rate": 1.142796664393707e-06, - "loss": 0.25168463587760925, - "mean_token_accuracy": 0.925534725189209, - "num_tokens": 24667132.0, - "step": 2759 - }, - { - "epoch": 2.0972644376899696, - "grad_norm": 1.6804249286651611, - "learning_rate": 1.141038248138253e-06, - "loss": 0.3862859010696411, - "mean_token_accuracy": 0.8686253428459167, - "num_tokens": 24679274.0, - "step": 2760 - }, - { - "epoch": 2.0980243161094223, - "grad_norm": 1.7432880401611328, - "learning_rate": 1.1392807856544682e-06, - "loss": 0.3200700879096985, - "mean_token_accuracy": 0.9188123941421509, - "num_tokens": 24688628.0, - "step": 2761 - }, - { - "epoch": 2.0987841945288754, - "grad_norm": 1.8734468221664429, - "learning_rate": 1.1375242781758077e-06, - "loss": 0.34758424758911133, - "mean_token_accuracy": 0.8724187016487122, - "num_tokens": 24698159.0, - "step": 2762 - }, - { - "epoch": 2.099544072948328, - "grad_norm": 3.7156829833984375, - "learning_rate": 1.1357687269350564e-06, - "loss": 0.30014732480049133, - "mean_token_accuracy": 0.9021577835083008, - "num_tokens": 24701797.0, - "step": 2763 - }, - { - "epoch": 2.1003039513677813, - "grad_norm": 1.5196985006332397, - "learning_rate": 1.1340141331643276e-06, - "loss": 0.45747464895248413, - "mean_token_accuracy": 0.839891791343689, - "num_tokens": 24716468.0, - "step": 2764 - }, - { - "epoch": 2.101063829787234, - "grad_norm": 1.978009581565857, - "learning_rate": 1.132260498095062e-06, - "loss": 0.3130183815956116, - "mean_token_accuracy": 0.90610271692276, - "num_tokens": 24723211.0, - "step": 2765 - }, - { - "epoch": 2.101823708206687, - "grad_norm": 1.5883251428604126, - "learning_rate": 1.1305078229580294e-06, - "loss": 0.30493029952049255, - "mean_token_accuracy": 0.8889745473861694, - "num_tokens": 24733839.0, - "step": 2766 - }, - { - "epoch": 2.1025835866261398, - "grad_norm": 1.2397783994674683, - "learning_rate": 1.128756108983325e-06, - "loss": 0.2606407105922699, - "mean_token_accuracy": 0.9061247110366821, - "num_tokens": 24747488.0, - "step": 2767 - }, - { - "epoch": 2.103343465045593, - "grad_norm": 1.3046784400939941, - "learning_rate": 1.1270053574003658e-06, - "loss": 0.38750404119491577, - "mean_token_accuracy": 0.8777017593383789, - "num_tokens": 24763893.0, - "step": 2768 - }, - { - "epoch": 2.1041033434650456, - "grad_norm": 1.499266266822815, - "learning_rate": 1.1252555694379005e-06, - "loss": 0.4804937243461609, - "mean_token_accuracy": 0.8344086408615112, - "num_tokens": 24779323.0, - "step": 2769 - }, - { - "epoch": 2.1048632218844983, - "grad_norm": 1.211094856262207, - "learning_rate": 1.123506746323997e-06, - "loss": 0.3579246997833252, - "mean_token_accuracy": 0.8705919981002808, - "num_tokens": 24794965.0, - "step": 2770 - }, - { - "epoch": 2.1056231003039514, - "grad_norm": 2.490551471710205, - "learning_rate": 1.1217588892860446e-06, - "loss": 0.4084790349006653, - "mean_token_accuracy": 0.8553222417831421, - "num_tokens": 24800614.0, - "step": 2771 - }, - { - "epoch": 2.106382978723404, - "grad_norm": 1.5249632596969604, - "learning_rate": 1.1200119995507572e-06, - "loss": 0.36853182315826416, - "mean_token_accuracy": 0.8847414255142212, - "num_tokens": 24812886.0, - "step": 2772 - }, - { - "epoch": 2.107142857142857, - "grad_norm": 1.8510968685150146, - "learning_rate": 1.1182660783441719e-06, - "loss": 0.2918103337287903, - "mean_token_accuracy": 0.8898224830627441, - "num_tokens": 24821545.0, - "step": 2773 - }, - { - "epoch": 2.10790273556231, - "grad_norm": 1.7721803188323975, - "learning_rate": 1.11652112689164e-06, - "loss": 0.2920452654361725, - "mean_token_accuracy": 0.8879085779190063, - "num_tokens": 24831526.0, - "step": 2774 - }, - { - "epoch": 2.108662613981763, - "grad_norm": 1.3987336158752441, - "learning_rate": 1.1147771464178378e-06, - "loss": 0.4407062828540802, - "mean_token_accuracy": 0.8472493886947632, - "num_tokens": 24845847.0, - "step": 2775 - }, - { - "epoch": 2.1094224924012157, - "grad_norm": 1.8927375078201294, - "learning_rate": 1.1130341381467569e-06, - "loss": 0.36293038725852966, - "mean_token_accuracy": 0.8881135582923889, - "num_tokens": 24854760.0, - "step": 2776 - }, - { - "epoch": 2.110182370820669, - "grad_norm": 3.0480666160583496, - "learning_rate": 1.111292103301708e-06, - "loss": 0.30395108461380005, - "mean_token_accuracy": 0.9036306142807007, - "num_tokens": 24859051.0, - "step": 2777 - }, - { - "epoch": 2.1109422492401215, - "grad_norm": 1.5833618640899658, - "learning_rate": 1.1095510431053176e-06, - "loss": 0.26424330472946167, - "mean_token_accuracy": 0.9020674824714661, - "num_tokens": 24869853.0, - "step": 2778 - }, - { - "epoch": 2.1117021276595747, - "grad_norm": 1.645459532737732, - "learning_rate": 1.1078109587795311e-06, - "loss": 0.3563994765281677, - "mean_token_accuracy": 0.8732106685638428, - "num_tokens": 24880184.0, - "step": 2779 - }, - { - "epoch": 2.1124620060790273, - "grad_norm": 2.2964093685150146, - "learning_rate": 1.1060718515456022e-06, - "loss": 0.19739922881126404, - "mean_token_accuracy": 0.9273765087127686, - "num_tokens": 24885398.0, - "step": 2780 - }, - { - "epoch": 2.11322188449848, - "grad_norm": 2.094024181365967, - "learning_rate": 1.1043337226241075e-06, - "loss": 0.3321923315525055, - "mean_token_accuracy": 0.8865819573402405, - "num_tokens": 24893908.0, - "step": 2781 - }, - { - "epoch": 2.113981762917933, - "grad_norm": 1.9787025451660156, - "learning_rate": 1.1025965732349318e-06, - "loss": 0.37631168961524963, - "mean_token_accuracy": 0.8808693885803223, - "num_tokens": 24901270.0, - "step": 2782 - }, - { - "epoch": 2.114741641337386, - "grad_norm": 2.376060724258423, - "learning_rate": 1.100860404597271e-06, - "loss": 0.2591894268989563, - "mean_token_accuracy": 0.9174780249595642, - "num_tokens": 24906578.0, - "step": 2783 - }, - { - "epoch": 2.115501519756839, - "grad_norm": 1.0967903137207031, - "learning_rate": 1.0991252179296389e-06, - "loss": 0.26626938581466675, - "mean_token_accuracy": 0.9305505752563477, - "num_tokens": 24922329.0, - "step": 2784 - }, - { - "epoch": 2.1162613981762917, - "grad_norm": 3.3701183795928955, - "learning_rate": 1.0973910144498534e-06, - "loss": 0.2710079848766327, - "mean_token_accuracy": 0.9095271825790405, - "num_tokens": 24925777.0, - "step": 2785 - }, - { - "epoch": 2.117021276595745, - "grad_norm": 1.636264681816101, - "learning_rate": 1.0956577953750461e-06, - "loss": 0.2995981276035309, - "mean_token_accuracy": 0.8988568782806396, - "num_tokens": 24934230.0, - "step": 2786 - }, - { - "epoch": 2.1177811550151975, - "grad_norm": 2.3107731342315674, - "learning_rate": 1.093925561921657e-06, - "loss": 0.3424459397792816, - "mean_token_accuracy": 0.9100210070610046, - "num_tokens": 24939830.0, - "step": 2787 - }, - { - "epoch": 2.1185410334346506, - "grad_norm": 1.814764380455017, - "learning_rate": 1.0921943153054343e-06, - "loss": 0.3182154893875122, - "mean_token_accuracy": 0.883027195930481, - "num_tokens": 24947764.0, - "step": 2788 - }, - { - "epoch": 2.1193009118541033, - "grad_norm": 1.693555235862732, - "learning_rate": 1.0904640567414332e-06, - "loss": 0.3685447573661804, - "mean_token_accuracy": 0.8900846242904663, - "num_tokens": 24957680.0, - "step": 2789 - }, - { - "epoch": 2.1200607902735564, - "grad_norm": 1.0726022720336914, - "learning_rate": 1.088734787444017e-06, - "loss": 0.28461548686027527, - "mean_token_accuracy": 0.9026681184768677, - "num_tokens": 24975181.0, - "step": 2790 - }, - { - "epoch": 2.120820668693009, - "grad_norm": 1.3013874292373657, - "learning_rate": 1.0870065086268506e-06, - "loss": 0.28222548961639404, - "mean_token_accuracy": 0.9041857719421387, - "num_tokens": 24993211.0, - "step": 2791 - }, - { - "epoch": 2.121580547112462, - "grad_norm": 2.592106580734253, - "learning_rate": 1.085279221502909e-06, - "loss": 0.31733593344688416, - "mean_token_accuracy": 0.90151047706604, - "num_tokens": 24998151.0, - "step": 2792 - }, - { - "epoch": 2.122340425531915, - "grad_norm": 2.649210214614868, - "learning_rate": 1.0835529272844694e-06, - "loss": 0.341595321893692, - "mean_token_accuracy": 0.8989696502685547, - "num_tokens": 25003399.0, - "step": 2793 - }, - { - "epoch": 2.1231003039513676, - "grad_norm": 2.376619577407837, - "learning_rate": 1.0818276271831094e-06, - "loss": 0.2770065665245056, - "mean_token_accuracy": 0.8967875242233276, - "num_tokens": 25009686.0, - "step": 2794 - }, - { - "epoch": 2.1238601823708207, - "grad_norm": 2.1539604663848877, - "learning_rate": 1.080103322409711e-06, - "loss": 0.37501147389411926, - "mean_token_accuracy": 0.8768513202667236, - "num_tokens": 25016339.0, - "step": 2795 - }, - { - "epoch": 2.1246200607902734, - "grad_norm": 2.5727670192718506, - "learning_rate": 1.0783800141744607e-06, - "loss": 0.31852903962135315, - "mean_token_accuracy": 0.8897477388381958, - "num_tokens": 25021410.0, - "step": 2796 - }, - { - "epoch": 2.1253799392097266, - "grad_norm": 2.1428916454315186, - "learning_rate": 1.0766577036868395e-06, - "loss": 0.2348000407218933, - "mean_token_accuracy": 0.9012142419815063, - "num_tokens": 25027375.0, - "step": 2797 - }, - { - "epoch": 2.1261398176291793, - "grad_norm": 2.4231064319610596, - "learning_rate": 1.074936392155631e-06, - "loss": 0.30580806732177734, - "mean_token_accuracy": 0.8963108658790588, - "num_tokens": 25033211.0, - "step": 2798 - }, - { - "epoch": 2.1268996960486324, - "grad_norm": 2.1027259826660156, - "learning_rate": 1.073216080788921e-06, - "loss": 0.2508814334869385, - "mean_token_accuracy": 0.9095165729522705, - "num_tokens": 25040316.0, - "step": 2799 - }, - { - "epoch": 2.127659574468085, - "grad_norm": 1.6513079404830933, - "learning_rate": 1.0714967707940876e-06, - "loss": 0.40694183111190796, - "mean_token_accuracy": 0.8895826935768127, - "num_tokens": 25054978.0, - "step": 2800 - }, - { - "epoch": 2.128419452887538, - "grad_norm": 2.0551133155822754, - "learning_rate": 1.0697784633778093e-06, - "loss": 0.3452662229537964, - "mean_token_accuracy": 0.8710684776306152, - "num_tokens": 25062755.0, - "step": 2801 - }, - { - "epoch": 2.129179331306991, - "grad_norm": 2.1780688762664795, - "learning_rate": 1.0680611597460607e-06, - "loss": 0.2918209135532379, - "mean_token_accuracy": 0.8689337968826294, - "num_tokens": 25069453.0, - "step": 2802 - }, - { - "epoch": 2.1299392097264436, - "grad_norm": 1.7905635833740234, - "learning_rate": 1.0663448611041114e-06, - "loss": 0.3535313308238983, - "mean_token_accuracy": 0.8762770295143127, - "num_tokens": 25080004.0, - "step": 2803 - }, - { - "epoch": 2.1306990881458967, - "grad_norm": 1.6187241077423096, - "learning_rate": 1.0646295686565258e-06, - "loss": 0.3042716681957245, - "mean_token_accuracy": 0.884156346321106, - "num_tokens": 25089652.0, - "step": 2804 - }, - { - "epoch": 2.1314589665653494, - "grad_norm": 2.667459011077881, - "learning_rate": 1.0629152836071633e-06, - "loss": 0.3904019892215729, - "mean_token_accuracy": 0.8603606224060059, - "num_tokens": 25095556.0, - "step": 2805 - }, - { - "epoch": 2.1322188449848025, - "grad_norm": 1.4227970838546753, - "learning_rate": 1.0612020071591722e-06, - "loss": 0.3765299320220947, - "mean_token_accuracy": 0.8655093908309937, - "num_tokens": 25108963.0, - "step": 2806 - }, - { - "epoch": 2.132978723404255, - "grad_norm": 2.262726068496704, - "learning_rate": 1.0594897405149994e-06, - "loss": 0.2727298140525818, - "mean_token_accuracy": 0.9005513191223145, - "num_tokens": 25115135.0, - "step": 2807 - }, - { - "epoch": 2.1337386018237083, - "grad_norm": 2.0810186862945557, - "learning_rate": 1.0577784848763773e-06, - "loss": 0.4001343250274658, - "mean_token_accuracy": 0.8537896871566772, - "num_tokens": 25123079.0, - "step": 2808 - }, - { - "epoch": 2.134498480243161, - "grad_norm": 1.6573376655578613, - "learning_rate": 1.0560682414443315e-06, - "loss": 0.4197486340999603, - "mean_token_accuracy": 0.8549862504005432, - "num_tokens": 25135398.0, - "step": 2809 - }, - { - "epoch": 2.135258358662614, - "grad_norm": 2.200150489807129, - "learning_rate": 1.0543590114191768e-06, - "loss": 0.32026296854019165, - "mean_token_accuracy": 0.8797904253005981, - "num_tokens": 25141382.0, - "step": 2810 - }, - { - "epoch": 2.136018237082067, - "grad_norm": 2.678558111190796, - "learning_rate": 1.0526507960005164e-06, - "loss": 0.30048054456710815, - "mean_token_accuracy": 0.8849201202392578, - "num_tokens": 25146235.0, - "step": 2811 - }, - { - "epoch": 2.13677811550152, - "grad_norm": 1.5207500457763672, - "learning_rate": 1.0509435963872422e-06, - "loss": 0.3706427216529846, - "mean_token_accuracy": 0.8740214109420776, - "num_tokens": 25157108.0, - "step": 2812 - }, - { - "epoch": 2.1375379939209727, - "grad_norm": 1.4632720947265625, - "learning_rate": 1.049237413777532e-06, - "loss": 0.27156776189804077, - "mean_token_accuracy": 0.8950715661048889, - "num_tokens": 25167937.0, - "step": 2813 - }, - { - "epoch": 2.1382978723404253, - "grad_norm": 2.101048469543457, - "learning_rate": 1.0475322493688506e-06, - "loss": 0.366736501455307, - "mean_token_accuracy": 0.8700850009918213, - "num_tokens": 25177043.0, - "step": 2814 - }, - { - "epoch": 2.1390577507598785, - "grad_norm": 2.54221248626709, - "learning_rate": 1.0458281043579482e-06, - "loss": 0.20383943617343903, - "mean_token_accuracy": 0.9226665496826172, - "num_tokens": 25182105.0, - "step": 2815 - }, - { - "epoch": 2.139817629179331, - "grad_norm": 1.7742674350738525, - "learning_rate": 1.04412497994086e-06, - "loss": 0.26852455735206604, - "mean_token_accuracy": 0.8987031579017639, - "num_tokens": 25190178.0, - "step": 2816 - }, - { - "epoch": 2.1405775075987843, - "grad_norm": 3.2856075763702393, - "learning_rate": 1.0424228773129019e-06, - "loss": 0.24643859267234802, - "mean_token_accuracy": 0.9189155101776123, - "num_tokens": 25194105.0, - "step": 2817 - }, - { - "epoch": 2.141337386018237, - "grad_norm": 3.374311923980713, - "learning_rate": 1.0407217976686777e-06, - "loss": 0.2575511336326599, - "mean_token_accuracy": 0.9143530130386353, - "num_tokens": 25197787.0, - "step": 2818 - }, - { - "epoch": 2.14209726443769, - "grad_norm": 1.4967217445373535, - "learning_rate": 1.03902174220207e-06, - "loss": 0.3054750859737396, - "mean_token_accuracy": 0.8989205360412598, - "num_tokens": 25209150.0, - "step": 2819 - }, - { - "epoch": 2.142857142857143, - "grad_norm": 2.654459238052368, - "learning_rate": 1.0373227121062423e-06, - "loss": 0.27398061752319336, - "mean_token_accuracy": 0.9181102514266968, - "num_tokens": 25214015.0, - "step": 2820 - }, - { - "epoch": 2.143617021276596, - "grad_norm": 1.3205828666687012, - "learning_rate": 1.0356247085736388e-06, - "loss": 0.4085468053817749, - "mean_token_accuracy": 0.8745299577713013, - "num_tokens": 25230588.0, - "step": 2821 - }, - { - "epoch": 2.1443768996960486, - "grad_norm": 1.6965736150741577, - "learning_rate": 1.0339277327959863e-06, - "loss": 0.27269643545150757, - "mean_token_accuracy": 0.9001271724700928, - "num_tokens": 25239298.0, - "step": 2822 - }, - { - "epoch": 2.1451367781155017, - "grad_norm": 2.789114236831665, - "learning_rate": 1.0322317859642852e-06, - "loss": 0.2319176197052002, - "mean_token_accuracy": 0.9237110614776611, - "num_tokens": 25243286.0, - "step": 2823 - }, - { - "epoch": 2.1458966565349544, - "grad_norm": 1.8817718029022217, - "learning_rate": 1.0305368692688175e-06, - "loss": 0.2917990982532501, - "mean_token_accuracy": 0.9211062788963318, - "num_tokens": 25250575.0, - "step": 2824 - }, - { - "epoch": 2.146656534954407, - "grad_norm": 2.1824984550476074, - "learning_rate": 1.0288429838991405e-06, - "loss": 0.39010798931121826, - "mean_token_accuracy": 0.8887852430343628, - "num_tokens": 25257947.0, - "step": 2825 - }, - { - "epoch": 2.1474164133738602, - "grad_norm": 1.302579641342163, - "learning_rate": 1.0271501310440882e-06, - "loss": 0.3511282503604889, - "mean_token_accuracy": 0.8728797435760498, - "num_tokens": 25272846.0, - "step": 2826 - }, - { - "epoch": 2.148176291793313, - "grad_norm": 1.691807746887207, - "learning_rate": 1.0254583118917699e-06, - "loss": 0.34246695041656494, - "mean_token_accuracy": 0.8743435144424438, - "num_tokens": 25283004.0, - "step": 2827 - }, - { - "epoch": 2.148936170212766, - "grad_norm": 1.2483569383621216, - "learning_rate": 1.0237675276295709e-06, - "loss": 0.3346659243106842, - "mean_token_accuracy": 0.8823951482772827, - "num_tokens": 25297786.0, - "step": 2828 - }, - { - "epoch": 2.1496960486322187, - "grad_norm": 3.7242841720581055, - "learning_rate": 1.022077779444145e-06, - "loss": 0.25516486167907715, - "mean_token_accuracy": 0.9189130663871765, - "num_tokens": 25301524.0, - "step": 2829 - }, - { - "epoch": 2.150455927051672, - "grad_norm": 2.5851144790649414, - "learning_rate": 1.020389068521426e-06, - "loss": 0.3543069362640381, - "mean_token_accuracy": 0.8942399621009827, - "num_tokens": 25307277.0, - "step": 2830 - }, - { - "epoch": 2.1512158054711246, - "grad_norm": 1.3453631401062012, - "learning_rate": 1.018701396046616e-06, - "loss": 0.2900702953338623, - "mean_token_accuracy": 0.8847548365592957, - "num_tokens": 25321366.0, - "step": 2831 - }, - { - "epoch": 2.1519756838905777, - "grad_norm": 1.6905686855316162, - "learning_rate": 1.0170147632041858e-06, - "loss": 0.24844832718372345, - "mean_token_accuracy": 0.9167388677597046, - "num_tokens": 25328916.0, - "step": 2832 - }, - { - "epoch": 2.1527355623100304, - "grad_norm": 2.6469411849975586, - "learning_rate": 1.0153291711778825e-06, - "loss": 0.18566903471946716, - "mean_token_accuracy": 0.9346771836280823, - "num_tokens": 25332871.0, - "step": 2833 - }, - { - "epoch": 2.1534954407294835, - "grad_norm": 1.3880906105041504, - "learning_rate": 1.0136446211507175e-06, - "loss": 0.37413570284843445, - "mean_token_accuracy": 0.8685535788536072, - "num_tokens": 25347447.0, - "step": 2834 - }, - { - "epoch": 2.154255319148936, - "grad_norm": 1.1376656293869019, - "learning_rate": 1.0119611143049731e-06, - "loss": 0.2844143509864807, - "mean_token_accuracy": 0.8910006284713745, - "num_tokens": 25365930.0, - "step": 2835 - }, - { - "epoch": 2.155015197568389, - "grad_norm": 2.259666919708252, - "learning_rate": 1.0102786518221997e-06, - "loss": 0.3148176074028015, - "mean_token_accuracy": 0.8851165175437927, - "num_tokens": 25373047.0, - "step": 2836 - }, - { - "epoch": 2.155775075987842, - "grad_norm": 3.304095506668091, - "learning_rate": 1.0085972348832138e-06, - "loss": 0.2042517364025116, - "mean_token_accuracy": 0.9247308969497681, - "num_tokens": 25376348.0, - "step": 2837 - }, - { - "epoch": 2.1565349544072947, - "grad_norm": 1.9856120347976685, - "learning_rate": 1.0069168646680985e-06, - "loss": 0.3547414541244507, - "mean_token_accuracy": 0.8941285610198975, - "num_tokens": 25384675.0, - "step": 2838 - }, - { - "epoch": 2.157294832826748, - "grad_norm": 2.8482213020324707, - "learning_rate": 1.0052375423562038e-06, - "loss": 0.3530133366584778, - "mean_token_accuracy": 0.8789700269699097, - "num_tokens": 25389631.0, - "step": 2839 - }, - { - "epoch": 2.1580547112462005, - "grad_norm": 1.4270408153533936, - "learning_rate": 1.0035592691261395e-06, - "loss": 0.34078776836395264, - "mean_token_accuracy": 0.8648165464401245, - "num_tokens": 25403746.0, - "step": 2840 - }, - { - "epoch": 2.1588145896656536, - "grad_norm": 0.9342723488807678, - "learning_rate": 1.0018820461557852e-06, - "loss": 0.2615935504436493, - "mean_token_accuracy": 0.9082236289978027, - "num_tokens": 25424695.0, - "step": 2841 - }, - { - "epoch": 2.1595744680851063, - "grad_norm": 2.695632219314575, - "learning_rate": 1.0002058746222807e-06, - "loss": 0.2202145904302597, - "mean_token_accuracy": 0.9221563339233398, - "num_tokens": 25428783.0, - "step": 2842 - }, - { - "epoch": 2.1603343465045595, - "grad_norm": 1.5679794549942017, - "learning_rate": 9.985307557020257e-07, - "loss": 0.24275024235248566, - "mean_token_accuracy": 0.9363338351249695, - "num_tokens": 25439104.0, - "step": 2843 - }, - { - "epoch": 2.161094224924012, - "grad_norm": 1.5985528230667114, - "learning_rate": 9.968566905706833e-07, - "loss": 0.2541901171207428, - "mean_token_accuracy": 0.9040743112564087, - "num_tokens": 25448829.0, - "step": 2844 - }, - { - "epoch": 2.161854103343465, - "grad_norm": 2.6022164821624756, - "learning_rate": 9.951836804031795e-07, - "loss": 0.24492180347442627, - "mean_token_accuracy": 0.9109418392181396, - "num_tokens": 25453902.0, - "step": 2845 - }, - { - "epoch": 2.162613981762918, - "grad_norm": 1.6719969511032104, - "learning_rate": 9.935117263736943e-07, - "loss": 0.43255117535591125, - "mean_token_accuracy": 0.868374228477478, - "num_tokens": 25465538.0, - "step": 2846 - }, - { - "epoch": 2.1633738601823707, - "grad_norm": 1.8284894227981567, - "learning_rate": 9.918408296556706e-07, - "loss": 0.32285982370376587, - "mean_token_accuracy": 0.9016412496566772, - "num_tokens": 25473721.0, - "step": 2847 - }, - { - "epoch": 2.164133738601824, - "grad_norm": 1.4488024711608887, - "learning_rate": 9.90170991421808e-07, - "loss": 0.35639309883117676, - "mean_token_accuracy": 0.8861881494522095, - "num_tokens": 25487535.0, - "step": 2848 - }, - { - "epoch": 2.1648936170212765, - "grad_norm": 2.089930534362793, - "learning_rate": 9.88502212844063e-07, - "loss": 0.2588546574115753, - "mean_token_accuracy": 0.9029642939567566, - "num_tokens": 25494567.0, - "step": 2849 - }, - { - "epoch": 2.1656534954407296, - "grad_norm": 1.1274315118789673, - "learning_rate": 9.86834495093649e-07, - "loss": 0.37268880009651184, - "mean_token_accuracy": 0.859347939491272, - "num_tokens": 25518278.0, - "step": 2850 - }, - { - "epoch": 2.1664133738601823, - "grad_norm": 2.3886640071868896, - "learning_rate": 9.851678393410343e-07, - "loss": 0.34938913583755493, - "mean_token_accuracy": 0.8724287748336792, - "num_tokens": 25524001.0, - "step": 2851 - }, - { - "epoch": 2.1671732522796354, - "grad_norm": 2.521230459213257, - "learning_rate": 9.83502246755942e-07, - "loss": 0.34781408309936523, - "mean_token_accuracy": 0.8970093131065369, - "num_tokens": 25529982.0, - "step": 2852 - }, - { - "epoch": 2.167933130699088, - "grad_norm": 2.467618942260742, - "learning_rate": 9.818377185073493e-07, - "loss": 0.29725387692451477, - "mean_token_accuracy": 0.8991899490356445, - "num_tokens": 25535356.0, - "step": 2853 - }, - { - "epoch": 2.1686930091185412, - "grad_norm": 2.335873603820801, - "learning_rate": 9.801742557634872e-07, - "loss": 0.39603036642074585, - "mean_token_accuracy": 0.8755916357040405, - "num_tokens": 25542526.0, - "step": 2854 - }, - { - "epoch": 2.169452887537994, - "grad_norm": 1.8388596773147583, - "learning_rate": 9.78511859691835e-07, - "loss": 0.3414672017097473, - "mean_token_accuracy": 0.8951467275619507, - "num_tokens": 25551904.0, - "step": 2855 - }, - { - "epoch": 2.1702127659574466, - "grad_norm": 1.86272394657135, - "learning_rate": 9.768505314591295e-07, - "loss": 0.45748448371887207, - "mean_token_accuracy": 0.8614354133605957, - "num_tokens": 25562197.0, - "step": 2856 - }, - { - "epoch": 2.1709726443768997, - "grad_norm": 1.9142264127731323, - "learning_rate": 9.751902722313527e-07, - "loss": 0.20877259969711304, - "mean_token_accuracy": 0.9316688179969788, - "num_tokens": 25569403.0, - "step": 2857 - }, - { - "epoch": 2.1717325227963524, - "grad_norm": 2.1138272285461426, - "learning_rate": 9.73531083173739e-07, - "loss": 0.37058722972869873, - "mean_token_accuracy": 0.8654135465621948, - "num_tokens": 25577200.0, - "step": 2858 - }, - { - "epoch": 2.1724924012158056, - "grad_norm": 1.973467469215393, - "learning_rate": 9.718729654507713e-07, - "loss": 0.4106993079185486, - "mean_token_accuracy": 0.8958662152290344, - "num_tokens": 25585694.0, - "step": 2859 - }, - { - "epoch": 2.1732522796352582, - "grad_norm": 1.957513451576233, - "learning_rate": 9.702159202261802e-07, - "loss": 0.2067333608865738, - "mean_token_accuracy": 0.9413473606109619, - "num_tokens": 25591604.0, - "step": 2860 - }, - { - "epoch": 2.1740121580547114, - "grad_norm": 2.7639806270599365, - "learning_rate": 9.685599486629444e-07, - "loss": 0.3446827232837677, - "mean_token_accuracy": 0.8837845325469971, - "num_tokens": 25596528.0, - "step": 2861 - }, - { - "epoch": 2.174772036474164, - "grad_norm": 2.483734607696533, - "learning_rate": 9.669050519232875e-07, - "loss": 0.21230249106884003, - "mean_token_accuracy": 0.9334918856620789, - "num_tokens": 25601182.0, - "step": 2862 - }, - { - "epoch": 2.175531914893617, - "grad_norm": 1.7194870710372925, - "learning_rate": 9.65251231168681e-07, - "loss": 0.2657586932182312, - "mean_token_accuracy": 0.9035707712173462, - "num_tokens": 25610561.0, - "step": 2863 - }, - { - "epoch": 2.17629179331307, - "grad_norm": 2.6709611415863037, - "learning_rate": 9.63598487559839e-07, - "loss": 0.3673030138015747, - "mean_token_accuracy": 0.8976202011108398, - "num_tokens": 25615822.0, - "step": 2864 - }, - { - "epoch": 2.1770516717325226, - "grad_norm": 1.6646889448165894, - "learning_rate": 9.619468222567216e-07, - "loss": 0.2796666622161865, - "mean_token_accuracy": 0.8698215484619141, - "num_tokens": 25626148.0, - "step": 2865 - }, - { - "epoch": 2.1778115501519757, - "grad_norm": 1.8341799974441528, - "learning_rate": 9.602962364185286e-07, - "loss": 0.44835132360458374, - "mean_token_accuracy": 0.84391850233078, - "num_tokens": 25636305.0, - "step": 2866 - }, - { - "epoch": 2.1785714285714284, - "grad_norm": 2.3579823970794678, - "learning_rate": 9.586467312037076e-07, - "loss": 0.2875673472881317, - "mean_token_accuracy": 0.889403223991394, - "num_tokens": 25642593.0, - "step": 2867 - }, - { - "epoch": 2.1793313069908815, - "grad_norm": 1.1284339427947998, - "learning_rate": 9.569983077699447e-07, - "loss": 0.3402171730995178, - "mean_token_accuracy": 0.8795222043991089, - "num_tokens": 25663734.0, - "step": 2868 - }, - { - "epoch": 2.180091185410334, - "grad_norm": 1.4705578088760376, - "learning_rate": 9.553509672741646e-07, - "loss": 0.4216107726097107, - "mean_token_accuracy": 0.845354437828064, - "num_tokens": 25678197.0, - "step": 2869 - }, - { - "epoch": 2.1808510638297873, - "grad_norm": 2.6181085109710693, - "learning_rate": 9.53704710872535e-07, - "loss": 0.2777765393257141, - "mean_token_accuracy": 0.8884872198104858, - "num_tokens": 25683808.0, - "step": 2870 - }, - { - "epoch": 2.18161094224924, - "grad_norm": 2.7285003662109375, - "learning_rate": 9.520595397204643e-07, - "loss": 0.33339786529541016, - "mean_token_accuracy": 0.8892828226089478, - "num_tokens": 25690125.0, - "step": 2871 - }, - { - "epoch": 2.182370820668693, - "grad_norm": 2.200571298599243, - "learning_rate": 9.504154549725944e-07, - "loss": 0.46546393632888794, - "mean_token_accuracy": 0.8389996290206909, - "num_tokens": 25697279.0, - "step": 2872 - }, - { - "epoch": 2.183130699088146, - "grad_norm": 3.491392135620117, - "learning_rate": 9.487724577828081e-07, - "loss": 0.17026299238204956, - "mean_token_accuracy": 0.9410334825515747, - "num_tokens": 25700263.0, - "step": 2873 - }, - { - "epoch": 2.183890577507599, - "grad_norm": 2.7800233364105225, - "learning_rate": 9.471305493042243e-07, - "loss": 0.2309894859790802, - "mean_token_accuracy": 0.9233936071395874, - "num_tokens": 25704486.0, - "step": 2874 - }, - { - "epoch": 2.1846504559270516, - "grad_norm": 2.6505582332611084, - "learning_rate": 9.454897306891972e-07, - "loss": 0.4378674328327179, - "mean_token_accuracy": 0.8846660852432251, - "num_tokens": 25710115.0, - "step": 2875 - }, - { - "epoch": 2.1854103343465043, - "grad_norm": 1.5393849611282349, - "learning_rate": 9.438500030893166e-07, - "loss": 0.42081019282341003, - "mean_token_accuracy": 0.8672939538955688, - "num_tokens": 25724598.0, - "step": 2876 - }, - { - "epoch": 2.1861702127659575, - "grad_norm": 1.911198377609253, - "learning_rate": 9.422113676554073e-07, - "loss": 0.19115394353866577, - "mean_token_accuracy": 0.9201297163963318, - "num_tokens": 25731040.0, - "step": 2877 - }, - { - "epoch": 2.18693009118541, - "grad_norm": 1.371443748474121, - "learning_rate": 9.405738255375243e-07, - "loss": 0.3639947772026062, - "mean_token_accuracy": 0.8653393983840942, - "num_tokens": 25745335.0, - "step": 2878 - }, - { - "epoch": 2.1876899696048633, - "grad_norm": 3.216238498687744, - "learning_rate": 9.389373778849612e-07, - "loss": 0.2623414397239685, - "mean_token_accuracy": 0.9046015739440918, - "num_tokens": 25749223.0, - "step": 2879 - }, - { - "epoch": 2.188449848024316, - "grad_norm": 2.7558846473693848, - "learning_rate": 9.37302025846237e-07, - "loss": 0.31921297311782837, - "mean_token_accuracy": 0.8903186321258545, - "num_tokens": 25754341.0, - "step": 2880 - }, - { - "epoch": 2.189209726443769, - "grad_norm": 2.06365704536438, - "learning_rate": 9.356677705691058e-07, - "loss": 0.357482373714447, - "mean_token_accuracy": 0.8661626577377319, - "num_tokens": 25761199.0, - "step": 2881 - }, - { - "epoch": 2.189969604863222, - "grad_norm": 3.240328550338745, - "learning_rate": 9.340346132005507e-07, - "loss": 0.3157888650894165, - "mean_token_accuracy": 0.8948285579681396, - "num_tokens": 25765099.0, - "step": 2882 - }, - { - "epoch": 2.190729483282675, - "grad_norm": 1.4671967029571533, - "learning_rate": 9.324025548867849e-07, - "loss": 0.32077109813690186, - "mean_token_accuracy": 0.8813248872756958, - "num_tokens": 25777636.0, - "step": 2883 - }, - { - "epoch": 2.1914893617021276, - "grad_norm": 2.6475353240966797, - "learning_rate": 9.307715967732492e-07, - "loss": 0.35567623376846313, - "mean_token_accuracy": 0.8738130331039429, - "num_tokens": 25783737.0, - "step": 2884 - }, - { - "epoch": 2.1922492401215807, - "grad_norm": 1.791491150856018, - "learning_rate": 9.29141740004613e-07, - "loss": 0.2556282877922058, - "mean_token_accuracy": 0.9223519563674927, - "num_tokens": 25792069.0, - "step": 2885 - }, - { - "epoch": 2.1930091185410334, - "grad_norm": 2.3944389820098877, - "learning_rate": 9.275129857247722e-07, - "loss": 0.3145869970321655, - "mean_token_accuracy": 0.8938079476356506, - "num_tokens": 25798400.0, - "step": 2886 - }, - { - "epoch": 2.193768996960486, - "grad_norm": 2.0802059173583984, - "learning_rate": 9.258853350768499e-07, - "loss": 0.37343069911003113, - "mean_token_accuracy": 0.8705670833587646, - "num_tokens": 25806567.0, - "step": 2887 - }, - { - "epoch": 2.1945288753799392, - "grad_norm": 2.10831880569458, - "learning_rate": 9.242587892031945e-07, - "loss": 0.1989251971244812, - "mean_token_accuracy": 0.931064248085022, - "num_tokens": 25812715.0, - "step": 2888 - }, - { - "epoch": 2.195288753799392, - "grad_norm": 2.1305530071258545, - "learning_rate": 9.226333492453759e-07, - "loss": 0.29377204179763794, - "mean_token_accuracy": 0.8942701816558838, - "num_tokens": 25819988.0, - "step": 2889 - }, - { - "epoch": 2.196048632218845, - "grad_norm": 2.179025411605835, - "learning_rate": 9.210090163441928e-07, - "loss": 0.37565115094184875, - "mean_token_accuracy": 0.8700202703475952, - "num_tokens": 25827777.0, - "step": 2890 - }, - { - "epoch": 2.1968085106382977, - "grad_norm": 3.177180290222168, - "learning_rate": 9.19385791639665e-07, - "loss": 0.16646479070186615, - "mean_token_accuracy": 0.9426749348640442, - "num_tokens": 25831724.0, - "step": 2891 - }, - { - "epoch": 2.197568389057751, - "grad_norm": 1.103196620941162, - "learning_rate": 9.177636762710321e-07, - "loss": 0.29140013456344604, - "mean_token_accuracy": 0.8789779543876648, - "num_tokens": 25854707.0, - "step": 2892 - }, - { - "epoch": 2.1983282674772036, - "grad_norm": 1.597692847251892, - "learning_rate": 9.161426713767574e-07, - "loss": 0.37799614667892456, - "mean_token_accuracy": 0.8623079061508179, - "num_tokens": 25868429.0, - "step": 2893 - }, - { - "epoch": 2.1990881458966567, - "grad_norm": 2.227132558822632, - "learning_rate": 9.145227780945265e-07, - "loss": 0.2683261036872864, - "mean_token_accuracy": 0.9092563390731812, - "num_tokens": 25875367.0, - "step": 2894 - }, - { - "epoch": 2.1998480243161094, - "grad_norm": 3.1229634284973145, - "learning_rate": 9.129039975612408e-07, - "loss": 0.21859994530677795, - "mean_token_accuracy": 0.9187530875205994, - "num_tokens": 25879456.0, - "step": 2895 - }, - { - "epoch": 2.2006079027355625, - "grad_norm": 2.3224828243255615, - "learning_rate": 9.112863309130235e-07, - "loss": 0.3557605743408203, - "mean_token_accuracy": 0.8735873103141785, - "num_tokens": 25886477.0, - "step": 2896 - }, - { - "epoch": 2.201367781155015, - "grad_norm": 1.7784863710403442, - "learning_rate": 9.096697792852155e-07, - "loss": 0.334577351808548, - "mean_token_accuracy": 0.8948780298233032, - "num_tokens": 25894977.0, - "step": 2897 - }, - { - "epoch": 2.202127659574468, - "grad_norm": 2.34066104888916, - "learning_rate": 9.080543438123746e-07, - "loss": 0.16479721665382385, - "mean_token_accuracy": 0.9405456781387329, - "num_tokens": 25900015.0, - "step": 2898 - }, - { - "epoch": 2.202887537993921, - "grad_norm": 1.944082498550415, - "learning_rate": 9.064400256282757e-07, - "loss": 0.40259572863578796, - "mean_token_accuracy": 0.8632713556289673, - "num_tokens": 25908749.0, - "step": 2899 - }, - { - "epoch": 2.2036474164133737, - "grad_norm": 1.2758828401565552, - "learning_rate": 9.048268258659098e-07, - "loss": 0.3939874470233917, - "mean_token_accuracy": 0.8652969598770142, - "num_tokens": 25924972.0, - "step": 2900 - }, - { - "epoch": 2.204407294832827, - "grad_norm": 1.4483891725540161, - "learning_rate": 9.032147456574822e-07, - "loss": 0.4132935404777527, - "mean_token_accuracy": 0.868486762046814, - "num_tokens": 25939785.0, - "step": 2901 - }, - { - "epoch": 2.2051671732522795, - "grad_norm": 1.4866713285446167, - "learning_rate": 9.01603786134413e-07, - "loss": 0.3644951581954956, - "mean_token_accuracy": 0.8750203847885132, - "num_tokens": 25952648.0, - "step": 2902 - }, - { - "epoch": 2.2059270516717326, - "grad_norm": 1.6555454730987549, - "learning_rate": 8.999939484273362e-07, - "loss": 0.48656779527664185, - "mean_token_accuracy": 0.8372372984886169, - "num_tokens": 25965062.0, - "step": 2903 - }, - { - "epoch": 2.2066869300911853, - "grad_norm": 2.3154168128967285, - "learning_rate": 8.983852336660959e-07, - "loss": 0.3768891990184784, - "mean_token_accuracy": 0.8614999055862427, - "num_tokens": 25972152.0, - "step": 2904 - }, - { - "epoch": 2.2074468085106385, - "grad_norm": 2.3618056774139404, - "learning_rate": 8.967776429797529e-07, - "loss": 0.24905793368816376, - "mean_token_accuracy": 0.9170958995819092, - "num_tokens": 25977808.0, - "step": 2905 - }, - { - "epoch": 2.208206686930091, - "grad_norm": 1.929051399230957, - "learning_rate": 8.951711774965741e-07, - "loss": 0.38099539279937744, - "mean_token_accuracy": 0.8812143802642822, - "num_tokens": 25987871.0, - "step": 2906 - }, - { - "epoch": 2.2089665653495443, - "grad_norm": 1.6529620885849, - "learning_rate": 8.93565838344039e-07, - "loss": 0.31784749031066895, - "mean_token_accuracy": 0.8929437398910522, - "num_tokens": 25997777.0, - "step": 2907 - }, - { - "epoch": 2.209726443768997, - "grad_norm": 2.1413469314575195, - "learning_rate": 8.919616266488373e-07, - "loss": 0.4043882191181183, - "mean_token_accuracy": 0.8937146663665771, - "num_tokens": 26005213.0, - "step": 2908 - }, - { - "epoch": 2.2104863221884496, - "grad_norm": 1.3838988542556763, - "learning_rate": 8.903585435368658e-07, - "loss": 0.2858969569206238, - "mean_token_accuracy": 0.9084860682487488, - "num_tokens": 26018371.0, - "step": 2909 - }, - { - "epoch": 2.211246200607903, - "grad_norm": 1.2853319644927979, - "learning_rate": 8.887565901332304e-07, - "loss": 0.3178713619709015, - "mean_token_accuracy": 0.872230589389801, - "num_tokens": 26034136.0, - "step": 2910 - }, - { - "epoch": 2.2120060790273555, - "grad_norm": 2.9032399654388428, - "learning_rate": 8.871557675622442e-07, - "loss": 0.20348960161209106, - "mean_token_accuracy": 0.9275314807891846, - "num_tokens": 26038299.0, - "step": 2911 - }, - { - "epoch": 2.2127659574468086, - "grad_norm": 2.4349892139434814, - "learning_rate": 8.855560769474237e-07, - "loss": 0.24282032251358032, - "mean_token_accuracy": 0.9103988409042358, - "num_tokens": 26043427.0, - "step": 2912 - }, - { - "epoch": 2.2135258358662613, - "grad_norm": 2.324664831161499, - "learning_rate": 8.839575194114958e-07, - "loss": 0.3808317184448242, - "mean_token_accuracy": 0.8598989844322205, - "num_tokens": 26049667.0, - "step": 2913 - }, - { - "epoch": 2.2142857142857144, - "grad_norm": 2.594947576522827, - "learning_rate": 8.823600960763901e-07, - "loss": 0.39623332023620605, - "mean_token_accuracy": 0.8738477230072021, - "num_tokens": 26055428.0, - "step": 2914 - }, - { - "epoch": 2.215045592705167, - "grad_norm": 1.674308180809021, - "learning_rate": 8.807638080632375e-07, - "loss": 0.2641369104385376, - "mean_token_accuracy": 0.9119734764099121, - "num_tokens": 26064355.0, - "step": 2915 - }, - { - "epoch": 2.2158054711246202, - "grad_norm": 2.9884912967681885, - "learning_rate": 8.791686564923746e-07, - "loss": 0.19229236245155334, - "mean_token_accuracy": 0.9388723969459534, - "num_tokens": 26067563.0, - "step": 2916 - }, - { - "epoch": 2.216565349544073, - "grad_norm": 1.8513846397399902, - "learning_rate": 8.775746424833428e-07, - "loss": 0.3076218366622925, - "mean_token_accuracy": 0.9165210723876953, - "num_tokens": 26075609.0, - "step": 2917 - }, - { - "epoch": 2.217325227963526, - "grad_norm": 1.229604721069336, - "learning_rate": 8.759817671548801e-07, - "loss": 0.2727023959159851, - "mean_token_accuracy": 0.8931418061256409, - "num_tokens": 26091183.0, - "step": 2918 - }, - { - "epoch": 2.2180851063829787, - "grad_norm": 2.384413957595825, - "learning_rate": 8.743900316249273e-07, - "loss": 0.27312609553337097, - "mean_token_accuracy": 0.8972288370132446, - "num_tokens": 26096677.0, - "step": 2919 - }, - { - "epoch": 2.2188449848024314, - "grad_norm": 2.186370611190796, - "learning_rate": 8.727994370106288e-07, - "loss": 0.36045557260513306, - "mean_token_accuracy": 0.8788503408432007, - "num_tokens": 26104464.0, - "step": 2920 - }, - { - "epoch": 2.2196048632218845, - "grad_norm": 2.769796848297119, - "learning_rate": 8.71209984428322e-07, - "loss": 0.3427591919898987, - "mean_token_accuracy": 0.892108678817749, - "num_tokens": 26109571.0, - "step": 2921 - }, - { - "epoch": 2.2203647416413372, - "grad_norm": 2.9888014793395996, - "learning_rate": 8.696216749935471e-07, - "loss": 0.20137615501880646, - "mean_token_accuracy": 0.9366025924682617, - "num_tokens": 26113165.0, - "step": 2922 - }, - { - "epoch": 2.2211246200607904, - "grad_norm": 1.484858751296997, - "learning_rate": 8.680345098210408e-07, - "loss": 0.2884698510169983, - "mean_token_accuracy": 0.8992507457733154, - "num_tokens": 26124385.0, - "step": 2923 - }, - { - "epoch": 2.221884498480243, - "grad_norm": 1.690119981765747, - "learning_rate": 8.664484900247363e-07, - "loss": 0.34275567531585693, - "mean_token_accuracy": 0.8682634234428406, - "num_tokens": 26134944.0, - "step": 2924 - }, - { - "epoch": 2.222644376899696, - "grad_norm": 1.6171982288360596, - "learning_rate": 8.64863616717764e-07, - "loss": 0.256338506937027, - "mean_token_accuracy": 0.9281957745552063, - "num_tokens": 26143586.0, - "step": 2925 - }, - { - "epoch": 2.223404255319149, - "grad_norm": 2.4853835105895996, - "learning_rate": 8.632798910124493e-07, - "loss": 0.26290056109428406, - "mean_token_accuracy": 0.9119559526443481, - "num_tokens": 26148931.0, - "step": 2926 - }, - { - "epoch": 2.224164133738602, - "grad_norm": 2.0014333724975586, - "learning_rate": 8.616973140203097e-07, - "loss": 0.33400261402130127, - "mean_token_accuracy": 0.8796782493591309, - "num_tokens": 26156246.0, - "step": 2927 - }, - { - "epoch": 2.2249240121580547, - "grad_norm": 1.4637027978897095, - "learning_rate": 8.601158868520617e-07, - "loss": 0.24374958872795105, - "mean_token_accuracy": 0.9116952419281006, - "num_tokens": 26166431.0, - "step": 2928 - }, - { - "epoch": 2.225683890577508, - "grad_norm": 2.2056987285614014, - "learning_rate": 8.585356106176093e-07, - "loss": 0.3419337570667267, - "mean_token_accuracy": 0.8703858852386475, - "num_tokens": 26173974.0, - "step": 2929 - }, - { - "epoch": 2.2264437689969605, - "grad_norm": 1.3687927722930908, - "learning_rate": 8.569564864260524e-07, - "loss": 0.43176111578941345, - "mean_token_accuracy": 0.8616900444030762, - "num_tokens": 26191632.0, - "step": 2930 - }, - { - "epoch": 2.227203647416413, - "grad_norm": 1.4975634813308716, - "learning_rate": 8.553785153856809e-07, - "loss": 0.38525745272636414, - "mean_token_accuracy": 0.8611687421798706, - "num_tokens": 26203300.0, - "step": 2931 - }, - { - "epoch": 2.2279635258358663, - "grad_norm": 1.970109462738037, - "learning_rate": 8.538016986039751e-07, - "loss": 0.31731468439102173, - "mean_token_accuracy": 0.884365975856781, - "num_tokens": 26210037.0, - "step": 2932 - }, - { - "epoch": 2.228723404255319, - "grad_norm": 2.681717872619629, - "learning_rate": 8.522260371876068e-07, - "loss": 0.2770140767097473, - "mean_token_accuracy": 0.9020107984542847, - "num_tokens": 26215460.0, - "step": 2933 - }, - { - "epoch": 2.229483282674772, - "grad_norm": 2.2324795722961426, - "learning_rate": 8.506515322424349e-07, - "loss": 0.30599141120910645, - "mean_token_accuracy": 0.8939633965492249, - "num_tokens": 26221260.0, - "step": 2934 - }, - { - "epoch": 2.230243161094225, - "grad_norm": 2.08915376663208, - "learning_rate": 8.49078184873508e-07, - "loss": 0.3609209954738617, - "mean_token_accuracy": 0.8776482343673706, - "num_tokens": 26228397.0, - "step": 2935 - }, - { - "epoch": 2.231003039513678, - "grad_norm": 1.641366958618164, - "learning_rate": 8.475059961850617e-07, - "loss": 0.2969125509262085, - "mean_token_accuracy": 0.8949217796325684, - "num_tokens": 26238533.0, - "step": 2936 - }, - { - "epoch": 2.2317629179331306, - "grad_norm": 1.082148551940918, - "learning_rate": 8.459349672805198e-07, - "loss": 0.23957109451293945, - "mean_token_accuracy": 0.9255712032318115, - "num_tokens": 26254154.0, - "step": 2937 - }, - { - "epoch": 2.2325227963525838, - "grad_norm": 2.495208740234375, - "learning_rate": 8.443650992624877e-07, - "loss": 0.2879767417907715, - "mean_token_accuracy": 0.8911515474319458, - "num_tokens": 26260812.0, - "step": 2938 - }, - { - "epoch": 2.2332826747720365, - "grad_norm": 3.566549062728882, - "learning_rate": 8.427963932327621e-07, - "loss": 0.31420570611953735, - "mean_token_accuracy": 0.8888009190559387, - "num_tokens": 26264592.0, - "step": 2939 - }, - { - "epoch": 2.2340425531914896, - "grad_norm": 2.217177391052246, - "learning_rate": 8.412288502923211e-07, - "loss": 0.30547618865966797, - "mean_token_accuracy": 0.9065294861793518, - "num_tokens": 26270729.0, - "step": 2940 - }, - { - "epoch": 2.2348024316109423, - "grad_norm": 1.404260277748108, - "learning_rate": 8.396624715413251e-07, - "loss": 0.32485032081604004, - "mean_token_accuracy": 0.8799532651901245, - "num_tokens": 26284280.0, - "step": 2941 - }, - { - "epoch": 2.235562310030395, - "grad_norm": 1.5519827604293823, - "learning_rate": 8.380972580791191e-07, - "loss": 0.3330575227737427, - "mean_token_accuracy": 0.8865892887115479, - "num_tokens": 26293635.0, - "step": 2942 - }, - { - "epoch": 2.236322188449848, - "grad_norm": 2.604766845703125, - "learning_rate": 8.365332110042323e-07, - "loss": 0.18986842036247253, - "mean_token_accuracy": 0.9276989102363586, - "num_tokens": 26298553.0, - "step": 2943 - }, - { - "epoch": 2.237082066869301, - "grad_norm": 2.1750004291534424, - "learning_rate": 8.349703314143712e-07, - "loss": 0.3661153018474579, - "mean_token_accuracy": 0.8879489302635193, - "num_tokens": 26305697.0, - "step": 2944 - }, - { - "epoch": 2.237841945288754, - "grad_norm": 2.247069835662842, - "learning_rate": 8.334086204064254e-07, - "loss": 0.3127560615539551, - "mean_token_accuracy": 0.8846344351768494, - "num_tokens": 26312347.0, - "step": 2945 - }, - { - "epoch": 2.2386018237082066, - "grad_norm": 1.905275821685791, - "learning_rate": 8.318480790764638e-07, - "loss": 0.44245776534080505, - "mean_token_accuracy": 0.87440425157547, - "num_tokens": 26322787.0, - "step": 2946 - }, - { - "epoch": 2.2393617021276597, - "grad_norm": 1.8596254587173462, - "learning_rate": 8.302887085197342e-07, - "loss": 0.30068373680114746, - "mean_token_accuracy": 0.8847110271453857, - "num_tokens": 26330437.0, - "step": 2947 - }, - { - "epoch": 2.2401215805471124, - "grad_norm": 2.0028860569000244, - "learning_rate": 8.28730509830663e-07, - "loss": 0.4276006817817688, - "mean_token_accuracy": 0.8406014442443848, - "num_tokens": 26340100.0, - "step": 2948 - }, - { - "epoch": 2.2408814589665655, - "grad_norm": 2.494434356689453, - "learning_rate": 8.271734841028553e-07, - "loss": 0.3874223232269287, - "mean_token_accuracy": 0.8782174587249756, - "num_tokens": 26345750.0, - "step": 2949 - }, - { - "epoch": 2.2416413373860182, - "grad_norm": 1.955613613128662, - "learning_rate": 8.256176324290885e-07, - "loss": 0.28770074248313904, - "mean_token_accuracy": 0.9004360437393188, - "num_tokens": 26353342.0, - "step": 2950 - }, - { - "epoch": 2.2424012158054714, - "grad_norm": 1.7579785585403442, - "learning_rate": 8.240629559013222e-07, - "loss": 0.2277943640947342, - "mean_token_accuracy": 0.9145861864089966, - "num_tokens": 26361348.0, - "step": 2951 - }, - { - "epoch": 2.243161094224924, - "grad_norm": 1.5848479270935059, - "learning_rate": 8.22509455610688e-07, - "loss": 0.32944542169570923, - "mean_token_accuracy": 0.8662827014923096, - "num_tokens": 26372006.0, - "step": 2952 - }, - { - "epoch": 2.2439209726443767, - "grad_norm": 2.6263222694396973, - "learning_rate": 8.209571326474897e-07, - "loss": 0.34646326303482056, - "mean_token_accuracy": 0.8817736506462097, - "num_tokens": 26377664.0, - "step": 2953 - }, - { - "epoch": 2.24468085106383, - "grad_norm": 2.407590627670288, - "learning_rate": 8.194059881012107e-07, - "loss": 0.41302192211151123, - "mean_token_accuracy": 0.8898757696151733, - "num_tokens": 26384225.0, - "step": 2954 - }, - { - "epoch": 2.2454407294832825, - "grad_norm": 2.5156402587890625, - "learning_rate": 8.178560230605012e-07, - "loss": 0.3468608558177948, - "mean_token_accuracy": 0.8879599571228027, - "num_tokens": 26389374.0, - "step": 2955 - }, - { - "epoch": 2.2462006079027357, - "grad_norm": 1.5076090097427368, - "learning_rate": 8.163072386131876e-07, - "loss": 0.3750625550746918, - "mean_token_accuracy": 0.8712738752365112, - "num_tokens": 26402674.0, - "step": 2956 - }, - { - "epoch": 2.2469604863221884, - "grad_norm": 1.5181068181991577, - "learning_rate": 8.147596358462662e-07, - "loss": 0.19113478064537048, - "mean_token_accuracy": 0.9323463439941406, - "num_tokens": 26411626.0, - "step": 2957 - }, - { - "epoch": 2.2477203647416415, - "grad_norm": 1.0806915760040283, - "learning_rate": 8.132132158459044e-07, - "loss": 0.3411233425140381, - "mean_token_accuracy": 0.8736830949783325, - "num_tokens": 26435891.0, - "step": 2958 - }, - { - "epoch": 2.248480243161094, - "grad_norm": 1.5527247190475464, - "learning_rate": 8.116679796974389e-07, - "loss": 0.425741970539093, - "mean_token_accuracy": 0.8448845148086548, - "num_tokens": 26448134.0, - "step": 2959 - }, - { - "epoch": 2.2492401215805473, - "grad_norm": 1.2390631437301636, - "learning_rate": 8.10123928485377e-07, - "loss": 0.38084933161735535, - "mean_token_accuracy": 0.8656617999076843, - "num_tokens": 26467213.0, - "step": 2960 - }, - { - "epoch": 2.25, - "grad_norm": 3.0672852993011475, - "learning_rate": 8.08581063293391e-07, - "loss": 0.29300111532211304, - "mean_token_accuracy": 0.8933638334274292, - "num_tokens": 26471599.0, - "step": 2961 - }, - { - "epoch": 2.250759878419453, - "grad_norm": 1.2359145879745483, - "learning_rate": 8.070393852043251e-07, - "loss": 0.41337621212005615, - "mean_token_accuracy": 0.854198694229126, - "num_tokens": 26488461.0, - "step": 2962 - }, - { - "epoch": 2.251519756838906, - "grad_norm": 1.8551225662231445, - "learning_rate": 8.054988953001889e-07, - "loss": 0.3036419153213501, - "mean_token_accuracy": 0.8883144855499268, - "num_tokens": 26496398.0, - "step": 2963 - }, - { - "epoch": 2.2522796352583585, - "grad_norm": 1.3691812753677368, - "learning_rate": 8.039595946621551e-07, - "loss": 0.3286219835281372, - "mean_token_accuracy": 0.892130434513092, - "num_tokens": 26510493.0, - "step": 2964 - }, - { - "epoch": 2.2530395136778116, - "grad_norm": 1.7371556758880615, - "learning_rate": 8.024214843705647e-07, - "loss": 0.4105026125907898, - "mean_token_accuracy": 0.8889180421829224, - "num_tokens": 26519148.0, - "step": 2965 - }, - { - "epoch": 2.2537993920972643, - "grad_norm": 2.211665630340576, - "learning_rate": 8.00884565504925e-07, - "loss": 0.3912196159362793, - "mean_token_accuracy": 0.8632891774177551, - "num_tokens": 26526314.0, - "step": 2966 - }, - { - "epoch": 2.2545592705167175, - "grad_norm": 2.476206064224243, - "learning_rate": 7.993488391439025e-07, - "loss": 0.20462508499622345, - "mean_token_accuracy": 0.9276266098022461, - "num_tokens": 26531781.0, - "step": 2967 - }, - { - "epoch": 2.25531914893617, - "grad_norm": 1.4944102764129639, - "learning_rate": 7.978143063653296e-07, - "loss": 0.2694895267486572, - "mean_token_accuracy": 0.9033881425857544, - "num_tokens": 26543780.0, - "step": 2968 - }, - { - "epoch": 2.2560790273556233, - "grad_norm": 1.7570104598999023, - "learning_rate": 7.962809682462008e-07, - "loss": 0.3060353100299835, - "mean_token_accuracy": 0.8908290863037109, - "num_tokens": 26551978.0, - "step": 2969 - }, - { - "epoch": 2.256838905775076, - "grad_norm": 2.215514898300171, - "learning_rate": 7.947488258626718e-07, - "loss": 0.2930528521537781, - "mean_token_accuracy": 0.8989757299423218, - "num_tokens": 26558267.0, - "step": 2970 - }, - { - "epoch": 2.2575987841945286, - "grad_norm": 2.3069000244140625, - "learning_rate": 7.93217880290059e-07, - "loss": 0.18501774966716766, - "mean_token_accuracy": 0.931271493434906, - "num_tokens": 26563286.0, - "step": 2971 - }, - { - "epoch": 2.2583586626139818, - "grad_norm": 1.6555116176605225, - "learning_rate": 7.916881326028387e-07, - "loss": 0.3178265392780304, - "mean_token_accuracy": 0.9016884565353394, - "num_tokens": 26572087.0, - "step": 2972 - }, - { - "epoch": 2.2591185410334345, - "grad_norm": 2.222161054611206, - "learning_rate": 7.901595838746471e-07, - "loss": 0.3013504445552826, - "mean_token_accuracy": 0.8942798376083374, - "num_tokens": 26578159.0, - "step": 2973 - }, - { - "epoch": 2.2598784194528876, - "grad_norm": 1.979411005973816, - "learning_rate": 7.886322351782782e-07, - "loss": 0.42746615409851074, - "mean_token_accuracy": 0.85303795337677, - "num_tokens": 26586252.0, - "step": 2974 - }, - { - "epoch": 2.2606382978723403, - "grad_norm": 1.4925786256790161, - "learning_rate": 7.871060875856854e-07, - "loss": 0.33495625853538513, - "mean_token_accuracy": 0.8911026120185852, - "num_tokens": 26599921.0, - "step": 2975 - }, - { - "epoch": 2.2613981762917934, - "grad_norm": 1.9037046432495117, - "learning_rate": 7.855811421679746e-07, - "loss": 0.31471866369247437, - "mean_token_accuracy": 0.9007552862167358, - "num_tokens": 26607954.0, - "step": 2976 - }, - { - "epoch": 2.262158054711246, - "grad_norm": 2.2751407623291016, - "learning_rate": 7.840573999954154e-07, - "loss": 0.26972368359565735, - "mean_token_accuracy": 0.8992317914962769, - "num_tokens": 26614036.0, - "step": 2977 - }, - { - "epoch": 2.262917933130699, - "grad_norm": 2.680572271347046, - "learning_rate": 7.825348621374257e-07, - "loss": 0.4264066219329834, - "mean_token_accuracy": 0.8547691106796265, - "num_tokens": 26619545.0, - "step": 2978 - }, - { - "epoch": 2.263677811550152, - "grad_norm": 2.3535876274108887, - "learning_rate": 7.810135296625817e-07, - "loss": 0.37871062755584717, - "mean_token_accuracy": 0.8621708750724792, - "num_tokens": 26626248.0, - "step": 2979 - }, - { - "epoch": 2.264437689969605, - "grad_norm": 1.2249537706375122, - "learning_rate": 7.794934036386139e-07, - "loss": 0.3877285122871399, - "mean_token_accuracy": 0.8593572378158569, - "num_tokens": 26648023.0, - "step": 2980 - }, - { - "epoch": 2.2651975683890577, - "grad_norm": 2.43371844291687, - "learning_rate": 7.779744851324048e-07, - "loss": 0.37463510036468506, - "mean_token_accuracy": 0.8646193742752075, - "num_tokens": 26654016.0, - "step": 2981 - }, - { - "epoch": 2.2659574468085104, - "grad_norm": 1.7429327964782715, - "learning_rate": 7.7645677520999e-07, - "loss": 0.4033060669898987, - "mean_token_accuracy": 0.8644014596939087, - "num_tokens": 26664447.0, - "step": 2982 - }, - { - "epoch": 2.2667173252279635, - "grad_norm": 2.4090006351470947, - "learning_rate": 7.749402749365573e-07, - "loss": 0.2981206774711609, - "mean_token_accuracy": 0.8886175751686096, - "num_tokens": 26670355.0, - "step": 2983 - }, - { - "epoch": 2.2674772036474162, - "grad_norm": 1.3855396509170532, - "learning_rate": 7.734249853764428e-07, - "loss": 0.35967472195625305, - "mean_token_accuracy": 0.8652631044387817, - "num_tokens": 26685385.0, - "step": 2984 - }, - { - "epoch": 2.2682370820668694, - "grad_norm": 1.328214168548584, - "learning_rate": 7.719109075931375e-07, - "loss": 0.3571951389312744, - "mean_token_accuracy": 0.8894522190093994, - "num_tokens": 26703265.0, - "step": 2985 - }, - { - "epoch": 2.268996960486322, - "grad_norm": 2.5001046657562256, - "learning_rate": 7.703980426492791e-07, - "loss": 0.3512844741344452, - "mean_token_accuracy": 0.887405514717102, - "num_tokens": 26709095.0, - "step": 2986 - }, - { - "epoch": 2.269756838905775, - "grad_norm": 1.8704569339752197, - "learning_rate": 7.688863916066524e-07, - "loss": 0.2746743857860565, - "mean_token_accuracy": 0.903412401676178, - "num_tokens": 26716815.0, - "step": 2987 - }, - { - "epoch": 2.270516717325228, - "grad_norm": 2.1134285926818848, - "learning_rate": 7.673759555261947e-07, - "loss": 0.38385504484176636, - "mean_token_accuracy": 0.8759124279022217, - "num_tokens": 26724046.0, - "step": 2988 - }, - { - "epoch": 2.271276595744681, - "grad_norm": 1.2651840448379517, - "learning_rate": 7.65866735467988e-07, - "loss": 0.3499506413936615, - "mean_token_accuracy": 0.8704953193664551, - "num_tokens": 26743024.0, - "step": 2989 - }, - { - "epoch": 2.2720364741641337, - "grad_norm": 1.7289817333221436, - "learning_rate": 7.643587324912597e-07, - "loss": 0.3768725097179413, - "mean_token_accuracy": 0.8623670339584351, - "num_tokens": 26754336.0, - "step": 2990 - }, - { - "epoch": 2.272796352583587, - "grad_norm": 1.6121667623519897, - "learning_rate": 7.628519476543839e-07, - "loss": 0.42746737599372864, - "mean_token_accuracy": 0.8425478935241699, - "num_tokens": 26766813.0, - "step": 2991 - }, - { - "epoch": 2.2735562310030395, - "grad_norm": 2.705442428588867, - "learning_rate": 7.613463820148831e-07, - "loss": 0.27137982845306396, - "mean_token_accuracy": 0.9014253616333008, - "num_tokens": 26772565.0, - "step": 2992 - }, - { - "epoch": 2.274316109422492, - "grad_norm": 1.3811960220336914, - "learning_rate": 7.598420366294185e-07, - "loss": 0.2957465350627899, - "mean_token_accuracy": 0.8935354351997375, - "num_tokens": 26787325.0, - "step": 2993 - }, - { - "epoch": 2.2750759878419453, - "grad_norm": 2.469336986541748, - "learning_rate": 7.583389125537982e-07, - "loss": 0.2811780273914337, - "mean_token_accuracy": 0.8956634998321533, - "num_tokens": 26793457.0, - "step": 2994 - }, - { - "epoch": 2.275835866261398, - "grad_norm": 2.945681571960449, - "learning_rate": 7.568370108429732e-07, - "loss": 0.3186708092689514, - "mean_token_accuracy": 0.8817545175552368, - "num_tokens": 26797867.0, - "step": 2995 - }, - { - "epoch": 2.276595744680851, - "grad_norm": 1.7748228311538696, - "learning_rate": 7.553363325510355e-07, - "loss": 0.3279818892478943, - "mean_token_accuracy": 0.884396493434906, - "num_tokens": 26806656.0, - "step": 2996 - }, - { - "epoch": 2.277355623100304, - "grad_norm": 1.312500238418579, - "learning_rate": 7.538368787312186e-07, - "loss": 0.3754822611808777, - "mean_token_accuracy": 0.8653179407119751, - "num_tokens": 26823126.0, - "step": 2997 - }, - { - "epoch": 2.278115501519757, - "grad_norm": 3.1305344104766846, - "learning_rate": 7.523386504358984e-07, - "loss": 0.3293214440345764, - "mean_token_accuracy": 0.8908799886703491, - "num_tokens": 26828250.0, - "step": 2998 - }, - { - "epoch": 2.2788753799392096, - "grad_norm": 2.6449344158172607, - "learning_rate": 7.508416487165862e-07, - "loss": 0.23732036352157593, - "mean_token_accuracy": 0.9029837846755981, - "num_tokens": 26833123.0, - "step": 2999 - }, - { - "epoch": 2.2796352583586628, - "grad_norm": 2.04388427734375, - "learning_rate": 7.49345874623939e-07, - "loss": 0.31240373849868774, - "mean_token_accuracy": 0.8860392570495605, - "num_tokens": 26840878.0, - "step": 3000 - }, - { - "epoch": 2.2803951367781155, - "grad_norm": 1.1828604936599731, - "learning_rate": 7.478513292077463e-07, - "loss": 0.32127636671066284, - "mean_token_accuracy": 0.8938446044921875, - "num_tokens": 26858916.0, - "step": 3001 - }, - { - "epoch": 2.2811550151975686, - "grad_norm": 2.5061612129211426, - "learning_rate": 7.46358013516938e-07, - "loss": 0.30558091402053833, - "mean_token_accuracy": 0.8819161653518677, - "num_tokens": 26864218.0, - "step": 3002 - }, - { - "epoch": 2.2819148936170213, - "grad_norm": 2.424044609069824, - "learning_rate": 7.448659285995808e-07, - "loss": 0.3008216917514801, - "mean_token_accuracy": 0.8751994371414185, - "num_tokens": 26869646.0, - "step": 3003 - }, - { - "epoch": 2.282674772036474, - "grad_norm": 1.3576173782348633, - "learning_rate": 7.433750755028774e-07, - "loss": 0.3001647889614105, - "mean_token_accuracy": 0.8996933698654175, - "num_tokens": 26884385.0, - "step": 3004 - }, - { - "epoch": 2.283434650455927, - "grad_norm": 2.237589120864868, - "learning_rate": 7.418854552731655e-07, - "loss": 0.3126741051673889, - "mean_token_accuracy": 0.8910979628562927, - "num_tokens": 26891109.0, - "step": 3005 - }, - { - "epoch": 2.2841945288753798, - "grad_norm": 2.1947414875030518, - "learning_rate": 7.403970689559184e-07, - "loss": 0.29793858528137207, - "mean_token_accuracy": 0.9057353734970093, - "num_tokens": 26897905.0, - "step": 3006 - }, - { - "epoch": 2.284954407294833, - "grad_norm": 1.4252705574035645, - "learning_rate": 7.389099175957426e-07, - "loss": 0.2873227298259735, - "mean_token_accuracy": 0.8910978436470032, - "num_tokens": 26910322.0, - "step": 3007 - }, - { - "epoch": 2.2857142857142856, - "grad_norm": 1.2200649976730347, - "learning_rate": 7.374240022363785e-07, - "loss": 0.2782876491546631, - "mean_token_accuracy": 0.8948163390159607, - "num_tokens": 26927253.0, - "step": 3008 - }, - { - "epoch": 2.2864741641337387, - "grad_norm": 2.1249423027038574, - "learning_rate": 7.359393239206991e-07, - "loss": 0.4046584367752075, - "mean_token_accuracy": 0.8653120994567871, - "num_tokens": 26934798.0, - "step": 3009 - }, - { - "epoch": 2.2872340425531914, - "grad_norm": 1.6851856708526611, - "learning_rate": 7.344558836907067e-07, - "loss": 0.3814213275909424, - "mean_token_accuracy": 0.8618872165679932, - "num_tokens": 26944984.0, - "step": 3010 - }, - { - "epoch": 2.2879939209726445, - "grad_norm": 1.5802191495895386, - "learning_rate": 7.329736825875388e-07, - "loss": 0.28643855452537537, - "mean_token_accuracy": 0.9038295745849609, - "num_tokens": 26957832.0, - "step": 3011 - }, - { - "epoch": 2.288753799392097, - "grad_norm": 1.6257383823394775, - "learning_rate": 7.314927216514617e-07, - "loss": 0.264072448015213, - "mean_token_accuracy": 0.9089190363883972, - "num_tokens": 26967621.0, - "step": 3012 - }, - { - "epoch": 2.2895136778115504, - "grad_norm": 2.107192039489746, - "learning_rate": 7.300130019218688e-07, - "loss": 0.2772635817527771, - "mean_token_accuracy": 0.9071067571640015, - "num_tokens": 26974669.0, - "step": 3013 - }, - { - "epoch": 2.290273556231003, - "grad_norm": 1.496505618095398, - "learning_rate": 7.285345244372843e-07, - "loss": 0.2936630845069885, - "mean_token_accuracy": 0.8946818113327026, - "num_tokens": 26985942.0, - "step": 3014 - }, - { - "epoch": 2.2910334346504557, - "grad_norm": 1.6122950315475464, - "learning_rate": 7.270572902353634e-07, - "loss": 0.2819349765777588, - "mean_token_accuracy": 0.8909854888916016, - "num_tokens": 26996231.0, - "step": 3015 - }, - { - "epoch": 2.291793313069909, - "grad_norm": 1.9463475942611694, - "learning_rate": 7.255813003528834e-07, - "loss": 0.2584724426269531, - "mean_token_accuracy": 0.9069744348526001, - "num_tokens": 27003253.0, - "step": 3016 - }, - { - "epoch": 2.2925531914893615, - "grad_norm": 2.1707770824432373, - "learning_rate": 7.241065558257513e-07, - "loss": 0.17524898052215576, - "mean_token_accuracy": 0.926141083240509, - "num_tokens": 27009501.0, - "step": 3017 - }, - { - "epoch": 2.2933130699088147, - "grad_norm": 2.1424882411956787, - "learning_rate": 7.226330576889998e-07, - "loss": 0.26512211561203003, - "mean_token_accuracy": 0.9059023857116699, - "num_tokens": 27016096.0, - "step": 3018 - }, - { - "epoch": 2.2940729483282674, - "grad_norm": 3.50669264793396, - "learning_rate": 7.211608069767867e-07, - "loss": 0.24738222360610962, - "mean_token_accuracy": 0.9179760217666626, - "num_tokens": 27019810.0, - "step": 3019 - }, - { - "epoch": 2.2948328267477205, - "grad_norm": 1.5426064729690552, - "learning_rate": 7.196898047223943e-07, - "loss": 0.2762960195541382, - "mean_token_accuracy": 0.8937389850616455, - "num_tokens": 27031952.0, - "step": 3020 - }, - { - "epoch": 2.295592705167173, - "grad_norm": 2.469064712524414, - "learning_rate": 7.182200519582283e-07, - "loss": 0.2877562940120697, - "mean_token_accuracy": 0.9252556562423706, - "num_tokens": 27036673.0, - "step": 3021 - }, - { - "epoch": 2.2963525835866263, - "grad_norm": 3.289813756942749, - "learning_rate": 7.167515497158179e-07, - "loss": 0.2837294340133667, - "mean_token_accuracy": 0.9070497155189514, - "num_tokens": 27041001.0, - "step": 3022 - }, - { - "epoch": 2.297112462006079, - "grad_norm": 1.7201104164123535, - "learning_rate": 7.152842990258147e-07, - "loss": 0.44239580631256104, - "mean_token_accuracy": 0.8443326354026794, - "num_tokens": 27052265.0, - "step": 3023 - }, - { - "epoch": 2.297872340425532, - "grad_norm": 1.3710078001022339, - "learning_rate": 7.138183009179922e-07, - "loss": 0.40450236201286316, - "mean_token_accuracy": 0.87160724401474, - "num_tokens": 27068475.0, - "step": 3024 - }, - { - "epoch": 2.298632218844985, - "grad_norm": 2.1379098892211914, - "learning_rate": 7.123535564212419e-07, - "loss": 0.3432690501213074, - "mean_token_accuracy": 0.8736584186553955, - "num_tokens": 27075548.0, - "step": 3025 - }, - { - "epoch": 2.2993920972644375, - "grad_norm": 2.423079252243042, - "learning_rate": 7.108900665635815e-07, - "loss": 0.27869731187820435, - "mean_token_accuracy": 0.9046810865402222, - "num_tokens": 27081560.0, - "step": 3026 - }, - { - "epoch": 2.3001519756838906, - "grad_norm": 1.2137898206710815, - "learning_rate": 7.094278323721418e-07, - "loss": 0.41351836919784546, - "mean_token_accuracy": 0.8553295135498047, - "num_tokens": 27098346.0, - "step": 3027 - }, - { - "epoch": 2.3009118541033433, - "grad_norm": 1.371337890625, - "learning_rate": 7.079668548731757e-07, - "loss": 0.29800572991371155, - "mean_token_accuracy": 0.9219756126403809, - "num_tokens": 27111678.0, - "step": 3028 - }, - { - "epoch": 2.3016717325227964, - "grad_norm": 3.133449077606201, - "learning_rate": 7.065071350920538e-07, - "loss": 0.39177340269088745, - "mean_token_accuracy": 0.8742524981498718, - "num_tokens": 27116496.0, - "step": 3029 - }, - { - "epoch": 2.302431610942249, - "grad_norm": 1.4038591384887695, - "learning_rate": 7.050486740532633e-07, - "loss": 0.2862081825733185, - "mean_token_accuracy": 0.8894703984260559, - "num_tokens": 27130806.0, - "step": 3030 - }, - { - "epoch": 2.3031914893617023, - "grad_norm": 1.806132197380066, - "learning_rate": 7.035914727804085e-07, - "loss": 0.42546606063842773, - "mean_token_accuracy": 0.876154363155365, - "num_tokens": 27143687.0, - "step": 3031 - }, - { - "epoch": 2.303951367781155, - "grad_norm": 1.8565905094146729, - "learning_rate": 7.021355322962103e-07, - "loss": 0.304633229970932, - "mean_token_accuracy": 0.896949052810669, - "num_tokens": 27152532.0, - "step": 3032 - }, - { - "epoch": 2.304711246200608, - "grad_norm": 2.8857851028442383, - "learning_rate": 7.006808536225009e-07, - "loss": 0.3943948745727539, - "mean_token_accuracy": 0.8629783391952515, - "num_tokens": 27157824.0, - "step": 3033 - }, - { - "epoch": 2.3054711246200608, - "grad_norm": 1.7708746194839478, - "learning_rate": 6.992274377802328e-07, - "loss": 0.46951010823249817, - "mean_token_accuracy": 0.8334795236587524, - "num_tokens": 27169445.0, - "step": 3034 - }, - { - "epoch": 2.306231003039514, - "grad_norm": 2.5275487899780273, - "learning_rate": 6.977752857894684e-07, - "loss": 0.3764885365962982, - "mean_token_accuracy": 0.8665527105331421, - "num_tokens": 27176545.0, - "step": 3035 - }, - { - "epoch": 2.3069908814589666, - "grad_norm": 1.9251405000686646, - "learning_rate": 6.963243986693832e-07, - "loss": 0.44473910331726074, - "mean_token_accuracy": 0.8828103542327881, - "num_tokens": 27187808.0, - "step": 3036 - }, - { - "epoch": 2.3077507598784193, - "grad_norm": 2.1559739112854004, - "learning_rate": 6.94874777438265e-07, - "loss": 0.35055795311927795, - "mean_token_accuracy": 0.8815537691116333, - "num_tokens": 27195493.0, - "step": 3037 - }, - { - "epoch": 2.3085106382978724, - "grad_norm": 1.2242814302444458, - "learning_rate": 6.934264231135163e-07, - "loss": 0.38762199878692627, - "mean_token_accuracy": 0.8607999086380005, - "num_tokens": 27213291.0, - "step": 3038 - }, - { - "epoch": 2.309270516717325, - "grad_norm": 3.787707805633545, - "learning_rate": 6.919793367116453e-07, - "loss": 0.299210786819458, - "mean_token_accuracy": 0.8993752002716064, - "num_tokens": 27216930.0, - "step": 3039 - }, - { - "epoch": 2.310030395136778, - "grad_norm": 1.4088979959487915, - "learning_rate": 6.905335192482734e-07, - "loss": 0.337495893239975, - "mean_token_accuracy": 0.8903428912162781, - "num_tokens": 27229441.0, - "step": 3040 - }, - { - "epoch": 2.310790273556231, - "grad_norm": 2.0042521953582764, - "learning_rate": 6.890889717381333e-07, - "loss": 0.2732951045036316, - "mean_token_accuracy": 0.8986722826957703, - "num_tokens": 27237525.0, - "step": 3041 - }, - { - "epoch": 2.311550151975684, - "grad_norm": 2.4301047325134277, - "learning_rate": 6.876456951950614e-07, - "loss": 0.25528258085250854, - "mean_token_accuracy": 0.9083898663520813, - "num_tokens": 27243073.0, - "step": 3042 - }, - { - "epoch": 2.3123100303951367, - "grad_norm": 1.4725151062011719, - "learning_rate": 6.862036906320055e-07, - "loss": 0.3366362452507019, - "mean_token_accuracy": 0.8746060729026794, - "num_tokens": 27255151.0, - "step": 3043 - }, - { - "epoch": 2.31306990881459, - "grad_norm": 2.687649965286255, - "learning_rate": 6.847629590610202e-07, - "loss": 0.30955633521080017, - "mean_token_accuracy": 0.8862895369529724, - "num_tokens": 27259909.0, - "step": 3044 - }, - { - "epoch": 2.3138297872340425, - "grad_norm": 1.9105106592178345, - "learning_rate": 6.833235014932662e-07, - "loss": 0.3366878628730774, - "mean_token_accuracy": 0.8920552134513855, - "num_tokens": 27268003.0, - "step": 3045 - }, - { - "epoch": 2.3145896656534957, - "grad_norm": 2.278108596801758, - "learning_rate": 6.818853189390104e-07, - "loss": 0.41192957758903503, - "mean_token_accuracy": 0.8558850288391113, - "num_tokens": 27275447.0, - "step": 3046 - }, - { - "epoch": 2.3153495440729484, - "grad_norm": 3.114295482635498, - "learning_rate": 6.804484124076249e-07, - "loss": 0.16981825232505798, - "mean_token_accuracy": 0.9305338859558105, - "num_tokens": 27279348.0, - "step": 3047 - }, - { - "epoch": 2.316109422492401, - "grad_norm": 1.188263177871704, - "learning_rate": 6.790127829075843e-07, - "loss": 0.3003719747066498, - "mean_token_accuracy": 0.8945091366767883, - "num_tokens": 27296576.0, - "step": 3048 - }, - { - "epoch": 2.316869300911854, - "grad_norm": 1.4627037048339844, - "learning_rate": 6.775784314464717e-07, - "loss": 0.42125576734542847, - "mean_token_accuracy": 0.85997474193573, - "num_tokens": 27310603.0, - "step": 3049 - }, - { - "epoch": 2.317629179331307, - "grad_norm": 1.86640465259552, - "learning_rate": 6.761453590309675e-07, - "loss": 0.27236056327819824, - "mean_token_accuracy": 0.8952003717422485, - "num_tokens": 27320635.0, - "step": 3050 - }, - { - "epoch": 2.31838905775076, - "grad_norm": 2.3250787258148193, - "learning_rate": 6.747135666668581e-07, - "loss": 0.35650634765625, - "mean_token_accuracy": 0.8870455026626587, - "num_tokens": 27326778.0, - "step": 3051 - }, - { - "epoch": 2.3191489361702127, - "grad_norm": 1.493028163909912, - "learning_rate": 6.732830553590305e-07, - "loss": 0.3086358308792114, - "mean_token_accuracy": 0.8837405443191528, - "num_tokens": 27341792.0, - "step": 3052 - }, - { - "epoch": 2.319908814589666, - "grad_norm": 1.9723037481307983, - "learning_rate": 6.718538261114727e-07, - "loss": 0.2970390021800995, - "mean_token_accuracy": 0.8897635340690613, - "num_tokens": 27349764.0, - "step": 3053 - }, - { - "epoch": 2.3206686930091185, - "grad_norm": 2.418403387069702, - "learning_rate": 6.704258799272723e-07, - "loss": 0.31288546323776245, - "mean_token_accuracy": 0.8795867562294006, - "num_tokens": 27355223.0, - "step": 3054 - }, - { - "epoch": 2.3214285714285716, - "grad_norm": 1.866711139678955, - "learning_rate": 6.689992178086174e-07, - "loss": 0.2915012240409851, - "mean_token_accuracy": 0.8901758790016174, - "num_tokens": 27363363.0, - "step": 3055 - }, - { - "epoch": 2.3221884498480243, - "grad_norm": 2.52559494972229, - "learning_rate": 6.675738407567941e-07, - "loss": 0.28706514835357666, - "mean_token_accuracy": 0.9131950736045837, - "num_tokens": 27368937.0, - "step": 3056 - }, - { - "epoch": 2.3229483282674774, - "grad_norm": 1.5393383502960205, - "learning_rate": 6.661497497721872e-07, - "loss": 0.41627925634384155, - "mean_token_accuracy": 0.8846169114112854, - "num_tokens": 27381824.0, - "step": 3057 - }, - { - "epoch": 2.32370820668693, - "grad_norm": 1.2711350917816162, - "learning_rate": 6.647269458542793e-07, - "loss": 0.3200211524963379, - "mean_token_accuracy": 0.8812989592552185, - "num_tokens": 27399489.0, - "step": 3058 - }, - { - "epoch": 2.324468085106383, - "grad_norm": 2.4790799617767334, - "learning_rate": 6.633054300016464e-07, - "loss": 0.21309956908226013, - "mean_token_accuracy": 0.9245274066925049, - "num_tokens": 27403825.0, - "step": 3059 - }, - { - "epoch": 2.325227963525836, - "grad_norm": 1.937660813331604, - "learning_rate": 6.618852032119655e-07, - "loss": 0.18426720798015594, - "mean_token_accuracy": 0.9317672252655029, - "num_tokens": 27410934.0, - "step": 3060 - }, - { - "epoch": 2.3259878419452886, - "grad_norm": 1.4951587915420532, - "learning_rate": 6.604662664820063e-07, - "loss": 0.27759790420532227, - "mean_token_accuracy": 0.9198849201202393, - "num_tokens": 27421281.0, - "step": 3061 - }, - { - "epoch": 2.3267477203647418, - "grad_norm": 1.6459094285964966, - "learning_rate": 6.590486208076319e-07, - "loss": 0.3164416551589966, - "mean_token_accuracy": 0.8805180788040161, - "num_tokens": 27431545.0, - "step": 3062 - }, - { - "epoch": 2.3275075987841944, - "grad_norm": 1.6612298488616943, - "learning_rate": 6.576322671838003e-07, - "loss": 0.35754746198654175, - "mean_token_accuracy": 0.8680465817451477, - "num_tokens": 27441566.0, - "step": 3063 - }, - { - "epoch": 2.3282674772036476, - "grad_norm": 2.4485018253326416, - "learning_rate": 6.562172066045655e-07, - "loss": 0.2957935929298401, - "mean_token_accuracy": 0.886491596698761, - "num_tokens": 27447186.0, - "step": 3064 - }, - { - "epoch": 2.3290273556231003, - "grad_norm": 1.9771100282669067, - "learning_rate": 6.548034400630693e-07, - "loss": 0.3137952387332916, - "mean_token_accuracy": 0.8874903321266174, - "num_tokens": 27454347.0, - "step": 3065 - }, - { - "epoch": 2.329787234042553, - "grad_norm": 4.502175331115723, - "learning_rate": 6.533909685515483e-07, - "loss": 0.30587732791900635, - "mean_token_accuracy": 0.8878371715545654, - "num_tokens": 27457322.0, - "step": 3066 - }, - { - "epoch": 2.330547112462006, - "grad_norm": 1.041748285293579, - "learning_rate": 6.519797930613289e-07, - "loss": 0.2936970889568329, - "mean_token_accuracy": 0.8899037837982178, - "num_tokens": 27476750.0, - "step": 3067 - }, - { - "epoch": 2.331306990881459, - "grad_norm": 1.57416570186615, - "learning_rate": 6.505699145828287e-07, - "loss": 0.2849736511707306, - "mean_token_accuracy": 0.8906558156013489, - "num_tokens": 27489326.0, - "step": 3068 - }, - { - "epoch": 2.332066869300912, - "grad_norm": 2.879692792892456, - "learning_rate": 6.491613341055547e-07, - "loss": 0.22944235801696777, - "mean_token_accuracy": 0.9167940616607666, - "num_tokens": 27493562.0, - "step": 3069 - }, - { - "epoch": 2.3328267477203646, - "grad_norm": 2.3187942504882812, - "learning_rate": 6.477540526181036e-07, - "loss": 0.3072662949562073, - "mean_token_accuracy": 0.8936570882797241, - "num_tokens": 27499670.0, - "step": 3070 - }, - { - "epoch": 2.3335866261398177, - "grad_norm": 1.3098584413528442, - "learning_rate": 6.463480711081577e-07, - "loss": 0.4124477505683899, - "mean_token_accuracy": 0.8422118425369263, - "num_tokens": 27518197.0, - "step": 3071 - }, - { - "epoch": 2.3343465045592704, - "grad_norm": 1.874219298362732, - "learning_rate": 6.449433905624916e-07, - "loss": 0.34171411395072937, - "mean_token_accuracy": 0.8761874437332153, - "num_tokens": 27526512.0, - "step": 3072 - }, - { - "epoch": 2.3351063829787235, - "grad_norm": 3.3637123107910156, - "learning_rate": 6.435400119669618e-07, - "loss": 0.23634830117225647, - "mean_token_accuracy": 0.9309012293815613, - "num_tokens": 27529921.0, - "step": 3073 - }, - { - "epoch": 2.335866261398176, - "grad_norm": 2.025264263153076, - "learning_rate": 6.421379363065142e-07, - "loss": 0.352272629737854, - "mean_token_accuracy": 0.8678278923034668, - "num_tokens": 27537122.0, - "step": 3074 - }, - { - "epoch": 2.3366261398176293, - "grad_norm": 1.7762253284454346, - "learning_rate": 6.407371645651808e-07, - "loss": 0.3190876841545105, - "mean_token_accuracy": 0.8870849609375, - "num_tokens": 27547436.0, - "step": 3075 - }, - { - "epoch": 2.337386018237082, - "grad_norm": 1.4258071184158325, - "learning_rate": 6.393376977260754e-07, - "loss": 0.24304701387882233, - "mean_token_accuracy": 0.9347224235534668, - "num_tokens": 27559322.0, - "step": 3076 - }, - { - "epoch": 2.3381458966565347, - "grad_norm": 2.015075922012329, - "learning_rate": 6.379395367713983e-07, - "loss": 0.37574928998947144, - "mean_token_accuracy": 0.8884165287017822, - "num_tokens": 27566564.0, - "step": 3077 - }, - { - "epoch": 2.338905775075988, - "grad_norm": 2.2211477756500244, - "learning_rate": 6.365426826824328e-07, - "loss": 0.3210097551345825, - "mean_token_accuracy": 0.8879522085189819, - "num_tokens": 27573643.0, - "step": 3078 - }, - { - "epoch": 2.339665653495441, - "grad_norm": 2.102496385574341, - "learning_rate": 6.351471364395448e-07, - "loss": 0.4013458490371704, - "mean_token_accuracy": 0.887574315071106, - "num_tokens": 27580724.0, - "step": 3079 - }, - { - "epoch": 2.3404255319148937, - "grad_norm": 1.6786696910858154, - "learning_rate": 6.337528990221822e-07, - "loss": 0.3980376124382019, - "mean_token_accuracy": 0.8881500363349915, - "num_tokens": 27592147.0, - "step": 3080 - }, - { - "epoch": 2.3411854103343464, - "grad_norm": 2.541473388671875, - "learning_rate": 6.323599714088754e-07, - "loss": 0.1682094782590866, - "mean_token_accuracy": 0.9426926374435425, - "num_tokens": 27596757.0, - "step": 3081 - }, - { - "epoch": 2.3419452887537995, - "grad_norm": 2.0378596782684326, - "learning_rate": 6.309683545772327e-07, - "loss": 0.4023628234863281, - "mean_token_accuracy": 0.8561117649078369, - "num_tokens": 27604923.0, - "step": 3082 - }, - { - "epoch": 2.342705167173252, - "grad_norm": 1.7666785717010498, - "learning_rate": 6.29578049503946e-07, - "loss": 0.37102991342544556, - "mean_token_accuracy": 0.8807623386383057, - "num_tokens": 27614106.0, - "step": 3083 - }, - { - "epoch": 2.3434650455927053, - "grad_norm": 1.6605560779571533, - "learning_rate": 6.281890571647853e-07, - "loss": 0.4239729642868042, - "mean_token_accuracy": 0.8428831696510315, - "num_tokens": 27626568.0, - "step": 3084 - }, - { - "epoch": 2.344224924012158, - "grad_norm": 1.9562166929244995, - "learning_rate": 6.268013785345969e-07, - "loss": 0.16737908124923706, - "mean_token_accuracy": 0.9457347393035889, - "num_tokens": 27632789.0, - "step": 3085 - }, - { - "epoch": 2.344984802431611, - "grad_norm": 2.274827480316162, - "learning_rate": 6.254150145873081e-07, - "loss": 0.3866672217845917, - "mean_token_accuracy": 0.8498655557632446, - "num_tokens": 27639692.0, - "step": 3086 - }, - { - "epoch": 2.345744680851064, - "grad_norm": 1.9612165689468384, - "learning_rate": 6.240299662959237e-07, - "loss": 0.2607918977737427, - "mean_token_accuracy": 0.9195128679275513, - "num_tokens": 27646911.0, - "step": 3087 - }, - { - "epoch": 2.3465045592705165, - "grad_norm": 1.6821730136871338, - "learning_rate": 6.226462346325221e-07, - "loss": 0.3244997560977936, - "mean_token_accuracy": 0.8889811038970947, - "num_tokens": 27656789.0, - "step": 3088 - }, - { - "epoch": 2.3472644376899696, - "grad_norm": 1.8024263381958008, - "learning_rate": 6.2126382056826e-07, - "loss": 0.28899791836738586, - "mean_token_accuracy": 0.8931136131286621, - "num_tokens": 27666153.0, - "step": 3089 - }, - { - "epoch": 2.3480243161094223, - "grad_norm": 2.8205342292785645, - "learning_rate": 6.198827250733694e-07, - "loss": 0.32387930154800415, - "mean_token_accuracy": 0.9032641649246216, - "num_tokens": 27671042.0, - "step": 3090 - }, - { - "epoch": 2.3487841945288754, - "grad_norm": 2.8001155853271484, - "learning_rate": 6.185029491171554e-07, - "loss": 0.3122251331806183, - "mean_token_accuracy": 0.9122956395149231, - "num_tokens": 27675732.0, - "step": 3091 - }, - { - "epoch": 2.349544072948328, - "grad_norm": 2.6694142818450928, - "learning_rate": 6.171244936679985e-07, - "loss": 0.3166629374027252, - "mean_token_accuracy": 0.875450074672699, - "num_tokens": 27681448.0, - "step": 3092 - }, - { - "epoch": 2.3503039513677813, - "grad_norm": 1.515966534614563, - "learning_rate": 6.157473596933517e-07, - "loss": 0.17373405396938324, - "mean_token_accuracy": 0.933076798915863, - "num_tokens": 27690654.0, - "step": 3093 - }, - { - "epoch": 2.351063829787234, - "grad_norm": 2.4486823081970215, - "learning_rate": 6.143715481597404e-07, - "loss": 0.18732565641403198, - "mean_token_accuracy": 0.9323808550834656, - "num_tokens": 27696111.0, - "step": 3094 - }, - { - "epoch": 2.351823708206687, - "grad_norm": 2.3000645637512207, - "learning_rate": 6.129970600327623e-07, - "loss": 0.267723023891449, - "mean_token_accuracy": 0.9053730964660645, - "num_tokens": 27702103.0, - "step": 3095 - }, - { - "epoch": 2.3525835866261398, - "grad_norm": 2.533583164215088, - "learning_rate": 6.116238962770868e-07, - "loss": 0.40778815746307373, - "mean_token_accuracy": 0.8500792980194092, - "num_tokens": 27708868.0, - "step": 3096 - }, - { - "epoch": 2.353343465045593, - "grad_norm": 1.9357147216796875, - "learning_rate": 6.102520578564508e-07, - "loss": 0.2880813479423523, - "mean_token_accuracy": 0.8895434141159058, - "num_tokens": 27716730.0, - "step": 3097 - }, - { - "epoch": 2.3541033434650456, - "grad_norm": 3.1041259765625, - "learning_rate": 6.088815457336664e-07, - "loss": 0.21810382604599, - "mean_token_accuracy": 0.9217148423194885, - "num_tokens": 27720792.0, - "step": 3098 - }, - { - "epoch": 2.3548632218844983, - "grad_norm": 2.890695095062256, - "learning_rate": 6.075123608706093e-07, - "loss": 0.4002879858016968, - "mean_token_accuracy": 0.8573901653289795, - "num_tokens": 27726201.0, - "step": 3099 - }, - { - "epoch": 2.3556231003039514, - "grad_norm": 3.4247958660125732, - "learning_rate": 6.061445042282271e-07, - "loss": 0.4269426465034485, - "mean_token_accuracy": 0.848825216293335, - "num_tokens": 27730419.0, - "step": 3100 - }, - { - "epoch": 2.356382978723404, - "grad_norm": 1.8903621435165405, - "learning_rate": 6.047779767665341e-07, - "loss": 0.460983544588089, - "mean_token_accuracy": 0.8535886406898499, - "num_tokens": 27741121.0, - "step": 3101 - }, - { - "epoch": 2.357142857142857, - "grad_norm": 2.6975221633911133, - "learning_rate": 6.03412779444612e-07, - "loss": 0.34841713309288025, - "mean_token_accuracy": 0.8812501430511475, - "num_tokens": 27746537.0, - "step": 3102 - }, - { - "epoch": 2.35790273556231, - "grad_norm": 1.4414833784103394, - "learning_rate": 6.02048913220609e-07, - "loss": 0.34440115094184875, - "mean_token_accuracy": 0.8725030422210693, - "num_tokens": 27761085.0, - "step": 3103 - }, - { - "epoch": 2.358662613981763, - "grad_norm": 1.7643623352050781, - "learning_rate": 6.006863790517392e-07, - "loss": 0.31087273359298706, - "mean_token_accuracy": 0.9108829498291016, - "num_tokens": 27769320.0, - "step": 3104 - }, - { - "epoch": 2.3594224924012157, - "grad_norm": 1.365966558456421, - "learning_rate": 5.993251778942794e-07, - "loss": 0.501873254776001, - "mean_token_accuracy": 0.8246122598648071, - "num_tokens": 27791567.0, - "step": 3105 - }, - { - "epoch": 2.360182370820669, - "grad_norm": 2.5981390476226807, - "learning_rate": 5.979653107035754e-07, - "loss": 0.27364015579223633, - "mean_token_accuracy": 0.8946651816368103, - "num_tokens": 27796849.0, - "step": 3106 - }, - { - "epoch": 2.3609422492401215, - "grad_norm": 3.3564229011535645, - "learning_rate": 5.966067784340346e-07, - "loss": 0.2456880509853363, - "mean_token_accuracy": 0.9110729694366455, - "num_tokens": 27800785.0, - "step": 3107 - }, - { - "epoch": 2.3617021276595747, - "grad_norm": 1.6739033460617065, - "learning_rate": 5.952495820391244e-07, - "loss": 0.30737343430519104, - "mean_token_accuracy": 0.8898587226867676, - "num_tokens": 27811982.0, - "step": 3108 - }, - { - "epoch": 2.3624620060790273, - "grad_norm": 1.4430924654006958, - "learning_rate": 5.9389372247138e-07, - "loss": 0.46142861247062683, - "mean_token_accuracy": 0.8355259895324707, - "num_tokens": 27827765.0, - "step": 3109 - }, - { - "epoch": 2.36322188449848, - "grad_norm": 3.7220218181610107, - "learning_rate": 5.92539200682396e-07, - "loss": 0.18588921427726746, - "mean_token_accuracy": 0.9419732093811035, - "num_tokens": 27830551.0, - "step": 3110 - }, - { - "epoch": 2.363981762917933, - "grad_norm": 2.4770448207855225, - "learning_rate": 5.911860176228262e-07, - "loss": 0.3194807767868042, - "mean_token_accuracy": 0.8959789276123047, - "num_tokens": 27836529.0, - "step": 3111 - }, - { - "epoch": 2.364741641337386, - "grad_norm": 2.1989665031433105, - "learning_rate": 5.898341742423866e-07, - "loss": 0.23653598129749298, - "mean_token_accuracy": 0.9119038581848145, - "num_tokens": 27842019.0, - "step": 3112 - }, - { - "epoch": 2.365501519756839, - "grad_norm": 1.9562573432922363, - "learning_rate": 5.884836714898554e-07, - "loss": 0.320852130651474, - "mean_token_accuracy": 0.8902987837791443, - "num_tokens": 27850663.0, - "step": 3113 - }, - { - "epoch": 2.3662613981762917, - "grad_norm": 1.4759801626205444, - "learning_rate": 5.871345103130646e-07, - "loss": 0.2739158570766449, - "mean_token_accuracy": 0.9033761024475098, - "num_tokens": 27863451.0, - "step": 3114 - }, - { - "epoch": 2.367021276595745, - "grad_norm": 1.7798938751220703, - "learning_rate": 5.857866916589089e-07, - "loss": 0.35400229692459106, - "mean_token_accuracy": 0.8623180389404297, - "num_tokens": 27873669.0, - "step": 3115 - }, - { - "epoch": 2.3677811550151975, - "grad_norm": 2.269472360610962, - "learning_rate": 5.84440216473339e-07, - "loss": 0.3717876672744751, - "mean_token_accuracy": 0.8853007555007935, - "num_tokens": 27880307.0, - "step": 3116 - }, - { - "epoch": 2.3685410334346506, - "grad_norm": 1.5675846338272095, - "learning_rate": 5.830950857013629e-07, - "loss": 0.3465133011341095, - "mean_token_accuracy": 0.876459002494812, - "num_tokens": 27893889.0, - "step": 3117 - }, - { - "epoch": 2.3693009118541033, - "grad_norm": 2.782482147216797, - "learning_rate": 5.817513002870451e-07, - "loss": 0.14173674583435059, - "mean_token_accuracy": 0.9492213726043701, - "num_tokens": 27897693.0, - "step": 3118 - }, - { - "epoch": 2.3700607902735564, - "grad_norm": 1.830674171447754, - "learning_rate": 5.80408861173507e-07, - "loss": 0.2692085802555084, - "mean_token_accuracy": 0.9287421107292175, - "num_tokens": 27905261.0, - "step": 3119 - }, - { - "epoch": 2.370820668693009, - "grad_norm": 2.2477660179138184, - "learning_rate": 5.790677693029217e-07, - "loss": 0.32119685411453247, - "mean_token_accuracy": 0.8751975297927856, - "num_tokens": 27911581.0, - "step": 3120 - }, - { - "epoch": 2.371580547112462, - "grad_norm": 2.3288302421569824, - "learning_rate": 5.777280256165218e-07, - "loss": 0.34133443236351013, - "mean_token_accuracy": 0.8763091564178467, - "num_tokens": 27918603.0, - "step": 3121 - }, - { - "epoch": 2.372340425531915, - "grad_norm": 1.595375895500183, - "learning_rate": 5.763896310545893e-07, - "loss": 0.30863112211227417, - "mean_token_accuracy": 0.8858665823936462, - "num_tokens": 27929892.0, - "step": 3122 - }, - { - "epoch": 2.3731003039513676, - "grad_norm": 2.0553293228149414, - "learning_rate": 5.750525865564613e-07, - "loss": 0.28052228689193726, - "mean_token_accuracy": 0.8970555067062378, - "num_tokens": 27937532.0, - "step": 3123 - }, - { - "epoch": 2.3738601823708207, - "grad_norm": 1.4700267314910889, - "learning_rate": 5.737168930605272e-07, - "loss": 0.27994588017463684, - "mean_token_accuracy": 0.9026262760162354, - "num_tokens": 27948679.0, - "step": 3124 - }, - { - "epoch": 2.3746200607902734, - "grad_norm": 3.2083890438079834, - "learning_rate": 5.723825515042284e-07, - "loss": 0.1810106784105301, - "mean_token_accuracy": 0.9297720193862915, - "num_tokens": 27952090.0, - "step": 3125 - }, - { - "epoch": 2.3753799392097266, - "grad_norm": 1.4345086812973022, - "learning_rate": 5.710495628240567e-07, - "loss": 0.2929079830646515, - "mean_token_accuracy": 0.8950849771499634, - "num_tokens": 27964959.0, - "step": 3126 - }, - { - "epoch": 2.3761398176291793, - "grad_norm": 2.0222737789154053, - "learning_rate": 5.697179279555551e-07, - "loss": 0.41308528184890747, - "mean_token_accuracy": 0.8616737127304077, - "num_tokens": 27973803.0, - "step": 3127 - }, - { - "epoch": 2.3768996960486324, - "grad_norm": 1.2820483446121216, - "learning_rate": 5.683876478333161e-07, - "loss": 0.4069697856903076, - "mean_token_accuracy": 0.8547379970550537, - "num_tokens": 27991576.0, - "step": 3128 - }, - { - "epoch": 2.377659574468085, - "grad_norm": 2.3709049224853516, - "learning_rate": 5.670587233909819e-07, - "loss": 0.1923210471868515, - "mean_token_accuracy": 0.9360835552215576, - "num_tokens": 27997051.0, - "step": 3129 - }, - { - "epoch": 2.378419452887538, - "grad_norm": 1.874002456665039, - "learning_rate": 5.657311555612433e-07, - "loss": 0.431087851524353, - "mean_token_accuracy": 0.8736472129821777, - "num_tokens": 28004863.0, - "step": 3130 - }, - { - "epoch": 2.379179331306991, - "grad_norm": 1.0792341232299805, - "learning_rate": 5.64404945275836e-07, - "loss": 0.38039785623550415, - "mean_token_accuracy": 0.8523920178413391, - "num_tokens": 28027220.0, - "step": 3131 - }, - { - "epoch": 2.3799392097264436, - "grad_norm": 1.7947046756744385, - "learning_rate": 5.630800934655481e-07, - "loss": 0.29587826132774353, - "mean_token_accuracy": 0.8919603824615479, - "num_tokens": 28035495.0, - "step": 3132 - }, - { - "epoch": 2.3806990881458967, - "grad_norm": 3.4972469806671143, - "learning_rate": 5.617566010602113e-07, - "loss": 0.31223949790000916, - "mean_token_accuracy": 0.895270586013794, - "num_tokens": 28039135.0, - "step": 3133 - }, - { - "epoch": 2.3814589665653494, - "grad_norm": 2.331387758255005, - "learning_rate": 5.60434468988702e-07, - "loss": 0.30856233835220337, - "mean_token_accuracy": 0.8810996413230896, - "num_tokens": 28045572.0, - "step": 3134 - }, - { - "epoch": 2.3822188449848025, - "grad_norm": 1.9918609857559204, - "learning_rate": 5.591136981789439e-07, - "loss": 0.3031975328922272, - "mean_token_accuracy": 0.9028782844543457, - "num_tokens": 28051851.0, - "step": 3135 - }, - { - "epoch": 2.382978723404255, - "grad_norm": 1.6089690923690796, - "learning_rate": 5.577942895579064e-07, - "loss": 0.34390494227409363, - "mean_token_accuracy": 0.8744557499885559, - "num_tokens": 28062705.0, - "step": 3136 - }, - { - "epoch": 2.3837386018237083, - "grad_norm": 1.4829623699188232, - "learning_rate": 5.564762440515994e-07, - "loss": 0.3172723650932312, - "mean_token_accuracy": 0.9192344546318054, - "num_tokens": 28073539.0, - "step": 3137 - }, - { - "epoch": 2.384498480243161, - "grad_norm": 1.4833530187606812, - "learning_rate": 5.551595625850786e-07, - "loss": 0.3714778423309326, - "mean_token_accuracy": 0.8697570562362671, - "num_tokens": 28085949.0, - "step": 3138 - }, - { - "epoch": 2.385258358662614, - "grad_norm": 3.140885829925537, - "learning_rate": 5.538442460824417e-07, - "loss": 0.3266214430332184, - "mean_token_accuracy": 0.9124236702919006, - "num_tokens": 28090639.0, - "step": 3139 - }, - { - "epoch": 2.386018237082067, - "grad_norm": 1.731658935546875, - "learning_rate": 5.525302954668285e-07, - "loss": 0.21903038024902344, - "mean_token_accuracy": 0.9181338548660278, - "num_tokens": 28099076.0, - "step": 3140 - }, - { - "epoch": 2.38677811550152, - "grad_norm": 1.2315683364868164, - "learning_rate": 5.5121771166042e-07, - "loss": 0.25057584047317505, - "mean_token_accuracy": 0.9130429029464722, - "num_tokens": 28113532.0, - "step": 3141 - }, - { - "epoch": 2.3875379939209727, - "grad_norm": 3.888575553894043, - "learning_rate": 5.499064955844383e-07, - "loss": 0.173577219247818, - "mean_token_accuracy": 0.9388964176177979, - "num_tokens": 28116683.0, - "step": 3142 - }, - { - "epoch": 2.3882978723404253, - "grad_norm": 1.4791816473007202, - "learning_rate": 5.48596648159145e-07, - "loss": 0.38739481568336487, - "mean_token_accuracy": 0.9086727499961853, - "num_tokens": 28129363.0, - "step": 3143 - }, - { - "epoch": 2.3890577507598785, - "grad_norm": 2.1314213275909424, - "learning_rate": 5.472881703038418e-07, - "loss": 0.3724244236946106, - "mean_token_accuracy": 0.8749525547027588, - "num_tokens": 28136421.0, - "step": 3144 - }, - { - "epoch": 2.389817629179331, - "grad_norm": 2.4120898246765137, - "learning_rate": 5.459810629368692e-07, - "loss": 0.36195144057273865, - "mean_token_accuracy": 0.869860053062439, - "num_tokens": 28143903.0, - "step": 3145 - }, - { - "epoch": 2.3905775075987843, - "grad_norm": 1.7327654361724854, - "learning_rate": 5.446753269756036e-07, - "loss": 0.3846886157989502, - "mean_token_accuracy": 0.859398603439331, - "num_tokens": 28155403.0, - "step": 3146 - }, - { - "epoch": 2.391337386018237, - "grad_norm": 1.2435929775238037, - "learning_rate": 5.433709633364637e-07, - "loss": 0.36000579595565796, - "mean_token_accuracy": 0.8722110986709595, - "num_tokens": 28171739.0, - "step": 3147 - }, - { - "epoch": 2.39209726443769, - "grad_norm": 1.746272325515747, - "learning_rate": 5.420679729348993e-07, - "loss": 0.36778098344802856, - "mean_token_accuracy": 0.8639050722122192, - "num_tokens": 28182326.0, - "step": 3148 - }, - { - "epoch": 2.392857142857143, - "grad_norm": 2.0103561878204346, - "learning_rate": 5.407663566854008e-07, - "loss": 0.3921544551849365, - "mean_token_accuracy": 0.8679144382476807, - "num_tokens": 28191456.0, - "step": 3149 - }, - { - "epoch": 2.393617021276596, - "grad_norm": 1.792054533958435, - "learning_rate": 5.394661155014921e-07, - "loss": 0.4300078749656677, - "mean_token_accuracy": 0.8496290445327759, - "num_tokens": 28201943.0, - "step": 3150 - }, - { - "epoch": 2.3943768996960486, - "grad_norm": 1.1109238862991333, - "learning_rate": 5.381672502957324e-07, - "loss": 0.3262210190296173, - "mean_token_accuracy": 0.8634629845619202, - "num_tokens": 28221353.0, - "step": 3151 - }, - { - "epoch": 2.3951367781155017, - "grad_norm": 1.855241060256958, - "learning_rate": 5.368697619797159e-07, - "loss": 0.3076592981815338, - "mean_token_accuracy": 0.9093140959739685, - "num_tokens": 28229172.0, - "step": 3152 - }, - { - "epoch": 2.3958966565349544, - "grad_norm": 2.416808605194092, - "learning_rate": 5.355736514640697e-07, - "loss": 0.27811431884765625, - "mean_token_accuracy": 0.9024926424026489, - "num_tokens": 28234877.0, - "step": 3153 - }, - { - "epoch": 2.396656534954407, - "grad_norm": 1.6434770822525024, - "learning_rate": 5.342789196584527e-07, - "loss": 0.43254753947257996, - "mean_token_accuracy": 0.8404601812362671, - "num_tokens": 28245905.0, - "step": 3154 - }, - { - "epoch": 2.3974164133738602, - "grad_norm": 2.4053826332092285, - "learning_rate": 5.329855674715592e-07, - "loss": 0.3984904885292053, - "mean_token_accuracy": 0.8764510154724121, - "num_tokens": 28251558.0, - "step": 3155 - }, - { - "epoch": 2.398176291793313, - "grad_norm": 1.60322904586792, - "learning_rate": 5.316935958111139e-07, - "loss": 0.34025734663009644, - "mean_token_accuracy": 0.8753441572189331, - "num_tokens": 28261596.0, - "step": 3156 - }, - { - "epoch": 2.398936170212766, - "grad_norm": 1.5645020008087158, - "learning_rate": 5.304030055838704e-07, - "loss": 0.35805732011795044, - "mean_token_accuracy": 0.8628225922584534, - "num_tokens": 28272233.0, - "step": 3157 - }, - { - "epoch": 2.3996960486322187, - "grad_norm": 2.0708835124969482, - "learning_rate": 5.291137976956148e-07, - "loss": 0.35056009888648987, - "mean_token_accuracy": 0.8771238923072815, - "num_tokens": 28279905.0, - "step": 3158 - }, - { - "epoch": 2.400455927051672, - "grad_norm": 1.9882023334503174, - "learning_rate": 5.278259730511651e-07, - "loss": 0.30454230308532715, - "mean_token_accuracy": 0.883628249168396, - "num_tokens": 28287183.0, - "step": 3159 - }, - { - "epoch": 2.4012158054711246, - "grad_norm": 2.3435161113739014, - "learning_rate": 5.26539532554364e-07, - "loss": 0.262816458940506, - "mean_token_accuracy": 0.8924182653427124, - "num_tokens": 28293816.0, - "step": 3160 - }, - { - "epoch": 2.4019756838905777, - "grad_norm": 1.5700311660766602, - "learning_rate": 5.252544771080853e-07, - "loss": 0.43474194407463074, - "mean_token_accuracy": 0.8594561815261841, - "num_tokens": 28306346.0, - "step": 3161 - }, - { - "epoch": 2.4027355623100304, - "grad_norm": 1.8969467878341675, - "learning_rate": 5.239708076142311e-07, - "loss": 0.309972882270813, - "mean_token_accuracy": 0.8846274614334106, - "num_tokens": 28314843.0, - "step": 3162 - }, - { - "epoch": 2.4034954407294835, - "grad_norm": 2.2149617671966553, - "learning_rate": 5.226885249737292e-07, - "loss": 0.40023672580718994, - "mean_token_accuracy": 0.8641965389251709, - "num_tokens": 28322124.0, - "step": 3163 - }, - { - "epoch": 2.404255319148936, - "grad_norm": 1.3280621767044067, - "learning_rate": 5.214076300865359e-07, - "loss": 0.31123194098472595, - "mean_token_accuracy": 0.8883715271949768, - "num_tokens": 28336490.0, - "step": 3164 - }, - { - "epoch": 2.405015197568389, - "grad_norm": 1.402884602546692, - "learning_rate": 5.201281238516318e-07, - "loss": 0.2590488791465759, - "mean_token_accuracy": 0.9011414051055908, - "num_tokens": 28349094.0, - "step": 3165 - }, - { - "epoch": 2.405775075987842, - "grad_norm": 1.6564174890518188, - "learning_rate": 5.188500071670235e-07, - "loss": 0.23672837018966675, - "mean_token_accuracy": 0.9133221507072449, - "num_tokens": 28357665.0, - "step": 3166 - }, - { - "epoch": 2.4065349544072947, - "grad_norm": 1.9133414030075073, - "learning_rate": 5.175732809297435e-07, - "loss": 0.40488386154174805, - "mean_token_accuracy": 0.8534098863601685, - "num_tokens": 28366519.0, - "step": 3167 - }, - { - "epoch": 2.407294832826748, - "grad_norm": 1.447898268699646, - "learning_rate": 5.16297946035847e-07, - "loss": 0.3679184913635254, - "mean_token_accuracy": 0.8696858882904053, - "num_tokens": 28379315.0, - "step": 3168 - }, - { - "epoch": 2.4080547112462005, - "grad_norm": 3.454120397567749, - "learning_rate": 5.150240033804116e-07, - "loss": 0.23210272192955017, - "mean_token_accuracy": 0.9179670214653015, - "num_tokens": 28382844.0, - "step": 3169 - }, - { - "epoch": 2.4088145896656536, - "grad_norm": 1.7603836059570312, - "learning_rate": 5.137514538575419e-07, - "loss": 0.4491140842437744, - "mean_token_accuracy": 0.8472066521644592, - "num_tokens": 28394064.0, - "step": 3170 - }, - { - "epoch": 2.4095744680851063, - "grad_norm": 1.3338149785995483, - "learning_rate": 5.124802983603602e-07, - "loss": 0.3237353563308716, - "mean_token_accuracy": 0.8897873163223267, - "num_tokens": 28410190.0, - "step": 3171 - }, - { - "epoch": 2.410334346504559, - "grad_norm": 2.6191205978393555, - "learning_rate": 5.112105377810128e-07, - "loss": 0.3119543194770813, - "mean_token_accuracy": 0.889589250087738, - "num_tokens": 28414838.0, - "step": 3172 - }, - { - "epoch": 2.411094224924012, - "grad_norm": 2.583130121231079, - "learning_rate": 5.099421730106669e-07, - "loss": 0.2616881728172302, - "mean_token_accuracy": 0.9155426621437073, - "num_tokens": 28419792.0, - "step": 3173 - }, - { - "epoch": 2.4118541033434653, - "grad_norm": 2.875683307647705, - "learning_rate": 5.086752049395094e-07, - "loss": 0.2567689120769501, - "mean_token_accuracy": 0.9075877666473389, - "num_tokens": 28424069.0, - "step": 3174 - }, - { - "epoch": 2.412613981762918, - "grad_norm": 1.695042371749878, - "learning_rate": 5.074096344567475e-07, - "loss": 0.3164510130882263, - "mean_token_accuracy": 0.8845095634460449, - "num_tokens": 28433279.0, - "step": 3175 - }, - { - "epoch": 2.4133738601823707, - "grad_norm": 2.110863447189331, - "learning_rate": 5.061454624506074e-07, - "loss": 0.22680208086967468, - "mean_token_accuracy": 0.9221781492233276, - "num_tokens": 28439569.0, - "step": 3176 - }, - { - "epoch": 2.414133738601824, - "grad_norm": 2.030958890914917, - "learning_rate": 5.048826898083331e-07, - "loss": 0.3482169210910797, - "mean_token_accuracy": 0.8853809833526611, - "num_tokens": 28447203.0, - "step": 3177 - }, - { - "epoch": 2.4148936170212765, - "grad_norm": 1.9921014308929443, - "learning_rate": 5.036213174161877e-07, - "loss": 0.29343554377555847, - "mean_token_accuracy": 0.893486499786377, - "num_tokens": 28454923.0, - "step": 3178 - }, - { - "epoch": 2.4156534954407296, - "grad_norm": 4.079009532928467, - "learning_rate": 5.023613461594512e-07, - "loss": 0.2569321095943451, - "mean_token_accuracy": 0.9205472469329834, - "num_tokens": 28458173.0, - "step": 3179 - }, - { - "epoch": 2.4164133738601823, - "grad_norm": 3.077458143234253, - "learning_rate": 5.01102776922418e-07, - "loss": 0.3203810453414917, - "mean_token_accuracy": 0.8863208293914795, - "num_tokens": 28462449.0, - "step": 3180 - }, - { - "epoch": 2.4171732522796354, - "grad_norm": 2.4658167362213135, - "learning_rate": 4.998456105884025e-07, - "loss": 0.33045345544815063, - "mean_token_accuracy": 0.8856333494186401, - "num_tokens": 28468051.0, - "step": 3181 - }, - { - "epoch": 2.417933130699088, - "grad_norm": 2.053370714187622, - "learning_rate": 4.985898480397322e-07, - "loss": 0.2415514886379242, - "mean_token_accuracy": 0.9296282529830933, - "num_tokens": 28473839.0, - "step": 3182 - }, - { - "epoch": 2.418693009118541, - "grad_norm": 2.705026149749756, - "learning_rate": 4.973354901577487e-07, - "loss": 0.3233085870742798, - "mean_token_accuracy": 0.8820867538452148, - "num_tokens": 28479419.0, - "step": 3183 - }, - { - "epoch": 2.419452887537994, - "grad_norm": 2.1648733615875244, - "learning_rate": 4.960825378228082e-07, - "loss": 0.25225499272346497, - "mean_token_accuracy": 0.9170141220092773, - "num_tokens": 28484968.0, - "step": 3184 - }, - { - "epoch": 2.420212765957447, - "grad_norm": 1.8317075967788696, - "learning_rate": 4.948309919142832e-07, - "loss": 0.3143184781074524, - "mean_token_accuracy": 0.8824752569198608, - "num_tokens": 28492904.0, - "step": 3185 - }, - { - "epoch": 2.4209726443768997, - "grad_norm": 2.591052770614624, - "learning_rate": 4.935808533105546e-07, - "loss": 0.31191521883010864, - "mean_token_accuracy": 0.9041938185691833, - "num_tokens": 28498136.0, - "step": 3186 - }, - { - "epoch": 2.4217325227963524, - "grad_norm": 2.200559139251709, - "learning_rate": 4.923321228890184e-07, - "loss": 0.23661679029464722, - "mean_token_accuracy": 0.9179906845092773, - "num_tokens": 28504246.0, - "step": 3187 - }, - { - "epoch": 2.4224924012158056, - "grad_norm": 1.6311591863632202, - "learning_rate": 4.910848015260822e-07, - "loss": 0.35421687364578247, - "mean_token_accuracy": 0.8728591799736023, - "num_tokens": 28515481.0, - "step": 3188 - }, - { - "epoch": 2.4232522796352582, - "grad_norm": 2.1564102172851562, - "learning_rate": 4.898388900971635e-07, - "loss": 0.30809515714645386, - "mean_token_accuracy": 0.8960262537002563, - "num_tokens": 28521294.0, - "step": 3189 - }, - { - "epoch": 2.4240121580547114, - "grad_norm": 2.1413958072662354, - "learning_rate": 4.885943894766909e-07, - "loss": 0.217842698097229, - "mean_token_accuracy": 0.9408326745033264, - "num_tokens": 28527104.0, - "step": 3190 - }, - { - "epoch": 2.424772036474164, - "grad_norm": 2.373764991760254, - "learning_rate": 4.873513005381042e-07, - "loss": 0.33814239501953125, - "mean_token_accuracy": 0.9007177352905273, - "num_tokens": 28533654.0, - "step": 3191 - }, - { - "epoch": 2.425531914893617, - "grad_norm": 1.8809123039245605, - "learning_rate": 4.861096241538483e-07, - "loss": 0.4467903971672058, - "mean_token_accuracy": 0.8424190282821655, - "num_tokens": 28543667.0, - "step": 3192 - }, - { - "epoch": 2.42629179331307, - "grad_norm": 1.4945175647735596, - "learning_rate": 4.848693611953825e-07, - "loss": 0.3123834729194641, - "mean_token_accuracy": 0.9072253704071045, - "num_tokens": 28554944.0, - "step": 3193 - }, - { - "epoch": 2.4270516717325226, - "grad_norm": 1.8136200904846191, - "learning_rate": 4.836305125331695e-07, - "loss": 0.27221372723579407, - "mean_token_accuracy": 0.9039586782455444, - "num_tokens": 28563082.0, - "step": 3194 - }, - { - "epoch": 2.4278115501519757, - "grad_norm": 4.269916534423828, - "learning_rate": 4.823930790366801e-07, - "loss": 0.2660295069217682, - "mean_token_accuracy": 0.9072147607803345, - "num_tokens": 28566246.0, - "step": 3195 - }, - { - "epoch": 2.4285714285714284, - "grad_norm": 2.273453950881958, - "learning_rate": 4.811570615743952e-07, - "loss": 0.27304959297180176, - "mean_token_accuracy": 0.9012454748153687, - "num_tokens": 28572906.0, - "step": 3196 - }, - { - "epoch": 2.4293313069908815, - "grad_norm": 2.0931496620178223, - "learning_rate": 4.799224610137975e-07, - "loss": 0.2358006238937378, - "mean_token_accuracy": 0.9263845682144165, - "num_tokens": 28579148.0, - "step": 3197 - }, - { - "epoch": 2.430091185410334, - "grad_norm": 1.885201334953308, - "learning_rate": 4.786892782213781e-07, - "loss": 0.346000611782074, - "mean_token_accuracy": 0.8616824150085449, - "num_tokens": 28587361.0, - "step": 3198 - }, - { - "epoch": 2.4308510638297873, - "grad_norm": 2.229367971420288, - "learning_rate": 4.774575140626317e-07, - "loss": 0.2951638400554657, - "mean_token_accuracy": 0.8938933610916138, - "num_tokens": 28593622.0, - "step": 3199 - }, - { - "epoch": 2.43161094224924, - "grad_norm": 2.681004762649536, - "learning_rate": 4.7622716940205787e-07, - "loss": 0.2588275671005249, - "mean_token_accuracy": 0.9101524353027344, - "num_tokens": 28597890.0, - "step": 3200 - }, - { - "epoch": 2.432370820668693, - "grad_norm": 1.8040683269500732, - "learning_rate": 4.7499824510316013e-07, - "loss": 0.3194184899330139, - "mean_token_accuracy": 0.906498908996582, - "num_tokens": 28606885.0, - "step": 3201 - }, - { - "epoch": 2.433130699088146, - "grad_norm": 3.4185421466827393, - "learning_rate": 4.7377074202844514e-07, - "loss": 0.4457589387893677, - "mean_token_accuracy": 0.8387380838394165, - "num_tokens": 28611709.0, - "step": 3202 - }, - { - "epoch": 2.433890577507599, - "grad_norm": 2.6594674587249756, - "learning_rate": 4.7254466103941995e-07, - "loss": 0.3260703384876251, - "mean_token_accuracy": 0.9050778150558472, - "num_tokens": 28616931.0, - "step": 3203 - }, - { - "epoch": 2.4346504559270516, - "grad_norm": 1.999886393547058, - "learning_rate": 4.713200029965978e-07, - "loss": 0.2933492660522461, - "mean_token_accuracy": 0.900344729423523, - "num_tokens": 28624034.0, - "step": 3204 - }, - { - "epoch": 2.4354103343465043, - "grad_norm": 1.932508111000061, - "learning_rate": 4.700967687594901e-07, - "loss": 0.29114463925361633, - "mean_token_accuracy": 0.9247289896011353, - "num_tokens": 28633528.0, - "step": 3205 - }, - { - "epoch": 2.4361702127659575, - "grad_norm": 4.55303430557251, - "learning_rate": 4.68874959186609e-07, - "loss": 0.2936939597129822, - "mean_token_accuracy": 0.9006574153900146, - "num_tokens": 28636289.0, - "step": 3206 - }, - { - "epoch": 2.43693009118541, - "grad_norm": 2.02156400680542, - "learning_rate": 4.6765457513546747e-07, - "loss": 0.3098263740539551, - "mean_token_accuracy": 0.8965007066726685, - "num_tokens": 28643491.0, - "step": 3207 - }, - { - "epoch": 2.4376899696048633, - "grad_norm": 2.018125295639038, - "learning_rate": 4.664356174625795e-07, - "loss": 0.4749948978424072, - "mean_token_accuracy": 0.8366118669509888, - "num_tokens": 28654136.0, - "step": 3208 - }, - { - "epoch": 2.438449848024316, - "grad_norm": 2.0175318717956543, - "learning_rate": 4.6521808702345516e-07, - "loss": 0.31277763843536377, - "mean_token_accuracy": 0.8878506422042847, - "num_tokens": 28662363.0, - "step": 3209 - }, - { - "epoch": 2.439209726443769, - "grad_norm": 1.37982177734375, - "learning_rate": 4.640019846726043e-07, - "loss": 0.3872165083885193, - "mean_token_accuracy": 0.8586703538894653, - "num_tokens": 28681637.0, - "step": 3210 - }, - { - "epoch": 2.439969604863222, - "grad_norm": 2.265124559402466, - "learning_rate": 4.6278731126353447e-07, - "loss": 0.20262989401817322, - "mean_token_accuracy": 0.9290857315063477, - "num_tokens": 28687032.0, - "step": 3211 - }, - { - "epoch": 2.440729483282675, - "grad_norm": 1.730516791343689, - "learning_rate": 4.615740676487507e-07, - "loss": 0.21819885075092316, - "mean_token_accuracy": 0.9351010322570801, - "num_tokens": 28694692.0, - "step": 3212 - }, - { - "epoch": 2.4414893617021276, - "grad_norm": 2.297240972518921, - "learning_rate": 4.603622546797534e-07, - "loss": 0.34703850746154785, - "mean_token_accuracy": 0.8764227628707886, - "num_tokens": 28700838.0, - "step": 3213 - }, - { - "epoch": 2.4422492401215807, - "grad_norm": 1.3174461126327515, - "learning_rate": 4.591518732070402e-07, - "loss": 0.27869975566864014, - "mean_token_accuracy": 0.8975766897201538, - "num_tokens": 28715114.0, - "step": 3214 - }, - { - "epoch": 2.4430091185410334, - "grad_norm": 1.8751143217086792, - "learning_rate": 4.5794292408010285e-07, - "loss": 0.4260019361972809, - "mean_token_accuracy": 0.8564238548278809, - "num_tokens": 28724176.0, - "step": 3215 - }, - { - "epoch": 2.443768996960486, - "grad_norm": 2.095414161682129, - "learning_rate": 4.5673540814742875e-07, - "loss": 0.2791098952293396, - "mean_token_accuracy": 0.896371603012085, - "num_tokens": 28730815.0, - "step": 3216 - }, - { - "epoch": 2.4445288753799392, - "grad_norm": 1.470991611480713, - "learning_rate": 4.555293262564994e-07, - "loss": 0.3128473162651062, - "mean_token_accuracy": 0.8857797980308533, - "num_tokens": 28743271.0, - "step": 3217 - }, - { - "epoch": 2.445288753799392, - "grad_norm": 1.8783953189849854, - "learning_rate": 4.5432467925378784e-07, - "loss": 0.24838949739933014, - "mean_token_accuracy": 0.9119431972503662, - "num_tokens": 28751291.0, - "step": 3218 - }, - { - "epoch": 2.446048632218845, - "grad_norm": 2.134469747543335, - "learning_rate": 4.53121467984764e-07, - "loss": 0.390994668006897, - "mean_token_accuracy": 0.8823093175888062, - "num_tokens": 28758262.0, - "step": 3219 - }, - { - "epoch": 2.4468085106382977, - "grad_norm": 1.369758129119873, - "learning_rate": 4.5191969329388627e-07, - "loss": 0.33717092871665955, - "mean_token_accuracy": 0.8909138441085815, - "num_tokens": 28770330.0, - "step": 3220 - }, - { - "epoch": 2.447568389057751, - "grad_norm": 1.3363337516784668, - "learning_rate": 4.5071935602460704e-07, - "loss": 0.41521191596984863, - "mean_token_accuracy": 0.8482609987258911, - "num_tokens": 28788148.0, - "step": 3221 - }, - { - "epoch": 2.4483282674772036, - "grad_norm": 1.5309195518493652, - "learning_rate": 4.495204570193687e-07, - "loss": 0.23737329244613647, - "mean_token_accuracy": 0.9094061851501465, - "num_tokens": 28798150.0, - "step": 3222 - }, - { - "epoch": 2.4490881458966567, - "grad_norm": 1.5956720113754272, - "learning_rate": 4.483229971196054e-07, - "loss": 0.24943354725837708, - "mean_token_accuracy": 0.9051728248596191, - "num_tokens": 28808116.0, - "step": 3223 - }, - { - "epoch": 2.4498480243161094, - "grad_norm": 1.515918254852295, - "learning_rate": 4.4712697716573994e-07, - "loss": 0.3883020281791687, - "mean_token_accuracy": 0.8599046468734741, - "num_tokens": 28822835.0, - "step": 3224 - }, - { - "epoch": 2.4506079027355625, - "grad_norm": 1.3584989309310913, - "learning_rate": 4.4593239799718636e-07, - "loss": 0.33565959334373474, - "mean_token_accuracy": 0.8725172281265259, - "num_tokens": 28841697.0, - "step": 3225 - }, - { - "epoch": 2.451367781155015, - "grad_norm": 2.800762414932251, - "learning_rate": 4.447392604523443e-07, - "loss": 0.36243852972984314, - "mean_token_accuracy": 0.8881685733795166, - "num_tokens": 28847164.0, - "step": 3226 - }, - { - "epoch": 2.452127659574468, - "grad_norm": 1.3506053686141968, - "learning_rate": 4.43547565368605e-07, - "loss": 0.21717754006385803, - "mean_token_accuracy": 0.9296318292617798, - "num_tokens": 28858658.0, - "step": 3227 - }, - { - "epoch": 2.452887537993921, - "grad_norm": 2.094951868057251, - "learning_rate": 4.423573135823464e-07, - "loss": 0.3554617762565613, - "mean_token_accuracy": 0.8762428760528564, - "num_tokens": 28866509.0, - "step": 3228 - }, - { - "epoch": 2.4536474164133737, - "grad_norm": 1.4730854034423828, - "learning_rate": 4.411685059289314e-07, - "loss": 0.2805292010307312, - "mean_token_accuracy": 0.9004697799682617, - "num_tokens": 28878151.0, - "step": 3229 - }, - { - "epoch": 2.454407294832827, - "grad_norm": 2.1443302631378174, - "learning_rate": 4.399811432427123e-07, - "loss": 0.3829796314239502, - "mean_token_accuracy": 0.866457462310791, - "num_tokens": 28886050.0, - "step": 3230 - }, - { - "epoch": 2.4551671732522795, - "grad_norm": 3.437201738357544, - "learning_rate": 4.387952263570261e-07, - "loss": 0.18470892310142517, - "mean_token_accuracy": 0.9365379810333252, - "num_tokens": 28889484.0, - "step": 3231 - }, - { - "epoch": 2.4559270516717326, - "grad_norm": 2.6203434467315674, - "learning_rate": 4.376107561041937e-07, - "loss": 0.25328633189201355, - "mean_token_accuracy": 0.921377956867218, - "num_tokens": 28893972.0, - "step": 3232 - }, - { - "epoch": 2.4566869300911853, - "grad_norm": 2.4467883110046387, - "learning_rate": 4.3642773331552203e-07, - "loss": 0.2748469412326813, - "mean_token_accuracy": 0.9046314358711243, - "num_tokens": 28899118.0, - "step": 3233 - }, - { - "epoch": 2.4574468085106385, - "grad_norm": 1.9845495223999023, - "learning_rate": 4.352461588213036e-07, - "loss": 0.443121075630188, - "mean_token_accuracy": 0.8609750866889954, - "num_tokens": 28909005.0, - "step": 3234 - }, - { - "epoch": 2.458206686930091, - "grad_norm": 2.8748083114624023, - "learning_rate": 4.340660334508115e-07, - "loss": 0.22461901605129242, - "mean_token_accuracy": 0.916649341583252, - "num_tokens": 28913366.0, - "step": 3235 - }, - { - "epoch": 2.4589665653495443, - "grad_norm": 1.7406567335128784, - "learning_rate": 4.328873580323034e-07, - "loss": 0.4147683382034302, - "mean_token_accuracy": 0.8523626327514648, - "num_tokens": 28924695.0, - "step": 3236 - }, - { - "epoch": 2.459726443768997, - "grad_norm": 1.767052412033081, - "learning_rate": 4.3171013339301905e-07, - "loss": 0.38994747400283813, - "mean_token_accuracy": 0.863203227519989, - "num_tokens": 28935163.0, - "step": 3237 - }, - { - "epoch": 2.4604863221884496, - "grad_norm": 1.2257410287857056, - "learning_rate": 4.305343603591802e-07, - "loss": 0.23309440910816193, - "mean_token_accuracy": 0.9016385674476624, - "num_tokens": 28948374.0, - "step": 3238 - }, - { - "epoch": 2.461246200607903, - "grad_norm": 1.3017674684524536, - "learning_rate": 4.293600397559897e-07, - "loss": 0.2825638949871063, - "mean_token_accuracy": 0.8953868746757507, - "num_tokens": 28961695.0, - "step": 3239 - }, - { - "epoch": 2.4620060790273555, - "grad_norm": 1.475160837173462, - "learning_rate": 4.2818717240763115e-07, - "loss": 0.30598294734954834, - "mean_token_accuracy": 0.8774391412734985, - "num_tokens": 28976399.0, - "step": 3240 - }, - { - "epoch": 2.4627659574468086, - "grad_norm": 2.1078310012817383, - "learning_rate": 4.2701575913726644e-07, - "loss": 0.4696943759918213, - "mean_token_accuracy": 0.8566044569015503, - "num_tokens": 28985515.0, - "step": 3241 - }, - { - "epoch": 2.4635258358662613, - "grad_norm": 2.587887763977051, - "learning_rate": 4.258458007670413e-07, - "loss": 0.32537540793418884, - "mean_token_accuracy": 0.8889709711074829, - "num_tokens": 28990365.0, - "step": 3242 - }, - { - "epoch": 2.4642857142857144, - "grad_norm": 1.622995138168335, - "learning_rate": 4.2467729811807497e-07, - "loss": 0.47171884775161743, - "mean_token_accuracy": 0.8305673599243164, - "num_tokens": 29002644.0, - "step": 3243 - }, - { - "epoch": 2.465045592705167, - "grad_norm": 2.0702009201049805, - "learning_rate": 4.235102520104681e-07, - "loss": 0.45754289627075195, - "mean_token_accuracy": 0.8536194562911987, - "num_tokens": 29011325.0, - "step": 3244 - }, - { - "epoch": 2.4658054711246202, - "grad_norm": 1.4394203424453735, - "learning_rate": 4.2234466326330023e-07, - "loss": 0.36623480916023254, - "mean_token_accuracy": 0.8834698796272278, - "num_tokens": 29028440.0, - "step": 3245 - }, - { - "epoch": 2.466565349544073, - "grad_norm": 1.6777557134628296, - "learning_rate": 4.211805326946247e-07, - "loss": 0.19617480039596558, - "mean_token_accuracy": 0.920343279838562, - "num_tokens": 29035936.0, - "step": 3246 - }, - { - "epoch": 2.467325227963526, - "grad_norm": 1.7396641969680786, - "learning_rate": 4.200178611214736e-07, - "loss": 0.3978565037250519, - "mean_token_accuracy": 0.8532278537750244, - "num_tokens": 29046734.0, - "step": 3247 - }, - { - "epoch": 2.4680851063829787, - "grad_norm": 2.9263904094696045, - "learning_rate": 4.18856649359855e-07, - "loss": 0.24883142113685608, - "mean_token_accuracy": 0.9077831506729126, - "num_tokens": 29051879.0, - "step": 3248 - }, - { - "epoch": 2.4688449848024314, - "grad_norm": 2.637763500213623, - "learning_rate": 4.1769689822475147e-07, - "loss": 0.3370334506034851, - "mean_token_accuracy": 0.8828175067901611, - "num_tokens": 29057684.0, - "step": 3249 - }, - { - "epoch": 2.4696048632218845, - "grad_norm": 1.768539309501648, - "learning_rate": 4.165386085301212e-07, - "loss": 0.32484760880470276, - "mean_token_accuracy": 0.8829447031021118, - "num_tokens": 29066105.0, - "step": 3250 - }, - { - "epoch": 2.4703647416413372, - "grad_norm": 1.4335054159164429, - "learning_rate": 4.1538178108889717e-07, - "loss": 0.442533940076828, - "mean_token_accuracy": 0.846094012260437, - "num_tokens": 29082385.0, - "step": 3251 - }, - { - "epoch": 2.4711246200607904, - "grad_norm": 2.007174491882324, - "learning_rate": 4.1422641671298336e-07, - "loss": 0.2856018841266632, - "mean_token_accuracy": 0.9205893278121948, - "num_tokens": 29089022.0, - "step": 3252 - }, - { - "epoch": 2.471884498480243, - "grad_norm": 2.225895404815674, - "learning_rate": 4.1307251621326124e-07, - "loss": 0.17259414494037628, - "mean_token_accuracy": 0.9244140386581421, - "num_tokens": 29094176.0, - "step": 3253 - }, - { - "epoch": 2.472644376899696, - "grad_norm": 2.6121842861175537, - "learning_rate": 4.1192008039958236e-07, - "loss": 0.37352171540260315, - "mean_token_accuracy": 0.8913992643356323, - "num_tokens": 29099565.0, - "step": 3254 - }, - { - "epoch": 2.473404255319149, - "grad_norm": 1.5645455121994019, - "learning_rate": 4.1076911008076895e-07, - "loss": 0.37237828969955444, - "mean_token_accuracy": 0.879361629486084, - "num_tokens": 29112039.0, - "step": 3255 - }, - { - "epoch": 2.474164133738602, - "grad_norm": 3.144536018371582, - "learning_rate": 4.096196060646168e-07, - "loss": 0.2038595974445343, - "mean_token_accuracy": 0.9299201369285583, - "num_tokens": 29115720.0, - "step": 3256 - }, - { - "epoch": 2.4749240121580547, - "grad_norm": 2.679821014404297, - "learning_rate": 4.0847156915789385e-07, - "loss": 0.41715145111083984, - "mean_token_accuracy": 0.862784743309021, - "num_tokens": 29120944.0, - "step": 3257 - }, - { - "epoch": 2.475683890577508, - "grad_norm": 2.243694305419922, - "learning_rate": 4.073250001663345e-07, - "loss": 0.43414735794067383, - "mean_token_accuracy": 0.8504310250282288, - "num_tokens": 29128842.0, - "step": 3258 - }, - { - "epoch": 2.4764437689969605, - "grad_norm": 2.636111259460449, - "learning_rate": 4.061798998946459e-07, - "loss": 0.2401021122932434, - "mean_token_accuracy": 0.910351037979126, - "num_tokens": 29133769.0, - "step": 3259 - }, - { - "epoch": 2.477203647416413, - "grad_norm": 2.6116414070129395, - "learning_rate": 4.050362691465032e-07, - "loss": 0.3290833830833435, - "mean_token_accuracy": 0.8770763278007507, - "num_tokens": 29138639.0, - "step": 3260 - }, - { - "epoch": 2.4779635258358663, - "grad_norm": 2.279324531555176, - "learning_rate": 4.038941087245507e-07, - "loss": 0.38752615451812744, - "mean_token_accuracy": 0.8624980449676514, - "num_tokens": 29145966.0, - "step": 3261 - }, - { - "epoch": 2.478723404255319, - "grad_norm": 2.2280423641204834, - "learning_rate": 4.0275341943040057e-07, - "loss": 0.3724668025970459, - "mean_token_accuracy": 0.8737661838531494, - "num_tokens": 29152705.0, - "step": 3262 - }, - { - "epoch": 2.479483282674772, - "grad_norm": 2.030075788497925, - "learning_rate": 4.0161420206463243e-07, - "loss": 0.32603174448013306, - "mean_token_accuracy": 0.8819995522499084, - "num_tokens": 29159853.0, - "step": 3263 - }, - { - "epoch": 2.480243161094225, - "grad_norm": 2.048346519470215, - "learning_rate": 4.0047645742679275e-07, - "loss": 0.3046466112136841, - "mean_token_accuracy": 0.8898575305938721, - "num_tokens": 29167744.0, - "step": 3264 - }, - { - "epoch": 2.481003039513678, - "grad_norm": 2.8435800075531006, - "learning_rate": 3.9934018631539506e-07, - "loss": 0.3660475015640259, - "mean_token_accuracy": 0.8754674196243286, - "num_tokens": 29173234.0, - "step": 3265 - }, - { - "epoch": 2.4817629179331306, - "grad_norm": 1.7785491943359375, - "learning_rate": 3.982053895279173e-07, - "loss": 0.39483463764190674, - "mean_token_accuracy": 0.8613039255142212, - "num_tokens": 29182555.0, - "step": 3266 - }, - { - "epoch": 2.4825227963525838, - "grad_norm": 2.384479522705078, - "learning_rate": 3.970720678608034e-07, - "loss": 0.3536769151687622, - "mean_token_accuracy": 0.8700416088104248, - "num_tokens": 29189742.0, - "step": 3267 - }, - { - "epoch": 2.4832826747720365, - "grad_norm": 2.368417978286743, - "learning_rate": 3.9594022210946355e-07, - "loss": 0.2937469780445099, - "mean_token_accuracy": 0.8970743417739868, - "num_tokens": 29194960.0, - "step": 3268 - }, - { - "epoch": 2.4840425531914896, - "grad_norm": 2.3920481204986572, - "learning_rate": 3.948098530682695e-07, - "loss": 0.29564806818962097, - "mean_token_accuracy": 0.913650393486023, - "num_tokens": 29200704.0, - "step": 3269 - }, - { - "epoch": 2.4848024316109423, - "grad_norm": 1.1830788850784302, - "learning_rate": 3.9368096153055783e-07, - "loss": 0.39095211029052734, - "mean_token_accuracy": 0.8536444902420044, - "num_tokens": 29224862.0, - "step": 3270 - }, - { - "epoch": 2.485562310030395, - "grad_norm": 1.4365004301071167, - "learning_rate": 3.925535482886286e-07, - "loss": 0.27921199798583984, - "mean_token_accuracy": 0.8939366936683655, - "num_tokens": 29237375.0, - "step": 3271 - }, - { - "epoch": 2.486322188449848, - "grad_norm": 2.1955132484436035, - "learning_rate": 3.9142761413374336e-07, - "loss": 0.41748374700546265, - "mean_token_accuracy": 0.8621724843978882, - "num_tokens": 29244655.0, - "step": 3272 - }, - { - "epoch": 2.487082066869301, - "grad_norm": 2.4120712280273438, - "learning_rate": 3.90303159856126e-07, - "loss": 0.2881275415420532, - "mean_token_accuracy": 0.8962163329124451, - "num_tokens": 29250350.0, - "step": 3273 - }, - { - "epoch": 2.487841945288754, - "grad_norm": 1.315206527709961, - "learning_rate": 3.891801862449629e-07, - "loss": 0.33958539366722107, - "mean_token_accuracy": 0.8800086379051208, - "num_tokens": 29264563.0, - "step": 3274 - }, - { - "epoch": 2.4886018237082066, - "grad_norm": 1.9663656949996948, - "learning_rate": 3.880586940883979e-07, - "loss": 0.35844963788986206, - "mean_token_accuracy": 0.8683270215988159, - "num_tokens": 29273782.0, - "step": 3275 - }, - { - "epoch": 2.4893617021276597, - "grad_norm": 1.4438722133636475, - "learning_rate": 3.869386841735395e-07, - "loss": 0.39307960867881775, - "mean_token_accuracy": 0.8902837038040161, - "num_tokens": 29288914.0, - "step": 3276 - }, - { - "epoch": 2.4901215805471124, - "grad_norm": 2.779317617416382, - "learning_rate": 3.8582015728645366e-07, - "loss": 0.237838476896286, - "mean_token_accuracy": 0.9132705926895142, - "num_tokens": 29293218.0, - "step": 3277 - }, - { - "epoch": 2.4908814589665655, - "grad_norm": 1.6183768510818481, - "learning_rate": 3.8470311421216435e-07, - "loss": 0.24135810136795044, - "mean_token_accuracy": 0.9351533055305481, - "num_tokens": 29301928.0, - "step": 3278 - }, - { - "epoch": 2.4916413373860182, - "grad_norm": 1.6468756198883057, - "learning_rate": 3.835875557346552e-07, - "loss": 0.34042105078697205, - "mean_token_accuracy": 0.8920395374298096, - "num_tokens": 29313740.0, - "step": 3279 - }, - { - "epoch": 2.4924012158054714, - "grad_norm": 1.6257606744766235, - "learning_rate": 3.8247348263687035e-07, - "loss": 0.3479476869106293, - "mean_token_accuracy": 0.8826069831848145, - "num_tokens": 29323650.0, - "step": 3280 - }, - { - "epoch": 2.493161094224924, - "grad_norm": 1.7144103050231934, - "learning_rate": 3.81360895700707e-07, - "loss": 0.3905973434448242, - "mean_token_accuracy": 0.8974796533584595, - "num_tokens": 29333192.0, - "step": 3281 - }, - { - "epoch": 2.4939209726443767, - "grad_norm": 1.3381150960922241, - "learning_rate": 3.802497957070225e-07, - "loss": 0.31121304631233215, - "mean_token_accuracy": 0.8921661376953125, - "num_tokens": 29348219.0, - "step": 3282 - }, - { - "epoch": 2.49468085106383, - "grad_norm": 1.8036452531814575, - "learning_rate": 3.7914018343562896e-07, - "loss": 0.4264541268348694, - "mean_token_accuracy": 0.8469835519790649, - "num_tokens": 29359632.0, - "step": 3283 - }, - { - "epoch": 2.4954407294832825, - "grad_norm": 1.7335898876190186, - "learning_rate": 3.780320596652956e-07, - "loss": 0.2710324823856354, - "mean_token_accuracy": 0.9050130844116211, - "num_tokens": 29368771.0, - "step": 3284 - }, - { - "epoch": 2.4962006079027357, - "grad_norm": 1.5694719552993774, - "learning_rate": 3.7692542517374615e-07, - "loss": 0.3114343285560608, - "mean_token_accuracy": 0.8869681358337402, - "num_tokens": 29379694.0, - "step": 3285 - }, - { - "epoch": 2.4969604863221884, - "grad_norm": 2.042365074157715, - "learning_rate": 3.75820280737659e-07, - "loss": 0.23643970489501953, - "mean_token_accuracy": 0.9191685318946838, - "num_tokens": 29385914.0, - "step": 3286 - }, - { - "epoch": 2.4977203647416415, - "grad_norm": 2.2526986598968506, - "learning_rate": 3.7471662713266744e-07, - "loss": 0.3166671097278595, - "mean_token_accuracy": 0.901310384273529, - "num_tokens": 29392128.0, - "step": 3287 - }, - { - "epoch": 2.498480243161094, - "grad_norm": 1.474029541015625, - "learning_rate": 3.7361446513335816e-07, - "loss": 0.4021439552307129, - "mean_token_accuracy": 0.9001395106315613, - "num_tokens": 29404742.0, - "step": 3288 - }, - { - "epoch": 2.499240121580547, - "grad_norm": 1.3057628870010376, - "learning_rate": 3.725137955132707e-07, - "loss": 0.30949655175209045, - "mean_token_accuracy": 0.8990561962127686, - "num_tokens": 29421839.0, - "step": 3289 - }, - { - "epoch": 2.5, - "grad_norm": 1.61989164352417, - "learning_rate": 3.7141461904489665e-07, - "loss": 0.3134443163871765, - "mean_token_accuracy": 0.8906387090682983, - "num_tokens": 29432127.0, - "step": 3290 - }, - { - "epoch": 2.500759878419453, - "grad_norm": 1.5306038856506348, - "learning_rate": 3.70316936499682e-07, - "loss": 0.4017624855041504, - "mean_token_accuracy": 0.845695436000824, - "num_tokens": 29444397.0, - "step": 3291 - }, - { - "epoch": 2.501519756838906, - "grad_norm": 1.2971603870391846, - "learning_rate": 3.6922074864802095e-07, - "loss": 0.4591655135154724, - "mean_token_accuracy": 0.8666995763778687, - "num_tokens": 29461121.0, - "step": 3292 - }, - { - "epoch": 2.5022796352583585, - "grad_norm": 1.9822273254394531, - "learning_rate": 3.681260562592609e-07, - "loss": 0.3666776716709137, - "mean_token_accuracy": 0.8733338117599487, - "num_tokens": 29469211.0, - "step": 3293 - }, - { - "epoch": 2.5030395136778116, - "grad_norm": 2.331378936767578, - "learning_rate": 3.670328601016995e-07, - "loss": 0.3511161506175995, - "mean_token_accuracy": 0.8734879493713379, - "num_tokens": 29475473.0, - "step": 3294 - }, - { - "epoch": 2.5037993920972643, - "grad_norm": 1.2138792276382446, - "learning_rate": 3.659411609425834e-07, - "loss": 0.2819535732269287, - "mean_token_accuracy": 0.9210860729217529, - "num_tokens": 29492447.0, - "step": 3295 - }, - { - "epoch": 2.5045592705167175, - "grad_norm": 1.4580892324447632, - "learning_rate": 3.648509595481095e-07, - "loss": 0.37376853823661804, - "mean_token_accuracy": 0.868643045425415, - "num_tokens": 29506128.0, - "step": 3296 - }, - { - "epoch": 2.50531914893617, - "grad_norm": 2.3763513565063477, - "learning_rate": 3.6376225668342287e-07, - "loss": 0.3229329586029053, - "mean_token_accuracy": 0.8802589178085327, - "num_tokens": 29512500.0, - "step": 3297 - }, - { - "epoch": 2.5060790273556233, - "grad_norm": 1.7995069026947021, - "learning_rate": 3.626750531126169e-07, - "loss": 0.2303360551595688, - "mean_token_accuracy": 0.9212342500686646, - "num_tokens": 29518867.0, - "step": 3298 - }, - { - "epoch": 2.506838905775076, - "grad_norm": 2.4798812866210938, - "learning_rate": 3.615893495987335e-07, - "loss": 0.15825161337852478, - "mean_token_accuracy": 0.9465295076370239, - "num_tokens": 29523418.0, - "step": 3299 - }, - { - "epoch": 2.5075987841945286, - "grad_norm": 2.6747193336486816, - "learning_rate": 3.6050514690376124e-07, - "loss": 0.3672150671482086, - "mean_token_accuracy": 0.8869320154190063, - "num_tokens": 29534685.0, - "step": 3300 - }, - { - "epoch": 2.5083586626139818, - "grad_norm": 1.47441828250885, - "learning_rate": 3.594224457886336e-07, - "loss": 0.3551298975944519, - "mean_token_accuracy": 0.8751654624938965, - "num_tokens": 29546692.0, - "step": 3301 - }, - { - "epoch": 2.509118541033435, - "grad_norm": 2.2134389877319336, - "learning_rate": 3.5834124701323414e-07, - "loss": 0.39865267276763916, - "mean_token_accuracy": 0.8581235408782959, - "num_tokens": 29553889.0, - "step": 3302 - }, - { - "epoch": 2.5098784194528876, - "grad_norm": 1.9763301610946655, - "learning_rate": 3.5726155133638915e-07, - "loss": 0.29025325179100037, - "mean_token_accuracy": 0.8915338516235352, - "num_tokens": 29562429.0, - "step": 3303 - }, - { - "epoch": 2.5106382978723403, - "grad_norm": 2.347961187362671, - "learning_rate": 3.561833595158698e-07, - "loss": 0.33726242184638977, - "mean_token_accuracy": 0.8788525462150574, - "num_tokens": 29568696.0, - "step": 3304 - }, - { - "epoch": 2.5113981762917934, - "grad_norm": 1.7410497665405273, - "learning_rate": 3.5510667230839237e-07, - "loss": 0.3604505956172943, - "mean_token_accuracy": 0.8745309114456177, - "num_tokens": 29579020.0, - "step": 3305 - }, - { - "epoch": 2.512158054711246, - "grad_norm": 2.8427274227142334, - "learning_rate": 3.540314904696196e-07, - "loss": 0.16700688004493713, - "mean_token_accuracy": 0.9461087584495544, - "num_tokens": 29583216.0, - "step": 3306 - }, - { - "epoch": 2.512917933130699, - "grad_norm": 3.4459211826324463, - "learning_rate": 3.529578147541532e-07, - "loss": 0.20073774456977844, - "mean_token_accuracy": 0.9330953359603882, - "num_tokens": 29586393.0, - "step": 3307 - }, - { - "epoch": 2.513677811550152, - "grad_norm": 1.2530099153518677, - "learning_rate": 3.518856459155409e-07, - "loss": 0.3268885016441345, - "mean_token_accuracy": 0.8808276653289795, - "num_tokens": 29602387.0, - "step": 3308 - }, - { - "epoch": 2.514437689969605, - "grad_norm": 2.64876389503479, - "learning_rate": 3.508149847062725e-07, - "loss": 0.328682541847229, - "mean_token_accuracy": 0.8907853364944458, - "num_tokens": 29608298.0, - "step": 3309 - }, - { - "epoch": 2.5151975683890577, - "grad_norm": 2.3505539894104004, - "learning_rate": 3.4974583187777853e-07, - "loss": 0.3768400549888611, - "mean_token_accuracy": 0.8646256327629089, - "num_tokens": 29615035.0, - "step": 3310 - }, - { - "epoch": 2.5159574468085104, - "grad_norm": 3.298685073852539, - "learning_rate": 3.4867818818043217e-07, - "loss": 0.4103941023349762, - "mean_token_accuracy": 0.8660793304443359, - "num_tokens": 29619522.0, - "step": 3311 - }, - { - "epoch": 2.5167173252279635, - "grad_norm": 1.8788949251174927, - "learning_rate": 3.476120543635469e-07, - "loss": 0.39368999004364014, - "mean_token_accuracy": 0.861727237701416, - "num_tokens": 29628297.0, - "step": 3312 - }, - { - "epoch": 2.5174772036474167, - "grad_norm": 1.3355047702789307, - "learning_rate": 3.4654743117537525e-07, - "loss": 0.30587559938430786, - "mean_token_accuracy": 0.8944345116615295, - "num_tokens": 29643010.0, - "step": 3313 - }, - { - "epoch": 2.5182370820668694, - "grad_norm": 1.6371463537216187, - "learning_rate": 3.4548431936311275e-07, - "loss": 0.35551705956459045, - "mean_token_accuracy": 0.8975727558135986, - "num_tokens": 29654169.0, - "step": 3314 - }, - { - "epoch": 2.518996960486322, - "grad_norm": 1.8126708269119263, - "learning_rate": 3.4442271967289083e-07, - "loss": 0.40501973032951355, - "mean_token_accuracy": 0.872620701789856, - "num_tokens": 29665965.0, - "step": 3315 - }, - { - "epoch": 2.519756838905775, - "grad_norm": 2.9103341102600098, - "learning_rate": 3.433626328497805e-07, - "loss": 0.21716530621051788, - "mean_token_accuracy": 0.9180731773376465, - "num_tokens": 29670529.0, - "step": 3316 - }, - { - "epoch": 2.520516717325228, - "grad_norm": 1.3893235921859741, - "learning_rate": 3.4230405963779357e-07, - "loss": 0.2638336420059204, - "mean_token_accuracy": 0.9039981365203857, - "num_tokens": 29681585.0, - "step": 3317 - }, - { - "epoch": 2.521276595744681, - "grad_norm": 2.408050298690796, - "learning_rate": 3.412470007798757e-07, - "loss": 0.4774054288864136, - "mean_token_accuracy": 0.835527777671814, - "num_tokens": 29688642.0, - "step": 3318 - }, - { - "epoch": 2.5220364741641337, - "grad_norm": 2.923038959503174, - "learning_rate": 3.4019145701791186e-07, - "loss": 0.24404606223106384, - "mean_token_accuracy": 0.9276547431945801, - "num_tokens": 29692516.0, - "step": 3319 - }, - { - "epoch": 2.522796352583587, - "grad_norm": 3.470700740814209, - "learning_rate": 3.3913742909272353e-07, - "loss": 0.26732707023620605, - "mean_token_accuracy": 0.910873293876648, - "num_tokens": 29695779.0, - "step": 3320 - }, - { - "epoch": 2.5235562310030395, - "grad_norm": 2.2419376373291016, - "learning_rate": 3.3808491774406817e-07, - "loss": 0.16050264239311218, - "mean_token_accuracy": 0.934256911277771, - "num_tokens": 29701486.0, - "step": 3321 - }, - { - "epoch": 2.524316109422492, - "grad_norm": 2.3232672214508057, - "learning_rate": 3.370339237106385e-07, - "loss": 0.23050843179225922, - "mean_token_accuracy": 0.9202409982681274, - "num_tokens": 29706780.0, - "step": 3322 - }, - { - "epoch": 2.5250759878419453, - "grad_norm": 3.012422800064087, - "learning_rate": 3.359844477300633e-07, - "loss": 0.22087830305099487, - "mean_token_accuracy": 0.9293035268783569, - "num_tokens": 29711164.0, - "step": 3323 - }, - { - "epoch": 2.5258358662613984, - "grad_norm": 3.0274150371551514, - "learning_rate": 3.3493649053890325e-07, - "loss": 0.1908535212278366, - "mean_token_accuracy": 0.9202175140380859, - "num_tokens": 29714988.0, - "step": 3324 - }, - { - "epoch": 2.526595744680851, - "grad_norm": 1.9113285541534424, - "learning_rate": 3.3389005287265713e-07, - "loss": 0.3098488748073578, - "mean_token_accuracy": 0.8901765942573547, - "num_tokens": 29722665.0, - "step": 3325 - }, - { - "epoch": 2.527355623100304, - "grad_norm": 2.3841238021850586, - "learning_rate": 3.32845135465755e-07, - "loss": 0.25352805852890015, - "mean_token_accuracy": 0.9079523682594299, - "num_tokens": 29727646.0, - "step": 3326 - }, - { - "epoch": 2.528115501519757, - "grad_norm": 2.134140968322754, - "learning_rate": 3.3180173905155906e-07, - "loss": 0.24720364809036255, - "mean_token_accuracy": 0.9039219617843628, - "num_tokens": 29734233.0, - "step": 3327 - }, - { - "epoch": 2.5288753799392096, - "grad_norm": 1.9245797395706177, - "learning_rate": 3.3075986436236494e-07, - "loss": 0.2697824537754059, - "mean_token_accuracy": 0.9077266454696655, - "num_tokens": 29742107.0, - "step": 3328 - }, - { - "epoch": 2.5296352583586628, - "grad_norm": 2.5044164657592773, - "learning_rate": 3.297195121294022e-07, - "loss": 0.3145396411418915, - "mean_token_accuracy": 0.8834670782089233, - "num_tokens": 29747755.0, - "step": 3329 - }, - { - "epoch": 2.5303951367781155, - "grad_norm": 3.475567102432251, - "learning_rate": 3.286806830828285e-07, - "loss": 0.14926226437091827, - "mean_token_accuracy": 0.9487104415893555, - "num_tokens": 29750730.0, - "step": 3330 - }, - { - "epoch": 2.5311550151975686, - "grad_norm": 2.0287671089172363, - "learning_rate": 3.2764337795173433e-07, - "loss": 0.3795855641365051, - "mean_token_accuracy": 0.8685719966888428, - "num_tokens": 29758328.0, - "step": 3331 - }, - { - "epoch": 2.5319148936170213, - "grad_norm": 1.4884649515151978, - "learning_rate": 3.2660759746414055e-07, - "loss": 0.3048096299171448, - "mean_token_accuracy": 0.8908923268318176, - "num_tokens": 29770486.0, - "step": 3332 - }, - { - "epoch": 2.532674772036474, - "grad_norm": 2.0645828247070312, - "learning_rate": 3.255733423469978e-07, - "loss": 0.3477875590324402, - "mean_token_accuracy": 0.8803027868270874, - "num_tokens": 29778363.0, - "step": 3333 - }, - { - "epoch": 2.533434650455927, - "grad_norm": 2.032289981842041, - "learning_rate": 3.245406133261858e-07, - "loss": 0.39452236890792847, - "mean_token_accuracy": 0.8499241471290588, - "num_tokens": 29786353.0, - "step": 3334 - }, - { - "epoch": 2.53419452887538, - "grad_norm": 2.146658420562744, - "learning_rate": 3.235094111265141e-07, - "loss": 0.250872939825058, - "mean_token_accuracy": 0.9086864590644836, - "num_tokens": 29793122.0, - "step": 3335 - }, - { - "epoch": 2.534954407294833, - "grad_norm": 1.407880187034607, - "learning_rate": 3.224797364717197e-07, - "loss": 0.30364125967025757, - "mean_token_accuracy": 0.875752329826355, - "num_tokens": 29806866.0, - "step": 3336 - }, - { - "epoch": 2.5357142857142856, - "grad_norm": 2.6231658458709717, - "learning_rate": 3.214515900844681e-07, - "loss": 0.31516194343566895, - "mean_token_accuracy": 0.8799179792404175, - "num_tokens": 29813035.0, - "step": 3337 - }, - { - "epoch": 2.5364741641337387, - "grad_norm": 2.3876113891601562, - "learning_rate": 3.204249726863523e-07, - "loss": 0.3034508526325226, - "mean_token_accuracy": 0.8916938304901123, - "num_tokens": 29818810.0, - "step": 3338 - }, - { - "epoch": 2.5372340425531914, - "grad_norm": 2.16711163520813, - "learning_rate": 3.1939988499789075e-07, - "loss": 0.25329700112342834, - "mean_token_accuracy": 0.9260494112968445, - "num_tokens": 29825472.0, - "step": 3339 - }, - { - "epoch": 2.5379939209726445, - "grad_norm": 2.5136961936950684, - "learning_rate": 3.18376327738531e-07, - "loss": 0.3313722312450409, - "mean_token_accuracy": 0.8868670463562012, - "num_tokens": 29831426.0, - "step": 3340 - }, - { - "epoch": 2.538753799392097, - "grad_norm": 1.7886340618133545, - "learning_rate": 3.1735430162664366e-07, - "loss": 0.3526390492916107, - "mean_token_accuracy": 0.8689097762107849, - "num_tokens": 29840212.0, - "step": 3341 - }, - { - "epoch": 2.5395136778115504, - "grad_norm": 2.2471916675567627, - "learning_rate": 3.1633380737952663e-07, - "loss": 0.21594303846359253, - "mean_token_accuracy": 0.9280022382736206, - "num_tokens": 29845696.0, - "step": 3342 - }, - { - "epoch": 2.540273556231003, - "grad_norm": 1.1835771799087524, - "learning_rate": 3.15314845713402e-07, - "loss": 0.2646978497505188, - "mean_token_accuracy": 0.8992418050765991, - "num_tokens": 29861802.0, - "step": 3343 - }, - { - "epoch": 2.5410334346504557, - "grad_norm": 2.2009525299072266, - "learning_rate": 3.14297417343416e-07, - "loss": 0.4950712323188782, - "mean_token_accuracy": 0.8226115703582764, - "num_tokens": 29869931.0, - "step": 3344 - }, - { - "epoch": 2.541793313069909, - "grad_norm": 1.2517180442810059, - "learning_rate": 3.1328152298363943e-07, - "loss": 0.26179224252700806, - "mean_token_accuracy": 0.9045941829681396, - "num_tokens": 29883562.0, - "step": 3345 - }, - { - "epoch": 2.5425531914893615, - "grad_norm": 2.1705822944641113, - "learning_rate": 3.122671633470664e-07, - "loss": 0.38098567724227905, - "mean_token_accuracy": 0.8638834357261658, - "num_tokens": 29891094.0, - "step": 3346 - }, - { - "epoch": 2.5433130699088147, - "grad_norm": 1.5869110822677612, - "learning_rate": 3.1125433914561185e-07, - "loss": 0.36774593591690063, - "mean_token_accuracy": 0.8730655908584595, - "num_tokens": 29901795.0, - "step": 3347 - }, - { - "epoch": 2.5440729483282674, - "grad_norm": 1.267867922782898, - "learning_rate": 3.1024305109011664e-07, - "loss": 0.30716824531555176, - "mean_token_accuracy": 0.8794038891792297, - "num_tokens": 29918112.0, - "step": 3348 - }, - { - "epoch": 2.5448328267477205, - "grad_norm": 1.7851269245147705, - "learning_rate": 3.092332998903416e-07, - "loss": 0.3374805748462677, - "mean_token_accuracy": 0.8766556978225708, - "num_tokens": 29927770.0, - "step": 3349 - }, - { - "epoch": 2.545592705167173, - "grad_norm": 1.7153595685958862, - "learning_rate": 3.082250862549671e-07, - "loss": 0.4149400293827057, - "mean_token_accuracy": 0.853299617767334, - "num_tokens": 29939361.0, - "step": 3350 - }, - { - "epoch": 2.5463525835866263, - "grad_norm": 2.676774740219116, - "learning_rate": 3.0721841089159823e-07, - "loss": 0.2004309445619583, - "mean_token_accuracy": 0.9245458841323853, - "num_tokens": 29943717.0, - "step": 3351 - }, - { - "epoch": 2.547112462006079, - "grad_norm": 3.0472381114959717, - "learning_rate": 3.0621327450675806e-07, - "loss": 0.31185799837112427, - "mean_token_accuracy": 0.8936638832092285, - "num_tokens": 29948613.0, - "step": 3352 - }, - { - "epoch": 2.547872340425532, - "grad_norm": 3.141087055206299, - "learning_rate": 3.0520967780588966e-07, - "loss": 0.34619835019111633, - "mean_token_accuracy": 0.8754764199256897, - "num_tokens": 29952477.0, - "step": 3353 - }, - { - "epoch": 2.548632218844985, - "grad_norm": 1.277807593345642, - "learning_rate": 3.0420762149335566e-07, - "loss": 0.41385579109191895, - "mean_token_accuracy": 0.8646053075790405, - "num_tokens": 29972620.0, - "step": 3354 - }, - { - "epoch": 2.5493920972644375, - "grad_norm": 1.8656301498413086, - "learning_rate": 3.0320710627243815e-07, - "loss": 0.33177047967910767, - "mean_token_accuracy": 0.884863018989563, - "num_tokens": 29980861.0, - "step": 3355 - }, - { - "epoch": 2.5501519756838906, - "grad_norm": 1.5590285062789917, - "learning_rate": 3.022081328453372e-07, - "loss": 0.35837340354919434, - "mean_token_accuracy": 0.8669678568840027, - "num_tokens": 29992920.0, - "step": 3356 - }, - { - "epoch": 2.5509118541033433, - "grad_norm": 1.3580808639526367, - "learning_rate": 3.0121070191317075e-07, - "loss": 0.30251336097717285, - "mean_token_accuracy": 0.891779363155365, - "num_tokens": 30006416.0, - "step": 3357 - }, - { - "epoch": 2.5516717325227964, - "grad_norm": 1.2978777885437012, - "learning_rate": 3.002148141759739e-07, - "loss": 0.3747216463088989, - "mean_token_accuracy": 0.8675031661987305, - "num_tokens": 30026730.0, - "step": 3358 - }, - { - "epoch": 2.552431610942249, - "grad_norm": 2.1855390071868896, - "learning_rate": 2.992204703326995e-07, - "loss": 0.25247129797935486, - "mean_token_accuracy": 0.9170730113983154, - "num_tokens": 30032920.0, - "step": 3359 - }, - { - "epoch": 2.5531914893617023, - "grad_norm": 1.46858811378479, - "learning_rate": 2.9822767108121623e-07, - "loss": 0.45840656757354736, - "mean_token_accuracy": 0.8472789525985718, - "num_tokens": 30046347.0, - "step": 3360 - }, - { - "epoch": 2.553951367781155, - "grad_norm": 1.7625445127487183, - "learning_rate": 2.9723641711830896e-07, - "loss": 0.34696075320243835, - "mean_token_accuracy": 0.8730940222740173, - "num_tokens": 30057264.0, - "step": 3361 - }, - { - "epoch": 2.5547112462006076, - "grad_norm": 2.3647844791412354, - "learning_rate": 2.96246709139677e-07, - "loss": 0.3888760209083557, - "mean_token_accuracy": 0.8829300403594971, - "num_tokens": 30064199.0, - "step": 3362 - }, - { - "epoch": 2.5554711246200608, - "grad_norm": 1.3508832454681396, - "learning_rate": 2.9525854783993696e-07, - "loss": 0.2998582720756531, - "mean_token_accuracy": 0.8910796642303467, - "num_tokens": 30078083.0, - "step": 3363 - }, - { - "epoch": 2.556231003039514, - "grad_norm": 1.8688349723815918, - "learning_rate": 2.942719339126171e-07, - "loss": 0.23044756054878235, - "mean_token_accuracy": 0.9150751233100891, - "num_tokens": 30086010.0, - "step": 3364 - }, - { - "epoch": 2.5569908814589666, - "grad_norm": 2.7221083641052246, - "learning_rate": 2.932868680501613e-07, - "loss": 0.30724483728408813, - "mean_token_accuracy": 0.9012277126312256, - "num_tokens": 30091524.0, - "step": 3365 - }, - { - "epoch": 2.5577507598784193, - "grad_norm": 2.5149598121643066, - "learning_rate": 2.92303350943928e-07, - "loss": 0.37096866965293884, - "mean_token_accuracy": 0.8573155403137207, - "num_tokens": 30097860.0, - "step": 3366 - }, - { - "epoch": 2.5585106382978724, - "grad_norm": 2.9985098838806152, - "learning_rate": 2.913213832841857e-07, - "loss": 0.3397367596626282, - "mean_token_accuracy": 0.8724661469459534, - "num_tokens": 30107543.0, - "step": 3367 - }, - { - "epoch": 2.559270516717325, - "grad_norm": 2.119527816772461, - "learning_rate": 2.9034096576011805e-07, - "loss": 0.34516414999961853, - "mean_token_accuracy": 0.8728296756744385, - "num_tokens": 30114737.0, - "step": 3368 - }, - { - "epoch": 2.560030395136778, - "grad_norm": 2.6809260845184326, - "learning_rate": 2.893620990598192e-07, - "loss": 0.4649572968482971, - "mean_token_accuracy": 0.8441047668457031, - "num_tokens": 30120640.0, - "step": 3369 - }, - { - "epoch": 2.560790273556231, - "grad_norm": 1.634458065032959, - "learning_rate": 2.8838478387029605e-07, - "loss": 0.3435993194580078, - "mean_token_accuracy": 0.8726693987846375, - "num_tokens": 30133091.0, - "step": 3370 - }, - { - "epoch": 2.561550151975684, - "grad_norm": 1.7352157831192017, - "learning_rate": 2.8740902087746604e-07, - "loss": 0.3171056807041168, - "mean_token_accuracy": 0.8962107300758362, - "num_tokens": 30141735.0, - "step": 3371 - }, - { - "epoch": 2.5623100303951367, - "grad_norm": 2.8209640979766846, - "learning_rate": 2.8643481076615717e-07, - "loss": 0.24519780278205872, - "mean_token_accuracy": 0.9098281860351562, - "num_tokens": 30146073.0, - "step": 3372 - }, - { - "epoch": 2.5630699088145894, - "grad_norm": 2.1111650466918945, - "learning_rate": 2.854621542201064e-07, - "loss": 0.34583622217178345, - "mean_token_accuracy": 0.8917075395584106, - "num_tokens": 30153104.0, - "step": 3373 - }, - { - "epoch": 2.5638297872340425, - "grad_norm": 1.5275969505310059, - "learning_rate": 2.844910519219632e-07, - "loss": 0.33743610978126526, - "mean_token_accuracy": 0.8789186477661133, - "num_tokens": 30166414.0, - "step": 3374 - }, - { - "epoch": 2.5645896656534957, - "grad_norm": 3.6885430812835693, - "learning_rate": 2.835215045532841e-07, - "loss": 0.3318662643432617, - "mean_token_accuracy": 0.880516767501831, - "num_tokens": 30170397.0, - "step": 3375 - }, - { - "epoch": 2.5653495440729484, - "grad_norm": 3.58422589302063, - "learning_rate": 2.8255351279453446e-07, - "loss": 0.24304428696632385, - "mean_token_accuracy": 0.911949634552002, - "num_tokens": 30173809.0, - "step": 3376 - }, - { - "epoch": 2.566109422492401, - "grad_norm": 2.180278778076172, - "learning_rate": 2.815870773250873e-07, - "loss": 0.2282833755016327, - "mean_token_accuracy": 0.9192917346954346, - "num_tokens": 30179431.0, - "step": 3377 - }, - { - "epoch": 2.566869300911854, - "grad_norm": 1.925766110420227, - "learning_rate": 2.8062219882322636e-07, - "loss": 0.38162487745285034, - "mean_token_accuracy": 0.8635650873184204, - "num_tokens": 30194252.0, - "step": 3378 - }, - { - "epoch": 2.567629179331307, - "grad_norm": 1.9528982639312744, - "learning_rate": 2.796588779661388e-07, - "loss": 0.3215118646621704, - "mean_token_accuracy": 0.8850376605987549, - "num_tokens": 30202341.0, - "step": 3379 - }, - { - "epoch": 2.56838905775076, - "grad_norm": 1.9466958045959473, - "learning_rate": 2.786971154299209e-07, - "loss": 0.3743375539779663, - "mean_token_accuracy": 0.8669804930686951, - "num_tokens": 30210657.0, - "step": 3380 - }, - { - "epoch": 2.5691489361702127, - "grad_norm": 1.0222121477127075, - "learning_rate": 2.777369118895745e-07, - "loss": 0.28801876306533813, - "mean_token_accuracy": 0.8983622789382935, - "num_tokens": 30232182.0, - "step": 3381 - }, - { - "epoch": 2.569908814589666, - "grad_norm": 1.5706082582473755, - "learning_rate": 2.767782680190073e-07, - "loss": 0.37556713819503784, - "mean_token_accuracy": 0.8659577369689941, - "num_tokens": 30244819.0, - "step": 3382 - }, - { - "epoch": 2.5706686930091185, - "grad_norm": 2.5092997550964355, - "learning_rate": 2.7582118449103273e-07, - "loss": 0.4440537691116333, - "mean_token_accuracy": 0.8627067804336548, - "num_tokens": 30251856.0, - "step": 3383 - }, - { - "epoch": 2.571428571428571, - "grad_norm": 2.2710351943969727, - "learning_rate": 2.748656619773687e-07, - "loss": 0.12478743493556976, - "mean_token_accuracy": 0.9581196904182434, - "num_tokens": 30255765.0, - "step": 3384 - }, - { - "epoch": 2.5721884498480243, - "grad_norm": 1.5596920251846313, - "learning_rate": 2.739117011486378e-07, - "loss": 0.23946957290172577, - "mean_token_accuracy": 0.9149091243743896, - "num_tokens": 30265134.0, - "step": 3385 - }, - { - "epoch": 2.5729483282674774, - "grad_norm": 2.5665597915649414, - "learning_rate": 2.729593026743668e-07, - "loss": 0.22638919949531555, - "mean_token_accuracy": 0.9160120487213135, - "num_tokens": 30269971.0, - "step": 3386 - }, - { - "epoch": 2.57370820668693, - "grad_norm": 2.1374216079711914, - "learning_rate": 2.7200846722298503e-07, - "loss": 0.3681026101112366, - "mean_token_accuracy": 0.8709797263145447, - "num_tokens": 30277792.0, - "step": 3387 - }, - { - "epoch": 2.574468085106383, - "grad_norm": 1.5955793857574463, - "learning_rate": 2.710591954618247e-07, - "loss": 0.3560969829559326, - "mean_token_accuracy": 0.8950826525688171, - "num_tokens": 30289038.0, - "step": 3388 - }, - { - "epoch": 2.575227963525836, - "grad_norm": 1.561316967010498, - "learning_rate": 2.701114880571232e-07, - "loss": 0.29359546303749084, - "mean_token_accuracy": 0.9007925987243652, - "num_tokens": 30298228.0, - "step": 3389 - }, - { - "epoch": 2.5759878419452886, - "grad_norm": 1.7596205472946167, - "learning_rate": 2.6916534567401675e-07, - "loss": 0.29790499806404114, - "mean_token_accuracy": 0.8907828330993652, - "num_tokens": 30306908.0, - "step": 3390 - }, - { - "epoch": 2.5767477203647418, - "grad_norm": 2.1243667602539062, - "learning_rate": 2.6822076897654453e-07, - "loss": 0.26356661319732666, - "mean_token_accuracy": 0.9012589454650879, - "num_tokens": 30312971.0, - "step": 3391 - }, - { - "epoch": 2.5775075987841944, - "grad_norm": 2.35373592376709, - "learning_rate": 2.6727775862764703e-07, - "loss": 0.3303247094154358, - "mean_token_accuracy": 0.876170814037323, - "num_tokens": 30319303.0, - "step": 3392 - }, - { - "epoch": 2.5782674772036476, - "grad_norm": 2.5983684062957764, - "learning_rate": 2.663363152891654e-07, - "loss": 0.3094015121459961, - "mean_token_accuracy": 0.9034996628761292, - "num_tokens": 30324454.0, - "step": 3393 - }, - { - "epoch": 2.5790273556231003, - "grad_norm": 2.264035940170288, - "learning_rate": 2.653964396218406e-07, - "loss": 0.42449623346328735, - "mean_token_accuracy": 0.8461374044418335, - "num_tokens": 30331213.0, - "step": 3394 - }, - { - "epoch": 2.579787234042553, - "grad_norm": 1.591833233833313, - "learning_rate": 2.64458132285314e-07, - "loss": 0.3518860340118408, - "mean_token_accuracy": 0.8751099705696106, - "num_tokens": 30341328.0, - "step": 3395 - }, - { - "epoch": 2.580547112462006, - "grad_norm": 2.4209396839141846, - "learning_rate": 2.635213939381248e-07, - "loss": 0.3116898238658905, - "mean_token_accuracy": 0.9000394344329834, - "num_tokens": 30346970.0, - "step": 3396 - }, - { - "epoch": 2.581306990881459, - "grad_norm": 2.373574733734131, - "learning_rate": 2.625862252377129e-07, - "loss": 0.2558296322822571, - "mean_token_accuracy": 0.9050976037979126, - "num_tokens": 30352535.0, - "step": 3397 - }, - { - "epoch": 2.582066869300912, - "grad_norm": 2.3691492080688477, - "learning_rate": 2.61652626840416e-07, - "loss": 0.34974297881126404, - "mean_token_accuracy": 0.880367636680603, - "num_tokens": 30359008.0, - "step": 3398 - }, - { - "epoch": 2.5828267477203646, - "grad_norm": 2.6194329261779785, - "learning_rate": 2.6072059940146775e-07, - "loss": 0.302560031414032, - "mean_token_accuracy": 0.9090637564659119, - "num_tokens": 30364545.0, - "step": 3399 - }, - { - "epoch": 2.5835866261398177, - "grad_norm": 1.8017159700393677, - "learning_rate": 2.597901435750025e-07, - "loss": 0.2855827212333679, - "mean_token_accuracy": 0.8930953741073608, - "num_tokens": 30372593.0, - "step": 3400 - }, - { - "epoch": 2.5843465045592704, - "grad_norm": 1.736401915550232, - "learning_rate": 2.5886126001405e-07, - "loss": 0.38662317395210266, - "mean_token_accuracy": 0.876146674156189, - "num_tokens": 30382097.0, - "step": 3401 - }, - { - "epoch": 2.5851063829787235, - "grad_norm": 1.174890398979187, - "learning_rate": 2.579339493705355e-07, - "loss": 0.351195752620697, - "mean_token_accuracy": 0.8636453747749329, - "num_tokens": 30399208.0, - "step": 3402 - }, - { - "epoch": 2.585866261398176, - "grad_norm": 1.9311470985412598, - "learning_rate": 2.5700821229528164e-07, - "loss": 0.3222745656967163, - "mean_token_accuracy": 0.9071283936500549, - "num_tokens": 30406648.0, - "step": 3403 - }, - { - "epoch": 2.5866261398176293, - "grad_norm": 1.7329829931259155, - "learning_rate": 2.5608404943800627e-07, - "loss": 0.25072571635246277, - "mean_token_accuracy": 0.9056229591369629, - "num_tokens": 30414836.0, - "step": 3404 - }, - { - "epoch": 2.587386018237082, - "grad_norm": 2.8609302043914795, - "learning_rate": 2.5516146144732273e-07, - "loss": 0.23907656967639923, - "mean_token_accuracy": 0.9093331694602966, - "num_tokens": 30419027.0, - "step": 3405 - }, - { - "epoch": 2.5881458966565347, - "grad_norm": 2.0544052124023438, - "learning_rate": 2.5424044897073895e-07, - "loss": 0.28297221660614014, - "mean_token_accuracy": 0.8845421075820923, - "num_tokens": 30426720.0, - "step": 3406 - }, - { - "epoch": 2.588905775075988, - "grad_norm": 2.0454416275024414, - "learning_rate": 2.533210126546565e-07, - "loss": 0.40411946177482605, - "mean_token_accuracy": 0.8890959024429321, - "num_tokens": 30434413.0, - "step": 3407 - }, - { - "epoch": 2.589665653495441, - "grad_norm": 2.5405404567718506, - "learning_rate": 2.52403153144371e-07, - "loss": 0.2860855460166931, - "mean_token_accuracy": 0.9015365242958069, - "num_tokens": 30439934.0, - "step": 3408 - }, - { - "epoch": 2.5904255319148937, - "grad_norm": 1.7092106342315674, - "learning_rate": 2.514868710840723e-07, - "loss": 0.36490949988365173, - "mean_token_accuracy": 0.8814249634742737, - "num_tokens": 30450150.0, - "step": 3409 - }, - { - "epoch": 2.5911854103343464, - "grad_norm": 2.2119903564453125, - "learning_rate": 2.505721671168426e-07, - "loss": 0.3019217848777771, - "mean_token_accuracy": 0.915499210357666, - "num_tokens": 30456068.0, - "step": 3410 - }, - { - "epoch": 2.5919452887537995, - "grad_norm": 2.1960413455963135, - "learning_rate": 2.496590418846545e-07, - "loss": 0.21500837802886963, - "mean_token_accuracy": 0.9256033897399902, - "num_tokens": 30461738.0, - "step": 3411 - }, - { - "epoch": 2.592705167173252, - "grad_norm": 2.9918036460876465, - "learning_rate": 2.48747496028377e-07, - "loss": 0.3921341001987457, - "mean_token_accuracy": 0.8748230934143066, - "num_tokens": 30466747.0, - "step": 3412 - }, - { - "epoch": 2.5934650455927053, - "grad_norm": 3.091017723083496, - "learning_rate": 2.478375301877664e-07, - "loss": 0.19394469261169434, - "mean_token_accuracy": 0.9396419525146484, - "num_tokens": 30470176.0, - "step": 3413 - }, - { - "epoch": 2.594224924012158, - "grad_norm": 1.3302737474441528, - "learning_rate": 2.4692914500147185e-07, - "loss": 0.43362653255462646, - "mean_token_accuracy": 0.844821572303772, - "num_tokens": 30486501.0, - "step": 3414 - }, - { - "epoch": 2.594984802431611, - "grad_norm": 1.7620038986206055, - "learning_rate": 2.460223411070337e-07, - "loss": 0.2638559937477112, - "mean_token_accuracy": 0.8977950215339661, - "num_tokens": 30495182.0, - "step": 3415 - }, - { - "epoch": 2.595744680851064, - "grad_norm": 2.4946224689483643, - "learning_rate": 2.451171191408813e-07, - "loss": 0.1953703612089157, - "mean_token_accuracy": 0.9253969192504883, - "num_tokens": 30499923.0, - "step": 3416 - }, - { - "epoch": 2.5965045592705165, - "grad_norm": 1.3302149772644043, - "learning_rate": 2.4421347973833443e-07, - "loss": 0.32972219586372375, - "mean_token_accuracy": 0.9001818299293518, - "num_tokens": 30515685.0, - "step": 3417 - }, - { - "epoch": 2.5972644376899696, - "grad_norm": 1.4542583227157593, - "learning_rate": 2.4331142353360206e-07, - "loss": 0.2967185378074646, - "mean_token_accuracy": 0.9060331583023071, - "num_tokens": 30527102.0, - "step": 3418 - }, - { - "epoch": 2.5980243161094227, - "grad_norm": 2.951036214828491, - "learning_rate": 2.424109511597822e-07, - "loss": 0.2523000240325928, - "mean_token_accuracy": 0.9064282774925232, - "num_tokens": 30531309.0, - "step": 3419 - }, - { - "epoch": 2.5987841945288754, - "grad_norm": 2.1855340003967285, - "learning_rate": 2.4151206324886047e-07, - "loss": 0.35201045870780945, - "mean_token_accuracy": 0.8787988424301147, - "num_tokens": 30538014.0, - "step": 3420 - }, - { - "epoch": 2.599544072948328, - "grad_norm": 2.579791307449341, - "learning_rate": 2.406147604317119e-07, - "loss": 0.24575991928577423, - "mean_token_accuracy": 0.9309802055358887, - "num_tokens": 30542437.0, - "step": 3421 - }, - { - "epoch": 2.6003039513677813, - "grad_norm": 2.758512496948242, - "learning_rate": 2.397190433380964e-07, - "loss": 0.3121788501739502, - "mean_token_accuracy": 0.8949024677276611, - "num_tokens": 30547171.0, - "step": 3422 - }, - { - "epoch": 2.601063829787234, - "grad_norm": 1.7849500179290771, - "learning_rate": 2.388249125966646e-07, - "loss": 0.3810131251811981, - "mean_token_accuracy": 0.8799927830696106, - "num_tokens": 30556368.0, - "step": 3423 - }, - { - "epoch": 2.601823708206687, - "grad_norm": 2.701768636703491, - "learning_rate": 2.3793236883495164e-07, - "loss": 0.2190743237733841, - "mean_token_accuracy": 0.9288224577903748, - "num_tokens": 30561367.0, - "step": 3424 - }, - { - "epoch": 2.6025835866261398, - "grad_norm": 2.0361149311065674, - "learning_rate": 2.3704141267937797e-07, - "loss": 0.37623006105422974, - "mean_token_accuracy": 0.8677272796630859, - "num_tokens": 30569589.0, - "step": 3425 - }, - { - "epoch": 2.603343465045593, - "grad_norm": 1.094288945198059, - "learning_rate": 2.3615204475525096e-07, - "loss": 0.3885940909385681, - "mean_token_accuracy": 0.8518509864807129, - "num_tokens": 30592538.0, - "step": 3426 - }, - { - "epoch": 2.6041033434650456, - "grad_norm": 3.1634905338287354, - "learning_rate": 2.3526426568676485e-07, - "loss": 0.14411768317222595, - "mean_token_accuracy": 0.9483509063720703, - "num_tokens": 30595768.0, - "step": 3427 - }, - { - "epoch": 2.6048632218844983, - "grad_norm": 1.642171859741211, - "learning_rate": 2.3437807609699575e-07, - "loss": 0.28384336829185486, - "mean_token_accuracy": 0.8940542936325073, - "num_tokens": 30605361.0, - "step": 3428 - }, - { - "epoch": 2.6056231003039514, - "grad_norm": 1.567029356956482, - "learning_rate": 2.3349347660790582e-07, - "loss": 0.373100221157074, - "mean_token_accuracy": 0.8695693016052246, - "num_tokens": 30616182.0, - "step": 3429 - }, - { - "epoch": 2.6063829787234045, - "grad_norm": 1.5392675399780273, - "learning_rate": 2.3261046784034154e-07, - "loss": 0.4163264036178589, - "mean_token_accuracy": 0.8596208095550537, - "num_tokens": 30628601.0, - "step": 3430 - }, - { - "epoch": 2.607142857142857, - "grad_norm": 1.5044162273406982, - "learning_rate": 2.3172905041403181e-07, - "loss": 0.3813124895095825, - "mean_token_accuracy": 0.8577728271484375, - "num_tokens": 30641924.0, - "step": 3431 - }, - { - "epoch": 2.60790273556231, - "grad_norm": 1.4375652074813843, - "learning_rate": 2.3084922494758965e-07, - "loss": 0.33166638016700745, - "mean_token_accuracy": 0.8825733661651611, - "num_tokens": 30653849.0, - "step": 3432 - }, - { - "epoch": 2.608662613981763, - "grad_norm": 2.5562593936920166, - "learning_rate": 2.299709920585108e-07, - "loss": 0.3969959616661072, - "mean_token_accuracy": 0.8612505197525024, - "num_tokens": 30659471.0, - "step": 3433 - }, - { - "epoch": 2.6094224924012157, - "grad_norm": 3.2285826206207275, - "learning_rate": 2.2909435236317224e-07, - "loss": 0.24361640214920044, - "mean_token_accuracy": 0.9103600978851318, - "num_tokens": 30664129.0, - "step": 3434 - }, - { - "epoch": 2.610182370820669, - "grad_norm": 2.702500343322754, - "learning_rate": 2.2821930647683427e-07, - "loss": 0.28006303310394287, - "mean_token_accuracy": 0.9067277908325195, - "num_tokens": 30668453.0, - "step": 3435 - }, - { - "epoch": 2.6109422492401215, - "grad_norm": 1.6491931676864624, - "learning_rate": 2.2734585501363676e-07, - "loss": 0.38273465633392334, - "mean_token_accuracy": 0.8603695631027222, - "num_tokens": 30680159.0, - "step": 3436 - }, - { - "epoch": 2.6117021276595747, - "grad_norm": 2.628532648086548, - "learning_rate": 2.2647399858660156e-07, - "loss": 0.28879645466804504, - "mean_token_accuracy": 0.9014753103256226, - "num_tokens": 30685706.0, - "step": 3437 - }, - { - "epoch": 2.6124620060790273, - "grad_norm": 1.515868067741394, - "learning_rate": 2.2560373780763256e-07, - "loss": 0.3872387707233429, - "mean_token_accuracy": 0.8627544641494751, - "num_tokens": 30696942.0, - "step": 3438 - }, - { - "epoch": 2.61322188449848, - "grad_norm": 2.4761857986450195, - "learning_rate": 2.2473507328751086e-07, - "loss": 0.3222554624080658, - "mean_token_accuracy": 0.8839071989059448, - "num_tokens": 30703089.0, - "step": 3439 - }, - { - "epoch": 2.613981762917933, - "grad_norm": 1.5424152612686157, - "learning_rate": 2.238680056358991e-07, - "loss": 0.24553638696670532, - "mean_token_accuracy": 0.9186095595359802, - "num_tokens": 30712643.0, - "step": 3440 - }, - { - "epoch": 2.6147416413373863, - "grad_norm": 2.1723358631134033, - "learning_rate": 2.2300253546133883e-07, - "loss": 0.22538061439990997, - "mean_token_accuracy": 0.914456844329834, - "num_tokens": 30719326.0, - "step": 3441 - }, - { - "epoch": 2.615501519756839, - "grad_norm": 2.0607242584228516, - "learning_rate": 2.2213866337125022e-07, - "loss": 0.40517157316207886, - "mean_token_accuracy": 0.8558610677719116, - "num_tokens": 30726700.0, - "step": 3442 - }, - { - "epoch": 2.6162613981762917, - "grad_norm": 1.3590739965438843, - "learning_rate": 2.2127638997193196e-07, - "loss": 0.3030068874359131, - "mean_token_accuracy": 0.904723048210144, - "num_tokens": 30739090.0, - "step": 3443 - }, - { - "epoch": 2.617021276595745, - "grad_norm": 1.3497486114501953, - "learning_rate": 2.2041571586856104e-07, - "loss": 0.33204561471939087, - "mean_token_accuracy": 0.8720648288726807, - "num_tokens": 30755883.0, - "step": 3444 - }, - { - "epoch": 2.6177811550151975, - "grad_norm": 4.2434515953063965, - "learning_rate": 2.1955664166519036e-07, - "loss": 0.16747456789016724, - "mean_token_accuracy": 0.9386751651763916, - "num_tokens": 30758472.0, - "step": 3445 - }, - { - "epoch": 2.6185410334346506, - "grad_norm": 2.629639148712158, - "learning_rate": 2.1869916796475294e-07, - "loss": 0.3494086265563965, - "mean_token_accuracy": 0.8652780055999756, - "num_tokens": 30765014.0, - "step": 3446 - }, - { - "epoch": 2.6193009118541033, - "grad_norm": 1.56986403465271, - "learning_rate": 2.1784329536905653e-07, - "loss": 0.288389652967453, - "mean_token_accuracy": 0.8911552429199219, - "num_tokens": 30783893.0, - "step": 3447 - }, - { - "epoch": 2.6200607902735564, - "grad_norm": 2.137489080429077, - "learning_rate": 2.1698902447878478e-07, - "loss": 0.32554084062576294, - "mean_token_accuracy": 0.8827146291732788, - "num_tokens": 30791222.0, - "step": 3448 - }, - { - "epoch": 2.620820668693009, - "grad_norm": 1.7718229293823242, - "learning_rate": 2.1613635589349756e-07, - "loss": 0.433074414730072, - "mean_token_accuracy": 0.8626075983047485, - "num_tokens": 30800909.0, - "step": 3449 - }, - { - "epoch": 2.621580547112462, - "grad_norm": 1.8075933456420898, - "learning_rate": 2.1528529021163203e-07, - "loss": 0.3695775270462036, - "mean_token_accuracy": 0.8898511528968811, - "num_tokens": 30809680.0, - "step": 3450 - }, - { - "epoch": 2.622340425531915, - "grad_norm": 2.61863374710083, - "learning_rate": 2.1443582803049757e-07, - "loss": 0.3161890506744385, - "mean_token_accuracy": 0.9073872566223145, - "num_tokens": 30814582.0, - "step": 3451 - }, - { - "epoch": 2.6231003039513676, - "grad_norm": 1.9178471565246582, - "learning_rate": 2.1358796994628005e-07, - "loss": 0.26871830224990845, - "mean_token_accuracy": 0.9038676023483276, - "num_tokens": 30822408.0, - "step": 3452 - }, - { - "epoch": 2.6238601823708207, - "grad_norm": 1.4968323707580566, - "learning_rate": 2.1274171655403852e-07, - "loss": 0.30813854932785034, - "mean_token_accuracy": 0.8859157562255859, - "num_tokens": 30833627.0, - "step": 3453 - }, - { - "epoch": 2.6246200607902734, - "grad_norm": 2.584803581237793, - "learning_rate": 2.118970684477062e-07, - "loss": 0.3794214129447937, - "mean_token_accuracy": 0.8644092679023743, - "num_tokens": 30839801.0, - "step": 3454 - }, - { - "epoch": 2.6253799392097266, - "grad_norm": 1.4426536560058594, - "learning_rate": 2.1105402622008996e-07, - "loss": 0.3904871344566345, - "mean_token_accuracy": 0.8601649403572083, - "num_tokens": 30854293.0, - "step": 3455 - }, - { - "epoch": 2.6261398176291793, - "grad_norm": 2.42291522026062, - "learning_rate": 2.1021259046286907e-07, - "loss": 0.2830442786216736, - "mean_token_accuracy": 0.918805718421936, - "num_tokens": 30859112.0, - "step": 3456 - }, - { - "epoch": 2.6268996960486324, - "grad_norm": 1.4296268224716187, - "learning_rate": 2.0937276176659553e-07, - "loss": 0.3172294497489929, - "mean_token_accuracy": 0.8801482915878296, - "num_tokens": 30871653.0, - "step": 3457 - }, - { - "epoch": 2.627659574468085, - "grad_norm": 2.1253740787506104, - "learning_rate": 2.0853454072069402e-07, - "loss": 0.39093419909477234, - "mean_token_accuracy": 0.9099202156066895, - "num_tokens": 30879156.0, - "step": 3458 - }, - { - "epoch": 2.628419452887538, - "grad_norm": 2.829529047012329, - "learning_rate": 2.0769792791345945e-07, - "loss": 0.35299739241600037, - "mean_token_accuracy": 0.8719742894172668, - "num_tokens": 30883839.0, - "step": 3459 - }, - { - "epoch": 2.629179331306991, - "grad_norm": 1.4410310983657837, - "learning_rate": 2.068629239320588e-07, - "loss": 0.3550579845905304, - "mean_token_accuracy": 0.8580837249755859, - "num_tokens": 30897204.0, - "step": 3460 - }, - { - "epoch": 2.6299392097264436, - "grad_norm": 1.8980296850204468, - "learning_rate": 2.0602952936253112e-07, - "loss": 0.33656907081604004, - "mean_token_accuracy": 0.9022824764251709, - "num_tokens": 30905304.0, - "step": 3461 - }, - { - "epoch": 2.6306990881458967, - "grad_norm": 2.6224915981292725, - "learning_rate": 2.0519774478978404e-07, - "loss": 0.383074015378952, - "mean_token_accuracy": 0.8744953870773315, - "num_tokens": 30911118.0, - "step": 3462 - }, - { - "epoch": 2.6314589665653494, - "grad_norm": 1.8675706386566162, - "learning_rate": 2.043675707975959e-07, - "loss": 0.36784154176712036, - "mean_token_accuracy": 0.8660717606544495, - "num_tokens": 30919786.0, - "step": 3463 - }, - { - "epoch": 2.6322188449848025, - "grad_norm": 1.7601722478866577, - "learning_rate": 2.0353900796861503e-07, - "loss": 0.4188779294490814, - "mean_token_accuracy": 0.8655462861061096, - "num_tokens": 30930882.0, - "step": 3464 - }, - { - "epoch": 2.632978723404255, - "grad_norm": 1.760291337966919, - "learning_rate": 2.027120568843588e-07, - "loss": 0.31421059370040894, - "mean_token_accuracy": 0.8876073360443115, - "num_tokens": 30940524.0, - "step": 3465 - }, - { - "epoch": 2.6337386018237083, - "grad_norm": 2.0120749473571777, - "learning_rate": 2.0188671812521293e-07, - "loss": 0.4053173065185547, - "mean_token_accuracy": 0.855548620223999, - "num_tokens": 30949577.0, - "step": 3466 - }, - { - "epoch": 2.634498480243161, - "grad_norm": 1.1353741884231567, - "learning_rate": 2.0106299227043298e-07, - "loss": 0.24654456973075867, - "mean_token_accuracy": 0.905929684638977, - "num_tokens": 30965797.0, - "step": 3467 - }, - { - "epoch": 2.6352583586626137, - "grad_norm": 2.011974811553955, - "learning_rate": 2.002408798981395e-07, - "loss": 0.37494587898254395, - "mean_token_accuracy": 0.8785897493362427, - "num_tokens": 30974271.0, - "step": 3468 - }, - { - "epoch": 2.636018237082067, - "grad_norm": 1.3929005861282349, - "learning_rate": 1.9942038158532407e-07, - "loss": 0.43479201197624207, - "mean_token_accuracy": 0.8481380939483643, - "num_tokens": 30992451.0, - "step": 3469 - }, - { - "epoch": 2.63677811550152, - "grad_norm": 2.2714993953704834, - "learning_rate": 1.9860149790784432e-07, - "loss": 0.36299505829811096, - "mean_token_accuracy": 0.8728935718536377, - "num_tokens": 30999180.0, - "step": 3470 - }, - { - "epoch": 2.6375379939209727, - "grad_norm": 1.722923755645752, - "learning_rate": 1.977842294404228e-07, - "loss": 0.2461910843849182, - "mean_token_accuracy": 0.9146148562431335, - "num_tokens": 31008617.0, - "step": 3471 - }, - { - "epoch": 2.6382978723404253, - "grad_norm": 1.4508280754089355, - "learning_rate": 1.9696857675665122e-07, - "loss": 0.3511884808540344, - "mean_token_accuracy": 0.869759202003479, - "num_tokens": 31022158.0, - "step": 3472 - }, - { - "epoch": 2.6390577507598785, - "grad_norm": 2.5803074836730957, - "learning_rate": 1.9615454042898635e-07, - "loss": 0.27785009145736694, - "mean_token_accuracy": 0.9075050354003906, - "num_tokens": 31027176.0, - "step": 3473 - }, - { - "epoch": 2.639817629179331, - "grad_norm": 3.2428712844848633, - "learning_rate": 1.95342121028749e-07, - "loss": 0.30596673488616943, - "mean_token_accuracy": 0.8934510946273804, - "num_tokens": 31031140.0, - "step": 3474 - }, - { - "epoch": 2.6405775075987843, - "grad_norm": 1.5055527687072754, - "learning_rate": 1.9453131912612694e-07, - "loss": 0.3586134612560272, - "mean_token_accuracy": 0.87983238697052, - "num_tokens": 31041878.0, - "step": 3475 - }, - { - "epoch": 2.641337386018237, - "grad_norm": 2.8457231521606445, - "learning_rate": 1.9372213529017192e-07, - "loss": 0.314262330532074, - "mean_token_accuracy": 0.8857930302619934, - "num_tokens": 31046670.0, - "step": 3476 - }, - { - "epoch": 2.64209726443769, - "grad_norm": 2.661770820617676, - "learning_rate": 1.9291457008880077e-07, - "loss": 0.3096502125263214, - "mean_token_accuracy": 0.9015626907348633, - "num_tokens": 31052419.0, - "step": 3477 - }, - { - "epoch": 2.642857142857143, - "grad_norm": 1.9692156314849854, - "learning_rate": 1.9210862408879373e-07, - "loss": 0.33081287145614624, - "mean_token_accuracy": 0.8793413639068604, - "num_tokens": 31060462.0, - "step": 3478 - }, - { - "epoch": 2.6436170212765955, - "grad_norm": 2.454256772994995, - "learning_rate": 1.9130429785579441e-07, - "loss": 0.486195832490921, - "mean_token_accuracy": 0.8472193479537964, - "num_tokens": 31066537.0, - "step": 3479 - }, - { - "epoch": 2.6443768996960486, - "grad_norm": 3.121835470199585, - "learning_rate": 1.9050159195431017e-07, - "loss": 0.28688520193099976, - "mean_token_accuracy": 0.8916707038879395, - "num_tokens": 31071061.0, - "step": 3480 - }, - { - "epoch": 2.6451367781155017, - "grad_norm": 2.0197176933288574, - "learning_rate": 1.8970050694771064e-07, - "loss": 0.2587219774723053, - "mean_token_accuracy": 0.9204096794128418, - "num_tokens": 31077438.0, - "step": 3481 - }, - { - "epoch": 2.6458966565349544, - "grad_norm": 2.305452585220337, - "learning_rate": 1.8890104339822913e-07, - "loss": 0.3234187960624695, - "mean_token_accuracy": 0.8695623874664307, - "num_tokens": 31084445.0, - "step": 3482 - }, - { - "epoch": 2.646656534954407, - "grad_norm": 2.671178102493286, - "learning_rate": 1.881032018669579e-07, - "loss": 0.31658151745796204, - "mean_token_accuracy": 0.9211946725845337, - "num_tokens": 31090229.0, - "step": 3483 - }, - { - "epoch": 2.6474164133738602, - "grad_norm": 1.9448342323303223, - "learning_rate": 1.8730698291385518e-07, - "loss": 0.4380547106266022, - "mean_token_accuracy": 0.8881628513336182, - "num_tokens": 31098328.0, - "step": 3484 - }, - { - "epoch": 2.648176291793313, - "grad_norm": 2.00927734375, - "learning_rate": 1.8651238709773646e-07, - "loss": 0.30627715587615967, - "mean_token_accuracy": 0.9037996530532837, - "num_tokens": 31106114.0, - "step": 3485 - }, - { - "epoch": 2.648936170212766, - "grad_norm": 1.800561547279358, - "learning_rate": 1.8571941497627976e-07, - "loss": 0.3352568745613098, - "mean_token_accuracy": 0.8773363828659058, - "num_tokens": 31114962.0, - "step": 3486 - }, - { - "epoch": 2.6496960486322187, - "grad_norm": 1.2112451791763306, - "learning_rate": 1.8492806710602495e-07, - "loss": 0.30349305272102356, - "mean_token_accuracy": 0.8948603272438049, - "num_tokens": 31131202.0, - "step": 3487 - }, - { - "epoch": 2.650455927051672, - "grad_norm": 1.241676926612854, - "learning_rate": 1.8413834404236857e-07, - "loss": 0.33237409591674805, - "mean_token_accuracy": 0.8674747943878174, - "num_tokens": 31146087.0, - "step": 3488 - }, - { - "epoch": 2.6512158054711246, - "grad_norm": 1.7932970523834229, - "learning_rate": 1.8335024633956977e-07, - "loss": 0.2946045696735382, - "mean_token_accuracy": 0.9197652339935303, - "num_tokens": 31153539.0, - "step": 3489 - }, - { - "epoch": 2.6519756838905773, - "grad_norm": 1.4799917936325073, - "learning_rate": 1.8256377455074526e-07, - "loss": 0.41131776571273804, - "mean_token_accuracy": 0.859546422958374, - "num_tokens": 31165330.0, - "step": 3490 - }, - { - "epoch": 2.6527355623100304, - "grad_norm": 1.196844458580017, - "learning_rate": 1.8177892922787154e-07, - "loss": 0.3251150846481323, - "mean_token_accuracy": 0.8738864660263062, - "num_tokens": 31182815.0, - "step": 3491 - }, - { - "epoch": 2.6534954407294835, - "grad_norm": 1.954189419746399, - "learning_rate": 1.809957109217833e-07, - "loss": 0.31352269649505615, - "mean_token_accuracy": 0.8898859024047852, - "num_tokens": 31190907.0, - "step": 3492 - }, - { - "epoch": 2.654255319148936, - "grad_norm": 2.5248095989227295, - "learning_rate": 1.802141201821736e-07, - "loss": 0.29824098944664, - "mean_token_accuracy": 0.9073196053504944, - "num_tokens": 31196077.0, - "step": 3493 - }, - { - "epoch": 2.655015197568389, - "grad_norm": 2.163174629211426, - "learning_rate": 1.7943415755759168e-07, - "loss": 0.3291153311729431, - "mean_token_accuracy": 0.8850691318511963, - "num_tokens": 31202843.0, - "step": 3494 - }, - { - "epoch": 2.655775075987842, - "grad_norm": 1.1075550317764282, - "learning_rate": 1.7865582359544664e-07, - "loss": 0.3335857093334198, - "mean_token_accuracy": 0.877744197845459, - "num_tokens": 31224407.0, - "step": 3495 - }, - { - "epoch": 2.6565349544072947, - "grad_norm": 3.600712299346924, - "learning_rate": 1.7787911884200314e-07, - "loss": 0.24402567744255066, - "mean_token_accuracy": 0.9030617475509644, - "num_tokens": 31228150.0, - "step": 3496 - }, - { - "epoch": 2.657294832826748, - "grad_norm": 2.5282156467437744, - "learning_rate": 1.7710404384238156e-07, - "loss": 0.3065975606441498, - "mean_token_accuracy": 0.8894387483596802, - "num_tokens": 31233676.0, - "step": 3497 - }, - { - "epoch": 2.6580547112462005, - "grad_norm": 5.057322025299072, - "learning_rate": 1.7633059914055976e-07, - "loss": 0.3121221661567688, - "mean_token_accuracy": 0.8697853088378906, - "num_tokens": 31241436.0, - "step": 3498 - }, - { - "epoch": 2.6588145896656536, - "grad_norm": 2.3506245613098145, - "learning_rate": 1.7555878527937164e-07, - "loss": 0.3100275993347168, - "mean_token_accuracy": 0.8860085010528564, - "num_tokens": 31249589.0, - "step": 3499 - }, - { - "epoch": 2.6595744680851063, - "grad_norm": 1.352675199508667, - "learning_rate": 1.7478860280050525e-07, - "loss": 0.3743774890899658, - "mean_token_accuracy": 0.8581909537315369, - "num_tokens": 31264177.0, - "step": 3500 - }, - { - "epoch": 2.660334346504559, - "grad_norm": 1.4283853769302368, - "learning_rate": 1.740200522445043e-07, - "loss": 0.3012605905532837, - "mean_token_accuracy": 0.8875954151153564, - "num_tokens": 31278104.0, - "step": 3501 - }, - { - "epoch": 2.661094224924012, - "grad_norm": 1.2291043996810913, - "learning_rate": 1.7325313415076705e-07, - "loss": 0.28256118297576904, - "mean_token_accuracy": 0.8932200074195862, - "num_tokens": 31295863.0, - "step": 3502 - }, - { - "epoch": 2.6618541033434653, - "grad_norm": 1.4281202554702759, - "learning_rate": 1.7248784905754656e-07, - "loss": 0.17757278680801392, - "mean_token_accuracy": 0.9204857349395752, - "num_tokens": 31304203.0, - "step": 3503 - }, - { - "epoch": 2.662613981762918, - "grad_norm": 1.369604229927063, - "learning_rate": 1.717241975019493e-07, - "loss": 0.35701876878738403, - "mean_token_accuracy": 0.8924071192741394, - "num_tokens": 31317585.0, - "step": 3504 - }, - { - "epoch": 2.6633738601823707, - "grad_norm": 1.8434638977050781, - "learning_rate": 1.7096218001993514e-07, - "loss": 0.2783927619457245, - "mean_token_accuracy": 0.9073910713195801, - "num_tokens": 31325380.0, - "step": 3505 - }, - { - "epoch": 2.664133738601824, - "grad_norm": 1.946325421333313, - "learning_rate": 1.702017971463174e-07, - "loss": 0.2873200476169586, - "mean_token_accuracy": 0.8956313133239746, - "num_tokens": 31333366.0, - "step": 3506 - }, - { - "epoch": 2.6648936170212765, - "grad_norm": 2.468369960784912, - "learning_rate": 1.6944304941476224e-07, - "loss": 0.2589072287082672, - "mean_token_accuracy": 0.9237367510795593, - "num_tokens": 31337721.0, - "step": 3507 - }, - { - "epoch": 2.6656534954407296, - "grad_norm": 1.1283265352249146, - "learning_rate": 1.686859373577876e-07, - "loss": 0.3271624445915222, - "mean_token_accuracy": 0.8839015960693359, - "num_tokens": 31355493.0, - "step": 3508 - }, - { - "epoch": 2.6664133738601823, - "grad_norm": 1.9863340854644775, - "learning_rate": 1.679304615067634e-07, - "loss": 0.24140994250774384, - "mean_token_accuracy": 0.9161529541015625, - "num_tokens": 31362707.0, - "step": 3509 - }, - { - "epoch": 2.6671732522796354, - "grad_norm": 1.8522552251815796, - "learning_rate": 1.671766223919133e-07, - "loss": 0.3312528133392334, - "mean_token_accuracy": 0.8730556964874268, - "num_tokens": 31371077.0, - "step": 3510 - }, - { - "epoch": 2.667933130699088, - "grad_norm": 2.4215502738952637, - "learning_rate": 1.6642442054230935e-07, - "loss": 0.3685656189918518, - "mean_token_accuracy": 0.8850007653236389, - "num_tokens": 31378208.0, - "step": 3511 - }, - { - "epoch": 2.668693009118541, - "grad_norm": 2.1833741664886475, - "learning_rate": 1.6567385648587563e-07, - "loss": 0.34506508708000183, - "mean_token_accuracy": 0.8798409104347229, - "num_tokens": 31384364.0, - "step": 3512 - }, - { - "epoch": 2.669452887537994, - "grad_norm": 1.5749074220657349, - "learning_rate": 1.6492493074938777e-07, - "loss": 0.426993191242218, - "mean_token_accuracy": 0.8461192846298218, - "num_tokens": 31399653.0, - "step": 3513 - }, - { - "epoch": 2.670212765957447, - "grad_norm": 1.782159686088562, - "learning_rate": 1.6417764385846996e-07, - "loss": 0.43299031257629395, - "mean_token_accuracy": 0.8456183075904846, - "num_tokens": 31410255.0, - "step": 3514 - }, - { - "epoch": 2.6709726443768997, - "grad_norm": 1.3696199655532837, - "learning_rate": 1.6343199633759715e-07, - "loss": 0.24636408686637878, - "mean_token_accuracy": 0.8885586261749268, - "num_tokens": 31422388.0, - "step": 3515 - }, - { - "epoch": 2.6717325227963524, - "grad_norm": 1.9061282873153687, - "learning_rate": 1.6268798871009405e-07, - "loss": 0.4061458706855774, - "mean_token_accuracy": 0.8875166177749634, - "num_tokens": 31431610.0, - "step": 3516 - }, - { - "epoch": 2.6724924012158056, - "grad_norm": 1.906085729598999, - "learning_rate": 1.6194562149813241e-07, - "loss": 0.4171827435493469, - "mean_token_accuracy": 0.848915159702301, - "num_tokens": 31440612.0, - "step": 3517 - }, - { - "epoch": 2.6732522796352582, - "grad_norm": 1.7384947538375854, - "learning_rate": 1.6120489522273548e-07, - "loss": 0.38559412956237793, - "mean_token_accuracy": 0.860315203666687, - "num_tokens": 31451002.0, - "step": 3518 - }, - { - "epoch": 2.6740121580547114, - "grad_norm": 3.150087356567383, - "learning_rate": 1.6046581040377317e-07, - "loss": 0.17975735664367676, - "mean_token_accuracy": 0.9390251636505127, - "num_tokens": 31454609.0, - "step": 3519 - }, - { - "epoch": 2.674772036474164, - "grad_norm": 1.9782978296279907, - "learning_rate": 1.5972836755996286e-07, - "loss": 0.4016202688217163, - "mean_token_accuracy": 0.8536617755889893, - "num_tokens": 31463351.0, - "step": 3520 - }, - { - "epoch": 2.675531914893617, - "grad_norm": 1.459272861480713, - "learning_rate": 1.589925672088713e-07, - "loss": 0.32752668857574463, - "mean_token_accuracy": 0.8932114839553833, - "num_tokens": 31475029.0, - "step": 3521 - }, - { - "epoch": 2.67629179331307, - "grad_norm": 1.5019307136535645, - "learning_rate": 1.5825840986691155e-07, - "loss": 0.47891637682914734, - "mean_token_accuracy": 0.8196566700935364, - "num_tokens": 31489340.0, - "step": 3522 - }, - { - "epoch": 2.6770516717325226, - "grad_norm": 1.9832415580749512, - "learning_rate": 1.5752589604934255e-07, - "loss": 0.3787233829498291, - "mean_token_accuracy": 0.8592989444732666, - "num_tokens": 31498173.0, - "step": 3523 - }, - { - "epoch": 2.6778115501519757, - "grad_norm": 1.6112871170043945, - "learning_rate": 1.567950262702714e-07, - "loss": 0.394833505153656, - "mean_token_accuracy": 0.8762246370315552, - "num_tokens": 31509701.0, - "step": 3524 - }, - { - "epoch": 2.678571428571429, - "grad_norm": 2.542189598083496, - "learning_rate": 1.560658010426505e-07, - "loss": 0.344679057598114, - "mean_token_accuracy": 0.8738337159156799, - "num_tokens": 31516174.0, - "step": 3525 - }, - { - "epoch": 2.6793313069908815, - "grad_norm": 1.6784722805023193, - "learning_rate": 1.5533822087827805e-07, - "loss": 0.2981395423412323, - "mean_token_accuracy": 0.9238042831420898, - "num_tokens": 31526373.0, - "step": 3526 - }, - { - "epoch": 2.680091185410334, - "grad_norm": 2.1711673736572266, - "learning_rate": 1.54612286287798e-07, - "loss": 0.32182997465133667, - "mean_token_accuracy": 0.8804676532745361, - "num_tokens": 31532221.0, - "step": 3527 - }, - { - "epoch": 2.6808510638297873, - "grad_norm": 2.920492172241211, - "learning_rate": 1.5388799778069896e-07, - "loss": 0.42035239934921265, - "mean_token_accuracy": 0.8616809844970703, - "num_tokens": 31537349.0, - "step": 3528 - }, - { - "epoch": 2.68161094224924, - "grad_norm": 1.6369318962097168, - "learning_rate": 1.5316535586531483e-07, - "loss": 0.3083080053329468, - "mean_token_accuracy": 0.8857955932617188, - "num_tokens": 31548063.0, - "step": 3529 - }, - { - "epoch": 2.682370820668693, - "grad_norm": 1.745784044265747, - "learning_rate": 1.5244436104882327e-07, - "loss": 0.3295830190181732, - "mean_token_accuracy": 0.8790948390960693, - "num_tokens": 31557297.0, - "step": 3530 - }, - { - "epoch": 2.683130699088146, - "grad_norm": 2.933802843093872, - "learning_rate": 1.5172501383724668e-07, - "loss": 0.20540538430213928, - "mean_token_accuracy": 0.9353891611099243, - "num_tokens": 31561267.0, - "step": 3531 - }, - { - "epoch": 2.683890577507599, - "grad_norm": 1.1792415380477905, - "learning_rate": 1.5100731473544932e-07, - "loss": 0.2857414484024048, - "mean_token_accuracy": 0.8919717073440552, - "num_tokens": 31577364.0, - "step": 3532 - }, - { - "epoch": 2.6846504559270516, - "grad_norm": 1.5752356052398682, - "learning_rate": 1.5029126424714186e-07, - "loss": 0.42933136224746704, - "mean_token_accuracy": 0.8738011717796326, - "num_tokens": 31593255.0, - "step": 3533 - }, - { - "epoch": 2.6854103343465043, - "grad_norm": 1.4097353219985962, - "learning_rate": 1.495768628748745e-07, - "loss": 0.41403159499168396, - "mean_token_accuracy": 0.8538030385971069, - "num_tokens": 31606689.0, - "step": 3534 - }, - { - "epoch": 2.6861702127659575, - "grad_norm": 1.3788182735443115, - "learning_rate": 1.4886411112004258e-07, - "loss": 0.3825019299983978, - "mean_token_accuracy": 0.870381236076355, - "num_tokens": 31623528.0, - "step": 3535 - }, - { - "epoch": 2.6869300911854106, - "grad_norm": 2.3032004833221436, - "learning_rate": 1.481530094828823e-07, - "loss": 0.28886643052101135, - "mean_token_accuracy": 0.9053950905799866, - "num_tokens": 31629949.0, - "step": 3536 - }, - { - "epoch": 2.6876899696048633, - "grad_norm": 1.8950154781341553, - "learning_rate": 1.4744355846247254e-07, - "loss": 0.3261764645576477, - "mean_token_accuracy": 0.8882689476013184, - "num_tokens": 31639482.0, - "step": 3537 - }, - { - "epoch": 2.688449848024316, - "grad_norm": 2.8152518272399902, - "learning_rate": 1.4673575855673278e-07, - "loss": 0.19367718696594238, - "mean_token_accuracy": 0.948776364326477, - "num_tokens": 31643354.0, - "step": 3538 - }, - { - "epoch": 2.689209726443769, - "grad_norm": 2.1745874881744385, - "learning_rate": 1.460296102624248e-07, - "loss": 0.3250897526741028, - "mean_token_accuracy": 0.8834096193313599, - "num_tokens": 31651085.0, - "step": 3539 - }, - { - "epoch": 2.689969604863222, - "grad_norm": 2.5239014625549316, - "learning_rate": 1.4532511407515022e-07, - "loss": 0.3069056570529938, - "mean_token_accuracy": 0.8939725160598755, - "num_tokens": 31656790.0, - "step": 3540 - }, - { - "epoch": 2.690729483282675, - "grad_norm": 2.19575572013855, - "learning_rate": 1.4462227048935185e-07, - "loss": 0.38596993684768677, - "mean_token_accuracy": 0.8545209169387817, - "num_tokens": 31664577.0, - "step": 3541 - }, - { - "epoch": 2.6914893617021276, - "grad_norm": 2.4618618488311768, - "learning_rate": 1.439210799983126e-07, - "loss": 0.43490833044052124, - "mean_token_accuracy": 0.8452163338661194, - "num_tokens": 31670328.0, - "step": 3542 - }, - { - "epoch": 2.6922492401215807, - "grad_norm": 1.6371922492980957, - "learning_rate": 1.4322154309415387e-07, - "loss": 0.36862409114837646, - "mean_token_accuracy": 0.8575112819671631, - "num_tokens": 31680342.0, - "step": 3543 - }, - { - "epoch": 2.6930091185410334, - "grad_norm": 3.311603546142578, - "learning_rate": 1.425236602678387e-07, - "loss": 0.3098670542240143, - "mean_token_accuracy": 0.8895800113677979, - "num_tokens": 31686819.0, - "step": 3544 - }, - { - "epoch": 2.693768996960486, - "grad_norm": 2.246453285217285, - "learning_rate": 1.4182743200916839e-07, - "loss": 0.2145545780658722, - "mean_token_accuracy": 0.9456803798675537, - "num_tokens": 31692024.0, - "step": 3545 - }, - { - "epoch": 2.6945288753799392, - "grad_norm": 2.962627410888672, - "learning_rate": 1.4113285880678145e-07, - "loss": 0.22648683190345764, - "mean_token_accuracy": 0.9368027448654175, - "num_tokens": 31696292.0, - "step": 3546 - }, - { - "epoch": 2.6952887537993924, - "grad_norm": 2.3828611373901367, - "learning_rate": 1.4043994114815663e-07, - "loss": 0.28031831979751587, - "mean_token_accuracy": 0.8995643854141235, - "num_tokens": 31701896.0, - "step": 3547 - }, - { - "epoch": 2.696048632218845, - "grad_norm": 2.749218463897705, - "learning_rate": 1.3974867951961097e-07, - "loss": 0.31309080123901367, - "mean_token_accuracy": 0.8827601671218872, - "num_tokens": 31707434.0, - "step": 3548 - }, - { - "epoch": 2.6968085106382977, - "grad_norm": 1.5682415962219238, - "learning_rate": 1.3905907440629752e-07, - "loss": 0.2794681191444397, - "mean_token_accuracy": 0.9000695943832397, - "num_tokens": 31718923.0, - "step": 3549 - }, - { - "epoch": 2.697568389057751, - "grad_norm": 2.2193145751953125, - "learning_rate": 1.38371126292208e-07, - "loss": 0.31643980741500854, - "mean_token_accuracy": 0.8916857242584229, - "num_tokens": 31724566.0, - "step": 3550 - }, - { - "epoch": 2.6983282674772036, - "grad_norm": 2.14003324508667, - "learning_rate": 1.3768483566017093e-07, - "loss": 0.3225042521953583, - "mean_token_accuracy": 0.8810629844665527, - "num_tokens": 31731363.0, - "step": 3551 - }, - { - "epoch": 2.6990881458966567, - "grad_norm": 2.594632863998413, - "learning_rate": 1.3700020299185156e-07, - "loss": 0.28227928280830383, - "mean_token_accuracy": 0.8986451625823975, - "num_tokens": 31736574.0, - "step": 3552 - }, - { - "epoch": 2.6998480243161094, - "grad_norm": 1.8695379495620728, - "learning_rate": 1.3631722876775137e-07, - "loss": 0.46631208062171936, - "mean_token_accuracy": 0.8425353765487671, - "num_tokens": 31746568.0, - "step": 3553 - }, - { - "epoch": 2.7006079027355625, - "grad_norm": 2.1246798038482666, - "learning_rate": 1.3563591346720806e-07, - "loss": 0.3978712260723114, - "mean_token_accuracy": 0.85677170753479, - "num_tokens": 31755499.0, - "step": 3554 - }, - { - "epoch": 2.701367781155015, - "grad_norm": 1.9348199367523193, - "learning_rate": 1.3495625756839464e-07, - "loss": 0.4381856620311737, - "mean_token_accuracy": 0.8389089107513428, - "num_tokens": 31765267.0, - "step": 3555 - }, - { - "epoch": 2.702127659574468, - "grad_norm": 3.3802061080932617, - "learning_rate": 1.342782615483204e-07, - "loss": 0.2558897137641907, - "mean_token_accuracy": 0.9038383960723877, - "num_tokens": 31769169.0, - "step": 3556 - }, - { - "epoch": 2.702887537993921, - "grad_norm": 1.8666874170303345, - "learning_rate": 1.3360192588282832e-07, - "loss": 0.3420698642730713, - "mean_token_accuracy": 0.8731567859649658, - "num_tokens": 31778500.0, - "step": 3557 - }, - { - "epoch": 2.7036474164133737, - "grad_norm": 2.2502217292785645, - "learning_rate": 1.3292725104659676e-07, - "loss": 0.33352571725845337, - "mean_token_accuracy": 0.889266848564148, - "num_tokens": 31786245.0, - "step": 3558 - }, - { - "epoch": 2.704407294832827, - "grad_norm": 1.7217984199523926, - "learning_rate": 1.3225423751313942e-07, - "loss": 0.3671357035636902, - "mean_token_accuracy": 0.8806703686714172, - "num_tokens": 31796242.0, - "step": 3559 - }, - { - "epoch": 2.7051671732522795, - "grad_norm": 2.5113964080810547, - "learning_rate": 1.315828857548024e-07, - "loss": 0.24104978144168854, - "mean_token_accuracy": 0.9279846549034119, - "num_tokens": 31801005.0, - "step": 3560 - }, - { - "epoch": 2.7059270516717326, - "grad_norm": 2.0345516204833984, - "learning_rate": 1.309131962427662e-07, - "loss": 0.3277859687805176, - "mean_token_accuracy": 0.8744111061096191, - "num_tokens": 31810184.0, - "step": 3561 - }, - { - "epoch": 2.7066869300911853, - "grad_norm": 1.2103748321533203, - "learning_rate": 1.3024516944704495e-07, - "loss": 0.34378400444984436, - "mean_token_accuracy": 0.8734696507453918, - "num_tokens": 31830255.0, - "step": 3562 - }, - { - "epoch": 2.7074468085106385, - "grad_norm": 2.3213655948638916, - "learning_rate": 1.2957880583648525e-07, - "loss": 0.38547977805137634, - "mean_token_accuracy": 0.8699804544448853, - "num_tokens": 31836624.0, - "step": 3563 - }, - { - "epoch": 2.708206686930091, - "grad_norm": 1.3899281024932861, - "learning_rate": 1.2891410587876714e-07, - "loss": 0.38521939516067505, - "mean_token_accuracy": 0.8629069924354553, - "num_tokens": 31851201.0, - "step": 3564 - }, - { - "epoch": 2.7089665653495443, - "grad_norm": 1.9310930967330933, - "learning_rate": 1.2825107004040272e-07, - "loss": 0.26716265082359314, - "mean_token_accuracy": 0.9009085893630981, - "num_tokens": 31858683.0, - "step": 3565 - }, - { - "epoch": 2.709726443768997, - "grad_norm": 2.839961290359497, - "learning_rate": 1.2758969878673504e-07, - "loss": 0.3741273880004883, - "mean_token_accuracy": 0.8934653997421265, - "num_tokens": 31864354.0, - "step": 3566 - }, - { - "epoch": 2.7104863221884496, - "grad_norm": 1.374866247177124, - "learning_rate": 1.269299925819409e-07, - "loss": 0.43979907035827637, - "mean_token_accuracy": 0.8200695514678955, - "num_tokens": 31879875.0, - "step": 3567 - }, - { - "epoch": 2.711246200607903, - "grad_norm": 1.149755597114563, - "learning_rate": 1.262719518890279e-07, - "loss": 0.375344842672348, - "mean_token_accuracy": 0.8663579225540161, - "num_tokens": 31902014.0, - "step": 3568 - }, - { - "epoch": 2.7120060790273555, - "grad_norm": 1.5612202882766724, - "learning_rate": 1.2561557716983308e-07, - "loss": 0.3224652409553528, - "mean_token_accuracy": 0.8762812614440918, - "num_tokens": 31913496.0, - "step": 3569 - }, - { - "epoch": 2.7127659574468086, - "grad_norm": 2.291853666305542, - "learning_rate": 1.2496086888502595e-07, - "loss": 0.299552321434021, - "mean_token_accuracy": 0.8792698383331299, - "num_tokens": 31919505.0, - "step": 3570 - }, - { - "epoch": 2.7135258358662613, - "grad_norm": 2.799447536468506, - "learning_rate": 1.2430782749410676e-07, - "loss": 0.16546699404716492, - "mean_token_accuracy": 0.943824052810669, - "num_tokens": 31923154.0, - "step": 3571 - }, - { - "epoch": 2.7142857142857144, - "grad_norm": 1.4593926668167114, - "learning_rate": 1.2365645345540383e-07, - "loss": 0.35158461332321167, - "mean_token_accuracy": 0.8825424909591675, - "num_tokens": 31936316.0, - "step": 3572 - }, - { - "epoch": 2.715045592705167, - "grad_norm": 1.3870587348937988, - "learning_rate": 1.2300674722607735e-07, - "loss": 0.25250178575515747, - "mean_token_accuracy": 0.900173544883728, - "num_tokens": 31948979.0, - "step": 3573 - }, - { - "epoch": 2.71580547112462, - "grad_norm": 1.8494576215744019, - "learning_rate": 1.223587092621162e-07, - "loss": 0.36176151037216187, - "mean_token_accuracy": 0.8696292638778687, - "num_tokens": 31957512.0, - "step": 3574 - }, - { - "epoch": 2.716565349544073, - "grad_norm": 2.2320656776428223, - "learning_rate": 1.2171234001833788e-07, - "loss": 0.3317434787750244, - "mean_token_accuracy": 0.8897237777709961, - "num_tokens": 31964788.0, - "step": 3575 - }, - { - "epoch": 2.717325227963526, - "grad_norm": 2.424726963043213, - "learning_rate": 1.2106763994838954e-07, - "loss": 0.2880811095237732, - "mean_token_accuracy": 0.8983594179153442, - "num_tokens": 31970888.0, - "step": 3576 - }, - { - "epoch": 2.7180851063829787, - "grad_norm": 1.7122806310653687, - "learning_rate": 1.204246095047465e-07, - "loss": 0.4846091568470001, - "mean_token_accuracy": 0.8358923196792603, - "num_tokens": 31981891.0, - "step": 3577 - }, - { - "epoch": 2.7188449848024314, - "grad_norm": 2.3445510864257812, - "learning_rate": 1.1978324913871214e-07, - "loss": 0.28702512383461, - "mean_token_accuracy": 0.8942852020263672, - "num_tokens": 31987375.0, - "step": 3578 - }, - { - "epoch": 2.7196048632218845, - "grad_norm": 2.418414831161499, - "learning_rate": 1.1914355930041838e-07, - "loss": 0.27506208419799805, - "mean_token_accuracy": 0.9329943656921387, - "num_tokens": 31992517.0, - "step": 3579 - }, - { - "epoch": 2.7203647416413372, - "grad_norm": 2.363285541534424, - "learning_rate": 1.1850554043882329e-07, - "loss": 0.32415682077407837, - "mean_token_accuracy": 0.9004105925559998, - "num_tokens": 31998223.0, - "step": 3580 - }, - { - "epoch": 2.7211246200607904, - "grad_norm": 1.5022046566009521, - "learning_rate": 1.178691930017134e-07, - "loss": 0.2446850836277008, - "mean_token_accuracy": 0.9055813550949097, - "num_tokens": 32008038.0, - "step": 3581 - }, - { - "epoch": 2.721884498480243, - "grad_norm": 1.7016842365264893, - "learning_rate": 1.172345174357023e-07, - "loss": 0.356515109539032, - "mean_token_accuracy": 0.876318097114563, - "num_tokens": 32018738.0, - "step": 3582 - }, - { - "epoch": 2.722644376899696, - "grad_norm": 2.113873243331909, - "learning_rate": 1.1660151418622923e-07, - "loss": 0.24748530983924866, - "mean_token_accuracy": 0.9214030504226685, - "num_tokens": 32025225.0, - "step": 3583 - }, - { - "epoch": 2.723404255319149, - "grad_norm": 1.6737921237945557, - "learning_rate": 1.159701836975602e-07, - "loss": 0.30180150270462036, - "mean_token_accuracy": 0.9211363792419434, - "num_tokens": 32034579.0, - "step": 3584 - }, - { - "epoch": 2.7241641337386016, - "grad_norm": 1.4193580150604248, - "learning_rate": 1.153405264127877e-07, - "loss": 0.2939320504665375, - "mean_token_accuracy": 0.9005526304244995, - "num_tokens": 32046461.0, - "step": 3585 - }, - { - "epoch": 2.7249240121580547, - "grad_norm": 2.273599863052368, - "learning_rate": 1.1471254277382882e-07, - "loss": 0.3552356958389282, - "mean_token_accuracy": 0.8682018518447876, - "num_tokens": 32056210.0, - "step": 3586 - }, - { - "epoch": 2.725683890577508, - "grad_norm": 2.242373466491699, - "learning_rate": 1.1408623322142736e-07, - "loss": 0.37924283742904663, - "mean_token_accuracy": 0.8833099603652954, - "num_tokens": 32063545.0, - "step": 3587 - }, - { - "epoch": 2.7264437689969605, - "grad_norm": 2.039243459701538, - "learning_rate": 1.134615981951509e-07, - "loss": 0.29171228408813477, - "mean_token_accuracy": 0.8961814641952515, - "num_tokens": 32070870.0, - "step": 3588 - }, - { - "epoch": 2.727203647416413, - "grad_norm": 1.8081161975860596, - "learning_rate": 1.1283863813339263e-07, - "loss": 0.34568479657173157, - "mean_token_accuracy": 0.9093149900436401, - "num_tokens": 32078829.0, - "step": 3589 - }, - { - "epoch": 2.7279635258358663, - "grad_norm": 2.301534414291382, - "learning_rate": 1.1221735347336976e-07, - "loss": 0.32527366280555725, - "mean_token_accuracy": 0.8894226551055908, - "num_tokens": 32084533.0, - "step": 3590 - }, - { - "epoch": 2.728723404255319, - "grad_norm": 1.9389806985855103, - "learning_rate": 1.1159774465112433e-07, - "loss": 0.39770618081092834, - "mean_token_accuracy": 0.8613806962966919, - "num_tokens": 32092713.0, - "step": 3591 - }, - { - "epoch": 2.729483282674772, - "grad_norm": 1.6589549779891968, - "learning_rate": 1.1097981210152042e-07, - "loss": 0.3170590400695801, - "mean_token_accuracy": 0.8901652097702026, - "num_tokens": 32102904.0, - "step": 3592 - }, - { - "epoch": 2.730243161094225, - "grad_norm": 1.8090909719467163, - "learning_rate": 1.1036355625824808e-07, - "loss": 0.274291455745697, - "mean_token_accuracy": 0.9074428081512451, - "num_tokens": 32111009.0, - "step": 3593 - }, - { - "epoch": 2.731003039513678, - "grad_norm": 2.431757688522339, - "learning_rate": 1.0974897755381936e-07, - "loss": 0.30703026056289673, - "mean_token_accuracy": 0.9109988808631897, - "num_tokens": 32116173.0, - "step": 3594 - }, - { - "epoch": 2.7317629179331306, - "grad_norm": 1.8828567266464233, - "learning_rate": 1.0913607641956842e-07, - "loss": 0.34009286761283875, - "mean_token_accuracy": 0.8761146068572998, - "num_tokens": 32124273.0, - "step": 3595 - }, - { - "epoch": 2.7325227963525833, - "grad_norm": 1.2194745540618896, - "learning_rate": 1.0852485328565337e-07, - "loss": 0.2432229220867157, - "mean_token_accuracy": 0.8984386920928955, - "num_tokens": 32137430.0, - "step": 3596 - }, - { - "epoch": 2.7332826747720365, - "grad_norm": 2.3038880825042725, - "learning_rate": 1.0791530858105387e-07, - "loss": 0.2546696066856384, - "mean_token_accuracy": 0.9092214107513428, - "num_tokens": 32145207.0, - "step": 3597 - }, - { - "epoch": 2.7340425531914896, - "grad_norm": 2.807394504547119, - "learning_rate": 1.0730744273357213e-07, - "loss": 0.33576664328575134, - "mean_token_accuracy": 0.8793773651123047, - "num_tokens": 32150161.0, - "step": 3598 - }, - { - "epoch": 2.7348024316109423, - "grad_norm": 1.8207601308822632, - "learning_rate": 1.067012561698319e-07, - "loss": 0.43848833441734314, - "mean_token_accuracy": 0.8729845285415649, - "num_tokens": 32160180.0, - "step": 3599 - }, - { - "epoch": 2.735562310030395, - "grad_norm": 1.5954468250274658, - "learning_rate": 1.0609674931527786e-07, - "loss": 0.3471013307571411, - "mean_token_accuracy": 0.889906644821167, - "num_tokens": 32172442.0, - "step": 3600 - }, - { - "epoch": 2.736322188449848, - "grad_norm": 1.2474297285079956, - "learning_rate": 1.0549392259417646e-07, - "loss": 0.2967996299266815, - "mean_token_accuracy": 0.887985110282898, - "num_tokens": 32187624.0, - "step": 3601 - }, - { - "epoch": 2.737082066869301, - "grad_norm": 1.4285695552825928, - "learning_rate": 1.0489277642961481e-07, - "loss": 0.2793816924095154, - "mean_token_accuracy": 0.8948850631713867, - "num_tokens": 32199904.0, - "step": 3602 - }, - { - "epoch": 2.737841945288754, - "grad_norm": 1.4096852540969849, - "learning_rate": 1.0429331124350045e-07, - "loss": 0.39516502618789673, - "mean_token_accuracy": 0.8942514657974243, - "num_tokens": 32213145.0, - "step": 3603 - }, - { - "epoch": 2.7386018237082066, - "grad_norm": 1.4818166494369507, - "learning_rate": 1.0369552745656014e-07, - "loss": 0.3851013779640198, - "mean_token_accuracy": 0.8604148626327515, - "num_tokens": 32225576.0, - "step": 3604 - }, - { - "epoch": 2.7393617021276597, - "grad_norm": 2.0186386108398438, - "learning_rate": 1.0309942548834329e-07, - "loss": 0.2715086340904236, - "mean_token_accuracy": 0.9169677495956421, - "num_tokens": 32232808.0, - "step": 3605 - }, - { - "epoch": 2.7401215805471124, - "grad_norm": 2.3498101234436035, - "learning_rate": 1.0250500575721578e-07, - "loss": 0.2616893947124481, - "mean_token_accuracy": 0.9052878618240356, - "num_tokens": 32239209.0, - "step": 3606 - }, - { - "epoch": 2.740881458966565, - "grad_norm": 2.3760416507720947, - "learning_rate": 1.0191226868036419e-07, - "loss": 0.3654823303222656, - "mean_token_accuracy": 0.9066962003707886, - "num_tokens": 32245690.0, - "step": 3607 - }, - { - "epoch": 2.7416413373860182, - "grad_norm": 1.9187121391296387, - "learning_rate": 1.0132121467379574e-07, - "loss": 0.2764931321144104, - "mean_token_accuracy": 0.9288564920425415, - "num_tokens": 32252804.0, - "step": 3608 - }, - { - "epoch": 2.7424012158054714, - "grad_norm": 2.57564115524292, - "learning_rate": 1.0073184415233334e-07, - "loss": 0.2813187837600708, - "mean_token_accuracy": 0.890303909778595, - "num_tokens": 32258534.0, - "step": 3609 - }, - { - "epoch": 2.743161094224924, - "grad_norm": 2.0758004188537598, - "learning_rate": 1.0014415752962081e-07, - "loss": 0.29847270250320435, - "mean_token_accuracy": 0.8947038054466248, - "num_tokens": 32265373.0, - "step": 3610 - }, - { - "epoch": 2.7439209726443767, - "grad_norm": 3.005535840988159, - "learning_rate": 9.955815521811852e-08, - "loss": 0.2781291604042053, - "mean_token_accuracy": 0.899482250213623, - "num_tokens": 32269487.0, - "step": 3611 - }, - { - "epoch": 2.74468085106383, - "grad_norm": 2.131834030151367, - "learning_rate": 9.897383762910606e-08, - "loss": 0.2915271520614624, - "mean_token_accuracy": 0.8984331488609314, - "num_tokens": 32276242.0, - "step": 3612 - }, - { - "epoch": 2.7454407294832825, - "grad_norm": 2.048445463180542, - "learning_rate": 9.839120517267986e-08, - "loss": 0.38389909267425537, - "mean_token_accuracy": 0.8720065951347351, - "num_tokens": 32284956.0, - "step": 3613 - }, - { - "epoch": 2.7462006079027357, - "grad_norm": 3.3529200553894043, - "learning_rate": 9.781025825775392e-08, - "loss": 0.29694801568984985, - "mean_token_accuracy": 0.8991866111755371, - "num_tokens": 32289109.0, - "step": 3614 - }, - { - "epoch": 2.7469604863221884, - "grad_norm": 2.5099470615386963, - "learning_rate": 9.72309972920582e-08, - "loss": 0.2015802264213562, - "mean_token_accuracy": 0.9364612102508545, - "num_tokens": 32294163.0, - "step": 3615 - }, - { - "epoch": 2.7477203647416415, - "grad_norm": 1.7144349813461304, - "learning_rate": 9.665342268214167e-08, - "loss": 0.42185109853744507, - "mean_token_accuracy": 0.8469204902648926, - "num_tokens": 32304034.0, - "step": 3616 - }, - { - "epoch": 2.748480243161094, - "grad_norm": 1.7306944131851196, - "learning_rate": 9.607753483336812e-08, - "loss": 0.294491708278656, - "mean_token_accuracy": 0.8831486701965332, - "num_tokens": 32314079.0, - "step": 3617 - }, - { - "epoch": 2.749240121580547, - "grad_norm": 1.5339795351028442, - "learning_rate": 9.55033341499173e-08, - "loss": 0.4163019359111786, - "mean_token_accuracy": 0.8496603965759277, - "num_tokens": 32325707.0, - "step": 3618 - }, - { - "epoch": 2.75, - "grad_norm": 1.878015398979187, - "learning_rate": 9.493082103478519e-08, - "loss": 0.2632361650466919, - "mean_token_accuracy": 0.8944116830825806, - "num_tokens": 32333710.0, - "step": 3619 - }, - { - "epoch": 2.750759878419453, - "grad_norm": 1.771299958229065, - "learning_rate": 9.43599958897845e-08, - "loss": 0.3327634334564209, - "mean_token_accuracy": 0.8778671026229858, - "num_tokens": 32343311.0, - "step": 3620 - }, - { - "epoch": 2.751519756838906, - "grad_norm": 1.358282208442688, - "learning_rate": 9.379085911554148e-08, - "loss": 0.3822714686393738, - "mean_token_accuracy": 0.8586339354515076, - "num_tokens": 32361435.0, - "step": 3621 - }, - { - "epoch": 2.7522796352583585, - "grad_norm": 1.9158512353897095, - "learning_rate": 9.322341111149852e-08, - "loss": 0.23024609684944153, - "mean_token_accuracy": 0.9222040176391602, - "num_tokens": 32368371.0, - "step": 3622 - }, - { - "epoch": 2.7530395136778116, - "grad_norm": 1.247992753982544, - "learning_rate": 9.265765227591261e-08, - "loss": 0.3436150550842285, - "mean_token_accuracy": 0.8803039789199829, - "num_tokens": 32388723.0, - "step": 3623 - }, - { - "epoch": 2.7537993920972643, - "grad_norm": 1.913124918937683, - "learning_rate": 9.209358300585474e-08, - "loss": 0.35059863328933716, - "mean_token_accuracy": 0.875072717666626, - "num_tokens": 32397011.0, - "step": 3624 - }, - { - "epoch": 2.7545592705167175, - "grad_norm": 2.487434148788452, - "learning_rate": 9.153120369721047e-08, - "loss": 0.2234063446521759, - "mean_token_accuracy": 0.904019832611084, - "num_tokens": 32402316.0, - "step": 3625 - }, - { - "epoch": 2.75531914893617, - "grad_norm": 2.188255548477173, - "learning_rate": 9.09705147446796e-08, - "loss": 0.19389624893665314, - "mean_token_accuracy": 0.9302033185958862, - "num_tokens": 32408031.0, - "step": 3626 - }, - { - "epoch": 2.7560790273556233, - "grad_norm": 2.892735004425049, - "learning_rate": 9.041151654177488e-08, - "loss": 0.24316613376140594, - "mean_token_accuracy": 0.9222840070724487, - "num_tokens": 32412498.0, - "step": 3627 - }, - { - "epoch": 2.756838905775076, - "grad_norm": 2.6814024448394775, - "learning_rate": 8.985420948082329e-08, - "loss": 0.2725716233253479, - "mean_token_accuracy": 0.9069510698318481, - "num_tokens": 32417717.0, - "step": 3628 - }, - { - "epoch": 2.7575987841945286, - "grad_norm": 2.8956947326660156, - "learning_rate": 8.929859395296365e-08, - "loss": 0.3466540575027466, - "mean_token_accuracy": 0.8771743774414062, - "num_tokens": 32422425.0, - "step": 3629 - }, - { - "epoch": 2.7583586626139818, - "grad_norm": 2.393306016921997, - "learning_rate": 8.874467034814816e-08, - "loss": 0.40261518955230713, - "mean_token_accuracy": 0.8902627229690552, - "num_tokens": 32428512.0, - "step": 3630 - }, - { - "epoch": 2.759118541033435, - "grad_norm": 2.201388359069824, - "learning_rate": 8.819243905514308e-08, - "loss": 0.28923481702804565, - "mean_token_accuracy": 0.8948091268539429, - "num_tokens": 32434316.0, - "step": 3631 - }, - { - "epoch": 2.7598784194528876, - "grad_norm": 1.9007173776626587, - "learning_rate": 8.764190046152421e-08, - "loss": 0.3775410056114197, - "mean_token_accuracy": 0.8737541437149048, - "num_tokens": 32442785.0, - "step": 3632 - }, - { - "epoch": 2.7606382978723403, - "grad_norm": 1.0914241075515747, - "learning_rate": 8.709305495368137e-08, - "loss": 0.27528852224349976, - "mean_token_accuracy": 0.8981513977050781, - "num_tokens": 32462749.0, - "step": 3633 - }, - { - "epoch": 2.7613981762917934, - "grad_norm": 2.024019718170166, - "learning_rate": 8.654590291681531e-08, - "loss": 0.3178071677684784, - "mean_token_accuracy": 0.8825376033782959, - "num_tokens": 32470041.0, - "step": 3634 - }, - { - "epoch": 2.762158054711246, - "grad_norm": 1.038554072380066, - "learning_rate": 8.600044473493856e-08, - "loss": 0.26435115933418274, - "mean_token_accuracy": 0.9002813100814819, - "num_tokens": 32492633.0, - "step": 3635 - }, - { - "epoch": 2.762917933130699, - "grad_norm": 3.143336057662964, - "learning_rate": 8.545668079087438e-08, - "loss": 0.356077641248703, - "mean_token_accuracy": 0.890540361404419, - "num_tokens": 32497085.0, - "step": 3636 - }, - { - "epoch": 2.763677811550152, - "grad_norm": 1.8176860809326172, - "learning_rate": 8.491461146625774e-08, - "loss": 0.42660102248191833, - "mean_token_accuracy": 0.8467463254928589, - "num_tokens": 32506375.0, - "step": 3637 - }, - { - "epoch": 2.764437689969605, - "grad_norm": 1.7116483449935913, - "learning_rate": 8.437423714153292e-08, - "loss": 0.3794213533401489, - "mean_token_accuracy": 0.8674054145812988, - "num_tokens": 32517443.0, - "step": 3638 - }, - { - "epoch": 2.7651975683890577, - "grad_norm": 3.004796266555786, - "learning_rate": 8.383555819595601e-08, - "loss": 0.3199142515659332, - "mean_token_accuracy": 0.8825819492340088, - "num_tokens": 32527003.0, - "step": 3639 - }, - { - "epoch": 2.7659574468085104, - "grad_norm": 2.6139073371887207, - "learning_rate": 8.329857500759291e-08, - "loss": 0.4262070059776306, - "mean_token_accuracy": 0.8643308281898499, - "num_tokens": 32533227.0, - "step": 3640 - }, - { - "epoch": 2.7667173252279635, - "grad_norm": 1.4850772619247437, - "learning_rate": 8.2763287953318e-08, - "loss": 0.4211199879646301, - "mean_token_accuracy": 0.8522083759307861, - "num_tokens": 32546463.0, - "step": 3641 - }, - { - "epoch": 2.7674772036474167, - "grad_norm": 2.1967451572418213, - "learning_rate": 8.22296974088177e-08, - "loss": 0.32154369354248047, - "mean_token_accuracy": 0.9058319926261902, - "num_tokens": 32554292.0, - "step": 3642 - }, - { - "epoch": 2.7682370820668694, - "grad_norm": 1.4377225637435913, - "learning_rate": 8.169780374858577e-08, - "loss": 0.34665489196777344, - "mean_token_accuracy": 0.8763554096221924, - "num_tokens": 32567357.0, - "step": 3643 - }, - { - "epoch": 2.768996960486322, - "grad_norm": 1.8216571807861328, - "learning_rate": 8.116760734592527e-08, - "loss": 0.39765921235084534, - "mean_token_accuracy": 0.8595637679100037, - "num_tokens": 32577681.0, - "step": 3644 - }, - { - "epoch": 2.769756838905775, - "grad_norm": 3.732693672180176, - "learning_rate": 8.063910857294881e-08, - "loss": 0.16449159383773804, - "mean_token_accuracy": 0.9406331777572632, - "num_tokens": 32580792.0, - "step": 3645 - }, - { - "epoch": 2.770516717325228, - "grad_norm": 1.4248076677322388, - "learning_rate": 8.011230780057749e-08, - "loss": 0.43648213148117065, - "mean_token_accuracy": 0.8409627676010132, - "num_tokens": 32596950.0, - "step": 3646 - }, - { - "epoch": 2.771276595744681, - "grad_norm": 1.5802161693572998, - "learning_rate": 7.958720539853971e-08, - "loss": 0.41201114654541016, - "mean_token_accuracy": 0.8678973913192749, - "num_tokens": 32608870.0, - "step": 3647 - }, - { - "epoch": 2.7720364741641337, - "grad_norm": 1.864032506942749, - "learning_rate": 7.906380173537315e-08, - "loss": 0.3839274048805237, - "mean_token_accuracy": 0.863370418548584, - "num_tokens": 32619357.0, - "step": 3648 - }, - { - "epoch": 2.772796352583587, - "grad_norm": 2.0040485858917236, - "learning_rate": 7.854209717842231e-08, - "loss": 0.4682219624519348, - "mean_token_accuracy": 0.8341292142868042, - "num_tokens": 32628659.0, - "step": 3649 - }, - { - "epoch": 2.7735562310030395, - "grad_norm": 3.2517287731170654, - "learning_rate": 7.80220920938396e-08, - "loss": 0.3697377145290375, - "mean_token_accuracy": 0.8937886357307434, - "num_tokens": 32632724.0, - "step": 3650 - }, - { - "epoch": 2.774316109422492, - "grad_norm": 1.437434434890747, - "learning_rate": 7.750378684658444e-08, - "loss": 0.21713104844093323, - "mean_token_accuracy": 0.9223493337631226, - "num_tokens": 32643085.0, - "step": 3651 - }, - { - "epoch": 2.7750759878419453, - "grad_norm": 1.3312400579452515, - "learning_rate": 7.698718180042392e-08, - "loss": 0.3078494668006897, - "mean_token_accuracy": 0.8865747451782227, - "num_tokens": 32657205.0, - "step": 3652 - }, - { - "epoch": 2.7758358662613984, - "grad_norm": 1.3009766340255737, - "learning_rate": 7.647227731793078e-08, - "loss": 0.33374494314193726, - "mean_token_accuracy": 0.8755972385406494, - "num_tokens": 32670785.0, - "step": 3653 - }, - { - "epoch": 2.776595744680851, - "grad_norm": 1.7956385612487793, - "learning_rate": 7.595907376048512e-08, - "loss": 0.3185005486011505, - "mean_token_accuracy": 0.896104097366333, - "num_tokens": 32679376.0, - "step": 3654 - }, - { - "epoch": 2.777355623100304, - "grad_norm": 1.9820408821105957, - "learning_rate": 7.544757148827297e-08, - "loss": 0.34602630138397217, - "mean_token_accuracy": 0.9006669521331787, - "num_tokens": 32687327.0, - "step": 3655 - }, - { - "epoch": 2.778115501519757, - "grad_norm": 1.447498083114624, - "learning_rate": 7.493777086028608e-08, - "loss": 0.29633957147598267, - "mean_token_accuracy": 0.8827477693557739, - "num_tokens": 32698669.0, - "step": 3656 - }, - { - "epoch": 2.7788753799392096, - "grad_norm": 1.195237159729004, - "learning_rate": 7.442967223432212e-08, - "loss": 0.25846078991889954, - "mean_token_accuracy": 0.932551920413971, - "num_tokens": 32713411.0, - "step": 3657 - }, - { - "epoch": 2.7796352583586628, - "grad_norm": 1.4306368827819824, - "learning_rate": 7.392327596698474e-08, - "loss": 0.22794288396835327, - "mean_token_accuracy": 0.9128783941268921, - "num_tokens": 32724629.0, - "step": 3658 - }, - { - "epoch": 2.7803951367781155, - "grad_norm": 3.5105903148651123, - "learning_rate": 7.341858241368182e-08, - "loss": 0.21695205569267273, - "mean_token_accuracy": 0.9189575910568237, - "num_tokens": 32728392.0, - "step": 3659 - }, - { - "epoch": 2.7811550151975686, - "grad_norm": 2.8782589435577393, - "learning_rate": 7.291559192862701e-08, - "loss": 0.3374413847923279, - "mean_token_accuracy": 0.9080451726913452, - "num_tokens": 32733126.0, - "step": 3660 - }, - { - "epoch": 2.7819148936170213, - "grad_norm": 1.9232850074768066, - "learning_rate": 7.24143048648382e-08, - "loss": 0.2707790732383728, - "mean_token_accuracy": 0.9045628309249878, - "num_tokens": 32741378.0, - "step": 3661 - }, - { - "epoch": 2.782674772036474, - "grad_norm": 1.1166657209396362, - "learning_rate": 7.19147215741381e-08, - "loss": 0.2668237090110779, - "mean_token_accuracy": 0.8920862674713135, - "num_tokens": 32760317.0, - "step": 3662 - }, - { - "epoch": 2.783434650455927, - "grad_norm": 3.9177591800689697, - "learning_rate": 7.141684240715374e-08, - "loss": 0.18272298574447632, - "mean_token_accuracy": 0.94575434923172, - "num_tokens": 32763663.0, - "step": 3663 - }, - { - "epoch": 2.78419452887538, - "grad_norm": 1.9616899490356445, - "learning_rate": 7.092066771331507e-08, - "loss": 0.20110884308815002, - "mean_token_accuracy": 0.9169102907180786, - "num_tokens": 32770243.0, - "step": 3664 - }, - { - "epoch": 2.784954407294833, - "grad_norm": 3.5950927734375, - "learning_rate": 7.042619784085741e-08, - "loss": 0.24979421496391296, - "mean_token_accuracy": 0.9095007181167603, - "num_tokens": 32773985.0, - "step": 3665 - }, - { - "epoch": 2.7857142857142856, - "grad_norm": 1.8824433088302612, - "learning_rate": 6.993343313681872e-08, - "loss": 0.32540541887283325, - "mean_token_accuracy": 0.8754172921180725, - "num_tokens": 32782040.0, - "step": 3666 - }, - { - "epoch": 2.7864741641337387, - "grad_norm": 1.7720941305160522, - "learning_rate": 6.944237394703985e-08, - "loss": 0.2930932641029358, - "mean_token_accuracy": 0.8913610577583313, - "num_tokens": 32790338.0, - "step": 3667 - }, - { - "epoch": 2.7872340425531914, - "grad_norm": 1.6130414009094238, - "learning_rate": 6.895302061616483e-08, - "loss": 0.35470184683799744, - "mean_token_accuracy": 0.8745495676994324, - "num_tokens": 32801160.0, - "step": 3668 - }, - { - "epoch": 2.7879939209726445, - "grad_norm": 1.315376877784729, - "learning_rate": 6.846537348764116e-08, - "loss": 0.33905792236328125, - "mean_token_accuracy": 0.8629679679870605, - "num_tokens": 32816508.0, - "step": 3669 - }, - { - "epoch": 2.788753799392097, - "grad_norm": 1.9508394002914429, - "learning_rate": 6.797943290371839e-08, - "loss": 0.27722638845443726, - "mean_token_accuracy": 0.8903636932373047, - "num_tokens": 32824029.0, - "step": 3670 - }, - { - "epoch": 2.7895136778115504, - "grad_norm": 0.9335530996322632, - "learning_rate": 6.74951992054479e-08, - "loss": 0.3004249632358551, - "mean_token_accuracy": 0.887278139591217, - "num_tokens": 32849091.0, - "step": 3671 - }, - { - "epoch": 2.790273556231003, - "grad_norm": 1.8353229761123657, - "learning_rate": 6.701267273268392e-08, - "loss": 0.3471749424934387, - "mean_token_accuracy": 0.8823778629302979, - "num_tokens": 32858285.0, - "step": 3672 - }, - { - "epoch": 2.7910334346504557, - "grad_norm": 2.469905138015747, - "learning_rate": 6.653185382408195e-08, - "loss": 0.27492985129356384, - "mean_token_accuracy": 0.898033857345581, - "num_tokens": 32863568.0, - "step": 3673 - }, - { - "epoch": 2.791793313069909, - "grad_norm": 1.861342430114746, - "learning_rate": 6.605274281709929e-08, - "loss": 0.4201383590698242, - "mean_token_accuracy": 0.8511666655540466, - "num_tokens": 32873794.0, - "step": 3674 - }, - { - "epoch": 2.7925531914893615, - "grad_norm": 1.6716010570526123, - "learning_rate": 6.557534004799443e-08, - "loss": 0.31345364451408386, - "mean_token_accuracy": 0.8953241109848022, - "num_tokens": 32883515.0, - "step": 3675 - }, - { - "epoch": 2.7933130699088147, - "grad_norm": 1.566288709640503, - "learning_rate": 6.509964585182688e-08, - "loss": 0.36333587765693665, - "mean_token_accuracy": 0.866706132888794, - "num_tokens": 32895232.0, - "step": 3676 - }, - { - "epoch": 2.7940729483282674, - "grad_norm": 1.5501067638397217, - "learning_rate": 6.462566056245761e-08, - "loss": 0.2846035957336426, - "mean_token_accuracy": 0.9041277766227722, - "num_tokens": 32903854.0, - "step": 3677 - }, - { - "epoch": 2.7948328267477205, - "grad_norm": 2.15285325050354, - "learning_rate": 6.415338451254722e-08, - "loss": 0.35233989357948303, - "mean_token_accuracy": 0.8840795159339905, - "num_tokens": 32911633.0, - "step": 3678 - }, - { - "epoch": 2.795592705167173, - "grad_norm": 1.3108829259872437, - "learning_rate": 6.368281803355692e-08, - "loss": 0.3379764258861542, - "mean_token_accuracy": 0.9114458560943604, - "num_tokens": 32925455.0, - "step": 3679 - }, - { - "epoch": 2.7963525835866263, - "grad_norm": 1.818579912185669, - "learning_rate": 6.321396145574948e-08, - "loss": 0.32847997546195984, - "mean_token_accuracy": 0.8970182538032532, - "num_tokens": 32935029.0, - "step": 3680 - }, - { - "epoch": 2.797112462006079, - "grad_norm": 3.7173373699188232, - "learning_rate": 6.274681510818587e-08, - "loss": 0.18795353174209595, - "mean_token_accuracy": 0.9429396986961365, - "num_tokens": 32938652.0, - "step": 3681 - }, - { - "epoch": 2.797872340425532, - "grad_norm": 2.2997212409973145, - "learning_rate": 6.228137931872713e-08, - "loss": 0.34515100717544556, - "mean_token_accuracy": 0.878103494644165, - "num_tokens": 32945409.0, - "step": 3682 - }, - { - "epoch": 2.798632218844985, - "grad_norm": 2.424675941467285, - "learning_rate": 6.18176544140342e-08, - "loss": 0.2552722990512848, - "mean_token_accuracy": 0.9087961316108704, - "num_tokens": 32950721.0, - "step": 3683 - }, - { - "epoch": 2.7993920972644375, - "grad_norm": 2.662060022354126, - "learning_rate": 6.135564071956729e-08, - "loss": 0.2554262578487396, - "mean_token_accuracy": 0.9034075736999512, - "num_tokens": 32955891.0, - "step": 3684 - }, - { - "epoch": 2.8001519756838906, - "grad_norm": 1.1945017576217651, - "learning_rate": 6.089533855958508e-08, - "loss": 0.36223694682121277, - "mean_token_accuracy": 0.8567380905151367, - "num_tokens": 32971543.0, - "step": 3685 - }, - { - "epoch": 2.8009118541033433, - "grad_norm": 1.2724100351333618, - "learning_rate": 6.043674825714607e-08, - "loss": 0.35224610567092896, - "mean_token_accuracy": 0.8696926832199097, - "num_tokens": 32986452.0, - "step": 3686 - }, - { - "epoch": 2.8016717325227964, - "grad_norm": 1.3042409420013428, - "learning_rate": 5.997987013410533e-08, - "loss": 0.38680803775787354, - "mean_token_accuracy": 0.8600257635116577, - "num_tokens": 33005534.0, - "step": 3687 - }, - { - "epoch": 2.802431610942249, - "grad_norm": 2.448430299758911, - "learning_rate": 5.9524704511118305e-08, - "loss": 0.13345648348331451, - "mean_token_accuracy": 0.9592865705490112, - "num_tokens": 33009403.0, - "step": 3688 - }, - { - "epoch": 2.8031914893617023, - "grad_norm": 1.1455037593841553, - "learning_rate": 5.9071251707638056e-08, - "loss": 0.3144465982913971, - "mean_token_accuracy": 0.8841190338134766, - "num_tokens": 33028129.0, - "step": 3689 - }, - { - "epoch": 2.803951367781155, - "grad_norm": 2.0947425365448, - "learning_rate": 5.861951204191446e-08, - "loss": 0.36041027307510376, - "mean_token_accuracy": 0.8605015873908997, - "num_tokens": 33036379.0, - "step": 3690 - }, - { - "epoch": 2.8047112462006076, - "grad_norm": 3.1552155017852783, - "learning_rate": 5.8169485830996134e-08, - "loss": 0.32727721333503723, - "mean_token_accuracy": 0.9110068678855896, - "num_tokens": 33040276.0, - "step": 3691 - }, - { - "epoch": 2.8054711246200608, - "grad_norm": 2.5555851459503174, - "learning_rate": 5.772117339072902e-08, - "loss": 0.23542895913124084, - "mean_token_accuracy": 0.91229647397995, - "num_tokens": 33045308.0, - "step": 3692 - }, - { - "epoch": 2.806231003039514, - "grad_norm": 2.4970197677612305, - "learning_rate": 5.7274575035755896e-08, - "loss": 0.13501018285751343, - "mean_token_accuracy": 0.9495668411254883, - "num_tokens": 33049012.0, - "step": 3693 - }, - { - "epoch": 2.8069908814589666, - "grad_norm": 3.25179123878479, - "learning_rate": 5.68296910795163e-08, - "loss": 0.39757871627807617, - "mean_token_accuracy": 0.8692524433135986, - "num_tokens": 33053004.0, - "step": 3694 - }, - { - "epoch": 2.8077507598784193, - "grad_norm": 2.4152987003326416, - "learning_rate": 5.6386521834247696e-08, - "loss": 0.3562552332878113, - "mean_token_accuracy": 0.8817118406295776, - "num_tokens": 33059557.0, - "step": 3695 - }, - { - "epoch": 2.8085106382978724, - "grad_norm": 2.051687002182007, - "learning_rate": 5.5945067610982395e-08, - "loss": 0.5281018018722534, - "mean_token_accuracy": 0.8174080848693848, - "num_tokens": 33068691.0, - "step": 3696 - }, - { - "epoch": 2.809270516717325, - "grad_norm": 3.8002891540527344, - "learning_rate": 5.550532871955061e-08, - "loss": 0.20866292715072632, - "mean_token_accuracy": 0.9262990951538086, - "num_tokens": 33072085.0, - "step": 3697 - }, - { - "epoch": 2.810030395136778, - "grad_norm": 2.3774707317352295, - "learning_rate": 5.506730546857797e-08, - "loss": 0.2632027566432953, - "mean_token_accuracy": 0.9251352548599243, - "num_tokens": 33078720.0, - "step": 3698 - }, - { - "epoch": 2.810790273556231, - "grad_norm": 1.3897415399551392, - "learning_rate": 5.463099816548578e-08, - "loss": 0.3936246931552887, - "mean_token_accuracy": 0.8637404441833496, - "num_tokens": 33092660.0, - "step": 3699 - }, - { - "epoch": 2.811550151975684, - "grad_norm": 1.5614900588989258, - "learning_rate": 5.419640711649188e-08, - "loss": 0.44372743368148804, - "mean_token_accuracy": 0.8500189185142517, - "num_tokens": 33104431.0, - "step": 3700 - }, - { - "epoch": 2.8123100303951367, - "grad_norm": 1.466921329498291, - "learning_rate": 5.376353262660811e-08, - "loss": 0.3102647066116333, - "mean_token_accuracy": 0.8741628527641296, - "num_tokens": 33115290.0, - "step": 3701 - }, - { - "epoch": 2.8130699088145894, - "grad_norm": 1.6993112564086914, - "learning_rate": 5.333237499964283e-08, - "loss": 0.4017091989517212, - "mean_token_accuracy": 0.865143358707428, - "num_tokens": 33126710.0, - "step": 3702 - }, - { - "epoch": 2.8138297872340425, - "grad_norm": 2.2112064361572266, - "learning_rate": 5.290293453819956e-08, - "loss": 0.3109806776046753, - "mean_token_accuracy": 0.9097060561180115, - "num_tokens": 33133186.0, - "step": 3703 - }, - { - "epoch": 2.8145896656534957, - "grad_norm": 1.9934327602386475, - "learning_rate": 5.247521154367552e-08, - "loss": 0.35044047236442566, - "mean_token_accuracy": 0.874421238899231, - "num_tokens": 33140329.0, - "step": 3704 - }, - { - "epoch": 2.8153495440729484, - "grad_norm": 2.815687656402588, - "learning_rate": 5.2049206316263366e-08, - "loss": 0.2516332268714905, - "mean_token_accuracy": 0.9180612564086914, - "num_tokens": 33144861.0, - "step": 3705 - }, - { - "epoch": 2.816109422492401, - "grad_norm": 1.7479608058929443, - "learning_rate": 5.162491915495005e-08, - "loss": 0.16342511773109436, - "mean_token_accuracy": 0.9410310983657837, - "num_tokens": 33151936.0, - "step": 3706 - }, - { - "epoch": 2.816869300911854, - "grad_norm": 1.3695951700210571, - "learning_rate": 5.120235035751653e-08, - "loss": 0.2908460199832916, - "mean_token_accuracy": 0.9211517572402954, - "num_tokens": 33164151.0, - "step": 3707 - }, - { - "epoch": 2.817629179331307, - "grad_norm": 2.370861768722534, - "learning_rate": 5.0781500220537797e-08, - "loss": 0.26081186532974243, - "mean_token_accuracy": 0.9090365171432495, - "num_tokens": 33169551.0, - "step": 3708 - }, - { - "epoch": 2.81838905775076, - "grad_norm": 1.627031922340393, - "learning_rate": 5.036236903938285e-08, - "loss": 0.2977932393550873, - "mean_token_accuracy": 0.9078235626220703, - "num_tokens": 33179586.0, - "step": 3709 - }, - { - "epoch": 2.8191489361702127, - "grad_norm": 1.830381155014038, - "learning_rate": 4.9944957108213896e-08, - "loss": 0.2239128053188324, - "mean_token_accuracy": 0.9216980934143066, - "num_tokens": 33186754.0, - "step": 3710 - }, - { - "epoch": 2.819908814589666, - "grad_norm": 2.419703245162964, - "learning_rate": 4.952926471998687e-08, - "loss": 0.3302939832210541, - "mean_token_accuracy": 0.9000803232192993, - "num_tokens": 33192512.0, - "step": 3711 - }, - { - "epoch": 2.8206686930091185, - "grad_norm": 2.2166857719421387, - "learning_rate": 4.911529216645089e-08, - "loss": 0.2880767285823822, - "mean_token_accuracy": 0.9058420658111572, - "num_tokens": 33198274.0, - "step": 3712 - }, - { - "epoch": 2.821428571428571, - "grad_norm": 1.357695460319519, - "learning_rate": 4.8703039738147165e-08, - "loss": 0.38549065589904785, - "mean_token_accuracy": 0.8689560890197754, - "num_tokens": 33213015.0, - "step": 3713 - }, - { - "epoch": 2.8221884498480243, - "grad_norm": 1.3445006608963013, - "learning_rate": 4.829250772441091e-08, - "loss": 0.28673315048217773, - "mean_token_accuracy": 0.8871713876724243, - "num_tokens": 33226895.0, - "step": 3714 - }, - { - "epoch": 2.8229483282674774, - "grad_norm": 2.043430805206299, - "learning_rate": 4.788369641336943e-08, - "loss": 0.27235424518585205, - "mean_token_accuracy": 0.9001829624176025, - "num_tokens": 33233991.0, - "step": 3715 - }, - { - "epoch": 2.82370820668693, - "grad_norm": 3.290034294128418, - "learning_rate": 4.7476606091941544e-08, - "loss": 0.3277619481086731, - "mean_token_accuracy": 0.9064863324165344, - "num_tokens": 33238393.0, - "step": 3716 - }, - { - "epoch": 2.824468085106383, - "grad_norm": 3.1663918495178223, - "learning_rate": 4.707123704583927e-08, - "loss": 0.2841528058052063, - "mean_token_accuracy": 0.9187209606170654, - "num_tokens": 33242428.0, - "step": 3717 - }, - { - "epoch": 2.825227963525836, - "grad_norm": 1.2812966108322144, - "learning_rate": 4.6667589559566405e-08, - "loss": 0.4020092785358429, - "mean_token_accuracy": 0.8751412630081177, - "num_tokens": 33257996.0, - "step": 3718 - }, - { - "epoch": 2.8259878419452886, - "grad_norm": 1.4390029907226562, - "learning_rate": 4.626566391641774e-08, - "loss": 0.44845378398895264, - "mean_token_accuracy": 0.8416492938995361, - "num_tokens": 33271661.0, - "step": 3719 - }, - { - "epoch": 2.8267477203647418, - "grad_norm": 1.5283807516098022, - "learning_rate": 4.586546039848094e-08, - "loss": 0.28856372833251953, - "mean_token_accuracy": 0.8961426019668579, - "num_tokens": 33282969.0, - "step": 3720 - }, - { - "epoch": 2.8275075987841944, - "grad_norm": 1.5666929483413696, - "learning_rate": 4.546697928663357e-08, - "loss": 0.3489445149898529, - "mean_token_accuracy": 0.8704522848129272, - "num_tokens": 33293549.0, - "step": 3721 - }, - { - "epoch": 2.8282674772036476, - "grad_norm": 1.6343169212341309, - "learning_rate": 4.5070220860545244e-08, - "loss": 0.3505254089832306, - "mean_token_accuracy": 0.8735896348953247, - "num_tokens": 33304821.0, - "step": 3722 - }, - { - "epoch": 2.8290273556231003, - "grad_norm": 1.963257074356079, - "learning_rate": 4.467518539867655e-08, - "loss": 0.3180759847164154, - "mean_token_accuracy": 0.8902066946029663, - "num_tokens": 33312313.0, - "step": 3723 - }, - { - "epoch": 2.829787234042553, - "grad_norm": 3.3562021255493164, - "learning_rate": 4.428187317827848e-08, - "loss": 0.23085635900497437, - "mean_token_accuracy": 0.9242620468139648, - "num_tokens": 33315831.0, - "step": 3724 - }, - { - "epoch": 2.830547112462006, - "grad_norm": 1.7402317523956299, - "learning_rate": 4.3890284475392175e-08, - "loss": 0.27766430377960205, - "mean_token_accuracy": 0.8943138122558594, - "num_tokens": 33324982.0, - "step": 3725 - }, - { - "epoch": 2.831306990881459, - "grad_norm": 1.6835107803344727, - "learning_rate": 4.350041956485029e-08, - "loss": 0.35358738899230957, - "mean_token_accuracy": 0.8683137893676758, - "num_tokens": 33334979.0, - "step": 3726 - }, - { - "epoch": 2.832066869300912, - "grad_norm": 2.232856035232544, - "learning_rate": 4.311227872027479e-08, - "loss": 0.3305876851081848, - "mean_token_accuracy": 0.885346531867981, - "num_tokens": 33341951.0, - "step": 3727 - }, - { - "epoch": 2.8328267477203646, - "grad_norm": 1.763230800628662, - "learning_rate": 4.272586221407776e-08, - "loss": 0.3677369952201843, - "mean_token_accuracy": 0.8810771703720093, - "num_tokens": 33351110.0, - "step": 3728 - }, - { - "epoch": 2.8335866261398177, - "grad_norm": 1.3161970376968384, - "learning_rate": 4.2341170317461433e-08, - "loss": 0.4191834628582001, - "mean_token_accuracy": 0.8625809550285339, - "num_tokens": 33368231.0, - "step": 3729 - }, - { - "epoch": 2.8343465045592704, - "grad_norm": 2.151383399963379, - "learning_rate": 4.1958203300417056e-08, - "loss": 0.30521994829177856, - "mean_token_accuracy": 0.8904989957809448, - "num_tokens": 33374755.0, - "step": 3730 - }, - { - "epoch": 2.8351063829787235, - "grad_norm": 1.2751890420913696, - "learning_rate": 4.1576961431726016e-08, - "loss": 0.2024286538362503, - "mean_token_accuracy": 0.9254995584487915, - "num_tokens": 33385820.0, - "step": 3731 - }, - { - "epoch": 2.835866261398176, - "grad_norm": 2.5229005813598633, - "learning_rate": 4.119744497895817e-08, - "loss": 0.2631904184818268, - "mean_token_accuracy": 0.9213854074478149, - "num_tokens": 33390577.0, - "step": 3732 - }, - { - "epoch": 2.8366261398176293, - "grad_norm": 1.3829402923583984, - "learning_rate": 4.0819654208472947e-08, - "loss": 0.3373589813709259, - "mean_token_accuracy": 0.8810330629348755, - "num_tokens": 33404300.0, - "step": 3733 - }, - { - "epoch": 2.837386018237082, - "grad_norm": 1.395129919052124, - "learning_rate": 4.044358938541853e-08, - "loss": 0.27040547132492065, - "mean_token_accuracy": 0.8935626745223999, - "num_tokens": 33418071.0, - "step": 3734 - }, - { - "epoch": 2.8381458966565347, - "grad_norm": 2.4185354709625244, - "learning_rate": 4.006925077373158e-08, - "loss": 0.2641582489013672, - "mean_token_accuracy": 0.9196245670318604, - "num_tokens": 33423213.0, - "step": 3735 - }, - { - "epoch": 2.838905775075988, - "grad_norm": 1.9432255029678345, - "learning_rate": 3.969663863613721e-08, - "loss": 0.31337353587150574, - "mean_token_accuracy": 0.886800229549408, - "num_tokens": 33432442.0, - "step": 3736 - }, - { - "epoch": 2.839665653495441, - "grad_norm": 1.1473867893218994, - "learning_rate": 3.9325753234149276e-08, - "loss": 0.3156060576438904, - "mean_token_accuracy": 0.8809531331062317, - "num_tokens": 33452184.0, - "step": 3737 - }, - { - "epoch": 2.8404255319148937, - "grad_norm": 2.233121633529663, - "learning_rate": 3.8956594828069295e-08, - "loss": 0.31154608726501465, - "mean_token_accuracy": 0.8883147239685059, - "num_tokens": 33458643.0, - "step": 3738 - }, - { - "epoch": 2.8411854103343464, - "grad_norm": 2.165466070175171, - "learning_rate": 3.8589163676986674e-08, - "loss": 0.38480815291404724, - "mean_token_accuracy": 0.8609665036201477, - "num_tokens": 33466465.0, - "step": 3739 - }, - { - "epoch": 2.8419452887537995, - "grad_norm": 3.5072174072265625, - "learning_rate": 3.822346003877875e-08, - "loss": 0.45201557874679565, - "mean_token_accuracy": 0.8519665002822876, - "num_tokens": 33470826.0, - "step": 3740 - }, - { - "epoch": 2.842705167173252, - "grad_norm": 2.2038586139678955, - "learning_rate": 3.785948417011076e-08, - "loss": 0.34780675172805786, - "mean_token_accuracy": 0.8806177377700806, - "num_tokens": 33478706.0, - "step": 3741 - }, - { - "epoch": 2.8434650455927053, - "grad_norm": 1.8423243761062622, - "learning_rate": 3.749723632643476e-08, - "loss": 0.2681577205657959, - "mean_token_accuracy": 0.9055651426315308, - "num_tokens": 33486200.0, - "step": 3742 - }, - { - "epoch": 2.844224924012158, - "grad_norm": 1.3372201919555664, - "learning_rate": 3.713671676199016e-08, - "loss": 0.3277212381362915, - "mean_token_accuracy": 0.8801225423812866, - "num_tokens": 33499465.0, - "step": 3743 - }, - { - "epoch": 2.844984802431611, - "grad_norm": 2.303901195526123, - "learning_rate": 3.677792572980371e-08, - "loss": 0.2349717617034912, - "mean_token_accuracy": 0.9109916090965271, - "num_tokens": 33505491.0, - "step": 3744 - }, - { - "epoch": 2.845744680851064, - "grad_norm": 2.1374688148498535, - "learning_rate": 3.642086348168844e-08, - "loss": 0.3567136526107788, - "mean_token_accuracy": 0.8669205904006958, - "num_tokens": 33512665.0, - "step": 3745 - }, - { - "epoch": 2.8465045592705165, - "grad_norm": 3.476426362991333, - "learning_rate": 3.6065530268244445e-08, - "loss": 0.3189643621444702, - "mean_token_accuracy": 0.882624626159668, - "num_tokens": 33516449.0, - "step": 3746 - }, - { - "epoch": 2.8472644376899696, - "grad_norm": 1.094572901725769, - "learning_rate": 3.5711926338858335e-08, - "loss": 0.25354239344596863, - "mean_token_accuracy": 0.9008959531784058, - "num_tokens": 33536298.0, - "step": 3747 - }, - { - "epoch": 2.8480243161094227, - "grad_norm": 1.375033974647522, - "learning_rate": 3.536005194170328e-08, - "loss": 0.2859119772911072, - "mean_token_accuracy": 0.8998885154724121, - "num_tokens": 33548861.0, - "step": 3748 - }, - { - "epoch": 2.8487841945288754, - "grad_norm": 1.96660578250885, - "learning_rate": 3.5009907323737826e-08, - "loss": 0.35728299617767334, - "mean_token_accuracy": 0.8976923227310181, - "num_tokens": 33556270.0, - "step": 3749 - }, - { - "epoch": 2.849544072948328, - "grad_norm": 2.8434062004089355, - "learning_rate": 3.466149273070707e-08, - "loss": 0.25592705607414246, - "mean_token_accuracy": 0.9228044748306274, - "num_tokens": 33560603.0, - "step": 3750 - }, - { - "epoch": 2.8503039513677813, - "grad_norm": 2.7658159732818604, - "learning_rate": 3.431480840714152e-08, - "loss": 0.33110958337783813, - "mean_token_accuracy": 0.8761162161827087, - "num_tokens": 33565428.0, - "step": 3751 - }, - { - "epoch": 2.851063829787234, - "grad_norm": 3.696040391921997, - "learning_rate": 3.396985459635821e-08, - "loss": 0.29301607608795166, - "mean_token_accuracy": 0.9034254550933838, - "num_tokens": 33568866.0, - "step": 3752 - }, - { - "epoch": 2.851823708206687, - "grad_norm": 1.8923646211624146, - "learning_rate": 3.3626631540458754e-08, - "loss": 0.3817586600780487, - "mean_token_accuracy": 0.8635997772216797, - "num_tokens": 33578141.0, - "step": 3753 - }, - { - "epoch": 2.8525835866261398, - "grad_norm": 1.6717027425765991, - "learning_rate": 3.328513948032991e-08, - "loss": 0.37302929162979126, - "mean_token_accuracy": 0.8670454025268555, - "num_tokens": 33588694.0, - "step": 3754 - }, - { - "epoch": 2.853343465045593, - "grad_norm": 3.2549097537994385, - "learning_rate": 3.29453786556444e-08, - "loss": 0.27366238832473755, - "mean_token_accuracy": 0.9079047441482544, - "num_tokens": 33592813.0, - "step": 3755 - }, - { - "epoch": 2.8541033434650456, - "grad_norm": 1.3533412218093872, - "learning_rate": 3.260734930485926e-08, - "loss": 0.4412471652030945, - "mean_token_accuracy": 0.839799165725708, - "num_tokens": 33609765.0, - "step": 3756 - }, - { - "epoch": 2.8548632218844983, - "grad_norm": 2.876262903213501, - "learning_rate": 3.227105166521638e-08, - "loss": 0.3382536768913269, - "mean_token_accuracy": 0.879544734954834, - "num_tokens": 33614131.0, - "step": 3757 - }, - { - "epoch": 2.8556231003039514, - "grad_norm": 1.9969818592071533, - "learning_rate": 3.193648597274279e-08, - "loss": 0.24406743049621582, - "mean_token_accuracy": 0.9072264432907104, - "num_tokens": 33621867.0, - "step": 3758 - }, - { - "epoch": 2.8563829787234045, - "grad_norm": 2.934230089187622, - "learning_rate": 3.1603652462249e-08, - "loss": 0.0893428698182106, - "mean_token_accuracy": 0.9702994227409363, - "num_tokens": 33625133.0, - "step": 3759 - }, - { - "epoch": 2.857142857142857, - "grad_norm": 3.162353038787842, - "learning_rate": 3.127255136733093e-08, - "loss": 0.2535284459590912, - "mean_token_accuracy": 0.8997728824615479, - "num_tokens": 33629391.0, - "step": 3760 - }, - { - "epoch": 2.85790273556231, - "grad_norm": 1.3975396156311035, - "learning_rate": 3.094318292036824e-08, - "loss": 0.37006449699401855, - "mean_token_accuracy": 0.8666602373123169, - "num_tokens": 33644831.0, - "step": 3761 - }, - { - "epoch": 2.858662613981763, - "grad_norm": 1.4016542434692383, - "learning_rate": 3.061554735252325e-08, - "loss": 0.33619073033332825, - "mean_token_accuracy": 0.8836570978164673, - "num_tokens": 33660918.0, - "step": 3762 - }, - { - "epoch": 2.8594224924012157, - "grad_norm": 1.3213437795639038, - "learning_rate": 3.028964489374453e-08, - "loss": 0.29083719849586487, - "mean_token_accuracy": 0.9077234864234924, - "num_tokens": 33672778.0, - "step": 3763 - }, - { - "epoch": 2.860182370820669, - "grad_norm": 2.157179594039917, - "learning_rate": 2.9965475772762154e-08, - "loss": 0.35480785369873047, - "mean_token_accuracy": 0.8937191367149353, - "num_tokens": 33680991.0, - "step": 3764 - }, - { - "epoch": 2.8609422492401215, - "grad_norm": 3.584878921508789, - "learning_rate": 2.96430402170908e-08, - "loss": 0.34448280930519104, - "mean_token_accuracy": 0.878994345664978, - "num_tokens": 33685137.0, - "step": 3765 - }, - { - "epoch": 2.8617021276595747, - "grad_norm": 1.7320963144302368, - "learning_rate": 2.9322338453028066e-08, - "loss": 0.40042293071746826, - "mean_token_accuracy": 0.8563319444656372, - "num_tokens": 33694591.0, - "step": 3766 - }, - { - "epoch": 2.8624620060790273, - "grad_norm": 1.6684232950210571, - "learning_rate": 2.900337070565473e-08, - "loss": 0.4402884542942047, - "mean_token_accuracy": 0.8600190877914429, - "num_tokens": 33708467.0, - "step": 3767 - }, - { - "epoch": 2.86322188449848, - "grad_norm": 1.9484777450561523, - "learning_rate": 2.8686137198834784e-08, - "loss": 0.2297988086938858, - "mean_token_accuracy": 0.9216253161430359, - "num_tokens": 33715825.0, - "step": 3768 - }, - { - "epoch": 2.863981762917933, - "grad_norm": 1.4594624042510986, - "learning_rate": 2.8370638155215125e-08, - "loss": 0.2471354901790619, - "mean_token_accuracy": 0.9343935251235962, - "num_tokens": 33726774.0, - "step": 3769 - }, - { - "epoch": 2.8647416413373863, - "grad_norm": 1.75857412815094, - "learning_rate": 2.805687379622446e-08, - "loss": 0.3599606156349182, - "mean_token_accuracy": 0.8600481748580933, - "num_tokens": 33738487.0, - "step": 3770 - }, - { - "epoch": 2.865501519756839, - "grad_norm": 2.5933029651641846, - "learning_rate": 2.774484434207525e-08, - "loss": 0.3705040514469147, - "mean_token_accuracy": 0.8960624933242798, - "num_tokens": 33743954.0, - "step": 3771 - }, - { - "epoch": 2.8662613981762917, - "grad_norm": 2.339298963546753, - "learning_rate": 2.7434550011761763e-08, - "loss": 0.35568612813949585, - "mean_token_accuracy": 0.8733487725257874, - "num_tokens": 33750214.0, - "step": 3772 - }, - { - "epoch": 2.867021276595745, - "grad_norm": 2.2959485054016113, - "learning_rate": 2.712599102306035e-08, - "loss": 0.2672561705112457, - "mean_token_accuracy": 0.9030044078826904, - "num_tokens": 33756736.0, - "step": 3773 - }, - { - "epoch": 2.8677811550151975, - "grad_norm": 1.500349521636963, - "learning_rate": 2.681916759252917e-08, - "loss": 0.41401299834251404, - "mean_token_accuracy": 0.8844438195228577, - "num_tokens": 33769268.0, - "step": 3774 - }, - { - "epoch": 2.8685410334346506, - "grad_norm": 1.7837727069854736, - "learning_rate": 2.6514079935509586e-08, - "loss": 0.2668437957763672, - "mean_token_accuracy": 0.8956533670425415, - "num_tokens": 33777122.0, - "step": 3775 - }, - { - "epoch": 2.8693009118541033, - "grad_norm": 1.717192530632019, - "learning_rate": 2.6210728266123364e-08, - "loss": 0.25972551107406616, - "mean_token_accuracy": 0.883383572101593, - "num_tokens": 33785044.0, - "step": 3776 - }, - { - "epoch": 2.8700607902735564, - "grad_norm": 1.9367283582687378, - "learning_rate": 2.5909112797274093e-08, - "loss": 0.44500526785850525, - "mean_token_accuracy": 0.8556182980537415, - "num_tokens": 33794610.0, - "step": 3777 - }, - { - "epoch": 2.870820668693009, - "grad_norm": 1.4821012020111084, - "learning_rate": 2.560923374064772e-08, - "loss": 0.3385273218154907, - "mean_token_accuracy": 0.873454749584198, - "num_tokens": 33807602.0, - "step": 3778 - }, - { - "epoch": 2.871580547112462, - "grad_norm": 2.105130195617676, - "learning_rate": 2.531109130671061e-08, - "loss": 0.2996317446231842, - "mean_token_accuracy": 0.8943172693252563, - "num_tokens": 33814280.0, - "step": 3779 - }, - { - "epoch": 2.872340425531915, - "grad_norm": 2.1374971866607666, - "learning_rate": 2.501468570471066e-08, - "loss": 0.3201690912246704, - "mean_token_accuracy": 0.8778494596481323, - "num_tokens": 33821842.0, - "step": 3780 - }, - { - "epoch": 2.8731003039513676, - "grad_norm": 2.2370989322662354, - "learning_rate": 2.4720017142676745e-08, - "loss": 0.4030833840370178, - "mean_token_accuracy": 0.8520782589912415, - "num_tokens": 33830051.0, - "step": 3781 - }, - { - "epoch": 2.8738601823708207, - "grad_norm": 2.3659868240356445, - "learning_rate": 2.4427085827418706e-08, - "loss": 0.2570466697216034, - "mean_token_accuracy": 0.9111968874931335, - "num_tokens": 33835753.0, - "step": 3782 - }, - { - "epoch": 2.8746200607902734, - "grad_norm": 2.252115249633789, - "learning_rate": 2.4135891964526535e-08, - "loss": 0.373632550239563, - "mean_token_accuracy": 0.8691182136535645, - "num_tokens": 33842183.0, - "step": 3783 - }, - { - "epoch": 2.8753799392097266, - "grad_norm": 1.216013789176941, - "learning_rate": 2.3846435758372034e-08, - "loss": 0.3572605848312378, - "mean_token_accuracy": 0.8590090274810791, - "num_tokens": 33860538.0, - "step": 3784 - }, - { - "epoch": 2.8761398176291793, - "grad_norm": 2.739243268966675, - "learning_rate": 2.3558717412106025e-08, - "loss": 0.3257160782814026, - "mean_token_accuracy": 0.8806333541870117, - "num_tokens": 33866134.0, - "step": 3785 - }, - { - "epoch": 2.8768996960486324, - "grad_norm": 1.683767557144165, - "learning_rate": 2.3272737127660595e-08, - "loss": 0.3267333507537842, - "mean_token_accuracy": 0.9005235433578491, - "num_tokens": 33875630.0, - "step": 3786 - }, - { - "epoch": 2.877659574468085, - "grad_norm": 1.4830154180526733, - "learning_rate": 2.2988495105748245e-08, - "loss": 0.28507307171821594, - "mean_token_accuracy": 0.9133665561676025, - "num_tokens": 33887110.0, - "step": 3787 - }, - { - "epoch": 2.878419452887538, - "grad_norm": 1.7522467374801636, - "learning_rate": 2.2705991545859953e-08, - "loss": 0.45354849100112915, - "mean_token_accuracy": 0.8899869918823242, - "num_tokens": 33898735.0, - "step": 3788 - }, - { - "epoch": 2.879179331306991, - "grad_norm": 2.4311375617980957, - "learning_rate": 2.242522664626823e-08, - "loss": 0.3417064845561981, - "mean_token_accuracy": 0.8724955320358276, - "num_tokens": 33906031.0, - "step": 3789 - }, - { - "epoch": 2.8799392097264436, - "grad_norm": 2.44846510887146, - "learning_rate": 2.2146200604024614e-08, - "loss": 0.3186315596103668, - "mean_token_accuracy": 0.8888083696365356, - "num_tokens": 33911248.0, - "step": 3790 - }, - { - "epoch": 2.8806990881458967, - "grad_norm": 1.4528448581695557, - "learning_rate": 2.1868913614959963e-08, - "loss": 0.34161821007728577, - "mean_token_accuracy": 0.87728351354599, - "num_tokens": 33923786.0, - "step": 3791 - }, - { - "epoch": 2.8814589665653494, - "grad_norm": 1.8418529033660889, - "learning_rate": 2.1593365873685544e-08, - "loss": 0.2751237452030182, - "mean_token_accuracy": 0.9060730338096619, - "num_tokens": 33930983.0, - "step": 3792 - }, - { - "epoch": 2.8822188449848025, - "grad_norm": 1.5349152088165283, - "learning_rate": 2.131955757359111e-08, - "loss": 0.31487759947776794, - "mean_token_accuracy": 0.8839719891548157, - "num_tokens": 33942600.0, - "step": 3793 - }, - { - "epoch": 2.882978723404255, - "grad_norm": 2.317296266555786, - "learning_rate": 2.1047488906845715e-08, - "loss": 0.22481049597263336, - "mean_token_accuracy": 0.9269076585769653, - "num_tokens": 33947548.0, - "step": 3794 - }, - { - "epoch": 2.8837386018237083, - "grad_norm": 1.9512174129486084, - "learning_rate": 2.0777160064397727e-08, - "loss": 0.33469653129577637, - "mean_token_accuracy": 0.8800324201583862, - "num_tokens": 33955699.0, - "step": 3795 - }, - { - "epoch": 2.884498480243161, - "grad_norm": 1.3140486478805542, - "learning_rate": 2.050857123597455e-08, - "loss": 0.3801634609699249, - "mean_token_accuracy": 0.8677546977996826, - "num_tokens": 33972033.0, - "step": 3796 - }, - { - "epoch": 2.8852583586626137, - "grad_norm": 2.4413559436798096, - "learning_rate": 2.024172261008178e-08, - "loss": 0.4444601535797119, - "mean_token_accuracy": 0.8535408973693848, - "num_tokens": 33978859.0, - "step": 3797 - }, - { - "epoch": 2.886018237082067, - "grad_norm": 1.8970952033996582, - "learning_rate": 1.997661437400461e-08, - "loss": 0.29712194204330444, - "mean_token_accuracy": 0.8958410024642944, - "num_tokens": 33988416.0, - "step": 3798 - }, - { - "epoch": 2.88677811550152, - "grad_norm": 1.4225033521652222, - "learning_rate": 1.9713246713805588e-08, - "loss": 0.2251742035150528, - "mean_token_accuracy": 0.9059432744979858, - "num_tokens": 33998579.0, - "step": 3799 - }, - { - "epoch": 2.8875379939209727, - "grad_norm": 3.485994338989258, - "learning_rate": 1.9451619814326307e-08, - "loss": 0.2449614405632019, - "mean_token_accuracy": 0.9136157035827637, - "num_tokens": 34002108.0, - "step": 3800 - }, - { - "epoch": 2.8882978723404253, - "grad_norm": 1.7383781671524048, - "learning_rate": 1.91917338591871e-08, - "loss": 0.3420751690864563, - "mean_token_accuracy": 0.8810985088348389, - "num_tokens": 34010102.0, - "step": 3801 - }, - { - "epoch": 2.8890577507598785, - "grad_norm": 2.587632894515991, - "learning_rate": 1.893358903078568e-08, - "loss": 0.38646167516708374, - "mean_token_accuracy": 0.8570578098297119, - "num_tokens": 34016684.0, - "step": 3802 - }, - { - "epoch": 2.889817629179331, - "grad_norm": 1.2580358982086182, - "learning_rate": 1.867718551029768e-08, - "loss": 0.23658394813537598, - "mean_token_accuracy": 0.9092692136764526, - "num_tokens": 34029808.0, - "step": 3803 - }, - { - "epoch": 2.8905775075987843, - "grad_norm": 1.808404803276062, - "learning_rate": 1.842252347767748e-08, - "loss": 0.2760203778743744, - "mean_token_accuracy": 0.8876132965087891, - "num_tokens": 34038138.0, - "step": 3804 - }, - { - "epoch": 2.891337386018237, - "grad_norm": 1.6140836477279663, - "learning_rate": 1.8169603111656554e-08, - "loss": 0.3449614346027374, - "mean_token_accuracy": 0.8777539730072021, - "num_tokens": 34048093.0, - "step": 3805 - }, - { - "epoch": 2.89209726443769, - "grad_norm": 3.784487724304199, - "learning_rate": 1.791842458974402e-08, - "loss": 0.3181925415992737, - "mean_token_accuracy": 0.8902693390846252, - "num_tokens": 34051903.0, - "step": 3806 - }, - { - "epoch": 2.892857142857143, - "grad_norm": 1.726521372795105, - "learning_rate": 1.7668988088226922e-08, - "loss": 0.3940914273262024, - "mean_token_accuracy": 0.8877660036087036, - "num_tokens": 34062433.0, - "step": 3807 - }, - { - "epoch": 2.8936170212765955, - "grad_norm": 2.1862759590148926, - "learning_rate": 1.7421293782168837e-08, - "loss": 0.2806234061717987, - "mean_token_accuracy": 0.9004480838775635, - "num_tokens": 34068835.0, - "step": 3808 - }, - { - "epoch": 2.8943768996960486, - "grad_norm": 1.8618063926696777, - "learning_rate": 1.717534184541153e-08, - "loss": 0.3391259014606476, - "mean_token_accuracy": 0.8807502388954163, - "num_tokens": 34078044.0, - "step": 3809 - }, - { - "epoch": 2.8951367781155017, - "grad_norm": 2.19085431098938, - "learning_rate": 1.6931132450573873e-08, - "loss": 0.34228384494781494, - "mean_token_accuracy": 0.8653440475463867, - "num_tokens": 34084925.0, - "step": 3810 - }, - { - "epoch": 2.8958966565349544, - "grad_norm": 2.0328660011291504, - "learning_rate": 1.6688665769050704e-08, - "loss": 0.3773893117904663, - "mean_token_accuracy": 0.8646367788314819, - "num_tokens": 34092740.0, - "step": 3811 - }, - { - "epoch": 2.896656534954407, - "grad_norm": 2.489732265472412, - "learning_rate": 1.644794197101507e-08, - "loss": 0.2722119688987732, - "mean_token_accuracy": 0.9241745471954346, - "num_tokens": 34097475.0, - "step": 3812 - }, - { - "epoch": 2.8974164133738602, - "grad_norm": 2.709529161453247, - "learning_rate": 1.620896122541571e-08, - "loss": 0.2608666718006134, - "mean_token_accuracy": 0.9132722020149231, - "num_tokens": 34101961.0, - "step": 3813 - }, - { - "epoch": 2.898176291793313, - "grad_norm": 2.089813709259033, - "learning_rate": 1.5971723699979015e-08, - "loss": 0.3234292268753052, - "mean_token_accuracy": 0.9032332897186279, - "num_tokens": 34109427.0, - "step": 3814 - }, - { - "epoch": 2.898936170212766, - "grad_norm": 1.3891119956970215, - "learning_rate": 1.5736229561207072e-08, - "loss": 0.2506135404109955, - "mean_token_accuracy": 0.8997396230697632, - "num_tokens": 34121770.0, - "step": 3815 - }, - { - "epoch": 2.8996960486322187, - "grad_norm": 1.9386579990386963, - "learning_rate": 1.5502478974378788e-08, - "loss": 0.29841434955596924, - "mean_token_accuracy": 0.8915755748748779, - "num_tokens": 34130111.0, - "step": 3816 - }, - { - "epoch": 2.900455927051672, - "grad_norm": 1.601960301399231, - "learning_rate": 1.5270472103549317e-08, - "loss": 0.34736987948417664, - "mean_token_accuracy": 0.876467227935791, - "num_tokens": 34142053.0, - "step": 3817 - }, - { - "epoch": 2.9012158054711246, - "grad_norm": 2.42319393157959, - "learning_rate": 1.5040209111550075e-08, - "loss": 0.24774286150932312, - "mean_token_accuracy": 0.9127346873283386, - "num_tokens": 34146627.0, - "step": 3818 - }, - { - "epoch": 2.9019756838905773, - "grad_norm": 2.711033582687378, - "learning_rate": 1.4811690159988456e-08, - "loss": 0.30365103483200073, - "mean_token_accuracy": 0.8981214165687561, - "num_tokens": 34151735.0, - "step": 3819 - }, - { - "epoch": 2.9027355623100304, - "grad_norm": 3.105949640274048, - "learning_rate": 1.4584915409248113e-08, - "loss": 0.38369080424308777, - "mean_token_accuracy": 0.8762385845184326, - "num_tokens": 34156484.0, - "step": 3820 - }, - { - "epoch": 2.9034954407294835, - "grad_norm": 1.9705839157104492, - "learning_rate": 1.435988501848784e-08, - "loss": 0.33529043197631836, - "mean_token_accuracy": 0.8921652436256409, - "num_tokens": 34164241.0, - "step": 3821 - }, - { - "epoch": 2.904255319148936, - "grad_norm": 2.084878921508789, - "learning_rate": 1.413659914564297e-08, - "loss": 0.24922935664653778, - "mean_token_accuracy": 0.9262560606002808, - "num_tokens": 34169898.0, - "step": 3822 - }, - { - "epoch": 2.905015197568389, - "grad_norm": 1.593758225440979, - "learning_rate": 1.3915057947423705e-08, - "loss": 0.3691917657852173, - "mean_token_accuracy": 0.8785613775253296, - "num_tokens": 34181419.0, - "step": 3823 - }, - { - "epoch": 2.905775075987842, - "grad_norm": 1.772596001625061, - "learning_rate": 1.3695261579316776e-08, - "loss": 0.358150839805603, - "mean_token_accuracy": 0.8747056722640991, - "num_tokens": 34190872.0, - "step": 3824 - }, - { - "epoch": 2.9065349544072947, - "grad_norm": 2.1670494079589844, - "learning_rate": 1.3477210195583234e-08, - "loss": 0.30586451292037964, - "mean_token_accuracy": 0.8851495981216431, - "num_tokens": 34197353.0, - "step": 3825 - }, - { - "epoch": 2.907294832826748, - "grad_norm": 2.7168121337890625, - "learning_rate": 1.3260903949260107e-08, - "loss": 0.2924152612686157, - "mean_token_accuracy": 0.8947597146034241, - "num_tokens": 34201889.0, - "step": 3826 - }, - { - "epoch": 2.9080547112462005, - "grad_norm": 1.576528787612915, - "learning_rate": 1.3046342992159567e-08, - "loss": 0.33903738856315613, - "mean_token_accuracy": 0.8710857033729553, - "num_tokens": 34212640.0, - "step": 3827 - }, - { - "epoch": 2.9088145896656536, - "grad_norm": 1.3831605911254883, - "learning_rate": 1.2833527474868657e-08, - "loss": 0.2891062796115875, - "mean_token_accuracy": 0.8909540176391602, - "num_tokens": 34223917.0, - "step": 3828 - }, - { - "epoch": 2.9095744680851063, - "grad_norm": 2.075225353240967, - "learning_rate": 1.2622457546749567e-08, - "loss": 0.14886733889579773, - "mean_token_accuracy": 0.9509548544883728, - "num_tokens": 34228609.0, - "step": 3829 - }, - { - "epoch": 2.910334346504559, - "grad_norm": 2.658463478088379, - "learning_rate": 1.2413133355939356e-08, - "loss": 0.13472142815589905, - "mean_token_accuracy": 0.957228422164917, - "num_tokens": 34232011.0, - "step": 3830 - }, - { - "epoch": 2.911094224924012, - "grad_norm": 1.8684933185577393, - "learning_rate": 1.2205555049349394e-08, - "loss": 0.13954663276672363, - "mean_token_accuracy": 0.953221321105957, - "num_tokens": 34237643.0, - "step": 3831 - }, - { - "epoch": 2.9118541033434653, - "grad_norm": 1.799784779548645, - "learning_rate": 1.1999722772666478e-08, - "loss": 0.28668212890625, - "mean_token_accuracy": 0.9035641551017761, - "num_tokens": 34246593.0, - "step": 3832 - }, - { - "epoch": 2.912613981762918, - "grad_norm": 1.3970232009887695, - "learning_rate": 1.1795636670351718e-08, - "loss": 0.2589891254901886, - "mean_token_accuracy": 0.9162927865982056, - "num_tokens": 34257535.0, - "step": 3833 - }, - { - "epoch": 2.9133738601823707, - "grad_norm": 2.5260443687438965, - "learning_rate": 1.1593296885640259e-08, - "loss": 0.452729195356369, - "mean_token_accuracy": 0.8569157123565674, - "num_tokens": 34263834.0, - "step": 3834 - }, - { - "epoch": 2.914133738601824, - "grad_norm": 1.879526972770691, - "learning_rate": 1.1392703560542118e-08, - "loss": 0.3608126640319824, - "mean_token_accuracy": 0.8750635385513306, - "num_tokens": 34272156.0, - "step": 3835 - }, - { - "epoch": 2.9148936170212765, - "grad_norm": 1.9857182502746582, - "learning_rate": 1.1193856835841344e-08, - "loss": 0.28058698773384094, - "mean_token_accuracy": 0.8984638452529907, - "num_tokens": 34280438.0, - "step": 3836 - }, - { - "epoch": 2.9156534954407296, - "grad_norm": 1.9187198877334595, - "learning_rate": 1.0996756851096579e-08, - "loss": 0.3203415870666504, - "mean_token_accuracy": 0.8920673131942749, - "num_tokens": 34288330.0, - "step": 3837 - }, - { - "epoch": 2.9164133738601823, - "grad_norm": 1.6627569198608398, - "learning_rate": 1.0801403744639672e-08, - "loss": 0.30393654108047485, - "mean_token_accuracy": 0.8877602815628052, - "num_tokens": 34297701.0, - "step": 3838 - }, - { - "epoch": 2.9171732522796354, - "grad_norm": 1.4527947902679443, - "learning_rate": 1.0607797653577333e-08, - "loss": 0.33950865268707275, - "mean_token_accuracy": 0.8850067853927612, - "num_tokens": 34311995.0, - "step": 3839 - }, - { - "epoch": 2.917933130699088, - "grad_norm": 1.694217324256897, - "learning_rate": 1.0415938713789487e-08, - "loss": 0.33595266938209534, - "mean_token_accuracy": 0.878333568572998, - "num_tokens": 34322095.0, - "step": 3840 - }, - { - "epoch": 2.918693009118541, - "grad_norm": 2.3357045650482178, - "learning_rate": 1.0225827059930082e-08, - "loss": 0.2966959476470947, - "mean_token_accuracy": 0.893630862236023, - "num_tokens": 34328400.0, - "step": 3841 - }, - { - "epoch": 2.919452887537994, - "grad_norm": 1.9848041534423828, - "learning_rate": 1.0037462825427113e-08, - "loss": 0.4187622368335724, - "mean_token_accuracy": 0.8641717433929443, - "num_tokens": 34337203.0, - "step": 3842 - }, - { - "epoch": 2.920212765957447, - "grad_norm": 1.7696800231933594, - "learning_rate": 9.850846142481773e-09, - "loss": 0.34298282861709595, - "mean_token_accuracy": 0.8812298774719238, - "num_tokens": 34346584.0, - "step": 3843 - }, - { - "epoch": 2.9209726443768997, - "grad_norm": 2.6058225631713867, - "learning_rate": 9.665977142068738e-09, - "loss": 0.2776247262954712, - "mean_token_accuracy": 0.908215343952179, - "num_tokens": 34351472.0, - "step": 3844 - }, - { - "epoch": 2.9217325227963524, - "grad_norm": 2.4990663528442383, - "learning_rate": 9.482855953936443e-09, - "loss": 0.2577187418937683, - "mean_token_accuracy": 0.9113357663154602, - "num_tokens": 34357101.0, - "step": 3845 - }, - { - "epoch": 2.9224924012158056, - "grad_norm": 3.2842514514923096, - "learning_rate": 9.30148270660569e-09, - "loss": 0.23392081260681152, - "mean_token_accuracy": 0.9370708465576172, - "num_tokens": 34360674.0, - "step": 3846 - }, - { - "epoch": 2.9232522796352582, - "grad_norm": 2.2124083042144775, - "learning_rate": 9.121857527372157e-09, - "loss": 0.3026091456413269, - "mean_token_accuracy": 0.886944055557251, - "num_tokens": 34367471.0, - "step": 3847 - }, - { - "epoch": 2.9240121580547114, - "grad_norm": 1.6130470037460327, - "learning_rate": 8.943980542302777e-09, - "loss": 0.33204811811447144, - "mean_token_accuracy": 0.8805426359176636, - "num_tokens": 34377461.0, - "step": 3848 - }, - { - "epoch": 2.924772036474164, - "grad_norm": 1.6536617279052734, - "learning_rate": 8.767851876239075e-09, - "loss": 0.33671748638153076, - "mean_token_accuracy": 0.8811848163604736, - "num_tokens": 34386732.0, - "step": 3849 - }, - { - "epoch": 2.925531914893617, - "grad_norm": 1.9558135271072388, - "learning_rate": 8.59347165279495e-09, - "loss": 0.3325084447860718, - "mean_token_accuracy": 0.8823798894882202, - "num_tokens": 34395705.0, - "step": 3850 - }, - { - "epoch": 2.92629179331307, - "grad_norm": 2.2350621223449707, - "learning_rate": 8.420839994356666e-09, - "loss": 0.28383463621139526, - "mean_token_accuracy": 0.8957310914993286, - "num_tokens": 34402470.0, - "step": 3851 - }, - { - "epoch": 2.9270516717325226, - "grad_norm": 1.9859482049942017, - "learning_rate": 8.249957022084254e-09, - "loss": 0.2720850110054016, - "mean_token_accuracy": 0.9078607559204102, - "num_tokens": 34410536.0, - "step": 3852 - }, - { - "epoch": 2.9278115501519757, - "grad_norm": 1.3174400329589844, - "learning_rate": 8.080822855909832e-09, - "loss": 0.330660879611969, - "mean_token_accuracy": 0.8777779936790466, - "num_tokens": 34425639.0, - "step": 3853 - }, - { - "epoch": 2.928571428571429, - "grad_norm": 1.0108131170272827, - "learning_rate": 7.913437614538166e-09, - "loss": 0.3833892345428467, - "mean_token_accuracy": 0.8571817874908447, - "num_tokens": 34451572.0, - "step": 3854 - }, - { - "epoch": 2.9293313069908815, - "grad_norm": 1.347409725189209, - "learning_rate": 7.747801415446677e-09, - "loss": 0.3100135028362274, - "mean_token_accuracy": 0.9087803363800049, - "num_tokens": 34465396.0, - "step": 3855 - }, - { - "epoch": 2.930091185410334, - "grad_norm": 1.4636729955673218, - "learning_rate": 7.583914374885426e-09, - "loss": 0.32699912786483765, - "mean_token_accuracy": 0.87745201587677, - "num_tokens": 34477359.0, - "step": 3856 - }, - { - "epoch": 2.9308510638297873, - "grad_norm": 2.9707272052764893, - "learning_rate": 7.4217766078760185e-09, - "loss": 0.18189990520477295, - "mean_token_accuracy": 0.9301153421401978, - "num_tokens": 34481356.0, - "step": 3857 - }, - { - "epoch": 2.93161094224924, - "grad_norm": 2.3689684867858887, - "learning_rate": 7.261388228213817e-09, - "loss": 0.3382490873336792, - "mean_token_accuracy": 0.9132488965988159, - "num_tokens": 34487525.0, - "step": 3858 - }, - { - "epoch": 2.932370820668693, - "grad_norm": 2.3896703720092773, - "learning_rate": 7.102749348465166e-09, - "loss": 0.3891000747680664, - "mean_token_accuracy": 0.8888499736785889, - "num_tokens": 34493053.0, - "step": 3859 - }, - { - "epoch": 2.933130699088146, - "grad_norm": 3.2713520526885986, - "learning_rate": 6.945860079969058e-09, - "loss": 0.26146358251571655, - "mean_token_accuracy": 0.9090266227722168, - "num_tokens": 34496938.0, - "step": 3860 - }, - { - "epoch": 2.933890577507599, - "grad_norm": 2.9600296020507812, - "learning_rate": 6.790720532836026e-09, - "loss": 0.3506978750228882, - "mean_token_accuracy": 0.8768079280853271, - "num_tokens": 34501615.0, - "step": 3861 - }, - { - "epoch": 2.9346504559270516, - "grad_norm": 2.640066146850586, - "learning_rate": 6.6373308159495275e-09, - "loss": 0.39720577001571655, - "mean_token_accuracy": 0.8565619587898254, - "num_tokens": 34507438.0, - "step": 3862 - }, - { - "epoch": 2.9354103343465043, - "grad_norm": 1.9988795518875122, - "learning_rate": 6.485691036964003e-09, - "loss": 0.18736782670021057, - "mean_token_accuracy": 0.92908775806427, - "num_tokens": 34513939.0, - "step": 3863 - }, - { - "epoch": 2.9361702127659575, - "grad_norm": 1.0155757665634155, - "learning_rate": 6.3358013023062656e-09, - "loss": 0.2876095771789551, - "mean_token_accuracy": 0.8892107009887695, - "num_tokens": 34537761.0, - "step": 3864 - }, - { - "epoch": 2.9369300911854106, - "grad_norm": 1.1695115566253662, - "learning_rate": 6.1876617171743865e-09, - "loss": 0.2740359306335449, - "mean_token_accuracy": 0.9104942083358765, - "num_tokens": 34557302.0, - "step": 3865 - }, - { - "epoch": 2.9376899696048633, - "grad_norm": 2.36651349067688, - "learning_rate": 6.04127238553881e-09, - "loss": 0.2734505534172058, - "mean_token_accuracy": 0.897555947303772, - "num_tokens": 34563225.0, - "step": 3866 - }, - { - "epoch": 2.938449848024316, - "grad_norm": 3.6499621868133545, - "learning_rate": 5.896633410141239e-09, - "loss": 0.30723029375076294, - "mean_token_accuracy": 0.9058237075805664, - "num_tokens": 34567037.0, - "step": 3867 - }, - { - "epoch": 2.939209726443769, - "grad_norm": 2.1518232822418213, - "learning_rate": 5.753744892494639e-09, - "loss": 0.46499863266944885, - "mean_token_accuracy": 0.8726839423179626, - "num_tokens": 34576470.0, - "step": 3868 - }, - { - "epoch": 2.939969604863222, - "grad_norm": 2.8443753719329834, - "learning_rate": 5.612606932883513e-09, - "loss": 0.33730241656303406, - "mean_token_accuracy": 0.8874512910842896, - "num_tokens": 34582531.0, - "step": 3869 - }, - { - "epoch": 2.940729483282675, - "grad_norm": 1.732109546661377, - "learning_rate": 5.473219630364457e-09, - "loss": 0.28559115529060364, - "mean_token_accuracy": 0.894468367099762, - "num_tokens": 34591376.0, - "step": 3870 - }, - { - "epoch": 2.9414893617021276, - "grad_norm": 2.533249855041504, - "learning_rate": 5.335583082764495e-09, - "loss": 0.23319819569587708, - "mean_token_accuracy": 0.9174623489379883, - "num_tokens": 34596426.0, - "step": 3871 - }, - { - "epoch": 2.9422492401215807, - "grad_norm": 2.3505852222442627, - "learning_rate": 5.19969738668219e-09, - "loss": 0.3506584167480469, - "mean_token_accuracy": 0.8692278861999512, - "num_tokens": 34603058.0, - "step": 3872 - }, - { - "epoch": 2.9430091185410334, - "grad_norm": 1.9322142601013184, - "learning_rate": 5.065562637487365e-09, - "loss": 0.2503264546394348, - "mean_token_accuracy": 0.9093045592308044, - "num_tokens": 34610330.0, - "step": 3873 - }, - { - "epoch": 2.943768996960486, - "grad_norm": 2.398416519165039, - "learning_rate": 4.933178929321103e-09, - "loss": 0.3825327157974243, - "mean_token_accuracy": 0.8750842809677124, - "num_tokens": 34617198.0, - "step": 3874 - }, - { - "epoch": 2.9445288753799392, - "grad_norm": 2.3529703617095947, - "learning_rate": 4.802546355095472e-09, - "loss": 0.3877553343772888, - "mean_token_accuracy": 0.8654624223709106, - "num_tokens": 34624459.0, - "step": 3875 - }, - { - "epoch": 2.9452887537993924, - "grad_norm": 1.4786031246185303, - "learning_rate": 4.673665006492967e-09, - "loss": 0.42244911193847656, - "mean_token_accuracy": 0.8566311597824097, - "num_tokens": 34639359.0, - "step": 3876 - }, - { - "epoch": 2.946048632218845, - "grad_norm": 2.091810703277588, - "learning_rate": 4.546534973968175e-09, - "loss": 0.27733317017555237, - "mean_token_accuracy": 0.9033011794090271, - "num_tokens": 34646046.0, - "step": 3877 - }, - { - "epoch": 2.9468085106382977, - "grad_norm": 1.900180459022522, - "learning_rate": 4.4211563467452814e-09, - "loss": 0.44815146923065186, - "mean_token_accuracy": 0.8379489183425903, - "num_tokens": 34656329.0, - "step": 3878 - }, - { - "epoch": 2.947568389057751, - "grad_norm": 1.3837320804595947, - "learning_rate": 4.297529212820006e-09, - "loss": 0.33357739448547363, - "mean_token_accuracy": 0.8788042068481445, - "num_tokens": 34671303.0, - "step": 3879 - }, - { - "epoch": 2.9483282674772036, - "grad_norm": 1.3475737571716309, - "learning_rate": 4.175653658958501e-09, - "loss": 0.30217933654785156, - "mean_token_accuracy": 0.8932538032531738, - "num_tokens": 34685080.0, - "step": 3880 - }, - { - "epoch": 2.9490881458966567, - "grad_norm": 1.6425048112869263, - "learning_rate": 4.055529770698175e-09, - "loss": 0.4368054270744324, - "mean_token_accuracy": 0.8392083644866943, - "num_tokens": 34695104.0, - "step": 3881 - }, - { - "epoch": 2.9498480243161094, - "grad_norm": 1.729368805885315, - "learning_rate": 3.937157632346311e-09, - "loss": 0.28259193897247314, - "mean_token_accuracy": 0.9338148236274719, - "num_tokens": 34706664.0, - "step": 3882 - }, - { - "epoch": 2.9506079027355625, - "grad_norm": 1.8707934617996216, - "learning_rate": 3.820537326980622e-09, - "loss": 0.40049535036087036, - "mean_token_accuracy": 0.8617393374443054, - "num_tokens": 34715401.0, - "step": 3883 - }, - { - "epoch": 2.951367781155015, - "grad_norm": 1.7935676574707031, - "learning_rate": 3.7056689364503574e-09, - "loss": 0.3386167585849762, - "mean_token_accuracy": 0.8947521448135376, - "num_tokens": 34724093.0, - "step": 3884 - }, - { - "epoch": 2.952127659574468, - "grad_norm": 2.6346704959869385, - "learning_rate": 3.592552541374361e-09, - "loss": 0.1505163311958313, - "mean_token_accuracy": 0.9515544176101685, - "num_tokens": 34727908.0, - "step": 3885 - }, - { - "epoch": 2.952887537993921, - "grad_norm": 2.2813618183135986, - "learning_rate": 3.481188221142184e-09, - "loss": 0.3014339506626129, - "mean_token_accuracy": 0.8985507488250732, - "num_tokens": 34734037.0, - "step": 3886 - }, - { - "epoch": 2.9536474164133737, - "grad_norm": 2.482675313949585, - "learning_rate": 3.37157605391325e-09, - "loss": 0.3489428758621216, - "mean_token_accuracy": 0.8771353960037231, - "num_tokens": 34739874.0, - "step": 3887 - }, - { - "epoch": 2.954407294832827, - "grad_norm": 2.3721418380737305, - "learning_rate": 3.2637161166179654e-09, - "loss": 0.3582353889942169, - "mean_token_accuracy": 0.861088752746582, - "num_tokens": 34747007.0, - "step": 3888 - }, - { - "epoch": 2.9551671732522795, - "grad_norm": 2.1871862411499023, - "learning_rate": 3.1576084849563315e-09, - "loss": 0.30689212679862976, - "mean_token_accuracy": 0.8910759687423706, - "num_tokens": 34753361.0, - "step": 3889 - }, - { - "epoch": 2.9559270516717326, - "grad_norm": 2.1797537803649902, - "learning_rate": 3.0532532333987785e-09, - "loss": 0.3343493938446045, - "mean_token_accuracy": 0.880067765712738, - "num_tokens": 34760824.0, - "step": 3890 - }, - { - "epoch": 2.9566869300911853, - "grad_norm": 2.6021335124969482, - "learning_rate": 2.9506504351861644e-09, - "loss": 0.34991219639778137, - "mean_token_accuracy": 0.8728436231613159, - "num_tokens": 34766783.0, - "step": 3891 - }, - { - "epoch": 2.9574468085106385, - "grad_norm": 2.202974319458008, - "learning_rate": 2.849800162328664e-09, - "loss": 0.3138400912284851, - "mean_token_accuracy": 0.8995538949966431, - "num_tokens": 34773174.0, - "step": 3892 - }, - { - "epoch": 2.958206686930091, - "grad_norm": 1.687474250793457, - "learning_rate": 2.7507024856071595e-09, - "loss": 0.40479594469070435, - "mean_token_accuracy": 0.8831138610839844, - "num_tokens": 34785142.0, - "step": 3893 - }, - { - "epoch": 2.9589665653495443, - "grad_norm": 1.960195779800415, - "learning_rate": 2.6533574745718493e-09, - "loss": 0.3259456157684326, - "mean_token_accuracy": 0.8871631622314453, - "num_tokens": 34793001.0, - "step": 3894 - }, - { - "epoch": 2.959726443768997, - "grad_norm": 2.89237904548645, - "learning_rate": 2.557765197543638e-09, - "loss": 0.32338041067123413, - "mean_token_accuracy": 0.9038220047950745, - "num_tokens": 34797424.0, - "step": 3895 - }, - { - "epoch": 2.9604863221884496, - "grad_norm": 2.655599594116211, - "learning_rate": 2.4639257216127476e-09, - "loss": 0.2710941731929779, - "mean_token_accuracy": 0.8990030884742737, - "num_tokens": 34802629.0, - "step": 3896 - }, - { - "epoch": 2.961246200607903, - "grad_norm": 2.8153562545776367, - "learning_rate": 2.3718391126392735e-09, - "loss": 0.4303235411643982, - "mean_token_accuracy": 0.8491297960281372, - "num_tokens": 34807870.0, - "step": 3897 - }, - { - "epoch": 2.9620060790273555, - "grad_norm": 1.4196341037750244, - "learning_rate": 2.2815054352531842e-09, - "loss": 0.38827845454216003, - "mean_token_accuracy": 0.8595222234725952, - "num_tokens": 34823597.0, - "step": 3898 - }, - { - "epoch": 2.9627659574468086, - "grad_norm": 2.9653196334838867, - "learning_rate": 2.192924752854042e-09, - "loss": 0.2555926442146301, - "mean_token_accuracy": 0.9074755907058716, - "num_tokens": 34827781.0, - "step": 3899 - }, - { - "epoch": 2.9635258358662613, - "grad_norm": 1.4998196363449097, - "learning_rate": 2.106097127611284e-09, - "loss": 0.36219048500061035, - "mean_token_accuracy": 0.885735273361206, - "num_tokens": 34839234.0, - "step": 3900 - }, - { - "epoch": 2.9642857142857144, - "grad_norm": 1.718245029449463, - "learning_rate": 2.0210226204639414e-09, - "loss": 0.26162803173065186, - "mean_token_accuracy": 0.8963354825973511, - "num_tokens": 34848059.0, - "step": 3901 - }, - { - "epoch": 2.965045592705167, - "grad_norm": 2.0226235389709473, - "learning_rate": 1.9377012911203642e-09, - "loss": 0.3657612204551697, - "mean_token_accuracy": 0.8982006311416626, - "num_tokens": 34854617.0, - "step": 3902 - }, - { - "epoch": 2.96580547112462, - "grad_norm": 2.6306872367858887, - "learning_rate": 1.8561331980587738e-09, - "loss": 0.19888967275619507, - "mean_token_accuracy": 0.9320937395095825, - "num_tokens": 34859412.0, - "step": 3903 - }, - { - "epoch": 2.966565349544073, - "grad_norm": 1.2558201551437378, - "learning_rate": 1.7763183985269882e-09, - "loss": 0.39052504301071167, - "mean_token_accuracy": 0.8551221489906311, - "num_tokens": 34875603.0, - "step": 3904 - }, - { - "epoch": 2.967325227963526, - "grad_norm": 1.7441751956939697, - "learning_rate": 1.6982569485415879e-09, - "loss": 0.3208625912666321, - "mean_token_accuracy": 0.8973188996315002, - "num_tokens": 34884309.0, - "step": 3905 - }, - { - "epoch": 2.9680851063829787, - "grad_norm": 1.6294625997543335, - "learning_rate": 1.6219489028895808e-09, - "loss": 0.2818947732448578, - "mean_token_accuracy": 0.9170717000961304, - "num_tokens": 34894045.0, - "step": 3906 - }, - { - "epoch": 2.9688449848024314, - "grad_norm": 1.9129183292388916, - "learning_rate": 1.5473943151270155e-09, - "loss": 0.3932931423187256, - "mean_token_accuracy": 0.8591724038124084, - "num_tokens": 34903057.0, - "step": 3907 - }, - { - "epoch": 2.9696048632218845, - "grad_norm": 2.125586748123169, - "learning_rate": 1.474593237578703e-09, - "loss": 0.4141325056552887, - "mean_token_accuracy": 0.855269193649292, - "num_tokens": 34911138.0, - "step": 3908 - }, - { - "epoch": 2.9703647416413372, - "grad_norm": 2.039323329925537, - "learning_rate": 1.4035457213393278e-09, - "loss": 0.30452996492385864, - "mean_token_accuracy": 0.8897982835769653, - "num_tokens": 34918685.0, - "step": 3909 - }, - { - "epoch": 2.9711246200607904, - "grad_norm": 1.213478446006775, - "learning_rate": 1.3342518162728913e-09, - "loss": 0.3703617751598358, - "mean_token_accuracy": 0.8672454357147217, - "num_tokens": 34936658.0, - "step": 3910 - }, - { - "epoch": 2.971884498480243, - "grad_norm": 1.2648811340332031, - "learning_rate": 1.2667115710127131e-09, - "loss": 0.4004117250442505, - "mean_token_accuracy": 0.8572319149971008, - "num_tokens": 34955480.0, - "step": 3911 - }, - { - "epoch": 2.972644376899696, - "grad_norm": 2.34121036529541, - "learning_rate": 1.2009250329608757e-09, - "loss": 0.12352144718170166, - "mean_token_accuracy": 0.9538272619247437, - "num_tokens": 34959942.0, - "step": 3912 - }, - { - "epoch": 2.973404255319149, - "grad_norm": 1.5843939781188965, - "learning_rate": 1.1368922482887789e-09, - "loss": 0.27862548828125, - "mean_token_accuracy": 0.8930153846740723, - "num_tokens": 34969425.0, - "step": 3913 - }, - { - "epoch": 2.9741641337386016, - "grad_norm": 1.2919771671295166, - "learning_rate": 1.0746132619374184e-09, - "loss": 0.38437312841415405, - "mean_token_accuracy": 0.8620239496231079, - "num_tokens": 34987289.0, - "step": 3914 - }, - { - "epoch": 2.9749240121580547, - "grad_norm": 2.299374580383301, - "learning_rate": 1.0140881176165517e-09, - "loss": 0.3482919931411743, - "mean_token_accuracy": 0.8766785860061646, - "num_tokens": 34993701.0, - "step": 3915 - }, - { - "epoch": 2.975683890577508, - "grad_norm": 2.1415762901306152, - "learning_rate": 9.553168578049776e-10, - "loss": 0.3619397282600403, - "mean_token_accuracy": 0.8685888051986694, - "num_tokens": 35000430.0, - "step": 3916 - }, - { - "epoch": 2.9764437689969605, - "grad_norm": 1.1967521905899048, - "learning_rate": 8.982995237505343e-10, - "loss": 0.289741188287735, - "mean_token_accuracy": 0.9111574292182922, - "num_tokens": 35015151.0, - "step": 3917 - }, - { - "epoch": 2.977203647416413, - "grad_norm": 2.4301388263702393, - "learning_rate": 8.430361554701005e-10, - "loss": 0.3439575433731079, - "mean_token_accuracy": 0.8783204555511475, - "num_tokens": 35020729.0, - "step": 3918 - }, - { - "epoch": 2.9779635258358663, - "grad_norm": 1.7229973077774048, - "learning_rate": 7.895267917501503e-10, - "loss": 0.379913330078125, - "mean_token_accuracy": 0.8735131025314331, - "num_tokens": 35031484.0, - "step": 3919 - }, - { - "epoch": 2.978723404255319, - "grad_norm": 1.468673825263977, - "learning_rate": 7.377714701450877e-10, - "loss": 0.369578093290329, - "mean_token_accuracy": 0.8639857172966003, - "num_tokens": 35044582.0, - "step": 3920 - }, - { - "epoch": 2.979483282674772, - "grad_norm": 1.45562744140625, - "learning_rate": 6.877702269786346e-10, - "loss": 0.33700788021087646, - "mean_token_accuracy": 0.8805566430091858, - "num_tokens": 35058539.0, - "step": 3921 - }, - { - "epoch": 2.980243161094225, - "grad_norm": 1.483021855354309, - "learning_rate": 6.395230973443856e-10, - "loss": 0.4657078981399536, - "mean_token_accuracy": 0.8335970640182495, - "num_tokens": 35072770.0, - "step": 3922 - }, - { - "epoch": 2.981003039513678, - "grad_norm": 2.2210497856140137, - "learning_rate": 5.930301151033102e-10, - "loss": 0.3754214644432068, - "mean_token_accuracy": 0.8667312264442444, - "num_tokens": 35079930.0, - "step": 3923 - }, - { - "epoch": 2.9817629179331306, - "grad_norm": 1.8546303510665894, - "learning_rate": 5.48291312886251e-10, - "loss": 0.27907687425613403, - "mean_token_accuracy": 0.9037660360336304, - "num_tokens": 35089005.0, - "step": 3924 - }, - { - "epoch": 2.9825227963525833, - "grad_norm": 2.201045513153076, - "learning_rate": 5.053067220925356e-10, - "loss": 0.27560052275657654, - "mean_token_accuracy": 0.9001410603523254, - "num_tokens": 35095726.0, - "step": 3925 - }, - { - "epoch": 2.9832826747720365, - "grad_norm": 1.4042561054229736, - "learning_rate": 4.640763728908093e-10, - "loss": 0.33435091376304626, - "mean_token_accuracy": 0.9042688608169556, - "num_tokens": 35108469.0, - "step": 3926 - }, - { - "epoch": 2.9840425531914896, - "grad_norm": 1.213336706161499, - "learning_rate": 4.246002942173699e-10, - "loss": 0.28249555826187134, - "mean_token_accuracy": 0.8767675161361694, - "num_tokens": 35124765.0, - "step": 3927 - }, - { - "epoch": 2.9848024316109423, - "grad_norm": 1.9213181734085083, - "learning_rate": 3.868785137786657e-10, - "loss": 0.22949065268039703, - "mean_token_accuracy": 0.9307032823562622, - "num_tokens": 35131459.0, - "step": 3928 - }, - { - "epoch": 2.985562310030395, - "grad_norm": 1.7959866523742676, - "learning_rate": 3.509110580490749e-10, - "loss": 0.2500322461128235, - "mean_token_accuracy": 0.9240894913673401, - "num_tokens": 35138772.0, - "step": 3929 - }, - { - "epoch": 2.986322188449848, - "grad_norm": 1.6845020055770874, - "learning_rate": 3.166979522717384e-10, - "loss": 0.3233460485935211, - "mean_token_accuracy": 0.8901629447937012, - "num_tokens": 35148787.0, - "step": 3930 - }, - { - "epoch": 2.987082066869301, - "grad_norm": 1.60831618309021, - "learning_rate": 2.842392204591149e-10, - "loss": 0.28861671686172485, - "mean_token_accuracy": 0.8791468143463135, - "num_tokens": 35158830.0, - "step": 3931 - }, - { - "epoch": 2.987841945288754, - "grad_norm": 2.2622485160827637, - "learning_rate": 2.5353488539187066e-10, - "loss": 0.35594597458839417, - "mean_token_accuracy": 0.8696492910385132, - "num_tokens": 35165616.0, - "step": 3932 - }, - { - "epoch": 2.9886018237082066, - "grad_norm": 1.8257495164871216, - "learning_rate": 2.24584968619157e-10, - "loss": 0.35592517256736755, - "mean_token_accuracy": 0.8911738395690918, - "num_tokens": 35174892.0, - "step": 3933 - }, - { - "epoch": 2.9893617021276597, - "grad_norm": 1.8350274562835693, - "learning_rate": 1.9738949045972068e-10, - "loss": 0.1599535346031189, - "mean_token_accuracy": 0.9352525472640991, - "num_tokens": 35181249.0, - "step": 3934 - }, - { - "epoch": 2.9901215805471124, - "grad_norm": 1.3198978900909424, - "learning_rate": 1.7194846999996073e-10, - "loss": 0.24172186851501465, - "mean_token_accuracy": 0.9095510840415955, - "num_tokens": 35192328.0, - "step": 3935 - }, - { - "epoch": 2.990881458966565, - "grad_norm": 1.6335922479629517, - "learning_rate": 1.4826192509559412e-10, - "loss": 0.4396868348121643, - "mean_token_accuracy": 0.8414689898490906, - "num_tokens": 35204736.0, - "step": 3936 - }, - { - "epoch": 2.9916413373860182, - "grad_norm": 2.0894503593444824, - "learning_rate": 1.2632987237054527e-10, - "loss": 0.2892245948314667, - "mean_token_accuracy": 0.9252790212631226, - "num_tokens": 35211186.0, - "step": 3937 - }, - { - "epoch": 2.9924012158054714, - "grad_norm": 2.221811294555664, - "learning_rate": 1.061523272177789e-10, - "loss": 0.40185099840164185, - "mean_token_accuracy": 0.8510832190513611, - "num_tokens": 35220326.0, - "step": 3938 - }, - { - "epoch": 2.993161094224924, - "grad_norm": 1.7605009078979492, - "learning_rate": 8.772930379846723e-11, - "loss": 0.38544684648513794, - "mean_token_accuracy": 0.8694577217102051, - "num_tokens": 35229889.0, - "step": 3939 - }, - { - "epoch": 2.9939209726443767, - "grad_norm": 2.6683199405670166, - "learning_rate": 7.106081504254514e-11, - "loss": 0.16490477323532104, - "mean_token_accuracy": 0.9414010047912598, - "num_tokens": 35233835.0, - "step": 3940 - }, - { - "epoch": 2.99468085106383, - "grad_norm": 2.2280800342559814, - "learning_rate": 5.6146872648987774e-11, - "loss": 0.41871631145477295, - "mean_token_accuracy": 0.8475867509841919, - "num_tokens": 35241042.0, - "step": 3941 - }, - { - "epoch": 2.9954407294832825, - "grad_norm": 2.169602870941162, - "learning_rate": 4.298748708470024e-11, - "loss": 0.3991228938102722, - "mean_token_accuracy": 0.8692910671234131, - "num_tokens": 35248427.0, - "step": 3942 - }, - { - "epoch": 2.9962006079027357, - "grad_norm": 2.665966033935547, - "learning_rate": 3.158266758562789e-11, - "loss": 0.25984981656074524, - "mean_token_accuracy": 0.9204732179641724, - "num_tokens": 35253086.0, - "step": 3943 - }, - { - "epoch": 2.9969604863221884, - "grad_norm": 1.8087493181228638, - "learning_rate": 2.1932422155923618e-11, - "loss": 0.41246354579925537, - "mean_token_accuracy": 0.8548201322555542, - "num_tokens": 35263360.0, - "step": 3944 - }, - { - "epoch": 2.9977203647416415, - "grad_norm": 2.6384191513061523, - "learning_rate": 1.4036757568502978e-11, - "loss": 0.32927870750427246, - "mean_token_accuracy": 0.8796735405921936, - "num_tokens": 35269214.0, - "step": 3945 - }, - { - "epoch": 2.998480243161094, - "grad_norm": 1.2011899948120117, - "learning_rate": 7.89567936476665e-12, - "loss": 0.2989211678504944, - "mean_token_accuracy": 0.8949509859085083, - "num_tokens": 35283851.0, - "step": 3946 - }, - { - "epoch": 2.999240121580547, - "grad_norm": 1.6725144386291504, - "learning_rate": 3.509191854877969e-12, - "loss": 0.30066749453544617, - "mean_token_accuracy": 0.9032993316650391, - "num_tokens": 35300894.0, - "step": 3947 - }, - { - "epoch": 3.0, - "grad_norm": 2.00422739982605, - "learning_rate": 8.77298117762937e-13, - "loss": 0.4101974368095398, - "mean_token_accuracy": 0.8702684640884399, - "num_tokens": 35309034.0, - "step": 3948 - } - ], - "logging_steps": 1.0, - "max_steps": 3948, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 1000, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 3.848873914830684e+17, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoint-3948/training_args.bin b/checkpoint-3948/training_args.bin deleted file mode 100644 index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000 --- a/checkpoint-3948/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 -size 6225 diff --git a/config.json b/config.json deleted file mode 100644 index c351e5fb52f50ea6e07b40981aef81c80f9df7e4..0000000000000000000000000000000000000000 --- a/config.json +++ /dev/null @@ -1,71 +0,0 @@ -{ - "architectures": [ - "Qwen3ForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "bos_token_id": null, - "dtype": "float32", - "eos_token_id": 151645, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 2560, - "initializer_range": 0.02, - "intermediate_size": 9728, - "layer_types": [ - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention", - "full_attention" - ], - "max_position_embeddings": 262144, - "max_window_layers": 36, - "model_type": "qwen3", - "num_attention_heads": 32, - "num_hidden_layers": 36, - "num_key_value_heads": 8, - "pad_token_id": 151662, - "rms_norm_eps": 1e-06, - "rope_parameters": { - "rope_theta": 5000000, - "rope_type": "default" - }, - "sliding_window": null, - "tie_word_embeddings": true, - "transformers_version": "5.5.3", - "use_cache": false, - "use_sliding_window": false, - "vocab_size": 151936 -} diff --git a/generation_config.json b/generation_config.json deleted file mode 100644 index 2104b83493c2833855e8fe32a7a784805ab5c2ee..0000000000000000000000000000000000000000 --- a/generation_config.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "do_sample": true, - "eos_token_id": [ - 151645, - 151643 - ], - "pad_token_id": 151662, - "temperature": 0.7, - "top_k": 20, - "top_p": 0.8, - "transformers_version": "5.5.3" -} diff --git a/model.safetensors b/model.safetensors deleted file mode 100644 index f787ad62bc7ccc577c324b6d71689c0739123f0c..0000000000000000000000000000000000000000 --- a/model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7db19800bbcf792dcb25dea9b5ae39f4e934a0d56f64ed6f74d7d89e87ae928 -size 17645743048 diff --git a/tokenizer.json b/tokenizer.json deleted file mode 100644 index c7afbed2efcdf019f88ab0572ec29d3bf595dfe2..0000000000000000000000000000000000000000 --- a/tokenizer.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 -size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json deleted file mode 100644 index 4e47e52c4e7f0b2bcf2103a878790216f3f6436d..0000000000000000000000000000000000000000 --- a/tokenizer_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "add_prefix_space": false, - "backend": "tokenizers", - "bos_token": null, - "clean_up_tokenization_spaces": false, - "eos_token": "<|im_end|>", - "errors": "replace", - "extra_special_tokens": [ - "<|im_start|>", - "<|im_end|>", - "<|object_ref_start|>", - "<|object_ref_end|>", - "<|box_start|>", - "<|box_end|>", - "<|quad_start|>", - "<|quad_end|>", - "<|vision_start|>", - "<|vision_end|>", - "<|vision_pad|>", - "<|image_pad|>", - "<|video_pad|>" - ], - "is_local": false, - "model_max_length": 1010000, - "pad_token": "<|fim_pad|>", - "split_special_tokens": false, - "tokenizer_class": "Qwen2Tokenizer", - "unk_token": null -} diff --git a/training_args.bin b/training_args.bin deleted file mode 100644 index 2fc4f538d721f958cdceda5408f2f4e1a35f4210..0000000000000000000000000000000000000000 --- a/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb9e429a6dba8782c1beb1411b31fa91f0c01ec6e0b1441e21d679f8a8b2c021 -size 6225